In [19]:
import pandas as pd

movies_df = pd.read_csv(
    'ml-25m/movies.csv',
    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

ratings_df = pd.read_csv(
    'ml-25m/ratings.csv',
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

print('read csv files: done')

read csv files: done


In [31]:
## data preparation_1

movies_df_cnt = pd.DataFrame(ratings_df.groupby('movieId').size(),
                            columns=['count'])
popular_movies = list(set(movies_df_cnt.query('count >= 50').index))
movies_filter = ratings_df.movieId.isin(popular_movies).values


users_df_cnt = pd.DataFrame(ratings_df.groupby('userId').size(),
                            columns=['count'])
active_users = list(set(users_df_cnt.query('count >= 20').index))  # noqa
users_filter = ratings_df.userId.isin(active_users).values

df_ratings_filtered = ratings_df[movies_filter & users_filter]

print('set popular_movies and active_users: done')


set popular_movies and active_users: done


In [33]:
## data preparation_2
movie_user_mat = df_ratings_filtered.pivot(index = 'movieId',
                                          columns = 'userId',
                                          values = 'rating').fillna(0)

# movie_user_mat.head(5) # 5 rows × 162242 columns
print('movie-user rating matrix: done')

movie-user rating matrix: done


In [34]:
## data preparation_3
from scipy.sparse import csr_matrix

hashmap = {
    movie: i for i, movie in
    enumerate(list(movies_df.set_index('movieId').loc[movie_user_mat.index].title))
}

movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

print('hashmap and movie_user_mat_sparse: done')


hashmap and movie_user_mat_sparse: done


In [55]:
## matching
from fuzzywuzzy import fuzz
import time

def fuzzy_matching(hashmap, fav_movie):
    match_tuple = []

    for title, idx in hashmap.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 70:
            match_tuple.append((title, idx, ratio))

    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('no match found')
    else:
        print('Found possible matches in our database: '
              '{0}\n'.format([x[0] for x in match_tuple]))

    return match_tuple[0][1]

## inference
def inference(model, data, hashmap, fav_movie, n_recommendation):
    model.fit(data)
    print('You have input movie: ', fav_movie)
    idx = fuzzy_matching(hashmap, fav_movie)
    print('Recommendation system start to make inference')
    print('......\n')
    t0 = time.time()
    distances, indices = model.kneighbors(
        data[idx], n_neighbors = n_recommendation+1)
    raw_recommends = sorted(list(zip(
                     indices.squeeze().tolist(),
                     distances.squeeze().tolist()
                     )), key = lambda x: x[1])[:0:-1]
    print('It took my system {:.2f}s to make inference'.format(time.time()-t0))
    
    return raw_recommends
          

In [56]:
def make_recommendations(fav_movie, n_recommendations):
    raw_recommends = inference(NearestNeighbors(), movie_user_mat_sparse, hashmap,fav_movie, n_recommendations)
    reverse_hashmap = {v: k for k, v in hashmap.items()}
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance' ' of {2}'.format(i+1, reverse_hashmap[idx], dist))
    

In [57]:
make_recommendations('Monsters Inc', 10)

You have input movie:  Monsters Inc
Found possible matches in our database: ['Monsters, Inc. (2001)']

Recommendation system start to make inference
......

It took my system 0.37s to make inference
Recommendations for Monsters Inc:
1: Spider-Man 2 (2004), with distance of 660.0568237304688
2: Harry Potter and the Chamber of Secrets (2002), with distance of 659.912109375
3: Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001), with distance of 656.5908203125
4: Ratatouille (2007), with distance of 654.8234252929688
5: Spider-Man (2002), with distance of 640.3944091796875
6: Ice Age (2002), with distance of 629.60107421875
7: Shrek (2001), with distance of 624.0492553710938
8: Shrek 2 (2004), with distance of 620.7487182617188
9: Incredibles, The (2004), with distance of 606.2330322265625
10: Finding Nemo (2003), with distance of 562.4389038085938


In [27]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
from fuzzywuzzy import fuzz


 def make_recommendations(self, fav_movie, n_recommendations):
        """
        make top n movie recommendations
        Parameters
        ----------
        fav_movie: str, name of user input movie
        n_recommendations: int, top n recommendations
        """
        # get data
        movie_user_mat_sparse, hashmap = self._prep_data()
        # get recommendations
        raw_recommends = self._inference(
            self.model, movie_user_mat_sparse, hashmap,
            fav_movie, n_recommendations)
        # print results
        reverse_hashmap = {v: k for k, v in hashmap.items()}
        print('Recommendations for {}:'.format(fav_movie))
        for i, (idx, dist) in enumerate(raw_recommends):
            print('{0}: {1}, with distance '
                  'of {2}'.format(i+1, reverse_hashmap[idx], dist))



TypeError: make_recommendations() missing 1 required positional argument: 'n_recommendations'