In [22]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import fuzz

In [4]:
df_movies = pd.read_csv(
    "D:/Data/Movie 20M/movie.csv",

    usecols=['movieId', 'title'],
    dtype={'movieId': 'int32', 'title': 'str'})

df_ratings = pd.read_csv(
    "D:/Data/Movie 20M/rating.csv",
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

In [7]:
print(df_movies.shape, df_ratings.shape)


(27278, 2) (20000263, 3)


In [10]:
df_ratings=df_ratings[:2000000]
df_movie_features = df_ratings.pivot(
    index='movieId',
    columns='userId',
    values='rating'
).fillna(0)

In [11]:
df_movies_cnt = pd.DataFrame(df_ratings.groupby('movieId').size(), columns=['count'])
popularity_thres = 50
popular_movies = list(set(df_movies_cnt.query('count >= @popularity_thres').index))
df_ratings_drop_movies = df_ratings[df_ratings.movieId.isin(popular_movies)]


In [12]:
df_users_cnt = pd.DataFrame(df_ratings_drop_movies.groupby('userId').size(), columns=['count'])
ratings_thres = 50
active_users = list(set(df_users_cnt.query('count >= @ratings_thres').index))
df_ratings_drop_users = df_ratings_drop_movies[df_ratings_drop_movies.userId.isin(active_users)]
                                                                                

In [19]:
movie_user_mat = df_ratings_drop_users.pivot(index='movieId', columns='userId', values='rating').fillna(0)
#map movie titles to indexes
movie_to_idx = {
    movie: i for i, movie in 
    enumerate(list(df_movies.set_index('movieId').loc[movie_user_mat.index].title))
}
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)
print(movie_user_mat)

userId   1      2      3      5      7      8      11     13     14     16     \
movieId                                                                         
1          0.0    0.0    4.0    0.0    0.0    4.0    4.5    4.0    4.5    3.0   
2          3.5    0.0    0.0    3.0    0.0    0.0    0.0    3.0    0.0    0.0   
3          0.0    4.0    0.0    0.0    3.0    5.0    0.0    0.0    0.0    0.0   
4          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
5          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
6          0.0    0.0    0.0    0.0    0.0    3.0    0.0    0.0    0.0    3.0   
7          0.0    0.0    0.0    0.0    3.0    0.0    0.0    0.0    3.5    0.0   
8          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
9          0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
10         0.0    0.0    0.0    0.0    0.0    4.0    2.5    3.0    0.0    0.0   
11         0.0    0.0    0.0

In [48]:
neigh = NearestNeighbors(metric='cosine', n_neighbors=20, algorithm='brute', n_jobs=-1)
neigh.fit(movie_user_mat_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [56]:
def fuzzy_matching(mapper, fav_movie, verbose=True):
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in the database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]

def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    # fit
    model_knn.fit(data)
    # get input movie index
    print('Your input movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[1:]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [57]:
my_favorite = 'V for Vendetta '

make_recommendation(
    model_knn=neigh,
    data=movie_user_mat_sparse,
    fav_movie=my_favorite,
    mapper=movie_to_idx,
    n_recommendations=10)

Your input movie: V for Vendetta 
Found possible matches in the database: ['V for Vendetta (2006)']

Recommendations for V for Vendetta :
1: Batman Begins (2005), with distance of 0.38366425037384033
2: Sin City (2005), with distance of 0.4006844162940979
3: Dark Knight, The (2008), with distance of 0.41834932565689087
4: Prestige, The (2006), with distance of 0.4343107342720032
5: 300 (2007), with distance of 0.4348687529563904
6: Iron Man (2008), with distance of 0.43665528297424316
7: Casino Royale (2006), with distance of 0.4376547336578369
8: Kill Bill: Vol. 1 (2003), with distance of 0.4464210867881775
9: Lord of the Rings: The Return of the King, The (2003), with distance of 0.44927364587783813
10: Kill Bill: Vol. 2 (2004), with distance of 0.4574657082557678
