# Building a Movie Recommender

In [1]:
import numpy as np
import pandas as pd
import pandas as pandas
import duckdb
from scipy.sparse import csr_matrix 
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.metrics import ndcg_score
from tqdm.notebook import tqdm

db = duckdb.connect('imdb.duckdb')

We will use the MovieLens 20M Kaggle dataset as provided in the instructions.

However, because there are too many movies and too many users in the dataset to make a user-item matrix, we will trim it down.

Our filter is:
- only movies (not tvShow or other IMDB categories)
- only movies from 2000 or newer

In [2]:
ratings = db.execute("""
                    select userId, imdbId, rating, primaryTitle, year
                    from 
                        (select userId, 
                                'tt' || lpad(cast(b.imdbId as varchar), 7, '0') as imdbId, 
                                rating
                        from ml_rating a
                        left join ml_link b
                        on a.movieId = b.movieId
                        ) ml_movies
                    inner join movies7    
                        on ml_movies.imdbId = movies7.tconst
                    where year > 2000
                     """).fetchdf()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [3]:
ratings = ratings.drop_duplicates()

In [4]:
ratings.imdbId.nunique(), ratings.userId.nunique()

(9894, 77805)

First, before we build the recommender, we prepare a method to extract basic movie information from imdb_id values. Then we create a simple function to do a text search on movie titles so we can get their imdb_id values

In [5]:
movies = ratings[['imdbId', 'primaryTitle', 'year']].drop_duplicates()
movies.set_index('imdbId', inplace=True)
movies['year'] = movies['year'].astype(int)

search_movie = lambda search_string: movies[movies.primaryTitle.str.contains(search_string, case=False)] 

In [6]:
# example usage for Iron Man
search_movie('Iron Man')

Unnamed: 0_level_0,primaryTitle,year
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0371746,Iron Man,2008
tt1228705,Iron Man 2,2010
tt1300854,Iron Man 3,2013


## Baseline Recommender 

Next we build the movie recommender. Create a sparse user-item interaction matrix and then train a K-Nearest Neighbors on it.

In [41]:
matrix = ratings.pivot(index='userId', columns='imdbId', values='rating').fillna(0)
matrix_sparse = csr_matrix(matrix)

In [54]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(matrix_sparse)

In [85]:
def recommend_movies(user_movies, num_recommendations=5):
    user_vector = np.zeros(matrix.shape[1])

    for movie_id in user_movies:
        if movie_id in matrix.columns:
            user_vector[matrix.columns.get_loc(movie_id)] = 1
    user_vector_sparse = csr_matrix(user_vector)

    distances, indices = model_knn.kneighbors(user_vector_sparse, n_neighbors=num_recommendations)
    recommended_movie_ids = []
    for idx in indices.flatten():
        recommended_movie_ids.extend(matrix.columns[matrix.iloc[idx].to_numpy().nonzero()].tolist())
    recommended_movie_ids = list(set(recommended_movie_ids))[:5]

    return movies.loc[recommended_movie_ids]

Now we can demonstrate the usage of this recommender model. Feed `recommend_movies` a list of imdb_id values then it will show you the recommended movies based on K-Nearest Neighbors.

In [None]:
user_movies = ['tt0371746', 'tt1228705','tt1300854'] # we used the Iron Man movies from earlier
recommend_movies(user_movies)

Unnamed: 0_level_0,primaryTitle,year
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0035423,Kate & Leopold,2001
tt0371746,Iron Man,2008
tt0796366,Star Trek,2009
tt0468569,The Dark Knight,2008
tt0800369,Thor,2011


## Evaluate model quality

In [7]:
matrix = ratings.pivot(index='userId', columns='imdbId', values='rating').fillna(0)

# We should create train and test sets from the pivoted matrix
train_matrix, test_matrix = train_test_split(matrix, test_size=0.1, random_state=42)
del matrix # I needed this due to my PC's memory limitations

In [8]:
# Next we build the model on the training set
train_matrix_sparse = csr_matrix(train_matrix)

model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(train_matrix_sparse)


def KNN_recommender(user_movies, num_recommendations=5):
    user_vector = np.zeros(train_matrix.shape[1])

    for movie_id in user_movies:
        if movie_id in train_matrix.columns:
            user_vector[train_matrix.columns.get_loc(movie_id)] = 1
    user_vector_sparse = csr_matrix(user_vector)

    distances, indices = model_knn.kneighbors(user_vector_sparse, n_neighbors=num_recommendations)
    recommended_movie_ids = []
    
    for idx in indices.flatten():
        recommended_movie_ids.extend(train_matrix.columns[train_matrix.iloc[idx].to_numpy().nonzero()].tolist())
    recommended_movie_ids = list(set(recommended_movie_ids))[:num_recommendations]

    return recommended_movie_ids

We create the NDCG@k function to be used for evaluation

In [9]:
def dcg_at_k(recommended_items, relevant_items, k):
    recommended_at_k = recommended_items[:k]
    dcg = 0.0
    for i, item in enumerate(recommended_at_k):
        if item in relevant_items:
            dcg += 1 / np.log2(i + 2)
    return dcg

def ndcg_at_k(recommended_items, relevant_items, k):
    dcg_max = dcg_at_k(relevant_items, relevant_items, k)
    if not dcg_max:
        return 0.0
    return dcg_at_k(recommended_items, relevant_items, k) / dcg_max

In [10]:
# We compute the aggregate evaluation metric over the test set
k = 5
ndcg_scores = []

for user_id in tqdm(test_matrix.index):
    user_test_data = test_matrix.loc[user_id]
    relevant_items = user_test_data[user_test_data > 0].index.tolist()
    
    if not relevant_items:
        continue
    
    user_movies = test_matrix.loc[user_id][test_matrix.loc[user_id] > 0].index.tolist()
    recommended_items = KNN_recommender(user_movies, num_recommendations=k)
    
    ndcg = ndcg_at_k(recommended_items, relevant_items, k)
    ndcg_scores.append(ndcg)

aggregate_ndcg = np.mean(ndcg_scores)
print(f'Aggregate NDCG@{k}: {aggregate_ndcg}') #0.4019

  0%|          | 0/7781 [00:00<?, ?it/s]

Aggregate NDCG@5: 0.3997455348420781


## Non-negative Matrix Factorization Method

In [None]:
# Build the NMF model
n_components = 50  # Number of latent factors
model_nmf = NMF(n_components=n_components, init='random', random_state=42)
W = model_nmf.fit_transform(train_matrix_sparse)
H = model_nmf.components_

def NMF_recommender(user_movies, num_recommendations=5):
    user_vector = np.zeros(train_matrix.shape[1])

    for movie_id in user_movies:
        if movie_id in train_matrix.columns:
            user_vector[train_matrix.columns.get_loc(movie_id)] = 1

    user_vector_transformed = model_nmf.transform(user_vector.reshape(1, -1))
    scores = np.dot(user_vector_transformed, H)
    recommended_movie_ids = np.argsort(scores.flatten())[::-1][:num_recommendations]

    return train_matrix.columns[recommended_movie_ids].tolist()

# We compute the aggregate evaluation metric over the test set
k = 5
ndcg_scores = []

for user_id in tqdm(test_matrix.index):
    user_test_data = test_matrix.loc[user_id]
    relevant_items = user_test_data[user_test_data > 0].index.tolist()
    
    if not relevant_items:
        continue
    
    user_movies = test_matrix.loc[user_id][test_matrix.loc[user_id] > 0].index.tolist()
    recommended_items = NMF_recommender(user_movies, num_recommendations=k)
    
    ndcg = ndcg_at_k(recommended_items, relevant_items, k)
    ndcg_scores.append(ndcg)

aggregate_ndcg = np.mean(ndcg_scores)
print(f'Aggregate NDCG@{k}: {aggregate_ndcg}')



  0%|          | 0/7781 [00:00<?, ?it/s]

Aggregate NDCG@5: 0.8221697347731712


Our KNN model has NDCG@5 of 0.3997 while our NMF model has 0.8221.

By the evaluation metric, the NMF performed much better. It gave recommendations that rank closer to the actual liked movies of the users

## Test running the recommenders with our own inputs

We want to have a qualitative sense of the outputs of the recommenders. We try a group of superhero movies, and then we try a group of romance movies. The observation is that K-Nearest-Neighbors can sometimes be unable to find enough recommendation as required by the parameter `num_recommendations`

In [65]:
search_terms = ['Superman', 'Batman']
pd.concat([search_movie(search_term) for search_term in search_terms],axis=0)

Unnamed: 0_level_0,primaryTitle,year
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0348150,Superman Returns,2006
tt1566648,Waiting for Superman,2010
tt0372784,Batman Begins,2005
tt3139072,Son of Batman,2014


In [66]:
user_movies = pd.concat([search_movie(search_term) for search_term in search_terms],axis=0).index.tolist()
movies.loc[KNN_recommender(user_movies, num_recommendations=5)]

Unnamed: 0_level_0,primaryTitle,year
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0372784,Batman Begins,2005


In [67]:
user_movies = pd.concat([search_movie(search_term) for search_term in search_terms],axis=0).index.tolist()
movies.loc[NMF_recommender(user_movies, num_recommendations=10)]

Unnamed: 0_level_0,primaryTitle,year
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0372784,Batman Begins,2005
tt0434409,V for Vendetta,2005
tt0401792,Sin City,2005
tt0381061,Casino Royale,2006
tt0121766,Star Wars: Episode III - Revenge of the Sith,2005
tt0416449,300,2006
tt0482571,The Prestige,2006
tt0379786,Serenity,2005
tt0405422,The 40-Year-Old Virgin,2005
tt0371746,Iron Man,2008


In [59]:
search_terms = ['Iron Man', 'Avengers' ]
pd.concat([search_movie(search_term) for search_term in search_terms],axis=0)

Unnamed: 0_level_0,primaryTitle,year
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0371746,Iron Man,2008
tt1228705,Iron Man 2,2010
tt1300854,Iron Man 3,2013
tt0848228,The Avengers,2012


In [60]:
user_movies = pd.concat([search_movie(search_term) for search_term in search_terms],axis=0).index.tolist()
movies.loc[KNN_recommender(user_movies, num_recommendations=10)]

Unnamed: 0_level_0,primaryTitle,year
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt1748122,Moonrise Kingdom,2012
tt0349683,King Arthur,2004
tt1392170,The Hunger Games,2012
tt0317132,Because of Winn-Dixie,2005
tt1049413,Up,2009
tt1250777,Kick-Ass,2010
tt1598778,Contagion,2011
tt1464540,I Am Number Four,2011
tt0432283,Fantastic Mr. Fox,2009
tt1302011,Kung Fu Panda 2,2011


In [61]:
user_movies = pd.concat([search_movie(search_term) for search_term in search_terms],axis=0).index.tolist()
movies.loc[NMF_recommender(user_movies, num_recommendations=10)]

Unnamed: 0_level_0,primaryTitle,year
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0848228,The Avengers,2012
tt1270798,X-Men: First Class,2011
tt1201607,Harry Potter and the Deathly Hallows: Part 2,2011
tt0468569,The Dark Knight,2008
tt1345836,The Dark Knight Rises,2012
tt1228705,Iron Man 2,2010
tt0926084,Harry Potter and the Deathly Hallows: Part 1,2010
tt1392170,The Hunger Games,2012
tt0800369,Thor,2011
tt0458339,Captain America: The First Avenger,2011


In [62]:
search_terms = ['500 Days', 'A Walk to Remember', 'The Notebook', 'About Time' ]
#sorted(pd.concat([search_movie(search_term) for search_term in search_terms],axis=0).primaryTitle.tolist())
pd.concat([search_movie(search_term) for search_term in search_terms],axis=0)

Unnamed: 0_level_0,primaryTitle,year
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt1022603,500 Days of Summer,2009
tt0281358,A Walk to Remember,2002
tt0332280,The Notebook,2004
tt2324384,The Notebook,2013
tt0910554,Frequently Asked Questions About Time Travel,2009
tt2194499,About Time,2013


In [63]:
user_movies = [
    'tt1022603', 'tt0281358', 'tt2324384', 'tt2194499'
]
movies.loc[KNN_recommender(user_movies, num_recommendations=10)]

Unnamed: 0_level_0,primaryTitle,year
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt2194499,About Time,2013
tt1022603,500 Days of Summer,2009


In [64]:
movies.loc[NMF_recommender(user_movies, num_recommendations=10)]

Unnamed: 0_level_0,primaryTitle,year
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt1022603,500 Days of Summer,2009
tt1375666,Inception,2010
tt0332280,The Notebook,2004
tt1010048,Slumdog Millionaire,2008
tt0361748,Inglourious Basterds,2009
tt0414387,Pride & Prejudice,2005
tt0988595,27 Dresses,2008
tt0457939,The Holiday,2006
tt0458352,The Devil Wears Prada,2006
tt1119646,The Hangover,2009
