# Building a Movie Recommender

In [1]:
import numpy as np
import pandas as pandas
import duckdb
from scipy.sparse import csr_matrix 
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.metrics import ndcg_score
from tqdm.notebook import tqdm

db = duckdb.connect('imdb.duckdb')

We will use the MovieLens 20M Kaggle dataset as provided in the instructions.

However, because there are too many movies and too many users in the dataset to make a user-item matrix, we will trim it down.

Our filter is:
- only movies (not tvShow or other IMDB categories)
- only movies from 2000 or newer

In [2]:
ratings = db.execute("""
                    select userId, imdbId, rating, primaryTitle, year
                    from 
                        (select userId, 
                                'tt' || lpad(cast(b.imdbId as varchar), 7, '0') as imdbId, 
                                rating
                        from ml_rating a
                        left join ml_link b
                        on a.movieId = b.movieId
                        ) ml_movies
                    inner join movies7    
                        on ml_movies.imdbId = movies7.tconst
                    where year > 2000
                     """).fetchdf()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [3]:
ratings = ratings.drop_duplicates()

In [4]:
ratings.imdbId.nunique(), ratings.userId.nunique()

(9894, 77805)

First, before we build the recommender, we prepare a method to extract basic movie information from imdb_id values. Then we create a simple function to do a text search on movie titles so we can get their imdb_id values

In [5]:
movies = ratings[['imdbId', 'primaryTitle', 'year']].drop_duplicates()
movies.set_index('imdbId', inplace=True)
movies['year'] = movies['year'].astype(int)

search_movie = lambda search_string: movies[movies.primaryTitle.str.contains(search_string, case=False)] 

In [6]:
# example usage for Iron Man
search_movie('Iron Man')

Unnamed: 0_level_0,primaryTitle,year
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0371746,Iron Man,2008
tt1228705,Iron Man 2,2010
tt1300854,Iron Man 3,2013


## Baseline Recommender 

Next we build the movie recommender. Create a sparse user-item interaction matrix and then train a K-Nearest Neighbors on it.

In [41]:
matrix = ratings.pivot(index='userId', columns='imdbId', values='rating').fillna(0)
matrix_sparse = csr_matrix(matrix)

In [54]:
model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(matrix_sparse)

In [85]:
def recommend_movies(user_movies, num_recommendations=5):
    user_vector = np.zeros(matrix.shape[1])

    for movie_id in user_movies:
        if movie_id in matrix.columns:
            user_vector[matrix.columns.get_loc(movie_id)] = 1
    user_vector_sparse = csr_matrix(user_vector)

    distances, indices = model_knn.kneighbors(user_vector_sparse, n_neighbors=num_recommendations)
    recommended_movie_ids = []
    for idx in indices.flatten():
        recommended_movie_ids.extend(matrix.columns[matrix.iloc[idx].to_numpy().nonzero()].tolist())
    recommended_movie_ids = list(set(recommended_movie_ids))[:5]

    return movies.loc[recommended_movie_ids]

Now we can demonstrate the usage of this recommender model. Feed `recommend_movies` a list of imdb_id values then it will show you the recommended movies based on K-Nearest Neighbors.

In [None]:
user_movies = ['tt0371746', 'tt1228705','tt1300854'] # we used the Iron Man movies from earlier
recommend_movies(user_movies)

Unnamed: 0_level_0,primaryTitle,year
imdbId,Unnamed: 1_level_1,Unnamed: 2_level_1
tt0035423,Kate & Leopold,2001
tt0371746,Iron Man,2008
tt0796366,Star Trek,2009
tt0468569,The Dark Knight,2008
tt0800369,Thor,2011


## Evaluate model quality

In [5]:
matrix = ratings.pivot(index='userId', columns='imdbId', values='rating').fillna(0)

In [6]:
# We should create train and test sets from the pivoted matrix
train_matrix, test_matrix = train_test_split(matrix, test_size=0.02, random_state=42)
del matrix # I needed this due to my PC's memory limitations

# Next we build the model on the training set
train_matrix_sparse = csr_matrix(train_matrix)

model_knn = NearestNeighbors(metric='cosine', algorithm='brute')
model_knn.fit(train_matrix_sparse)


def KNN_recommender(user_movies, num_recommendations=5):
    user_vector = np.zeros(train_matrix.shape[1])

    for movie_id in user_movies:
        if movie_id in train_matrix.columns:
            user_vector[train_matrix.columns.get_loc(movie_id)] = 1
    user_vector_sparse = csr_matrix(user_vector)

    distances, indices = model_knn.kneighbors(user_vector_sparse, n_neighbors=num_recommendations)
    recommended_movie_ids = []
    
    for idx in indices.flatten():
        recommended_movie_ids.extend(train_matrix.columns[train_matrix.iloc[idx].to_numpy().nonzero()].tolist())
    recommended_movie_ids = list(set(recommended_movie_ids))[:5]

    return recommended_movie_ids

We create the NDCG@k function to be used for evaluation

In [None]:
def dcg_at_k(recommended_items, relevant_items, k):
    recommended_at_k = recommended_items[:k]
    dcg = 0.0
    for i, item in enumerate(recommended_at_k):
        if item in relevant_items:
            dcg += 1 / np.log2(i + 2)
    return dcg

def ndcg_at_k(recommended_items, relevant_items, k):
    dcg_max = dcg_at_k(relevant_items, relevant_items, k)
    if not dcg_max:
        return 0.0
    return dcg_at_k(recommended_items, relevant_items, k) / dcg_max

In [11]:
# We compute the aggregate evaluation metric over the test set
k = 5
ndcg_scores = []

for user_id in tqdm(test_matrix.index):
    user_test_data = test_matrix.loc[user_id]
    relevant_items = user_test_data[user_test_data > 0].index.tolist()
    
    if not relevant_items:
        continue
    
    user_movies = test_matrix.loc[user_id][test_matrix.loc[user_id] > 0].index.tolist()
    recommended_items = KNN_recommender(user_movies, num_recommendations=k)
    
    ndcg = ndcg_at_k(recommended_items, relevant_items, k)
    ndcg_scores.append(ndcg)

# Step 5: Compute the aggregate NDCG
aggregate_ndcg = np.mean(ndcg_scores)
print(f'Aggregate NDCG@{k}: {aggregate_ndcg}')

  0%|          | 0/1557 [00:00<?, ?it/s]

Aggregate NDCG@5: 0.40195392121397444


## Non-negative Matrix Factorization Method

In [15]:
# Initialize and fit NMF model
n_components = 50  # Number of latent features
nmf_model = NMF(n_components=n_components, init='random', random_state=42)
user_features = nmf_model.fit_transform(train_matrix)
movie_features = nmf_model.components_

def NMF_recommender(user_movies, num_recommendations=5):
    try:
        # Create user vector
        user_vector = np.zeros(train_matrix.shape[1])
        for movie_id in user_movies:
            if movie_id in train_matrix.columns:
                user_vector[train_matrix.columns.get_loc(movie_id)] = 1
        
        # Get user features by transforming their vector
        user_features_vector = nmf_model.transform([user_vector])
        
        # Calculate predicted ratings
        predicted_ratings = np.dot(user_features_vector, movie_features)
        
        # Get indices of top rated movies
        recommended_indices = predicted_ratings.argsort()[0][::-1]
        
        # Filter out movies user has already seen
        seen_movies = set(user_movies)
        recommended_movies = []
        for idx in recommended_indices:
            movie_id = train_matrix.columns[idx]
            if movie_id not in seen_movies:
                recommended_movies.append(movie_id)
            if len(recommended_movies) == num_recommendations:
                break
                
        return recommended_movies
        
    except Exception as e:
        print(f"Error making recommendations: {str(e)}")
        return []

In [None]:
user_movies = ['tt0371746', 'tt1228705','tt1300854'] # we used the Iron Man movies from earlier
recommend_movies(user_movies)

In [None]:
# We compute the aggregate evaluation metric over the test set
k = 5
ndcg_scores = []

for user_id in tqdm(test_matrix.index):
    user_test_data = test_matrix.loc[user_id]
    relevant_items = user_test_data[user_test_data > 0].index.tolist()
    
    if not relevant_items:
        continue
    
    user_movies = test_matrix.loc[user_id][test_matrix.loc[user_id] > 0].index.tolist()
    recommended_items = NMF_recommender(user_movies, num_recommendations=k)
    
    ndcg = ndcg_at_k(recommended_items, relevant_items, k)
    ndcg_scores.append(ndcg)

# Step 5: Compute the aggregate NDCG
aggregate_ndcg = np.mean(ndcg_scores)
print(f'Aggregate NDCG@{k}: {aggregate_ndcg}')