# Exploring Collaborative Filtering

### By developing a movie recommendation system

### Movie Dataset
Dataset source - http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [114]:
import pandas as pd
import numpy as np

In [115]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


### Splitting data for training and validation

In [116]:
np.random.seed(3)
msk = np.random.rand(len(ratings)) < 0.8
train = ratings[msk].copy()
val = ratings[~msk].copy()

### Encode data
Encode data with continous user and movie ids, if train is passed to the function call, we encode df with the same encoding as train

In [117]:
def proc_col(col, train_col=None):
  # use training col if available
  if train_col is not None:
    uniq = train_col.unique()
  else:
    uniq = col.unique()

  # mapping value to index
  name2idx = {}
  for index, val in enumerate(uniq):
    name2idx[val] = index
  arr = []
  for x in col:
    # uknown ids get encoded as -1
    arr.append(name2idx.get(x, -1))
  arr = np.array(arr)
  return name2idx, arr, len(uniq) # understanding mapping, encoded array, number of unique categories

def encode_data(df, train=None):
  df = df.copy()
  for col_name in ["userId", "movieId"]:
    train_col = None
    if train is not None:
      train_col = train[col_name]
    _, col, _ = proc_col(df[col_name], train_col)
    df[col_name] = col

    # removing the unknowns (value of -1)
    df = df[df[col_name] >= 0]
  return df

In [118]:
df_train = encode_data(train)
df_val = encode_data(val)

### Embedding Layer

In [119]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [120]:
# creating embedding layer (matrix) with 10 rows and 3 columns
# first filled with random numbers, so model can learn later during training
embed = nn.Embedding(10, 3)

### Matrix Factorization Model

In [121]:
class MF(nn.Module):
  def __init__(self, num_users, num_items, emb_size=100):
    super(MF, self).__init__()
    # lookup table for all users
    self.user_emb = nn.Embedding(num_users, emb_size)

    # lookup table for all items
    self.item_emb = nn.Embedding(num_items, emb_size)

    # initialize each randomly
    self.user_emb.weight.data.uniform_(0, 0.05)
    self.item_emb.weight.data.uniform_(0, 0.05)
      
  def forward(self, u, v):
    # replace each row with the embedding layer row
    u = self.user_emb(u)
    v = self.item_emb(v)

    # dot product of u and v
    return (u*v).sum(1)   

### Training the MF model

In [122]:

num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print("num users:", num_users)
print("num items:", num_items)

num users: 610
num items: 8998


In [123]:
# initialize the new matrix factorization model
model = MF(num_users, num_items, emb_size=100)

In [124]:
# check how good the current model is without changing anything
def test_loss(model, unsqueeze=False):
  model.eval()
  users = torch.LongTensor(df_val['userId'].values)
  items = torch.LongTensor(df_val['movieId'].values)
  ratings = torch.FloatTensor(df_val['rating'].values)
  if unsqueeze:
    ratings = ratings.unsqueeze(1)
  y_hat = model(users, items)
  loss = F.mse_loss(y_hat, ratings)
  print(f"test loss: {loss.item():.4f}")

In [125]:
# lr: learning rate for Adam (speed of learning)
# wd: weight decay (L2 regularization) applied by Adam to all model params (your embeddings).
# unsqueeze: whether to reshape targets from shape [N] to [N,1] to match model output if needed.
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
  optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
  model.train()
  for i in range(epochs):
    # convert each dataframe columns into pytorch tensors
    users = torch.LongTensor(df_train['userId'].values)
    items = torch.LongTensor(df_train['movieId'].values)
    ratings = torch.FloatTensor(df_train['rating'].values)

    # unsqueeze if we need to match dimensions
    if unsqueeze:
      ratings = ratings.unsqueeze(1)

    # forward pass - predict ratings using the dot product of embeddings
    y_hat = model(users, items)

    # compute the mean squared error between predicted and actual ratings
    loss = F.mse_loss(y_hat, ratings)

    # clear previously stored gradients
    optimizer.zero_grad()

    # compute gradients for all model parameters
    loss.backward()

    # updating embeddings (weights) using the computed gradients
    # to reduce training loss by adjusting parameters
    optimizer.step()
    print(f"Epoch {i+1}/{epochs}, Loss: {loss.item():.4f}")
  test_loss(model, unsqueeze)

### Testing different configurations

In [126]:
train_epocs(model, epochs=10, lr=0.1)

Epoch 1/10, Loss: 12.9140
Epoch 2/10, Loss: 4.8572
Epoch 3/10, Loss: 2.5778
Epoch 4/10, Loss: 3.1057
Epoch 5/10, Loss: 0.8492
Epoch 6/10, Loss: 1.8183
Epoch 7/10, Loss: 2.6539
Epoch 8/10, Loss: 2.1326
Epoch 9/10, Loss: 1.0869
Epoch 10/10, Loss: 0.9740
test loss: 3.0683


In [127]:
train_epocs(model, epochs=15, lr=0.1)

Epoch 1/15, Loss: 1.6437
Epoch 2/15, Loss: 5.7090
Epoch 3/15, Loss: 4.1261
Epoch 4/15, Loss: 1.0724
Epoch 5/15, Loss: 2.8327
Epoch 6/15, Loss: 2.4845
Epoch 7/15, Loss: 0.7608
Epoch 8/15, Loss: 1.2345
Epoch 9/15, Loss: 2.0833
Epoch 10/15, Loss: 1.9896
Epoch 11/15, Loss: 1.1983
Epoch 12/15, Loss: 0.6940
Epoch 13/15, Loss: 1.0968
Epoch 14/15, Loss: 1.4018
Epoch 15/15, Loss: 0.9383
test loss: 1.4314


In [128]:
train_epocs(model, epochs=15, lr=0.01)

Epoch 1/15, Loss: 0.6304
Epoch 2/15, Loss: 0.5552
Epoch 3/15, Loss: 0.5358
Epoch 4/15, Loss: 0.5259
Epoch 5/15, Loss: 0.5131
Epoch 6/15, Loss: 0.4974
Epoch 7/15, Loss: 0.4814
Epoch 8/15, Loss: 0.4658
Epoch 9/15, Loss: 0.4507
Epoch 10/15, Loss: 0.4357
Epoch 11/15, Loss: 0.4205
Epoch 12/15, Loss: 0.4050
Epoch 13/15, Loss: 0.3889
Epoch 14/15, Loss: 0.3726
Epoch 15/15, Loss: 0.3561
test loss: 1.4054


### Matrix Factorization with bias
Currently we assume that all users give ratings centered around the same average and all movies have the same baseline popularity, however this is not true in real life.

Some users are naturally more generous raters forexample they always rate higher and some movies are generally loved by everyone. By adding the bias we can "normalize" these values to increase the accuracy of the model.

In [129]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        # bias for how much a user tends to rate higher or lower than average
        b_u = self.user_bias(u).squeeze()
        # bias for how much an item tends to be rated higher or lower than average
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [130]:
model = MF_bias(num_users, num_items, emb_size=100)

In [131]:
train_epocs(model, epochs=10, lr=0.05, wd=1e-5)

Epoch 1/10, Loss: 12.9126
Epoch 2/10, Loss: 9.1550
Epoch 3/10, Loss: 4.3890
Epoch 4/10, Loss: 1.1587
Epoch 5/10, Loss: 2.4673
Epoch 6/10, Loss: 3.7430
Epoch 7/10, Loss: 2.4491
Epoch 8/10, Loss: 1.0787
Epoch 9/10, Loss: 0.8159
Epoch 10/10, Loss: 1.3178
test loss: 2.8496


In [132]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-5)

Epoch 1/10, Loss: 1.8932
Epoch 2/10, Loss: 1.3251
Epoch 3/10, Loss: 0.9354
Epoch 4/10, Loss: 0.7452
Epoch 5/10, Loss: 0.7225
Epoch 6/10, Loss: 0.7772
Epoch 7/10, Loss: 0.8225
Epoch 8/10, Loss: 0.8215
Epoch 9/10, Loss: 0.7811
Epoch 10/10, Loss: 0.7274
test loss: 1.2240


In [133]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-5)

Epoch 1/10, Loss: 0.6853
Epoch 2/10, Loss: 0.6711
Epoch 3/10, Loss: 0.6592
Epoch 4/10, Loss: 0.6494
Epoch 5/10, Loss: 0.6416
Epoch 6/10, Loss: 0.6355
Epoch 7/10, Loss: 0.6308
Epoch 8/10, Loss: 0.6273
Epoch 9/10, Loss: 0.6248
Epoch 10/10, Loss: 0.6230
test loss: 1.2160


### Neural Network Model
Neural Network Models can learn nonlinear interactions between users and items not just pure similarity like using the Matrix Factorization

In [134]:
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x

In [135]:
model = CollabFNet(num_users, num_items, emb_size=100)

In [136]:
train_epocs(model, epochs=15, lr=0.05, wd=1e-6, unsqueeze=True)

Epoch 1/15, Loss: 11.0653
Epoch 2/15, Loss: 5.7462
Epoch 3/15, Loss: 1.3388
Epoch 4/15, Loss: 3.2856
Epoch 5/15, Loss: 2.7545
Epoch 6/15, Loss: 1.3073
Epoch 7/15, Loss: 1.2815
Epoch 8/15, Loss: 1.9777
Epoch 9/15, Loss: 1.5293
Epoch 10/15, Loss: 0.9360
Epoch 11/15, Loss: 1.0160
Epoch 12/15, Loss: 1.3137
Epoch 13/15, Loss: 1.3448
Epoch 14/15, Loss: 1.0803
Epoch 15/15, Loss: 0.7891
test loss: 1.0431


In [137]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-6, unsqueeze=True)

Epoch 1/10, Loss: 0.7910
Epoch 2/10, Loss: 0.9388
Epoch 3/10, Loss: 0.7961
Epoch 4/10, Loss: 0.7277
Epoch 5/10, Loss: 0.8083
Epoch 6/10, Loss: 0.7846
Epoch 7/10, Loss: 0.7158
Epoch 8/10, Loss: 0.7126
Epoch 9/10, Loss: 0.7488
Epoch 10/10, Loss: 0.7465
test loss: 1.0677


In [138]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

Epoch 1/10, Loss: 0.7108
Epoch 2/10, Loss: 0.6964
Epoch 3/10, Loss: 0.6903
Epoch 4/10, Loss: 0.6917
Epoch 5/10, Loss: 0.6943
Epoch 6/10, Loss: 0.6953
Epoch 7/10, Loss: 0.6941
Epoch 8/10, Loss: 0.6905
Epoch 9/10, Loss: 0.6877
Epoch 10/10, Loss: 0.6863
test loss: 1.0497


### Recommend movies based on my preferences

In [139]:
# load movie titles
movies = pd.read_csv('ml-latest-small/movies.csv')
print(f"Total movies available: {len(movies)}")
movies.head()

Total movies available: 9742


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


I want the user to rate 5 movies, but a problem is that I don't know what 5 movies they've watched previously. So giving 5 random movies for them to rate won't work. So my solution to this problem is to make a list of the top 50 most popular movies in the dataset. Now users can just pick at least 5 movies from this list. Picking more movies will make the recommnendation more accurate.

In [140]:
# display top 50 most popular movies that users are likely to have watched
# popularity is determined by the number of ratings a movie has received
movie_ratings_count = ratings.groupby('movieId').size().reset_index(name='count')
popular_movies_df = movies.merge(movie_ratings_count, on='movieId').sort_values('count', ascending=False).head(50)

print("TOP 50 MOST POPULAR MOVIES IN THE DATASET")
print("Pick movies you've watched from this list")
for idx, (_, row) in enumerate(popular_movies_df.iterrows(), 1):
    print(f"{idx:2d}. {row['title']:50s} | Genres: {row['genres']}")

TOP 50 MOST POPULAR MOVIES IN THE DATASET
Pick movies you've watched from this list
 1. Forrest Gump (1994)                                | Genres: Comedy|Drama|Romance|War
 2. Shawshank Redemption, The (1994)                   | Genres: Crime|Drama
 3. Pulp Fiction (1994)                                | Genres: Comedy|Crime|Drama|Thriller
 4. Silence of the Lambs, The (1991)                   | Genres: Crime|Horror|Thriller
 5. Matrix, The (1999)                                 | Genres: Action|Sci-Fi|Thriller
 6. Star Wars: Episode IV - A New Hope (1977)          | Genres: Action|Adventure|Sci-Fi
 7. Jurassic Park (1993)                               | Genres: Action|Adventure|Sci-Fi|Thriller
 8. Braveheart (1995)                                  | Genres: Action|Drama|War
 9. Terminator 2: Judgment Day (1991)                  | Genres: Action|Sci-Fi
10. Schindler's List (1993)                            | Genres: Drama|War
11. Fight Club (1999)                                  | G

Now you can pick at least 5 movies from the list above and give each a rating!

In [141]:
user_movie_ratings = {}  # dictionary to store {movieId: rating}

while len(user_movie_ratings) < 5 or input("\nDo you want to rate another movie? (yes/no): ").lower() == 'yes':
    print("\n" + "-"*80)
    print(f"Movies rated so far: {len(user_movie_ratings)}")
    
    try:
        movie_num = int(input(f"Enter movie number (1-50): "))
        
        if movie_num < 1 or movie_num > 50:
            print("Please enter a number between 1 and 50.")
            continue
        
        selected_movie = popular_movies_df.iloc[movie_num - 1]
        
        # check if user already rated the same movie
        if selected_movie['movieId'] in user_movie_ratings:
            print(f"You already rated '{selected_movie['title']}' with {user_movie_ratings[selected_movie['movieId']]} stars.")
            update = input("Update rating? (yes/no): ")
            if update.lower() != 'yes':
                continue
        
        # get rating
        while True:
            try:
                rating = float(input(f"Rate '{selected_movie['title']}' (1-5): "))
                if 1 <= rating <= 5:
                    user_movie_ratings[selected_movie['movieId']] = rating
                    print(f"✓ Recorded: {selected_movie['title']} = {rating} stars")
                    break
                else:
                    print("Please enter a rating between 1 and 5")
            except ValueError:
                print("Please enter a valid number")
        
    except ValueError:
        print("Invalid input. Please enter a number.")
        continue
    
    if len(user_movie_ratings) >= 5:
        continue_rating = input("You've rated enough! Continue rating more? (yes/no): ")
        if continue_rating.lower() != 'yes':
            break

print(f"Thank you! You've rated {len(user_movie_ratings)} movies.")


--------------------------------------------------------------------------------
Movies rated so far: 0
✓ Recorded: Forrest Gump (1994) = 3.0 stars

--------------------------------------------------------------------------------
Movies rated so far: 1
✓ Recorded: Jurassic Park (1993) = 4.0 stars

--------------------------------------------------------------------------------
Movies rated so far: 2
✓ Recorded: Fight Club (1999) = 5.0 stars

--------------------------------------------------------------------------------
Movies rated so far: 3
✓ Recorded: Saving Private Ryan (1998) = 3.0 stars

--------------------------------------------------------------------------------
Movies rated so far: 4
✓ Recorded: Lion King, The (1994) = 5.0 stars

--------------------------------------------------------------------------------
Movies rated so far: 5
✓ Recorded: Shrek (2001) = 2.0 stars

--------------------------------------------------------------------------------
Movies rated so far: 6


In [142]:
# display your ratings summary
print("\nYOUR RATINGS SUMMARY:")
print("="*80)
rated_movies_df = movies[movies['movieId'].isin(user_movie_ratings.keys())].copy()
rated_movies_df['your_rating'] = rated_movies_df['movieId'].map(user_movie_ratings)
rated_movies_df = rated_movies_df.sort_values('your_rating', ascending=False)

for _, row in rated_movies_df.iterrows():
    stars = '⭐' * int(row['your_rating'])
    print(f"{row['your_rating']:.1f} {stars:10s} {row['title']}")
    print(f"     Genres: {row['genres']}")
    print()


YOUR RATINGS SUMMARY:
5.0 ⭐⭐⭐⭐⭐      Lion King, The (1994)
     Genres: Adventure|Animation|Children|Drama|Musical|IMAX

5.0 ⭐⭐⭐⭐⭐      Mrs. Doubtfire (1993)
     Genres: Comedy|Drama

5.0 ⭐⭐⭐⭐⭐      Fight Club (1999)
     Genres: Action|Crime|Drama|Thriller

5.0 ⭐⭐⭐⭐⭐      Dark Knight, The (2008)
     Genres: Action|Crime|Drama|IMAX

4.0 ⭐⭐⭐⭐       Jurassic Park (1993)
     Genres: Action|Adventure|Sci-Fi|Thriller

4.0 ⭐⭐⭐⭐       Beauty and the Beast (1991)
     Genres: Animation|Children|Fantasy|Musical|Romance|IMAX

4.0 ⭐⭐⭐⭐       Mission: Impossible (1996)
     Genres: Action|Adventure|Mystery|Thriller

3.0 ⭐⭐⭐        Forrest Gump (1994)
     Genres: Comedy|Drama|Romance|War

3.0 ⭐⭐⭐        Saving Private Ryan (1998)
     Genres: Action|Drama|War

2.0 ⭐⭐         Shrek (2001)
     Genres: Adventure|Animation|Children|Comedy|Fantasy|Romance



In [143]:
# get all movies that the user hasn't rated
all_movie_ids = movies['movieId'].values
unrated_movie_ids = [mid for mid in all_movie_ids if mid not in user_movie_ratings.keys()]

# get the encoding mappings from training data
user_mapping, _, _ = proc_col(train['userId'])
movie_mapping, _, _ = proc_col(train['movieId'])

# find similar users based on your ratings
similar_users = []
for rated_movie_id, rating in user_movie_ratings.items():
    similar_user_ratings = train[
        (train['movieId'] == rated_movie_id) & 
        (abs(train['rating'] - rating) <= 1)
    ]['userId'].values
    similar_users.extend(similar_user_ratings)

# get the most similar user
from collections import Counter
if similar_users:
    most_common_user = Counter(similar_users).most_common(1)[0][0]
    encoded_user = user_mapping[most_common_user]
    print(f"Found similar users based on your taste!")
else:
    # fallback to a random active user
    encoded_user = 0
    print("Using default user profile...")

# generating predictions
predictions = []
model.eval()

with torch.no_grad():
    for movie_id in unrated_movie_ids:
        if movie_id in movie_mapping:
            encoded_movie = movie_mapping[movie_id]
            
            u_tensor = torch.LongTensor([encoded_user])
            v_tensor = torch.LongTensor([encoded_movie])
            
            pred_rating = model(u_tensor, v_tensor).item()
            pred_rating = max(0.5, min(5.0, pred_rating))
            
            predictions.append({
                'movieId': movie_id,
                'predicted_rating': pred_rating
            })

# convert to dataframe and sort by predicted rating
predictions_df = pd.DataFrame(predictions)
predictions_df = predictions_df.merge(movies, on='movieId')
predictions_df = predictions_df.sort_values('predicted_rating', ascending=False)

print(f"\nGenerated predictions for {len(predictions_df)} movies!")

Found similar users based on your taste!

Generated predictions for 8988 movies!


In [144]:
# Display top 10 recommendations
print("\n" + "="*80)
print("🎬 TOP 10 MOVIE RECOMMENDATIONS FOR YOU 🎬")
print("="*80)

top_recommendations = predictions_df.head(10)
for idx, (_, row) in enumerate(top_recommendations.iterrows(), 1):
    print(f"\n{idx}. {row['title']}")
    print(f"   Predicted Rating: {row['predicted_rating']:.2f} ⭐")
    print(f"   Genres: {row['genres']}")

print("\n" + "="*80)


🎬 TOP 10 MOVIE RECOMMENDATIONS FOR YOU 🎬

1. Wild Parrots of Telegraph Hill, The (2003)
   Predicted Rating: 4.59 ⭐
   Genres: Documentary

2. Heidi Fleiss: Hollywood Madam (1995)
   Predicted Rating: 4.55 ⭐
   Genres: Documentary

3. Man Bites Dog (C'est arrivé près de chez vous) (1992)
   Predicted Rating: 4.55 ⭐
   Genres: Comedy|Crime|Drama|Thriller

4. I Am Not Your Negro (2017)
   Predicted Rating: 4.51 ⭐
   Genres: Documentary

5. Guess Who's Coming to Dinner (1967)
   Predicted Rating: 4.51 ⭐
   Genres: Drama

6. Come and See (Idi i smotri) (1985)
   Predicted Rating: 4.47 ⭐
   Genres: Drama|War

7. Indignation (2016)
   Predicted Rating: 4.46 ⭐
   Genres: Drama

8. Act of Killing, The (2012)
   Predicted Rating: 4.46 ⭐
   Genres: Documentary

9. On the Beach (1959)
   Predicted Rating: 4.46 ⭐
   Genres: Drama

10. Crossing Delancey (1988)
   Predicted Rating: 4.46 ⭐
   Genres: Comedy|Romance

