# Exploring Collaborative Filtering

### By developing a movie recommendation system

### Movie Dataset
Dataset source - http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [73]:
import pandas as pd
import numpy as np

In [74]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


### Splitting data for training and validation

In [75]:
np.random.seed(3)
msk = np.random.rand(len(ratings)) < 0.8
train = ratings[msk].copy()
val = ratings[~msk].copy()

### Encode data
Encode data with continous user and movie ids, if train is passed to the function call, we encode df with the same encoding as train

In [76]:
def proc_col(col, train_col=None):
  # use training col if available
  if train_col is not None:
    uniq = train_col.unique()
  else:
    uniq = col.unique()

  # mapping value to index
  name2idx = {}
  for index, val in enumerate(uniq):
    name2idx[val] = index
  arr = []
  for x in col:
    # uknown ids get encoded as -1
    arr.append(name2idx.get(x, -1))
  arr = np.array(arr)
  return name2idx, arr, len(uniq) # understanding mapping, encoded array, number of unique categories

def encode_data(df, train=None):
  df = df.copy()
  for col_name in ["userId", "movieId"]:
    train_col = None
    if train is not None:
      train_col = train[col_name]
    _, col, _ = proc_col(df[col_name], train_col)
    df[col_name] = col

    # removing the unknowns (value of -1)
    df = df[df[col_name] >= 0]
  return df

In [77]:
df_train = encode_data(train)
df_val = encode_data(val)

### Embedding Layer

In [78]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [79]:
# creating embedding layer (matrix) with 10 rows and 3 columns
# first filled with random numbers, so model can learn later during training
embed = nn.Embedding(10, 3)

### Matrix Factorization Model

In [80]:
class MF(nn.Module):
  def __init__(self, num_users, num_items, emb_size=100):
    super(MF, self).__init__()
    # lookup table for all users
    self.user_emb = nn.Embedding(num_users, emb_size)

    # lookup table for all items
    self.item_emb = nn.Embedding(num_items, emb_size)

    # initialize each randomly
    self.user_emb.weight.data.uniform_(0, 0.05)
    self.item_emb.weight.data.uniform_(0, 0.05)
      
  def forward(self, u, v):
    # replace each row with the embedding layer row
    u = self.user_emb(u)
    v = self.item_emb(v)

    # dot product of u and v
    return (u*v).sum(1)   

### Training the MF model

In [81]:

num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print("num users:", num_users)
print("num items:", num_items)

num users: 610
num items: 8998


In [82]:
# initialize the new matrix factorization model
model = MF(num_users, num_items, emb_size=100)

In [83]:
# check how good the current model is without changing anything
def test_loss(model, unsqueeze=False):
  model.eval()
  users = torch.LongTensor(df_val['userId'].values)
  items = torch.LongTensor(df_val['movieId'].values)
  ratings = torch.FloatTensor(df_val['rating'].values)
  if unsqueeze:
    ratings = ratings.unsqueeze(1)
  y_hat = model(users, items)
  loss = F.mse_loss(y_hat, ratings)
  print(f"test loss: {loss.item():.4f}")

In [84]:
# lr: learning rate for Adam (speed of learning)
# wd: weight decay (L2 regularization) applied by Adam to all model params (your embeddings).
# unsqueeze: whether to reshape targets from shape [N] to [N,1] to match model output if needed.
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
  optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
  model.train()
  for i in range(epochs):
    # convert each dataframe columns into pytorch tensors
    users = torch.LongTensor(df_train['userId'].values)
    items = torch.LongTensor(df_train['movieId'].values)
    ratings = torch.FloatTensor(df_train['rating'].values)

    # unsqueeze if we need to match dimensions
    if unsqueeze:
      ratings = ratings.unsqueeze(1)

    # forward pass - predict ratings using the dot product of embeddings
    y_hat = model(users, items)

    # compute the mean squared error between predicted and actual ratings
    loss = F.mse_loss(y_hat, ratings)

    # clear previously stored gradients
    optimizer.zero_grad()

    # compute gradients for all model parameters
    loss.backward()

    # updating embeddings (weights) using the computed gradients
    # to reduce training loss by adjusting parameters
    optimizer.step()
    print(f"Epoch {i+1}/{epochs}, Loss: {loss.item():.4f}")
  test_loss(model, unsqueeze)

### Testing different configurations

In [85]:
train_epocs(model, epochs=10, lr=0.1)

Epoch 1/10, Loss: 12.9108
Epoch 2/10, Loss: 4.8494
Epoch 3/10, Loss: 2.6056
Epoch 4/10, Loss: 3.0974
Epoch 5/10, Loss: 0.8495
Epoch 6/10, Loss: 1.8237
Epoch 7/10, Loss: 2.6599
Epoch 8/10, Loss: 2.1401
Epoch 9/10, Loss: 1.0947
Epoch 10/10, Loss: 0.9769
test loss: 3.0757


In [86]:
train_epocs(model, epochs=15, lr=0.1)

Epoch 1/15, Loss: 1.6413
Epoch 2/15, Loss: 5.7155
Epoch 3/15, Loss: 4.1172
Epoch 4/15, Loss: 1.0737
Epoch 5/15, Loss: 2.8520
Epoch 6/15, Loss: 2.4697
Epoch 7/15, Loss: 0.7537
Epoch 8/15, Loss: 1.2421
Epoch 9/15, Loss: 2.0902
Epoch 10/15, Loss: 1.9920
Epoch 11/15, Loss: 1.1977
Epoch 12/15, Loss: 0.6940
Epoch 13/15, Loss: 1.0995
Epoch 14/15, Loss: 1.4013
Epoch 15/15, Loss: 0.9348
test loss: 1.4304


In [87]:
train_epocs(model, epochs=15, lr=0.01)

Epoch 1/15, Loss: 0.6293
Epoch 2/15, Loss: 0.5548
Epoch 3/15, Loss: 0.5360
Epoch 4/15, Loss: 0.5255
Epoch 5/15, Loss: 0.5120
Epoch 6/15, Loss: 0.4963
Epoch 7/15, Loss: 0.4803
Epoch 8/15, Loss: 0.4648
Epoch 9/15, Loss: 0.4499
Epoch 10/15, Loss: 0.4350
Epoch 11/15, Loss: 0.4197
Epoch 12/15, Loss: 0.4040
Epoch 13/15, Loss: 0.3878
Epoch 14/15, Loss: 0.3714
Epoch 15/15, Loss: 0.3550
test loss: 1.4070


### Matrix Factorization with bias
Currently we assume that all users give ratings centered around the same average and all movies have the same baseline popularity, however this is not true in real life.

Some users are naturally more generous raters forexample they always rate higher and some movies are generally loved by everyone. By adding the bias we can "normalize" these values to increase the accuracy of the model.

In [88]:
class MF_bias(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF_bias, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.user_bias = nn.Embedding(num_users, 1)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.item_bias = nn.Embedding(num_items, 1)
        self.user_emb.weight.data.uniform_(0,0.05)
        self.item_emb.weight.data.uniform_(0,0.05)
        self.user_bias.weight.data.uniform_(-0.01,0.01)
        self.item_bias.weight.data.uniform_(-0.01,0.01)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        # bias for how much a user tends to rate higher or lower than average
        b_u = self.user_bias(u).squeeze()
        # bias for how much an item tends to be rated higher or lower than average
        b_v = self.item_bias(v).squeeze()
        return (U*V).sum(1) +  b_u  + b_v

In [89]:
model = MF_bias(num_users, num_items, emb_size=100)

In [90]:
train_epocs(model, epochs=10, lr=0.05, wd=1e-5)

Epoch 1/10, Loss: 12.9050
Epoch 2/10, Loss: 9.1493
Epoch 3/10, Loss: 4.3852
Epoch 4/10, Loss: 1.1557
Epoch 5/10, Loss: 2.4683
Epoch 6/10, Loss: 3.7424
Epoch 7/10, Loss: 2.4468
Epoch 8/10, Loss: 1.0769
Epoch 9/10, Loss: 0.8153
Epoch 10/10, Loss: 1.3178
test loss: 2.8503


In [91]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-5)

Epoch 1/10, Loss: 1.8930
Epoch 2/10, Loss: 1.3249
Epoch 3/10, Loss: 0.9351
Epoch 4/10, Loss: 0.7448
Epoch 5/10, Loss: 0.7222
Epoch 6/10, Loss: 0.7773
Epoch 7/10, Loss: 0.8231
Epoch 8/10, Loss: 0.8222
Epoch 9/10, Loss: 0.7818
Epoch 10/10, Loss: 0.7278
test loss: 1.2259


In [92]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-5)

Epoch 1/10, Loss: 0.6853
Epoch 2/10, Loss: 0.6711
Epoch 3/10, Loss: 0.6592
Epoch 4/10, Loss: 0.6494
Epoch 5/10, Loss: 0.6416
Epoch 6/10, Loss: 0.6355
Epoch 7/10, Loss: 0.6309
Epoch 8/10, Loss: 0.6274
Epoch 9/10, Loss: 0.6249
Epoch 10/10, Loss: 0.6232
test loss: 1.2177


### Neural Network Model
Neural Network Models can learn nonlinear interactions between users and items not just pure similarity like using the Matrix Factorization

In [93]:
class CollabFNet(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100, n_hidden=10):
        super(CollabFNet, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.lin1 = nn.Linear(emb_size*2, n_hidden)
        self.lin2 = nn.Linear(n_hidden, 1)
        self.drop1 = nn.Dropout(0.1)
        
    def forward(self, u, v):
        U = self.user_emb(u)
        V = self.item_emb(v)
        x = F.relu(torch.cat([U, V], dim=1))
        x = self.drop1(x)
        x = F.relu(self.lin1(x))
        x = self.lin2(x)
        return x

In [94]:
model = CollabFNet(num_users, num_items, emb_size=100)

In [95]:
train_epocs(model, epochs=15, lr=0.05, wd=1e-6, unsqueeze=True)

Epoch 1/15, Loss: 11.8611
Epoch 2/15, Loss: 1.2061
Epoch 3/15, Loss: 11.7211
Epoch 4/15, Loss: 1.6822
Epoch 5/15, Loss: 2.3878
Epoch 6/15, Loss: 4.9893
Epoch 7/15, Loss: 5.5557
Epoch 8/15, Loss: 4.7141
Epoch 9/15, Loss: 3.2336
Epoch 10/15, Loss: 1.7677
Epoch 11/15, Loss: 0.9652
Epoch 12/15, Loss: 1.2551
Epoch 13/15, Loss: 2.1266
Epoch 14/15, Loss: 2.3708
Epoch 15/15, Loss: 1.7475
test loss: 1.1896


In [96]:
train_epocs(model, epochs=10, lr=0.01, wd=1e-6, unsqueeze=True)

Epoch 1/10, Loss: 1.0385
Epoch 2/10, Loss: 0.8155
Epoch 3/10, Loss: 0.9147
Epoch 4/10, Loss: 0.9229
Epoch 5/10, Loss: 0.8455
Epoch 6/10, Loss: 0.7784
Epoch 7/10, Loss: 0.7723
Epoch 8/10, Loss: 0.8033
Epoch 9/10, Loss: 0.8077
Epoch 10/10, Loss: 0.7746
test loss: 1.0122


In [97]:
train_epocs(model, epochs=10, lr=0.001, wd=1e-6, unsqueeze=True)

Epoch 1/10, Loss: 0.7376
Epoch 2/10, Loss: 0.7300
Epoch 3/10, Loss: 0.7250
Epoch 4/10, Loss: 0.7264
Epoch 5/10, Loss: 0.7273
Epoch 6/10, Loss: 0.7259
Epoch 7/10, Loss: 0.7241
Epoch 8/10, Loss: 0.7250
Epoch 9/10, Loss: 0.7218
Epoch 10/10, Loss: 0.7190
test loss: 1.0130


### Recommend movies based on my preferences