# Exploring Collaborative Filtering

### By developing a movie recommendation system

### Movie Dataset
Dataset source - http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [42]:
import pandas as pd
import numpy as np

In [43]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


### Splitting data for training and validation

In [44]:
np.random.seed(3)
msk = np.random.rand(len(ratings)) < 0.8
train = ratings[msk].copy()
val = ratings[~msk].copy()

### Encode data
Encode data with continous user and movie ids, if train is passed to the function call, we encode df with the same encoding as train

In [45]:
def proc_col(col, train_col=None):
  # use training col if available
  if train_col is not None:
    uniq = train_col.unique()
  else:
    uniq = col.unique()

  # mapping value to index
  name2idx = {}
  for index, val in enumerate(uniq):
    name2idx[val] = index
  arr = []
  for x in col:
    # uknown ids get encoded as -1
    arr.append(name2idx.get(x, -1))
  arr = np.array(arr)
  return name2idx, arr, len(uniq) # understanding mapping, encoded array, number of unique categories

def encode_data(df, train=None):
  df = df.copy()
  for col_name in ["userId", "movieId"]:
    train_col = None
    if train is not None:
      train_col = train[col_name]
    _, col, _ = proc_col(df[col_name], train_col)
    df[col_name] = col

    # removing the unknowns (value of -1)
    df = df[df[col_name] >= 0]
  return df

In [46]:
df_train = encode_data(train)
df_val = encode_data(val)

### Embedding Layer

In [47]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [48]:
# creating embedding layer (matrix) with 10 rows and 3 columns
# first filled with random numbers, so model can learn later during training
embed = nn.Embedding(10, 3)

### Matrix Factorization Model

In [49]:
class MF(nn.Module):
  def __init__(self, num_users, num_items, emb_size=100):
    super(MF, self).__init__()
    # lookup table for all users
    self.user_emb = nn.Embedding(num_users, emb_size)

    # lookup table for all items
    self.item_emb = nn.Embedding(num_items, emb_size)

    # initialize each randomly
    self.user_emb.weight.data.uniform_(0, 0.05)
    self.item_emb.weight.data.uniform_(0, 0.05)
      
  def forward(self, u, v):
    # replace each row with the embedding layer row
    u = self.user_emb(u)
    v = self.item_emb(v)

    # dot product of u and v
    return (u*v).sum(1)   

### Training the MF model

In [50]:

num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print("num users:", num_users)
print("num items:", num_items)

num users: 610
num items: 8998


In [51]:
# initialize the new matrix factorization model
model = MF(num_users, num_items, emb_size=100)

In [52]:
# check how good the current model is without changing anything
def test_loss(model, unsqueeze=False):
  model.eval()
  users = torch.LongTensor(df_val['userId'].values)
  items = torch.LongTensor(df_val['movieId'].values)
  ratings = torch.FloatTensor(df_val['rating'].values)
  if unsqueeze:
    ratings = ratings.unsqueeze(1)
  y_hat = model(users, items)
  loss = F.mse_loss(y_hat, ratings)
  print(f"test loss: {loss.item():.4f}")

In [53]:
# lr: learning rate for Adam (speed of learning)
# wd: weight decay (L2 regularization) applied by Adam to all model params (your embeddings).
# unsqueeze: whether to reshape targets from shape [N] to [N,1] to match model output if needed.
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
  optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
  model.train()
  for i in range(epochs):
    # convert each dataframe columns into pytorch tensors
    users = torch.LongTensor(df_train['userId'].values)
    items = torch.LongTensor(df_train['movieId'].values)
    ratings = torch.FloatTensor(df_train['rating'].values)

    # unsqueeze if we need to match dimensions
    if unsqueeze:
      ratings = ratings.unsqueeze(1)

    # forward pass - predict ratings using the dot product of embeddings
    y_hat = model(users, items)

    # compute the mean squared error between predicted and actual ratings
    loss = F.mse_loss(y_hat, ratings)

    # clear previously stored gradients
    optimizer.zero_grad()

    # compute gradients for all model parameters
    loss.backward()

    # updating embeddings (weights) using the computed gradients
    # to reduce training loss by adjusting parameters
    optimizer.step()
    print(f"Epoch {i+1}/{epochs}, Loss: {loss.item():.4f}")
  test_loss(model, unsqueeze)

### Testing different configurations

In [54]:
train_epocs(model, epochs=10, lr=0.01)

Epoch 1/10, Loss: 12.9140
Epoch 2/10, Loss: 12.5060
Epoch 3/10, Loss: 11.9756
Epoch 4/10, Loss: 11.3311
Epoch 5/10, Loss: 10.5820
Epoch 6/10, Loss: 9.7397
Epoch 7/10, Loss: 8.8184
Epoch 8/10, Loss: 7.8356
Epoch 9/10, Loss: 6.8129
Epoch 10/10, Loss: 5.7758
test loss: 4.7717


In [55]:
train_epocs(model, epochs=20, lr=0.01)

Epoch 1/20, Loss: 4.7544
Epoch 2/20, Loss: 3.8178
Epoch 3/20, Loss: 2.9628
Epoch 4/20, Loss: 2.2228
Epoch 5/20, Loss: 1.6284
Epoch 6/20, Loss: 1.2018
Epoch 7/20, Loss: 0.9494
Epoch 8/20, Loss: 0.8546
Epoch 9/20, Loss: 0.8774
Epoch 10/20, Loss: 0.9614
Epoch 11/20, Loss: 1.0499
Epoch 12/20, Loss: 1.1023
Epoch 13/20, Loss: 1.1020
Epoch 14/20, Loss: 1.0535
Epoch 15/20, Loss: 0.9746
Epoch 16/20, Loss: 0.8869
Epoch 17/20, Loss: 0.8087
Epoch 18/20, Loss: 0.7516
Epoch 19/20, Loss: 0.7196
Epoch 20/20, Loss: 0.7103
test loss: 1.5562


In [56]:
train_epocs(model, epochs=20, lr=0.1)

Epoch 1/20, Loss: 0.7169
Epoch 2/20, Loss: 10.4183
Epoch 3/20, Loss: 1.1558
Epoch 4/20, Loss: 2.2212
Epoch 5/20, Loss: 4.5572
Epoch 6/20, Loss: 5.2357
Epoch 7/20, Loss: 5.0751
Epoch 8/20, Loss: 4.2550
Epoch 9/20, Loss: 2.9255
Epoch 10/20, Loss: 1.7810
Epoch 11/20, Loss: 1.5063
Epoch 12/20, Loss: 1.8308
Epoch 13/20, Loss: 1.9978
Epoch 14/20, Loss: 1.8274
Epoch 15/20, Loss: 1.5248
Epoch 16/20, Loss: 1.2991
Epoch 17/20, Loss: 1.2296
Epoch 18/20, Loss: 1.2934
Epoch 19/20, Loss: 1.3910
Epoch 20/20, Loss: 1.3314
test loss: 2.0961
