# Exploring Collaborative Filtering

### By developing a movie recommendation system

### Movie Dataset
Dataset source - http://files.grouplens.org/datasets/movielens/ml-latest-small.zip

In [1]:
import pandas as pd
import numpy as np

In [2]:
ratings = pd.read_csv('ml-latest-small/ratings.csv')
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


### Splitting data for training and validation

In [3]:
np.random.seed(3)
msk = np.random.rand(len(ratings)) < 0.8
train = ratings[msk].copy()
val = ratings[~msk].copy()

### Encode data
Encode data with continous user and movie ids, if train is passed to the function call, we encode df with the same encoding as train

In [4]:
def proc_col(col, train_col=None):
  # use training col if available
  if train_col is not None:
    uniq = train_col.unique()
  else:
    uniq = col.unique()

  # mapping value to index
  name2idx = {}
  for index, val in enumerate(uniq):
    name2idx[val] = index
  arr = []
  for x in col:
    # uknown ids get encoded as -1
    arr.append(name2idx.get(x, -1))
  arr = np.array(arr)
  return name2idx, arr, len(uniq) # understanding mapping, encoded array, number of unique categories

def encode_data(df, train=None):
  df = df.copy()
  for col_name in ["userId", "movieId"]:
    train_col = None
    if train is not None:
      train_col = train[col_name]
    _, col, _ = proc_col(df[col_name], train_col)
    df[col_name] = col

    # removing the unknowns (value of -1)
    df = df[df[col_name] >= 0]
  return df

In [10]:
df_train = encode_data(train)
df_val = encode_data(val)

### Embedding Layer

In [12]:
!pip3 install torch torchvision torchaudio

Collecting torch
  Downloading torch-2.9.0-cp313-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting torchvision
  Downloading torchvision-0.24.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (5.9 kB)
Collecting torchaudio
  Downloading torchaudio-2.9.0-cp313-cp313-macosx_12_0_arm64.whl.metadata (6.9 kB)
Collecting filelock (from torch)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx>=2.5.1 (from torch)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting fsspec>=0.8.5 (from torch)
  Downloading fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Collecting mpmath<1.4,>=1.1.0 (from sympy>=1.13.3->torch)
  Downloading mpmath-1.3.0-py3-none-any.whl.metadata (8.6 kB)
Downloading torch-2.9.0-cp313-none-macosx_11_0_arm64.whl (74.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.5/74.5 MB[0m [31m40.5 MB/s[0m eta [36

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# creating embedding layer (matrix) with 10 rows and 3 columns
# first filled with random numbers, so model can learn later during training
embed = nn.Embedding(10, 3)

### Matrix Factorization Model

In [14]:
class MF(nn.Module):
  def __init__(self, num_users, num_items, emb_size=100):
    super(MF, self).__init__()
    # lookup table for all users
    self.user_emb = nn.Embedding(num_users, emb_size)

    # lookup table for all items
    self.item_emb = nn.Embedding(num_items, emb_size)

    # initialize each randomly
    self.user_emb.weight.data.uniform_(0, 0.05)
    self.item_emb.weight.data.uniform_(0, 0.05)
      
  def forward(self, u, v):
    # replace each row with the embedding layer row
    u = self.user_emb(u)
    v = self.item_emb(v)

    # dot product of u and v
    return (u*v).sum(1)   

### Training the MF model

In [18]:

num_users = df_train['userId'].nunique()
num_items = df_train['movieId'].nunique()

In [None]:
model = MF(num_users, num_items, emb_size=100)

In [None]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0, unsqueeze=False):
  optimizer = torch.optim.Adam()