# MIE524 - Lab 5

# Collaborative Filtering

In this notebook we will write a matrix factorization model in pytorch to solve a recommendation problem.

The MovieLens dataset (ml-latest-small) describes 5-star rating and free-text tagging activity from MovieLens, a movie recommendation service. It contains 100004 ratings and 1296 tag applications across 9125 movies. https://grouplens.org/datasets/movielens/. To get the data:

`http://files.grouplens.org/datasets/movielens/ml-latest-small.zip`

## MovieLens dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
data = pd.read_csv("q1-ratings.csv")

In [None]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Encoding data
We encode the data to have continuous ids for users and movies. You can think about this as a categorical encoding of our two categorical variables userId and movieId.

In [None]:
# split train and validation before encoding
np.random.seed(3)
msk = np.random.rand(len(data)) < 0.8
train = data[msk].copy()
val = data[~msk].copy()

In [None]:
# here is a handy function modified from fast.ai
def proc_col(col, train_col=None):
    """Encodes a pandas column with continuous ids.
    """
    if train_col is not None:
        uniq = train_col.unique()
    else:
        uniq = col.unique()
    name2idx = {o:i for i,o in enumerate(uniq)}
    return name2idx, np.array([name2idx.get(x, -1) for x in col]), len(uniq)

In [None]:
def encode_data(df, train=None):
    """ Encodes rating data with continous user and movie ids.
    If train is provided, encodes df with the same encoding as train.
    """
    df = df.copy()
    for col_name in ["userId", "movieId"]:
        train_col = None
        if train is not None:
            train_col = train[col_name]
        _,col,_ = proc_col(df[col_name], train_col)
        df[col_name] = col
        df = df[df[col_name] >= 0]
    return df

In [None]:
# check encode implementation
df_t = pd.read_csv("tiny_training2.csv")
df_v = pd.read_csv("tiny_val2.csv")
print(df_t)
df_t_e = encode_data(df_t)
df_v_e = encode_data(df_v, df_t)
df_v_e
print(df_t_e)

    userId  movieId  rating
0       11        1       4
1       11       23       5
2        2       23       5
3        2        4       3
4       31        1       4
5       31       23       4
6        4        1       5
7        4        3       2
8       52        1       1
9       52        3       4
10      61        3       5
11       7       23       1
12       7        3       3
    userId  movieId  rating
0        0        0       4
1        0        1       5
2        1        1       5
3        1        2       3
4        2        0       4
5        2        1       4
6        3        0       5
7        3        3       2
8        4        0       1
9        4        3       4
10       5        3       5
11       6        1       1
12       6        3       3


In [None]:
# encoding the train and validation data
df_train = encode_data(train)
df_val = encode_data(val, train)

In [None]:
df_train.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,4.0,964982703
1,0,1,4.0,964981247
2,0,2,4.0,964982224
3,0,3,5.0,964983815
6,0,4,5.0,964980868


## Embedding layer

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# an Embedding module containing 10 user or item embedding size 3
# embedding will be initialized at random
embed = nn.Embedding(6, 3)

In [None]:
# given a list of ids we can "look up" the embedding corresponing to each id
a = torch.LongTensor([[1,2,0,4,5,1]])
embed(a)

tensor([[[-0.5296, -0.3023,  1.0801],
         [ 0.7281,  1.6024,  0.8521],
         [ 0.2905, -0.3309, -0.8858],
         [ 0.7525,  0.1204, -1.0548],
         [-0.8111,  0.3071,  1.9387],
         [-0.5296, -0.3023,  1.0801]]], grad_fn=<EmbeddingBackward0>)

## Matrix factorization model

In [None]:
class MF(nn.Module):
    def __init__(self, num_users, num_items, emb_size=100):
        super(MF, self).__init__()
        self.user_emb = nn.Embedding(num_users, emb_size)
        self.item_emb = nn.Embedding(num_items, emb_size)
        self.user_emb.weight.data.uniform_(0, 0.05)
        self.item_emb.weight.data.uniform_(0, 0.05)

    def forward(self, u, v):
        u = self.user_emb(u)
        v = self.item_emb(v)
        return (u*v).sum(1)

## Debugging MF model

In [None]:
df_t_e

Unnamed: 0,userId,movieId,rating
0,0,0,4
1,0,1,5
2,1,1,5
3,1,2,3
4,2,0,4
5,2,1,4
6,3,0,5
7,3,3,2
8,4,0,1
9,4,3,4


In [None]:
num_users = 7
num_items = 4
emb_size = 3

user_emb = nn.Embedding(num_users, emb_size)
item_emb = nn.Embedding(num_items, emb_size)
users = torch.LongTensor(df_t_e.userId.values)
items = torch.LongTensor(df_t_e.movieId.values)

In [None]:
U = user_emb(users)
V = item_emb(items)

In [None]:
U

tensor([[-0.5643, -0.2570, -1.3175],
        [-0.5643, -0.2570, -1.3175],
        [-1.0108, -0.0760, -0.5476],
        [-1.0108, -0.0760, -0.5476],
        [-0.0930, -0.2160,  0.2233],
        [-0.0930, -0.2160,  0.2233],
        [ 1.9618, -1.2234,  1.3440],
        [ 1.9618, -1.2234,  1.3440],
        [-1.5219, -1.5414, -1.3096],
        [-1.5219, -1.5414, -1.3096],
        [-0.0155,  0.5152,  0.4456],
        [ 0.8519, -0.0183, -0.3179],
        [ 0.8519, -0.0183, -0.3179]], grad_fn=<EmbeddingBackward0>)

In [None]:
V

tensor([[-0.2225,  1.9827, -2.0772],
        [-0.5626,  0.5404, -0.4186],
        [-0.5626,  0.5404, -0.4186],
        [-0.7126,  0.0204, -0.0117],
        [-0.2225,  1.9827, -2.0772],
        [-0.5626,  0.5404, -0.4186],
        [-0.2225,  1.9827, -2.0772],
        [-1.1016,  0.4222,  0.1395],
        [-0.2225,  1.9827, -2.0772],
        [-1.1016,  0.4222,  0.1395],
        [-1.1016,  0.4222,  0.1395],
        [-0.5626,  0.5404, -0.4186],
        [-1.1016,  0.4222,  0.1395]], grad_fn=<EmbeddingBackward0>)

In [None]:
# element wise multiplication
U*V

tensor([[ 1.2558e-01, -5.0959e-01,  2.7368e+00],
        [ 3.1746e-01, -1.3890e-01,  5.5151e-01],
        [ 5.6864e-01, -4.1055e-02,  2.2921e-01],
        [ 7.2023e-01, -1.5535e-03,  6.4296e-03],
        [ 2.0701e-02, -4.2825e-01, -4.6383e-01],
        [ 5.2332e-02, -1.1673e-01, -9.3471e-02],
        [-4.3659e-01, -2.4257e+00, -2.7917e+00],
        [-2.1612e+00, -5.1650e-01,  1.8744e-01],
        [ 3.3868e-01, -3.0561e+00,  2.7203e+00],
        [ 1.6765e+00, -6.5072e-01, -1.8265e-01],
        [ 1.7099e-02,  2.1749e-01,  6.2153e-02],
        [-4.7928e-01, -9.8968e-03,  1.3306e-01],
        [-9.3850e-01, -7.7313e-03, -4.4333e-02]], grad_fn=<MulBackward0>)

In [None]:
# what we want is a dot product per row
(U*V).sum(1)

tensor([ 2.3527e+00,  7.3007e-01,  7.5679e-01,  7.2511e-01, -8.7138e-01,
        -1.5787e-01, -5.6540e+00, -2.4902e+00,  2.9202e-03,  8.4313e-01,
         2.9674e-01, -3.5612e-01, -9.9057e-01], grad_fn=<SumBackward1>)

## Training GMF model

In [None]:
num_users = len(df_train.userId.unique())
num_items = len(df_train.movieId.unique())
print(num_users, num_items)

610 8998


In [None]:
model = MF(num_users, num_items, emb_size=100)

In [None]:
def train_epocs(model, epochs=10, lr=0.01, wd=0.0):
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    model.train()
    for i in range(epochs):
        users = torch.LongTensor(df_train.userId.values) # .cuda()
        items = torch.LongTensor(df_train.movieId.values) #.cuda()
        ratings = torch.FloatTensor(df_train.rating.values) #.cuda()
        y_hat = model(users, items)
        loss = F.mse_loss(y_hat, ratings)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(loss.item())
    test_loss(model)

In [None]:
def test_loss(model):
    model.eval()
    users = torch.LongTensor(df_val.userId.values) #.cuda()
    items = torch.LongTensor(df_val.movieId.values) #.cuda()
    ratings = torch.FloatTensor(df_val.rating.values) #.cuda()
    y_hat = model(users, items)
    loss = F.mse_loss(y_hat, ratings)
    print("test loss %.3f " % loss.item())

In [None]:
train_epocs(model, epochs=10, lr=0.1)

12.914663314819336
4.859004020690918
2.5788631439208984
3.108581781387329
0.8500668406486511
1.8190046548843384
2.6562750339508057
2.136134147644043
1.089386224746704
0.973445475101471
test loss 1.852 


In [None]:
train_epocs(model, epochs=15, lr=0.01)

1.640624761581421
1.0038915872573853
0.7122650742530823
0.661577582359314
0.7259314656257629
0.8034321069717407
0.8425079584121704
0.834148645401001
0.7920082211494446
0.7368590235710144
0.6875086426734924
0.655912458896637
0.6451523900032043
0.6501864790916443
0.6612692475318909
test loss 0.821 


Note that these models are susceptible to weight initialization, optimization algorithm and regularization.

## Check prediction results

In [None]:
u = torch.LongTensor(df_val.userId.values) #.cuda()
v = torch.LongTensor(df_val.movieId.values) #.cuda()
mm = model(u,v).detach().numpy()

In [None]:
mm

array([5.642462, 4.584304, 4.234826, ..., 4.549503, 4.338807, 4.371043],
      dtype=float32)

In [None]:
df_val.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
27066,181,1091,4.0,1146534346
84167,536,124,4.5,1424141439
88486,570,8352,2.0,966901381
8074,56,15,3.0,965798870
94215,598,2954,1.5,1519233780


In [None]:
u = torch.LongTensor(np.array([570])) #.cuda()
v = torch.LongTensor(np.array([8352])) #.cuda()
model(u,v).detach().numpy()

array([2.1619668], dtype=float32)

In [None]:
df_train.sample(5)

Unnamed: 0,userId,movieId,rating,timestamp
30250,210,2679,4.0,1350912713
76334,479,999,4.0,1179178545
89998,584,273,4.0,1307417190
16377,104,372,3.5,1446573297
86784,560,149,3.5,1491092031


In [None]:
u = torch.LongTensor(np.array([560])) #.cuda()
v = torch.LongTensor(np.array([149])) #.cuda()
model(u,v).detach().numpy()

array([4.936886], dtype=float32)

# Zero-Shot Classification
https://huggingface.co/tasks/zero-shot-classification

In [None]:
!pip install transformers



In [None]:
from transformers import pipeline

pipe = pipeline(model="facebook/bart-large-mnli")
pipe("I have a problem with my iphone that needs to be resolved asap!",
    candidate_labels=["urgent", "not urgent"],
)

config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

{'sequence': 'I have a problem with my iphone that needs to be resolved asap!',
 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'],
 'scores': [0.5227572321891785,
  0.45814111828804016,
  0.014264623634517193,
  0.002685008803382516,
  0.002152056200429797]}

In [None]:
pipe("I have a problem with my iphone that needs to be resolved asap!",
    candidate_labels=["phone", "tablet", "computer"],
)