<a href="https://colab.research.google.com/github/jdasam/mas1004-2022/blob/main/notebooks/Data_AI_11th_week_Movie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collaborative Filtering with MovieLens Dataset

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm


In [2]:
torch.set_printoptions(sci_mode=False)

In [3]:
!wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip

--2022-11-15 14:32:06--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip.2’


2022-11-15 14:32:08 (819 KB/s) - ‘ml-latest-small.zip.2’ saved [978202/978202]



In [4]:
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
replace ml-latest-small/links.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [5]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [6]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [7]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [9]:
class RatingSet:
  def __init__(self, csv_path='ml-latest-small/ratings.csv'):
    self.ratings = pd.read_csv(csv_path)

    # how many unique users exist in this dataset
    self.n_users = len(set(self.ratings['userId']))
    self.n_movies = len(set(self.ratings['movieId']))

    # list the every ids of included users
    self.user_ids = list(set(self.ratings['userId']))
    self.movie_ids = sorted(list(set(self.ratings['movieId'])))

    # we have to find in which idx the given movieId exists in this dataset's movie ID
    self.movie2idx = {id: idx for idx, id in enumerate(self.movie_ids)}
    self.user2idx = {id: idx for idx, id in enumerate(self.user_ids)}

  def __len__(self):
    return len(self.ratings) # number of ratings in the dataset

  def __getitem__(self, idx):
    idx_row = self.ratings.iloc[idx]

    user_id = self.user2idx[idx_row.userId]
    movie_id = self.movie2idx[idx_row.movieId]
    rating = idx_row.rating

    return user_id, movie_id, rating


dataset = RatingSet()
# set(dataset.ratings['userId'])
dataset.ratings
# dataset.movie_ids.index(1019)
# dataset.movie2idx[1019], dataset.movie_ids[777]
dataset[5000]

(31, 630, 4.0)

In [10]:
# indexing pandas DataFrame in a row
dataset.ratings.iloc[0]

userId               1.0
movieId              1.0
rating               4.0
timestamp    964982703.0
Name: 0, dtype: float64

In [17]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

batch = next(iter(dataloader))
batch

[tensor([181, 176, 181, 297, 273, 116, 424, 324, 538, 287, 131, 181, 427, 605,
          18,  19, 598, 579, 541, 559, 596, 231, 211, 179, 576,  45, 306, 418,
         413, 102, 447, 287, 386, 112,  63, 551, 413, 454, 273, 453, 598, 437,
         102, 273, 253, 545, 589, 431, 599, 379, 464, 494, 501, 589,  76, 176,
         447, 248, 514, 459, 479, 306, 199, 220, 553, 201, 255, 379, 128, 291,
         468,  63,  56, 159, 473,   3, 602, 474, 291,  20, 507, 218, 474, 485,
         479, 598,  67, 599,  42, 181, 317, 199,  90, 176, 216, 181, 559, 491,
         559, 224, 259, 186, 482, 513,   9, 259, 414, 468, 149, 201, 104, 595,
         481, 131, 281, 324, 216, 473, 547, 324, 447, 560, 599, 468, 176, 473,
         131, 423]),
 tensor([5761, 7307, 2390, 5250, 1935,  216, 1208,  978,  963, 4332,  649,   92,
          607, 3859,  387,  835,  636, 4402, 3635, 2466,  658, 6905, 9344,  959,
         1494,  126, 5801,  659, 3929, 7670,  398, 2627, 1266, 1062, 2026, 4040,
         2556,  376, 4600

In [14]:
class MatrixFactorizer(nn.Module):
  def __init__(self, n_user, n_movie, n_factor):
    super().__init__()

    self.user_embedding = nn.Embedding(n_user, n_factor)
    self.movie_embedding = nn.Embedding(n_movie, n_factor)
  
  def forward(self, x):
    return

model = MatrixFactorizer(dataset.n_users, dataset.n_movies, n_factor=5)
model.movie_embedding.weight

Parameter containing:
tensor([[-2.7139, -0.9692, -0.6128, -0.6940,  2.3084],
        [ 0.8686,  1.4290, -0.1051,  0.1433, -0.0877],
        [ 0.5351,  1.4501,  0.5428, -0.1345, -0.3333],
        ...,
        [ 1.5514,  0.9710,  1.0824, -0.5386,  0.5400],
        [ 0.0377, -2.7129, -1.3009, -0.5098, -1.2406],
        [ 0.3800,  1.0245, -0.2848, -0.8668, -0.8347]], requires_grad=True)

In [15]:
# making prediction for given user and movie Id
user_id, movie_id, rating = dataset[0]

user_id = torch.tensor(user_id)
movie_id = torch.tensor(movie_id)

# call the embedding of corresponding user and movie
user_emb_vec = model.user_embedding(user_id)
movie_emb_vec = model.movie_embedding(movie_id)

# get dot product result
# user_emb_vec, movie_emb_vec, (user_emb_vec * movie_emb_vec)
# dot_result = (user_emb_vec * movie_emb_vec).sum()

dot_result = torch.matmul(user_emb_vec, movie_emb_vec)
dot_result

tensor(-0.4179, grad_fn=<DotBackward0>)

In [18]:
user_id, movie_id, rating = batch

user_emb_vec = model.user_embedding(user_id)
movie_emb_vec = model.movie_embedding(movie_id)

print(user_emb_vec.shape, movie_emb_vec.shape)
(user_emb_vec * movie_emb_vec).sum(dim=-1)

torch.Size([128, 5]) torch.Size([128, 5])


tensor([-0.4425,  2.5511, -1.5023,  0.3361,  1.4652, -1.1858, -1.0929,  0.7490,
        -1.3958,  3.3112,  1.5317,  0.0539,  0.3393, -1.5901,  1.7209, -1.1860,
        -2.2672, -1.8001,  0.0321, -2.8619,  3.0432,  4.2050, -3.3824, -0.2631,
         1.9497,  0.2496,  1.7556,  0.6536,  2.6054, -0.3940,  0.7813, -0.7134,
         1.8206,  1.9378,  5.0987,  0.4230, -0.5187, -5.3864, -1.5774,  0.0146,
        -2.6817,  0.5819, -1.6246,  2.5275,  3.4925, -2.9228, -2.7488, -8.4362,
         0.4238, -3.0844,  0.6985, -2.4748, -1.2272, -0.8409,  0.4309,  0.4241,
         1.7677, -0.1824,  0.3536,  0.0702,  0.6044, -0.0136, -0.6089,  1.4871,
         0.0768, -0.3250, -4.3249,  1.1507,  2.6857, -1.0612, -0.9979, -0.5572,
        -2.0680,  1.3364,  2.7656, -0.1069,  0.0945, -3.2904, -0.0527, -1.1879,
        -0.9639, -1.4756,  3.7180,  0.2125, -2.3871, -1.6099, -5.4729,  0.4599,
        -0.0644,  0.1535,  1.4403,  5.4030, -0.4485, -0.1153, -1.2884,  0.6868,
         0.5835,  4.6144,  3.7662, -4.12

In [None]:

emb_test = nn.Embedding(num_embeddings=12, embedding_dim=5)
emb_test.weight


In [48]:
test_ids = torch.tensor([5,8,1,0])
emb_test(test_ids)

tensor([[-0.5702,  0.7185, -0.0459, -0.2225, -0.5316],
        [-2.5704,  0.0466, -1.3711,  0.2310, -0.4315],
        [-0.2621, -1.4530, -0.4934, -0.9529, -0.1507],
        [ 0.1753,  1.1969,  0.4925,  0.5635, -0.1250]],
       grad_fn=<EmbeddingBackward0>)

# Training a model
- Train model with `Trainer` class
  - This is from the Assignment 2
- Now the task is regression
  - We are estimating the raiting in a continous value
- We have to split dataset into train / valid / test

In [22]:
num_train = int(len(dataset)*0.8)
num_valid = int(len(dataset)*0.1)
num_test = len(dataset) - num_train - num_valid

train_set, valid_set, test_set = torch.utils.data.random_split(dataset, [num_train, num_valid, num_test])
train_loader = torch.utils.data.DataLoader(train_set, batch_size=128, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=128, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=128, shuffle=False)

In [None]:
class Trainer:
  def __init__(self, model, train_loader, valid_loader, model_name='resnet'):
    self.model = model
    self.train_loader = train_loader
    self.valid_loader = valid_loader
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.model.to(self.device)
    self.criterion = nn.NLLLoss()
    self.optimizer = torch.optim.Adam(self.model.parameters(), lr=1e-3)
    self.best_loss = torch.inf
    self.best_acc = 0.0
    self.train_losses = []
    self.valid_losses = []
    self.train_accs = []
    self.valid_accs = []
    self.model_name = model_name

  def validation(self):
    self.model.eval() # change the model from train mode to evaluation mode
    # Some models work in different ways based on whtehter it is on training step
    # or on inference step

    # In validation step, you don't have to calculate the gradient
    # with torch.no_grad():

    current_loss = 0
    num_total_correct_pred = 0
    with torch.inference_mode(): # every torch computation under this indent
    # will be run without calculating the gradient or computation history
      for batch in self.valid_loader:
        images, labels = batch
        images, labels = images.to(self.device), labels.to(self.device)
        outputs = self.model(images)
        probs = torch.softmax(outputs, dim=-1)
        log_probs = torch.log(probs)

        loss = self.criterion(log_probs, labels)
        predicted_classes = torch.argmax(outputs, dim=-1)
        num_acc_pred = (predicted_classes == labels.to(self.device)).sum()
        #num_acc_pred is on self.device
        num_total_correct_pred += num_acc_pred.item()
        # in validation stage, we don't care about single batch's loss
        # we want to see the result for total images of validation set

        current_loss += loss.item() * len(labels)
        # instead of adding the mean loss, we add sum of loss
        # because the batch size can be different
    mean_loss = current_loss / len(self.valid_loader.dataset)
    mean_acc = num_total_correct_pred / len(self.valid_loader.dataset) # number of total datasample in the validation loader
    return mean_loss, mean_acc
    # return {'loss': mean_loss, 'acc': mean_acc}



  def train_by_number_of_epochs(self, num_epochs):
    for epoch in tqdm(range(num_epochs)):
      self.model.train()
      for batch in tqdm(self.train_loader, leave=False):
        images, labels = batch
        images, labels = images.to(self.device), labels.to(self.device)
        self.optimizer.zero_grad()
        outputs = self.model(images) # this is logits
        probs = torch.softmax(outputs, dim=-1)
        log_probs = torch.log(probs)
        loss = self.criterion(log_probs, labels) # you have to feed log_probs

        acc = (torch.argmax(outputs, dim=-1) == labels.to(self.device)).sum() / len(labels)
        # for torch.nn.NLLLoss
        loss.backward()
        self.optimizer.step()

        self.train_losses.append(loss.item())
        self.train_accs.append(acc.item())
        # don't try self.train_losses.append(loss)
      # training step has ended
      # we want to test our model on the validation set
      valid_loss, valid_acc = self.validation()

      # is this model the best? 
      # let's decide it based on valid_acc
      if valid_acc > self.best_acc:
        self.best_acc = valid_acc

        # If it is the best model, save the model's weight'
        models_parameters = self.model.state_dict()
        print(f"Saving best model at epoch {len(self.valid_accs)}, acc: {valid_acc}")
        torch.save(models_parameters, f'{self.model_name}_best.pt')

      self.valid_losses.append(valid_loss)
      self.valid_accs.append(valid_acc)

    # Plot Accuracy curve
    plt.plot(self.train_accs)
    plt.plot(range(len(self.train_loader)-1, len(self.train_accs), len(self.train_loader)) ,self.valid_accs)
    plt.title("Accuracy")



## Inference (test)
- Estimating a score for given User ID and Movie ID

## Cosine Similarity
- We can calculate similarity between two embeddings by using Cosine Similarity
  - $\text{cosine similarity} = \frac{\textbf{A}\cdot \textbf{B}}{\|\textbf{A}\|\|\textbf{B}\|} = \frac{\sum_{i=1}^{n}A_iB_i}{\sqrt{\sum_{i=1}^nA_i^2}\sqrt{\sum_{i=1}^nB_i^2}}$ 

# Visualization of Embedding

In [None]:
!pip install -q umap-learn
!pip install -Uq plotly

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import umap
import plotly
import plotly.graph_objs as go
plotly.offline.init_notebook_mode()

def get_umap_embedding(embedding, n_reduced_dimension=2, n_neighbors=15):
  reducer = umap.UMAP(n_components=n_reduced_dimension, n_neighbors=n_neighbors)
  umap_emb = reducer.fit_transform(embedding)
  return umap_emb

def make_scatter3d(emb, label):
  trace = go.Scatter3d(
    x=emb[:,0],  
    y=emb[:,1],  
    z=emb[:,2], 
    mode='markers',
    marker={
        'size': 2,
        'opacity': 0.5,
    },
    text = label,
  )
  return trace

def make_3d_plot_with_pyplot(embs, labels, highlighted_titles):
  highlighted_indices = [labels.index(title) if title in labels else 0 for title in highlighted_titles]
  layout = go.Layout(
      margin={'l': 0, 'r': 0, 'b': 0, 't': 0},
      scene=dict(
          annotations = [dict(x=embs[i,0], y=embs[i,1], z=embs[i,2],text=labels[i]) for i in highlighted_indices ]
      )
  )
  data = make_scatter3d(embs,labels)
  plot_figure = go.Figure(data=data, layout=layout)
  plot_figure.update_traces(textposition='top center')
  plot_figure.show(renderer="colab")
