<a href="https://colab.research.google.com/github/jdasam/mas1004-2022/blob/main/notebooks/Data_AI_11th_week_Movie.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collaborative Filtering with MovieLens Dataset

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

In [2]:
torch.set_printoptions(sci_mode=False)

In [3]:
!wget https://files.grouplens.org/datasets/movielens/ml-latest-small.zip

--2022-11-10 06:35:08--  https://files.grouplens.org/datasets/movielens/ml-latest-small.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 978202 (955K) [application/zip]
Saving to: ‘ml-latest-small.zip’


2022-11-10 06:35:09 (2.68 MB/s) - ‘ml-latest-small.zip’ saved [978202/978202]



In [4]:
!unzip ml-latest-small.zip

Archive:  ml-latest-small.zip
   creating: ml-latest-small/
  inflating: ml-latest-small/links.csv  
  inflating: ml-latest-small/tags.csv  
  inflating: ml-latest-small/ratings.csv  
  inflating: ml-latest-small/README.txt  
  inflating: ml-latest-small/movies.csv  


In [5]:
movies = pd.read_csv('ml-latest-small/movies.csv')
ratings = pd.read_csv('ml-latest-small/ratings.csv')

In [6]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [7]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [37]:
class RatingSet:
  def __init__(self, csv_path='ml-latest-small/ratings.csv'):
    self.ratings = pd.read_csv(csv_path)

    # how many unique users exist in this dataset
    self.n_users = len(set(dataset.ratings['userId']))
    self.n_movies = len(set(dataset.ratings['movieId']))

    # list the every ids of included users
    self.user_ids = list(set(dataset.ratings['userId']))
    self.movie_ids = sorted(list(set(dataset.ratings['movieId'])))

    # we have to find in which idx the given movieId exists in this dataset's movie ID
    self.movie2idx = {id: idx for idx, id in enumerate(self.movie_ids)}
    self.user2idx = {id: idx for idx, id in enumerate(self.user_ids)}

  def __len__(self):
    return len(self.ratings) # number of ratings in the dataset

  def __getitem__(self, idx):
    idx_row = self.ratings.iloc[idx]

    user_id = self.user2idx[idx_row.userId]
    movie_id = self.movie2idx[idx_row.movieId]
    rating = idx_row.rating

    return user_id, movie_id, rating


dataset = RatingSet()
# set(dataset.ratings['userId'])
dataset.ratings
# dataset.movie_ids.index(1019)
# dataset.movie2idx[1019], dataset.movie_ids[777]
dataset[5000]

(31, 630, 4.0)

In [32]:
# indexing pandas DataFrame in a row
dataset.ratings.iloc[0]

userId               1.0
movieId              1.0
rating               4.0
timestamp    964982703.0
Name: 0, dtype: float64

In [55]:
dataloader = torch.utils.data.DataLoader(dataset, batch_size=128, shuffle=True)

batch = next(iter(dataloader))
batch

[tensor([602, 281, 367, 304, 566,  61, 341, 131,  19, 216, 123, 273, 199, 519,
         259, 386, 386, 598, 589, 465, 605,  19, 212, 248, 565, 536, 466, 223,
         193,  50, 519,  27, 404, 381, 159, 219, 473, 139,  72, 566,  67, 317,
         289, 215, 330, 375,  83, 473, 561, 609,  37, 364, 563, 541, 599, 333,
         447,   3, 307, 139, 273, 434, 317, 584, 579, 609, 598, 591, 223,   6,
         431, 607, 447, 284, 551, 488, 245, 244, 598, 526, 219, 161, 598, 596,
         366, 447, 104, 473, 559, 134, 558, 273, 479, 211, 598,  67, 513, 329,
          67, 468, 106, 104, 297, 104, 607, 566, 343,  32, 413, 223, 159, 304,
         551,  96,  99, 379, 176, 479, 597, 417, 345, 409, 107, 248, 413, 194,
         607, 238]),
 tensor([1310, 2257, 2217, 5777, 9367, 6753, 2225,  827, 3520,  253, 4900, 6288,
         6753,  818, 4020, 4163, 5222, 1670, 6071, 6517, 2887, 1971,  349, 8038,
          197, 4153, 1502,  250,  986, 5885,  406, 4939, 1223, 4460, 1801, 1542,
         1059, 1258, 8668

In [53]:
class MatrixFactorizer(nn.Module):
  def __init__(self, n_user, n_movie, n_factor):
    super().__init__()

    self.user_embedding = nn.Embedding(n_user, n_factor)
    self.movie_embedding = nn.Embedding(n_movie, n_factor)
  
  def forward(self, x):
    return

model = MatrixFactorizer(dataset.n_users, dataset.n_movies, n_factor=5)
model.movie_embedding.weight

Parameter containing:
tensor([[-0.4785, -0.7349,  0.4144,  0.3459, -0.3760],
        [ 0.3609, -0.8168, -0.7085, -0.5950,  0.2063],
        [ 1.3869,  0.3119, -1.0054, -0.7740,  0.0990],
        ...,
        [ 0.5754,  0.4393,  1.0168, -0.6943,  0.0390],
        [ 0.4661, -0.0200, -0.3364, -0.4844,  0.2236],
        [-1.3954,  2.1226,  0.7522, -1.6019,  2.4323]], requires_grad=True)

In [69]:
# making prediction for given user and movie Id
user_id, movie_id, rating = dataset[0]

user_id = torch.tensor(user_id)
movie_id = torch.tensor(movie_id)

# call the embedding of corresponding user and movie
user_emb_vec = model.user_embedding(user_id)
movie_emb_vec = model.movie_embedding(movie_id)

# get dot product result
# user_emb_vec, movie_emb_vec, (user_emb_vec * movie_emb_vec)
# dot_result = (user_emb_vec * movie_emb_vec).sum()

dot_result = torch.matmul(user_emb_vec, movie_emb_vec)
dot_result

tensor(-0.0615, grad_fn=<DotBackward0>)

In [76]:
user_id, movie_id, rating = batch

user_emb_vec = model.user_embedding(user_id)
movie_emb_vec = model.movie_embedding(movie_id)

print(user_emb_vec.shape, movie_emb_vec.shape)
(user_emb_vec * movie_emb_vec).sum(dim=-1)

torch.Size([128, 5]) torch.Size([128, 5])


tensor([-2.2351,  0.1631,  1.2320,  0.3572, -1.4796,  0.3605, -0.1290,  3.5240,
        -0.4034,  1.4790, -2.1340,  0.9387, -0.5723,  1.6360, -2.9310,  0.1947,
         4.1074,  0.0477, -3.1284,  2.3812,  1.3904, -0.8724, -2.1411,  1.1913,
        -1.8387,  2.9212,  2.5635,  1.4362, -0.1799, -5.2855,  1.5802,  0.1372,
         1.6774, -1.4472,  0.1147, -1.2136, -0.7481,  1.6723, -3.0173,  2.4982,
         0.1609, -0.6296, -0.8426, -1.1950,  0.2637,  0.8032,  2.3075,  0.1663,
         1.8574,  0.4669, -0.2635, -0.2691, -3.6967, -0.9938, -3.4730,  5.0250,
         2.2036, -1.4197, -0.4889, -0.2947, -0.2924, -0.9876,  0.1431, -2.9164,
        -2.7954, -1.2981,  5.5226, -3.2246,  3.9338, -0.0168, -1.0363, -2.4994,
         1.1945, -0.4011,  3.1179,  4.9508, -0.6357,  1.2160,  1.3154,  1.1693,
         1.5246, -0.0071,  5.7332,  0.8066,  1.7512,  3.7719, -2.3335,  2.5403,
        -0.3350, -0.5588, -0.5933,  0.6803, -2.1063,  3.0579,  2.5349, -0.2510,
        -0.6980,  1.3990,  2.8466, -3.49

In [None]:

emb_test = nn.Embedding(num_embeddings=12, embedding_dim=5)
emb_test.weight


In [48]:
test_ids = torch.tensor([5,8,1,0])
emb_test(test_ids)

tensor([[-0.5702,  0.7185, -0.0459, -0.2225, -0.5316],
        [-2.5704,  0.0466, -1.3711,  0.2310, -0.4315],
        [-0.2621, -1.4530, -0.4934, -0.9529, -0.1507],
        [ 0.1753,  1.1969,  0.4925,  0.5635, -0.1250]],
       grad_fn=<EmbeddingBackward0>)