In [1]:
import pandas as pd
from sklearn import model_selection, preprocessing
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from sklearn.metrics import mean_squared_error

In [3]:
#%% data import
df = pd.read_csv("ratings.csv")
df.head(2)

print(f"Unique Users: {df.userId.nunique()}, Unique Movies: {df.movieId.nunique()}")

Unique Users: 610, Unique Movies: 9724


In [4]:
#%% Data Class
class MovieDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings
    # len(movie_dataset)
    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        users = self.users[idx]
        movies = self.movies[idx]
        ratings = self.ratings[idx]

        return torch.tensor(users, dtype=torch.long), torch.tensor(movies, dtype=torch.long),torch.tensor(ratings, dtype=torch.long),

In [6]:
#%% Model Class
class RecSysModel(nn.Module):
    def __init__(self, n_users, n_movies, n_embeddings = 32):
        super().__init__()
        self.user_embed = nn.Embedding(n_users, n_embeddings)
        self.movie_embed = nn.Embedding(n_movies, n_embeddings)
        self.out = nn.Linear(n_embeddings * 2, 1)

    def forward(self, users, movies):
        user_embeds = self.user_embed(users)
        movie_embeds = self.movie_embed(movies)
        x = torch.cat([user_embeds, movie_embeds], dim=1)
        x = self.out(x)
        return x

In [7]:
#%% encode user and movie id to start from 0
lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()
df.userId = lbl_user.fit_transform(df.userId.values)
df.movieId = lbl_movie.fit_transform(df.movieId.values)

In [8]:
#%% create train test split
df_train, df_test = model_selection.train_test_split(
    df, test_size=0.2, random_state=42, stratify=df.rating.values)

In [9]:
#%% Dataset Instances
train_dataset = MovieDataset(
    users=df_train.userId.values,
    movies=df_train.movieId.values,
    ratings=df_train.rating.values
)

valid_dataset = MovieDataset(
    users=df_test.userId.values,
    movies=df_test.movieId.values,
    ratings=df_test.rating.values
)

In [10]:
#%% Data Loaders
BATCH_SIZE = 4
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True
                          )

test_loader = DataLoader(dataset=valid_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True
                          )

In [11]:
#%% Model Instance, Optimizer, and Loss Function
model = RecSysModel(
    n_users=len(lbl_user.classes_),
    n_movies=len(lbl_movie.classes_))

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.MSELoss()

In [12]:
#%% Model Training
NUM_EPOCHS = 1

model.train()
for epoch_i in range(NUM_EPOCHS):
    for users, movies, ratings in train_loader:
        optimizer.zero_grad()
        y_pred = model(users,
                       movies)
        y_true = ratings.unsqueeze(dim=1).to(torch.float32)
        loss = criterion(y_pred, y_true)
        loss.backward()
        optimizer.step()

In [13]:
#%% Model Evaluation
y_preds = []
y_trues = []

model.eval()
with torch.no_grad():
    for users, movies, ratings in test_loader:
        y_true = ratings.detach().numpy().tolist()
        y_pred = model(users, movies).squeeze().detach().numpy().tolist()
        y_trues.append(y_true)
        y_preds.append(y_pred)

mse = mean_squared_error(y_trues, y_preds)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.9164341531951026


In [14]:
#%% Users and Items
user_movie_test = defaultdict(list)

with torch.no_grad():
    for users, movies, ratings in test_loader:
        y_pred = model(users, movies)
        for i in range(len(users)):
            user_id = users[i].item()
            movie_id = movies[i].item()
            pred_rating = y_pred[i][0].item()
            true_rating = ratings[i].item()

            print(f"User: {user_id}, Movie: {movie_id}, Pred: {pred_rating}, True: {true_rating}")
            user_movie_test[user_id].append((pred_rating, true_rating))

[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m
User: 96, Movie: 2982, Pred: 3.596296787261963, True: 3
User: 87, Movie: 2596, Pred: 3.65139102935791, True: 4
User: 340, Movie: 8413, Pred: 3.6239407062530518, True: 4
User: 188, Movie: 2224, Pred: 4.426867961883545, True: 4
User: 316, Movie: 8519, Pred: 3.737367630004883, True: 4
User: 447, Movie: 7207, Pred: 2.6989054679870605, True: 3
User: 273, Movie: 1558, Pred: 2.9440627098083496, True: 3
User: 406, Movie: 6517, Pred: 4.155196189880371, True: 3
User: 67, Movie: 6525, Pred: 2.9194529056549072, True: 3
User: 486, Movie: 2224, Pred: 3.591620445251465, True: 4
User: 609, Movie: 84, Pred: 3.1235203742980957, True: 3
User: 169, Movie: 508, Pred: 3.857842445373535, True: 4
User: 447, Movie: 2348, Pred: 2.9281086921691895, True: 3
User: 559, Movie: 6178, Pred: 2.6491634845733643, True: 3
User: 18, Movie: 695, Pred: 2.7719032764434814, True: 5
User: 303, Movie: 910, Pred: 4.414768218994141, True: 5
User: 393, Movi

In [15]:
#%% Precision and Recall
precisions = {}
recalls = {}

k = 10
thres = 3.5

for uid, user_ratings in user_movie_test.items():
    # Sort user ratings by rating
    user_ratings.sort(key=lambda x: x[0], reverse=True)

    # count of relevant items
    n_rel = sum((rating_true >= thres) for (_, rating_true) in user_ratings)

    # count recommended items that are predicted relevent and within topk
    n_rec_k = sum((rating_pred >= thres) for (rating_pred, _) in user_ratings[:k])

    # count recommended AND relevant item
    n_rel_and_rec_k = sum(
        ((rating_true >= thres) and (rating_pred >= thres))
        for (rating_pred, rating_true) in user_ratings[:k]
    )

    print(f"uid {uid},  n_rel {n_rel}, n_rec_k {n_rec_k}, n_rel_and_rec_k {n_rel_and_rec_k}")
    precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
    recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

print(f"Precision @ {k}: {sum(precisions.values()) / len(precisions)}")
print(f"Recall @ {k} : {sum(recalls.values()) / len(recalls)}")

uid 317,  n_rel 69, n_rec_k 10, n_rel_and_rec_k 5
uid 312,  n_rel 35, n_rec_k 10, n_rel_and_rec_k 8
uid 273,  n_rel 58, n_rec_k 5, n_rel_and_rec_k 5
uid 379,  n_rel 125, n_rec_k 10, n_rel_and_rec_k 9
uid 413,  n_rel 267, n_rec_k 10, n_rel_and_rec_k 10
uid 358,  n_rel 5, n_rec_k 5, n_rel_and_rec_k 3
uid 159,  n_rel 33, n_rec_k 0, n_rel_and_rec_k 0
uid 386,  n_rel 61, n_rec_k 10, n_rel_and_rec_k 8
uid 44,  n_rel 46, n_rec_k 10, n_rel_and_rec_k 10
uid 102,  n_rel 38, n_rec_k 10, n_rel_and_rec_k 10
uid 158,  n_rel 12, n_rec_k 3, n_rel_and_rec_k 2
uid 458,  n_rel 4, n_rec_k 4, n_rel_and_rec_k 4
uid 65,  n_rel 46, n_rec_k 10, n_rel_and_rec_k 7
uid 609,  n_rel 121, n_rec_k 10, n_rel_and_rec_k 9
uid 408,  n_rel 8, n_rec_k 10, n_rel_and_rec_k 6
uid 598,  n_rel 41, n_rec_k 0, n_rel_and_rec_k 0
uid 449,  n_rel 6, n_rec_k 7, n_rel_and_rec_k 6
uid 131,  n_rel 11, n_rec_k 6, n_rel_and_rec_k 2
uid 506,  n_rel 0, n_rec_k 0, n_rel_and_rec_k 0
uid 447,  n_rel 95, n_rec_k 1, n_rel_and_rec_k 0
uid 293,  n