In [1]:
import pandas as pd
from sklearn import model_selection, preprocessing
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import defaultdict
from sklearn.metrics import mean_squared_error

In [3]:
#%% data import
df = pd.read_csv("ratings.csv")
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
print(f"Unique Users: {df.userId.nunique()}, Unique Movies: {df.movieId.nunique()}")

Unique Users: 610, Unique Movies: 9724


In [5]:
#%% Data Class
class MovieDataset(Dataset):
    def __init__(self, users, movies, ratings):
        self.users = users
        self.movies = movies
        self.ratings = ratings
    # len(movie_dataset)
    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        users = self.users[idx]
        movies = self.movies[idx]
        ratings = self.ratings[idx]

        return torch.tensor(users, dtype=torch.long), torch.tensor(movies, dtype=torch.long),torch.tensor(ratings, dtype=torch.long),

In [6]:
#%% Model Class
class RecSysModel(nn.Module):
    def __init__(self, n_users, n_movies, n_embeddings = 32):
        super().__init__()
        self.user_embed = nn.Embedding(n_users, n_embeddings)
        self.movie_embed = nn.Embedding(n_movies, n_embeddings)
        self.out = nn.Linear(n_embeddings * 2, 1)

    def forward(self, users, movies):
        user_embeds = self.user_embed(users)
        movie_embeds = self.movie_embed(movies)
        x = torch.cat([user_embeds, movie_embeds], dim=1)
        x = self.out(x)
        return x

In [7]:
#%% encode user and movie id to start from 0
lbl_user = preprocessing.LabelEncoder()
lbl_movie = preprocessing.LabelEncoder()
df.userId = lbl_user.fit_transform(df.userId.values)
df.movieId = lbl_movie.fit_transform(df.movieId.values)

In [10]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,0,0,4.0,964982703
1,0,2,4.0,964981247
2,0,5,4.0,964982224
3,0,43,5.0,964983815
4,0,46,5.0,964982931


In [11]:
df.tail()

Unnamed: 0,userId,movieId,rating,timestamp
100831,609,9416,4.0,1493848402
100832,609,9443,5.0,1493850091
100833,609,9444,5.0,1494273047
100834,609,9445,5.0,1493846352
100835,609,9485,3.0,1493846415


In [12]:
#%% create train test split
df_train, df_test = model_selection.train_test_split(
    df, test_size=0.2, random_state=42, stratify=df.rating.values)

In [13]:
#%% Dataset Instances
train_dataset = MovieDataset(
    users=df_train.userId.values,
    movies=df_train.movieId.values,
    ratings=df_train.rating.values
)

valid_dataset = MovieDataset(
    users=df_test.userId.values,
    movies=df_test.movieId.values,
    ratings=df_test.rating.values
)

In [14]:
#%% Data Loaders
BATCH_SIZE = 4
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True
                          )

test_loader = DataLoader(dataset=valid_dataset,
                          batch_size=BATCH_SIZE,
                          shuffle=True
                          )

In [15]:
#%% Model Instance, Optimizer, and Loss Function
model = RecSysModel(
    n_users=len(lbl_user.classes_),
    n_movies=len(lbl_movie.classes_))

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.MSELoss()

In [16]:
#%% Model Training
NUM_EPOCHS = 1

model.train()
for epoch_i in range(NUM_EPOCHS):
    for users, movies, ratings in train_loader:
        optimizer.zero_grad()
        y_pred = model(users,
                       movies)
        y_true = ratings.unsqueeze(dim=1).to(torch.float32)
        loss = criterion(y_pred, y_true)
        loss.backward()
        optimizer.step()

In [17]:
#%% Model Evaluation
y_preds = []
y_trues = []

model.eval()
with torch.no_grad():
    for users, movies, ratings in test_loader:
        y_true = ratings.detach().numpy().tolist()
        y_pred = model(users, movies).squeeze().detach().numpy().tolist()
        y_trues.append(y_true)
        y_preds.append(y_pred)

mse = mean_squared_error(y_trues, y_preds)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 0.9255200539449724


In [None]:
#%% Users and Items
user_movie_test = defaultdict(list)

with torch.no_grad():
    for users, movies, ratings in test_loader:
        y_pred = model(users, movies)
        for i in range(len(users)):
            user_id = users[i].item()
            movie_id = movies[i].item()
            pred_rating = y_pred[i][0].item()
            true_rating = ratings[i].item()

            print(f"User: {user_id}, Movie: {movie_id}, Pred: {pred_rating}, True: {true_rating}")
            user_movie_test[user_id].append((pred_rating, true_rating))

## output:
# User: 379, Movie: 7627, Pred: 3.6014649868011475, True: 5
# User: 607, Movie: 5151, Pred: 2.883819580078125, True: 4
# User: 598, Movie: 1770, Pred: 2.177128314971924, True: 3
# User: 216, Movie: 2526, Pred: 2.3788743019104004, True: 2
# ...

In [None]:
#%% Precision and Recall
precisions = {}
recalls = {}

k = 10
thres = 3.5

for uid, user_ratings in user_movie_test.items():
    # Sort user ratings by rating
    user_ratings.sort(key=lambda x: x[0], reverse=True)

    # count of relevant items
    n_rel = sum((rating_true >= thres) for (_, rating_true) in user_ratings)

    # count recommended items that are predicted relevent and within topk
    n_rec_k = sum((rating_pred >= thres) for (rating_pred, _) in user_ratings[:k])

    # count recommended AND relevant item
    n_rel_and_rec_k = sum(
        ((rating_true >= thres) and (rating_pred >= thres))
        for (rating_pred, rating_true) in user_ratings[:k]
    )

    print(f"uid {uid},  n_rel {n_rel}, n_rec_k {n_rec_k}, n_rel_and_rec_k {n_rel_and_rec_k}")
    precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0
    recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

print(f"Precision @ {k}: {sum(precisions.values()) / len(precisions)}")
print(f"Recall @ {k} : {sum(recalls.values()) / len(recalls)}")

## output:
# uid 562,  n_rel 9, n_rec_k 1, n_rel_and_rec_k 0
# uid 183,  n_rel 22, n_rec_k 6, n_rel_and_rec_k 5
# Precision @ 10: 0.6023451730418944
# Recall @ 10 : 0.4067860027043935
# ...