# Dataset preparation

In [1]:
import numpy as np
import pandas as pd
import math
from itertools import product
from tqdm import tqdm
import pickle
import csv
from matplotlib import pyplot as plt
import urllib
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [2]:
# To ignore warinings
import warnings
warnings.filterwarnings('ignore')

In [3]:
rating_df = pd.read_csv('../data/cleaned/cleaned_user_rating.csv')
rating_df_copy = rating_df.copy()
movies_df = pd.read_csv('../data/cleaned/cleaned_movies_details.csv', usecols=['movie_id', 'title', 'img_url'])

rating_df_copy.head()

Unnamed: 0,userID,movieID,rating
0,ur127508339,tt7737800,8
1,ur6312156,tt7737800,8
2,ur150453978,tt7737800,1
3,ur18519281,tt7737800,3
4,ur0806494,tt7737800,7


In [4]:
rating_df_copy['user_id_number'], user_mapping = pd.factorize(rating_df_copy['user_id'])
rating_df_copy['movie_id_number'], movie_mapping = pd.factorize(rating_df_copy['movie_id'])
assert(rating_df_copy['user_id_number'].nunique() == rating_df_copy['user_id'].nunique())
assert(rating_df_copy['movie_id_number'].nunique() == rating_df_copy['movie_id'].nunique())

rating_df_copy = rating_df_copy[['user_id_number', 'movie_id_number', 'rating']]
rating_df_copy.head()

Unnamed: 0,userID_number,movieID_number,rating
0,0,0,8
1,1,0,8
2,2,0,1
3,3,0,3
4,4,0,7


In [5]:
n_users = rating_df_copy['user_id_number'].max() + 1
n_movies = rating_df_copy['movie_id_number'].max() + 1
print("Number of users: ", n_users)
print("Number of movies: ", n_movies)

Number of users:  173693
Number of movies:  18662


In [6]:
user_id_to_number = {user_mapping[i]: i for i in range(len(user_mapping))}
movie_id_to_number = {movie_mapping[i]: i for i in range(len(movie_mapping))}
number_to_user_id = {i: user_mapping[i] for i in range(len(user_mapping))}
number_to_movie_id = {i: movie_mapping[i] for i in range(len(movie_mapping))}

In [7]:
user_id = 'ur6312156'
user_id_number = user_id_to_number[user_id]
print('User_id number: ', user_id_number)
assert(number_to_user_id[user_id_number] == user_id)

movie_id = 'tt0062292'
movie_id_number = movie_id_to_number[movie_id]
print('Movie_id number: ', movie_id_number)
assert(number_to_movie_id[movie_id_number] == movie_id)

UserID number:  1
MovieID number:  18660


In [8]:
new_id_start = n_movies

# Function to assign or create a movie_id_number
def assign_movie_id_number(movie_id):
    global new_id_start
    if movie_id not in movie_id_to_number:
        movie_id_to_number[movie_id] = new_id_start
        number_to_movie_id[new_id_start] = movie_id
        new_id_start += 1
    return movie_id_to_number[movie_id]

# Apply the function to create the 'movie_id_number' column
movies_df['movie_id_number'] = movies_df['movie_id'].apply(assign_movie_id_number)

In [9]:
rating_data = rating_df_copy.to_numpy()
rating_data[:5]

array([[0, 0, 8],
       [1, 0, 8],
       [2, 0, 1],
       [3, 0, 3],
       [4, 0, 7]])

In [10]:
def split_ratings_by_user(data, train_ratio=0.6, val_ratio=0.2, random_state=420):
    """
    Split ratings for each user into train, validation, and test sets
    with 60:20:20 split and maintaining rating count distribution.
    """
    np.random.seed(random_state)
    
    # Initialize empty lists for split data
    train_data_list, val_data_list, test_data_list = [], [], []
    
    # Group ratings by user using a dictionary for better efficiency
    user_groups = {}
    for row in data:
        user_groups.setdefault(row[0], []).append(row)
    
    for user_ratings in user_groups.values():
        user_ratings = np.array(user_ratings)
        total_ratings = len(user_ratings)
        
        # Shuffle user ratings to ensure randomness
        np.random.shuffle(user_ratings)
        
        # Calculate split indices
        train_end = math.ceil(total_ratings * train_ratio)
        val_end = train_end + math.ceil(total_ratings * val_ratio)
        
        # Append splits
        train_data_list.append(user_ratings[:train_end])
        val_data_list.append(user_ratings[train_end:val_end])
        test_data_list.append(user_ratings[val_end:])
    
    # Concatenate all users' split data
    train_data = np.vstack(train_data_list)
    val_data = np.vstack(val_data_list)
    test_data = np.vstack(test_data_list)
    
    return train_data, val_data, test_data

In [11]:
train_data, val_data, test_data = split_ratings_by_user(rating_data)

# AutoRec

In [12]:
# Shape: (n_movies, n_users)
def create_interaction_matrix(data, n_users, n_movies):
    interaction_matrix = np.zeros((n_movies, n_users))
    for user, movie, rating in data:
        interaction_matrix[int(movie), int(user)] = rating
    return interaction_matrix

In [13]:
# Create the interaction matrix for training and test data
train_matrix = create_interaction_matrix(train_data, n_users, n_movies)
test_matrix = create_interaction_matrix(test_data, n_users, n_movies)

In [14]:
# User-user AutoRec
class AutoRec(nn.Module):
    def __init__(self, n_users, n_embed, dropout=0.5):
        super(AutoRec, self).__init__()
        self.n_users = n_users
        self.model = nn.Sequential(
            nn.Linear(n_users, n_embed, bias=True),
            nn.Sigmoid(),
            nn.Dropout(dropout),
            nn.Linear(n_embed, n_users, bias=True)
        )

        self.init_parameters()  # Initialize weights and biases

    def init_parameters(self):
        # Apply Xavier initialization to all linear layers
        for layer in self.model:
            if isinstance(layer, nn.Linear):
                nn.init.xavier_uniform_(layer.weight)  # Xavier Uniform for weights
                if layer.bias is not None:
                    nn.init.zeros_(layer.bias)  # Initialize biases to zero
                    
    def save_model(self, path):
        torch.save(model.state_dict(), path)
        
    def load_model(self, path):
        model.load_state_dict(torch.load(path, map_location=torch.device('cpu')))
    
    def forward(self, input):
        return self.model(input)       

In [15]:
class AutoRecDataset(Dataset):
    def __init__(self, interaction_matrix):
        self.data = interaction_matrix
        self.n_movies, self.n_users = interaction_matrix.shape

    def __len__(self):
        return self.n_movies

    def __getitem__(self, idx):
        user_vector = self.data[idx]
        return torch.tensor(user_vector, dtype=torch.float32)

In [16]:
# Training the model
def train(model, trainloader, criterion, optimizer, epochs):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    model.train()
    
    for epoch in range(epochs):
        epoch_loss = 0
        pbar = tqdm(enumerate(trainloader), total=len(trainloader), desc=f"Epoch {epoch+1}/{epochs}")
        
        for it, movie_vector in pbar:
            movie_vector = movie_vector.to(device) # Shape: (batch_size, n_users)
            
            # Forward pass
            preds = model(movie_vector)

            # Calculate loss (we compare predictions to the original ratings, which are the user vector itself)
            loss = criterion(preds * torch.sign(movie_vector), movie_vector) / torch.sign(movie_vector).sum()
            epoch_loss += loss.item()

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            pbar.set_postfix(loss=loss.item())
        print(f"Epoch {epoch+1} Loss: {epoch_loss / len(trainloader)}")

In [17]:
N_EPOCHS = 50
BATCH_SIZE = 64
N_EMBED = 100
LR = 0.001
LAMBDA_REG = 0.0001

# Create the dataset and dataloader
trainset = AutoRecDataset(train_matrix)
trainloader = DataLoader(dataset=trainset, batch_size=BATCH_SIZE, shuffle=True)

# Model, loss, and optimizer
model = AutoRec(n_users, N_EMBED)  # 50 is the embedding size
criterion = nn.MSELoss(reduction='sum')
optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=LAMBDA_REG)

# Train the model
train(model, trainloader, criterion, optimizer, epochs=N_EPOCHS)

Epoch 1/50: 100%|██████████| 292/292 [00:22<00:00, 12.77it/s, loss=36.4]


Epoch 1 Loss: 37.77017334062759


Epoch 2/50: 100%|██████████| 292/292 [00:17<00:00, 17.16it/s, loss=27.1]


Epoch 2 Loss: 28.59861466002791


Epoch 3/50: 100%|██████████| 292/292 [00:17<00:00, 17.03it/s, loss=21.6]


Epoch 3 Loss: 22.161762779706144


Epoch 4/50: 100%|██████████| 292/292 [00:16<00:00, 17.22it/s, loss=18.8]


Epoch 4 Loss: 17.28366712021501


Epoch 5/50: 100%|██████████| 292/292 [00:17<00:00, 16.93it/s, loss=13.7]


Epoch 5 Loss: 13.664676966732495


Epoch 6/50: 100%|██████████| 292/292 [00:17<00:00, 16.97it/s, loss=11.7]


Epoch 6 Loss: 11.001155281720097


Epoch 7/50: 100%|██████████| 292/292 [00:17<00:00, 17.04it/s, loss=10.6]


Epoch 7 Loss: 9.080464612947752


Epoch 8/50: 100%|██████████| 292/292 [00:17<00:00, 16.91it/s, loss=8.27]


Epoch 8 Loss: 7.706886549518533


Epoch 9/50: 100%|██████████| 292/292 [00:17<00:00, 17.16it/s, loss=6.84]


Epoch 9 Loss: 6.7466292609907175


Epoch 10/50: 100%|██████████| 292/292 [00:17<00:00, 16.89it/s, loss=5.99]


Epoch 10 Loss: 6.057437238627917


Epoch 11/50: 100%|██████████| 292/292 [00:17<00:00, 16.99it/s, loss=6.51]


Epoch 11 Loss: 5.593229238301108


Epoch 12/50: 100%|██████████| 292/292 [00:17<00:00, 16.89it/s, loss=5.51]


Epoch 12 Loss: 5.298122748936692


Epoch 13/50: 100%|██████████| 292/292 [00:17<00:00, 17.07it/s, loss=5.48]


Epoch 13 Loss: 5.0838379549653565


Epoch 14/50: 100%|██████████| 292/292 [00:17<00:00, 16.91it/s, loss=5.34]


Epoch 14 Loss: 4.958191619343953


Epoch 15/50: 100%|██████████| 292/292 [00:17<00:00, 17.08it/s, loss=5.98]


Epoch 15 Loss: 4.878322500072113


Epoch 16/50: 100%|██████████| 292/292 [00:17<00:00, 16.90it/s, loss=5.56]


Epoch 16 Loss: 4.844496690247157


Epoch 17/50: 100%|██████████| 292/292 [00:17<00:00, 17.06it/s, loss=4.89]


Epoch 17 Loss: 4.815044950132501


Epoch 18/50: 100%|██████████| 292/292 [00:17<00:00, 17.03it/s, loss=5.62]


Epoch 18 Loss: 4.837248328613908


Epoch 19/50: 100%|██████████| 292/292 [00:17<00:00, 16.85it/s, loss=6.13]


Epoch 19 Loss: 4.8180368338545705


Epoch 20/50: 100%|██████████| 292/292 [00:17<00:00, 17.04it/s, loss=5.94]


Epoch 20 Loss: 4.81011698588933


Epoch 21/50: 100%|██████████| 292/292 [00:17<00:00, 16.92it/s, loss=4.95]


Epoch 21 Loss: 4.836051862533778


Epoch 22/50: 100%|██████████| 292/292 [00:17<00:00, 17.06it/s, loss=6.58]


Epoch 22 Loss: 4.833446444713906


Epoch 23/50: 100%|██████████| 292/292 [00:17<00:00, 16.89it/s, loss=6.12]


Epoch 23 Loss: 4.855752953927811


Epoch 24/50: 100%|██████████| 292/292 [00:17<00:00, 17.09it/s, loss=5.24]


Epoch 24 Loss: 4.844765690091538


Epoch 25/50: 100%|██████████| 292/292 [00:17<00:00, 16.94it/s, loss=5.63]


Epoch 25 Loss: 4.857727431271174


Epoch 26/50: 100%|██████████| 292/292 [00:17<00:00, 17.07it/s, loss=5.45]


Epoch 26 Loss: 4.85382077220368


Epoch 27/50: 100%|██████████| 292/292 [00:17<00:00, 16.78it/s, loss=5.71]


Epoch 27 Loss: 4.837745431351335


Epoch 28/50: 100%|██████████| 292/292 [00:17<00:00, 17.04it/s, loss=5.62]


Epoch 28 Loss: 4.864205460842341


Epoch 29/50: 100%|██████████| 292/292 [00:17<00:00, 16.79it/s, loss=5.89]


Epoch 29 Loss: 4.825296226429613


Epoch 30/50: 100%|██████████| 292/292 [00:17<00:00, 16.83it/s, loss=5.67]


Epoch 30 Loss: 4.847134463591118


Epoch 31/50: 100%|██████████| 292/292 [00:17<00:00, 17.00it/s, loss=5.38]


Epoch 31 Loss: 4.825492251409243


Epoch 32/50: 100%|██████████| 292/292 [00:17<00:00, 16.80it/s, loss=6.11]


Epoch 32 Loss: 4.836482584476471


Epoch 33/50: 100%|██████████| 292/292 [00:17<00:00, 17.05it/s, loss=5.01]


Epoch 33 Loss: 4.7996935909741545


Epoch 34/50: 100%|██████████| 292/292 [00:17<00:00, 16.80it/s, loss=5.42]


Epoch 34 Loss: 4.780444787789698


Epoch 35/50: 100%|██████████| 292/292 [00:17<00:00, 16.95it/s, loss=5.24]


Epoch 35 Loss: 4.758228048886338


Epoch 36/50: 100%|██████████| 292/292 [00:17<00:00, 16.87it/s, loss=5.93]


Epoch 36 Loss: 4.7724438277009416


Epoch 37/50: 100%|██████████| 292/292 [00:17<00:00, 17.00it/s, loss=5.06]


Epoch 37 Loss: 4.740875525017307


Epoch 38/50: 100%|██████████| 292/292 [00:17<00:00, 16.82it/s, loss=5.39]


Epoch 38 Loss: 4.7148479125271106


Epoch 39/50: 100%|██████████| 292/292 [00:17<00:00, 16.94it/s, loss=4.81]


Epoch 39 Loss: 4.710846317957525


Epoch 40/50: 100%|██████████| 292/292 [00:17<00:00, 16.87it/s, loss=4.4] 


Epoch 40 Loss: 4.679470863244305


Epoch 41/50: 100%|██████████| 292/292 [00:17<00:00, 16.86it/s, loss=6.16]


Epoch 41 Loss: 4.656031131744385


Epoch 42/50: 100%|██████████| 292/292 [00:17<00:00, 17.05it/s, loss=5.36]


Epoch 42 Loss: 4.657174922015569


Epoch 43/50: 100%|██████████| 292/292 [00:17<00:00, 16.80it/s, loss=5.12]


Epoch 43 Loss: 4.625222753988553


Epoch 44/50: 100%|██████████| 292/292 [00:17<00:00, 17.11it/s, loss=5.04]


Epoch 44 Loss: 4.611727398552307


Epoch 45/50: 100%|██████████| 292/292 [00:17<00:00, 16.90it/s, loss=5.71]


Epoch 45 Loss: 4.5865306233706535


Epoch 46/50: 100%|██████████| 292/292 [00:17<00:00, 17.04it/s, loss=5.52]


Epoch 46 Loss: 4.596240743382336


Epoch 47/50: 100%|██████████| 292/292 [00:17<00:00, 16.83it/s, loss=6.03]


Epoch 47 Loss: 4.537053478090731


Epoch 48/50: 100%|██████████| 292/292 [00:17<00:00, 17.03it/s, loss=5.37]


Epoch 48 Loss: 4.542691812123338


Epoch 49/50: 100%|██████████| 292/292 [00:17<00:00, 16.86it/s, loss=5.97]


Epoch 49 Loss: 4.509381546549601


Epoch 50/50: 100%|██████████| 292/292 [00:17<00:00, 16.88it/s, loss=5.14]

Epoch 50 Loss: 4.4869527139075815





In [18]:
def eval(model, valloader):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    model.eval()  # Set the model to evaluation mode

    total_rmse = 0
    total_elements = 0

    with torch.no_grad():  # Disable gradient computation during evaluation
        pbar = tqdm(valloader, total=len(valloader), desc="Evaluating")

        for movie_vector in pbar:
            movie_vector = movie_vector.to(device)  # Shape: (batch_size, n_users)

            # Forward pass
            r_hat = model(movie_vector)

            # Mask for observed ratings only
            mask = torch.sign(movie_vector)  # 1 for observed ratings, 0 otherwise
            squared_error = ((r_hat - movie_vector) * mask) ** 2

            # Accumulate RMSE
            total_rmse += squared_error.sum().item()
            total_elements += mask.sum().item()

    # Calculate final RMSE
    rmse = torch.sqrt(torch.tensor(total_rmse / total_elements)).item()
    print(f"Validation RMSE: {rmse:.4f}")
    return rmse

In [19]:
testloader = DataLoader(dataset=AutoRecDataset(test_matrix), batch_size=BATCH_SIZE, shuffle=False)
eval(model, testloader)

Evaluating: 100%|██████████| 292/292 [00:16<00:00, 17.33it/s]

Validation RMSE: 2.8014





2.801445722579956

In [20]:
eval(model, trainloader)

Evaluating: 100%|██████████| 292/292 [00:12<00:00, 24.00it/s]

Validation RMSE: 1.8244





1.8244248628616333

In [21]:
# Save model
model.save_model('autorec_embed100.pth')

In [22]:
def evaluate_recommendation(model, train_data, test_data, n_movies, n_top=5):
    hit_ratios, ndcg_scores = [], []

    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    train_matrix = create_interaction_matrix(train_data, model.n_users, n_movies)
    model = model.to(device)
    model.eval()
    
    unique_users = np.unique(test_data[:, 0])
    for u in tqdm(unique_users, desc="Evaluating Users"):
        # Get movies with ratings 8-10 in test data
        user_test_ratings = test_data[test_data[:, 0] == u]
        high_rated_test_movies = user_test_ratings[user_test_ratings[:, 2] >= 8, 1]
        
        if len(high_rated_test_movies) != 0:
            # Get all unrated movies
            train_movies = train_data[train_data[:, 0] == u, 1]
            all_movies = np.arange(n_movies)
            unrated_movies = all_movies[~np.isin(all_movies, np.concatenate([train_movies, user_test_ratings[:, 1]]))]
            
            # Combine the high-rated movie with 99 other unrated movies
            sampled_unrated_movies = np.random.choice(unrated_movies, 99, replace=False)
            
            hit_ratio = 0
            dcg = 0
            # For each high-rated test movie
            for high_rated_movie in high_rated_test_movies:
                
                candidate_movies = np.concatenate([[high_rated_movie], sampled_unrated_movies])
                inputs = torch.cat([torch.tensor(train_matrix[m], dtype=torch.float32).unsqueeze(0) for m in candidate_movies], dim=0).to(device)
                r_preds = model(inputs)
                movie_scores = {m: r_preds[i, u].item() for i, m in enumerate(candidate_movies)}
                
                # Sort movies by predicted score
                sorted_movies = sorted(movie_scores, key=movie_scores.get, reverse=True)[:n_top]
                
                # Calculate the benchmark
                if high_rated_movie in sorted_movies:
                    hit_ratio += 1 
                    rank = sorted_movies.index(high_rated_movie) + 1
                    dcg += 1 / math.log2(rank + 1)  # Discount for rank
            hit_ratios.append(hit_ratio / len(high_rated_test_movies))
            ndcg_scores.append(dcg / len(high_rated_test_movies))
            
    return np.mean(hit_ratios), np.mean(ndcg_scores)

In [23]:
# Evaluate the model on validation data
hit_ratio, ndcg = evaluate_recommendation(model, train_data, test_data, n_movies, n_top=5)
print(f"Hit Ratio: {hit_ratio:.4f}, NDCG: {ndcg:.4f}")

Evaluating Users: 100%|██████████| 6084/6084 [11:14<00:00,  9.02it/s]  


Hit Ratio: 0.0686, NDCG: 0.0421
