# 1. Import Libraries

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import math
import zipfile
import os

from google.colab import files
from collections import defaultdict
from datetime import datetime
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import KFold, cross_val_score

from sklearn.decomposition import TruncatedSVD

# 2. Load the Dataset

In [3]:
uploaded = files.upload()

Saving archive.zip to archive (1).zip


In [7]:
with zipfile.ZipFile("archive.zip", "r") as zip_ref:
    zip_ref.extractall("movielens")

# Optional: List extracted files
os.listdir("movielens")

['genome_tags.csv',
 'link.csv',
 'movie.csv',
 'rating.csv',
 'genome_scores.csv',
 'tag.csv']

In [5]:
rating_path = '/content/movielens/rating.csv'
rating_dataset = pd.read_csv(rating_path)
print("Dataset Loaded:\n", rating_dataset.head())

Dataset Loaded:
    userId  movieId  rating            timestamp
0       1        2     3.5  2005-04-02 23:53:47
1       1       29     3.5  2005-04-02 23:31:16
2       1       32     3.5  2005-04-02 23:33:39
3       1       47     3.5  2005-04-02 23:32:07
4       1       50     3.5  2005-04-02 23:29:40


In [6]:
movie_path = '/content/movielens/movie.csv'
movie_dataset = pd.read_csv(movie_path)
print("Dataset Loaded:\n", movie_dataset.head())

Dataset Loaded:
    movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  


In [7]:
print(rating_dataset.shape)

(20000263, 4)


In [8]:
print(rating_dataset.isnull().sum())

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


In [9]:
print(rating_dataset.dtypes)

userId         int64
movieId        int64
rating       float64
timestamp     object
dtype: object


# 3. Preprocess the Dataset

In [10]:
rating_dataset = rating_dataset.drop(columns=['timestamp'])

In [11]:
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

rating_dataset['user_index'] = user_encoder.fit_transform(rating_dataset['userId'])
rating_dataset['movie_index'] = movie_encoder.fit_transform(rating_dataset['movieId'])

num_users = rating_dataset['user_index'].nunique()
num_movies = rating_dataset['movie_index'].nunique()

print(f"Users: {num_users}, Movies: {num_movies}")

Users: 138493, Movies: 26744


In [12]:
train_data, test_data = train_test_split(
    rating_dataset[['user_index', 'movie_index', 'rating']],
    test_size=0.2,
    random_state=42
)

# 4. Reference Latent Factor Model (LFM)

In [13]:
train_matrix = csr_matrix((
    train_data['rating'],
    (train_data['user_index'], train_data['movie_index'])
), shape=(num_users, num_movies))

In [14]:
n_components = 20

svd_ref = TruncatedSVD(n_components=n_components, random_state=42)

start_time = datetime.now()
svd_ref.fit(train_matrix)
end_time = datetime.now()

train_time = (end_time - start_time).microseconds
print(f"Training time: {train_time:.2f} mcs")

Training time: 134427.00 mcs


In [15]:
# Transform users in training data
user_features = svd_ref.transform(train_matrix)
item_features = svd_ref.components_.T

In [16]:
test_preds = []
test_truth = []

for _, row in test_data.iterrows():
    u = int(row['user_index'])
    m = int(row['movie_index'])
    true_rating = row['rating']

    if u >= user_features.shape[0] or m >= item_features.shape[0]:
        continue  # skip if out of bounds

    pred_rating = np.dot(user_features[u], item_features[m])
    pred_rating = np.clip(pred_rating, 0.5, 5.0)

    test_preds.append(pred_rating)
    test_truth.append(true_rating)

In [17]:
# Compute MSE and then RMSE manually
mse = mean_squared_error(test_truth, test_preds)
rmse = np.sqrt(mse)

# MAE is safe
mae = mean_absolute_error(test_truth, test_preds)

print(f"Reference LFM (TruncatedSVD) RMSE: {rmse:.4f}")
print(f"Reference LFM (TruncatedSVD) MAE:  {mae:.4f}")

Reference LFM (TruncatedSVD) RMSE: 2.6756
Reference LFM (TruncatedSVD) MAE:  2.4302


# 5. Custom Latent Factor Model (LFM)

In [18]:
class CustomTruncatedSVD:
    def __init__(self, n_components=20, learning_rate=0.01, reg=0.1, n_iter=10):
        self.n_components = n_components
        self.lr = learning_rate
        self.reg = reg
        self.n_iter = n_iter

    def fit(self, train_data, num_users, num_items, batch_size=100000):
        self.num_users = num_users
        self.num_items = num_items

        self.P = np.random.normal(0, 0.01, (num_users, self.n_components))
        self.Q = np.random.normal(0, 0.01, (num_items, self.n_components))

        user_indices = train_data['user_index'].values.astype(np.int32)
        item_indices = train_data['movie_index'].values.astype(np.int32)
        ratings = train_data['rating'].values.astype(np.float32)

        n_samples = len(ratings)

        for epoch in range(self.n_iter):
            perm = np.random.permutation(n_samples)
            total_loss = 0

            for start in range(0, n_samples, batch_size):
                end = min(start + batch_size, n_samples)
                batch_users = user_indices[perm[start:end]]
                batch_items = item_indices[perm[start:end]]
                batch_ratings = ratings[perm[start:end]]

                pred = np.sum(self.P[batch_users] * self.Q[batch_items], axis=1)
                err = np.clip(batch_ratings - pred, -10, 10)

                dP = err[:, np.newaxis] * self.Q[batch_items] - self.reg * self.P[batch_users]
                dQ = err[:, np.newaxis] * self.P[batch_users] - self.reg * self.Q[batch_items]

                dP = np.clip(dP, -1, 1)
                dQ = np.clip(dQ, -1, 1)

                np.add.at(self.P, batch_users, self.lr * dP)
                np.add.at(self.Q, batch_items, self.lr * dQ)

                total_loss += np.sum(err ** 2)

            rmse = np.sqrt(total_loss / n_samples)
            print(f"Epoch {epoch+1}/{self.n_iter}, RMSE: {rmse:.4f}")

    def predict(self, u, i):
        if u >= self.P.shape[0] or i >= self.Q.shape[0]:
            return 3.0  # default rating
        return np.clip(np.dot(self.P[u], self.Q[i]), 0.5, 5.0)

    def evaluate(self, test_data):
        user_idx = test_data['user_index'].values.astype(np.int32)
        item_idx = test_data['movie_index'].values.astype(np.int32)
        true_ratings = test_data['rating'].values

        valid_mask = (user_idx < self.P.shape[0]) & (item_idx < self.Q.shape[0])
        user_idx = user_idx[valid_mask]
        item_idx = item_idx[valid_mask]
        true_ratings = true_ratings[valid_mask]

        preds = np.sum(self.P[user_idx] * self.Q[item_idx], axis=1)
        preds = np.clip(preds, 0.5, 5.0)

        rmse = np.sqrt(mean_squared_error(true_ratings, preds))
        mae = mean_absolute_error(true_ratings, preds)
        return rmse, mae

In [19]:
n_components = 20

svd_cus = CustomTruncatedSVD(n_components=n_components, learning_rate=0.01, reg=0.1, n_iter=10)

start_time = datetime.now()
svd_cus.fit(train_data, num_users, num_movies)
end_time = datetime.now()

train_time = (end_time - start_time).microseconds
print(f"Training time: {train_time:.2f} mcs")

Epoch 1/10, RMSE: 2.3083
Epoch 2/10, RMSE: 1.8703
Epoch 3/10, RMSE: 1.7155
Epoch 4/10, RMSE: 1.5887
Epoch 5/10, RMSE: 1.4970
Epoch 6/10, RMSE: 1.4356
Epoch 7/10, RMSE: 1.4210
Epoch 8/10, RMSE: 1.4307
Epoch 9/10, RMSE: 1.4421
Epoch 10/10, RMSE: 1.4495
Training time: 909477.00 mcs


In [20]:
rmse_custom, mae_custom = svd_cus.evaluate(test_data)
print(f"Custom LFM RMSE: {rmse_custom:.4f}")
print(f"Custom LFM MAE:  {mae_custom:.4f}")

Custom LFM RMSE: 1.2390
Custom LFM MAE:  0.9036
