In [6]:
import numpy as np
import pandas as pd
import os
import re
import random
from typing import List, Dict
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from tqdm import tqdm
import sys

In [7]:
sys.path.append('..') 

from src.models.als import ALS
from src.data.data_utils import *

In [2]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [17]:
def evaluate_model(model, validation_partitions):
    """Evaluate ALS model on validation partitions"""
    all_errors = []

    for partition in tqdm(validation_partitions, desc="Evaluating validation partitions"):

        df = pd.read_parquet(partition['path'], columns=['user_id', 'movie_id', 'rating'])

        predictions = model.batch_predict(df)

        errors = (predictions - df['rating'].values) ** 2
        all_errors.extend(errors)

    rmse = np.sqrt(np.mean(all_errors))
    print(f"Validation RMSE: {rmse:.4f}")

    return rmse

In [9]:
def get_model_save_path(dir):
  i = 1
  while True:
    save_path = os.path.join(dir, f'model_{i}.pkl')
    if not os.path.exists(save_path):
      return save_path
    i += 1

In [7]:
data_dir = '/gdrive/MyDrive/Netflix_Prize/split/train'
output_dir = '/gdrive/MyDrive/Netflix_Prize/results'
val_dir = '/gdrive/MyDrive/Netflix_Prize/split/validation'
checkpoint_dir = '/gdrive/MyDrive/Netflix_Prize/checkpoints'

In [8]:
# retreive training data info
train_partition_files = get_data(data_dir)
print(f"Number of training partitions: {len(train_partition_files)}")
val_partition_files = get_data(val_dir)
print(f"Number of validation partitions: {len(val_partition_files)}")

seed = 42
random.seed(seed)

# testing
sample_train_partitions = random.sample(train_partition_files, 7)

sample_val_partitions = []
for partition in sample_train_partitions:
  val_partition = partition.copy()
  val_partition['path'] = partition['path'].replace('train', 'validation')
  sample_val_partitions.append(val_partition)


print(f"Train EX: {sample_train_partitions[0]}")
print(f"Val EX: {sample_val_partitions[0]}")

Number of training partitions: 34
Number of validation partitions: 34
Train EX: {'path': '/gdrive/MyDrive/Netflix_Prize/split/train/part_1_7.parquet', 'part': 1, 'group': 7}
Val EX: {'path': '/gdrive/MyDrive/Netflix_Prize/split/validation/part_1_7.parquet', 'part': 1, 'group': 7}


In [9]:
# build user_map and movie_map
user_map, movie_map = map_id(sample_train_partitions)

Mapping IDs: 100%|██████████| 7/7 [00:06<00:00,  1.07it/s]


Map successful for 431670 users, 3720 movies


In [10]:
# 7 each as a test + for tuning
train_partitions = sample_train_partitions
val_partitions = sample_val_partitions

In [13]:
# Tuning grid
param_grid = {
    'num_factors': [10, 25, 50],
    'lambda_reg': [0.1, 1],
    'num_iters': [15, 20]
}

In [12]:
# init model - small test values

model = ALS(
    num_factors=50,
    lambda_reg=0.1,
    num_iters=15,
    val_interval=5
)

model.fit(
    partitions=train_partitions,
    user_map=user_map,
    movie_map=movie_map,
    checkpoint_dir=checkpoint_dir
)

model.save_model(get_model_save_path(output_dir))

val_rmse = evaluate_model(model, val_partitions)

Building sparse matrices for 431670 users, and 3720 movies


Building sparse matrix: 100%|██████████| 7/7 [01:23<00:00, 11.89s/it]
ALS Optimization:  27%|██▋       | 4/15 [11:55<32:56, 179.67s/it]


Saved checkpoint to /gdrive/MyDrive/Netflix_Prize/checkpoints/checkpoint_5.npz, in 11.142310380935669 seconds


ALS Optimization:  33%|███▎      | 5/15 [15:04<30:33, 183.34s/it]


Iteration 5 -- RMSE: 0.5020


ALS Optimization:  60%|██████    | 9/15 [27:02<18:00, 180.16s/it]


Saved checkpoint to /gdrive/MyDrive/Netflix_Prize/checkpoints/checkpoint_10.npz, in 10.830766201019287 seconds


ALS Optimization:  67%|██████▋   | 10/15 [30:13<15:17, 183.59s/it]


Iteration 10 -- RMSE: 0.4755


ALS Optimization:  93%|█████████▎| 14/15 [41:56<02:57, 177.36s/it]


Saved checkpoint to /gdrive/MyDrive/Netflix_Prize/checkpoints/checkpoint_15.npz, in 11.345720529556274 seconds


ALS Optimization: 100%|██████████| 15/15 [45:06<00:00, 180.46s/it]


Iteration 15 -- RMSE: 0.4666






Trained Model saved at /gdrive/MyDrive/Netflix_Prize/results/model_1.pkl, in 12.369998216629028 seconds


Evaluating validation partitions: 100%|██████████| 7/7 [00:11<00:00,  1.65s/it]


Validation RMSE: 1.6540


In [11]:
# retreive full training data info
data_dir = '../NETFLIX_DATA/partitions/train'
val_dir = '../NETFLIX_DATA/partitions/validation'

train_partition_files = get_data(data_dir)
val_partition_files = get_data(val_dir)

random.seed(42)

full_train_partitions = random.sample(train_partition_files, 7)

full_val_partitions = []
for partition in full_train_partitions:
  val_partition = partition.copy()
  val_partition['path'] = partition['path'].replace('train', 'validation')
  full_val_partitions.append(val_partition)


print(f"Train EX: {full_train_partitions[0]}")
print(f"Val EX: {full_val_partitions[0]}")

Train EX: {'path': '../NETFLIX_DATA/partitions/train/part_1_7.parquet', 'part': 1, 'group': 7}
Val EX: {'path': '../NETFLIX_DATA/partitions/validation/part_1_7.parquet', 'part': 1, 'group': 7}


In [12]:
user_map, movie_map = map_id(full_train_partitions)

Mapping IDs: 100%|██████████| 7/7 [00:01<00:00,  4.43it/s]

Map successful for 431670 users, 3720 movies





In [13]:
# load test set
test_path = "../NETFLIX_DATA/test.parquet"

test_df = pd.read_parquet(test_path)
print(f"Test set size: {len(test_df):,} ratings")

Test set size: 1,408,395 ratings


In [14]:
model_path = "../model_checkpoints/ALS/model_1.pkl.npz"

model = ALS.load_model(model_path, user_map, movie_map)

predictions = model.batch_predict(test_df)

Loading model from ../model_checkpoints/ALS/model_1.pkl.npz


In [20]:
errors = (predictions - test_df['rating'].values) ** 2
rmse = np.sqrt(np.mean(errors))

evaluate_model(model, full_val_partitions)
print(f"Test RMSE: {rmse:.4f}")

Evaluating validation partitions: 100%|██████████| 7/7 [00:02<00:00,  2.59it/s]

Validation RMSE: 1.6540
Test RMSE: 1.4698



