In [49]:
import numpy as np
import pandas as pd
import os
import re
import random
from typing import List, Dict
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from tqdm import tqdm

from als import ALS

In [29]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [57]:
data_dir = '/gdrive/MyDrive/Netflix_Prize/split/train'
output_dir = '/gdrive/MyDrive/Netflix_Prize/results'
val_dir = '/gdrive/MyDrive/Netflix_Prize/split/validation'
checkpoint_dir = '/gdrive/MyDrive/Netflix_Prize/checkpoints'

In [52]:
def get_data(data_dir: str):
  partition_files = []

  pattern = re.compile(r'part_(\d+)_(\d+)\.parquet$')

  for filename in os.listdir(data_dir):
    match = pattern.match(filename)
    if match:
        file_path = os.path.join(data_dir, filename)
        part_num = int(match.group(1))
        group_num = int(match.group(2))

        partition_files.append({
            'path': file_path,
            'part': part_num,
            'group': group_num
        })

  # sorted partitions in order
  partition_files.sort(key=lambda x: (x['part'], x['group']))
  return partition_files

In [55]:
train_partition_files = get_data(data_dir)
print(f"Number of training partitions: {len(train_partition_files)}")
val_partition_files = get_data(val_dir)
print(f"Number of validation partitions: {len(val_partition_files)}")

# testing
sample_train_partitions = train_partition_files[:5]
sample_val_partitions = val_partition_files[:5]
print(f"Train: {sample_train_partitions[0]}")
print(f"Val: {sample_train_partitions[0]}")

Number of training partitions: 34
Number of validation partitions: 34
Train: {'path': '/gdrive/MyDrive/Netflix_Prize/split/train/part_1_0.parquet', 'part': 1, 'group': 0}
Val: {'path': '/gdrive/MyDrive/Netflix_Prize/split/train/part_1_0.parquet', 'part': 1, 'group': 0}


In [60]:
def map_id(partition_files: List[Dict], sample_size=None):
  """
  Map user and movie ids to denser indicies -> for matrix factorization
  """

  user_ids = set()
  movie_ids = set()

  if sample_size and sample_size < len(partition_files):
    partition_sample = random.sample(partition_files, sample_size)
  else:
    partition_sample = partition_files

  for partition in tqdm(partition_sample, desc="Mapping IDs"):
    df = pd.read_parquet(partition['path'], columns=['user_id', 'movie_id'])

    user_ids.update(df['user_id'].unique())
    movie_ids.update(df['movie_id'].unique())

  user_id_map = {user_id: idx for idx, user_id in enumerate(sorted(user_ids))}
  movie_id_map = {movie_id: idx for idx, movie_id in enumerate(sorted(movie_ids))}

  print(f"Map successful for {len(user_id_map)} users, {len(movie_id_map)} movies")

  return user_id_map, movie_id_map

user_map, movie_map = map_id(train_partition_files, sample_size=5)

Mapping IDs: 100%|██████████| 5/5 [00:05<00:00,  1.11s/it]


Map successful for 428145 users, 2649 movies


In [61]:
# 5 each as a test
train_partitions = sample_train_partitions
val_partitions = sample_val_partitions

In [62]:
# init model - small test values

model = ALS(
    num_factors=10,
    lambda_reg=0.1,
    num_iters=5,
    val_interval=2
)

model.fit(
    partitions=train_partitions,
    user_map=user_map,
    movie_map=movie_map,
    checkpoint_dir=checkpoint_dir
)

model.save_model(output_dir)

Building sparse matrices for 428145 users, and 2649 movies


Building sparse matrix...:  20%|██        | 1/5 [00:05<00:21,  5.33s/it]



Building sparse matrix...:  40%|████      | 2/5 [00:08<00:12,  4.32s/it]



Building sparse matrix...: 100%|██████████| 5/5 [00:28<00:00,  5.70s/it]
ALS Optimization:  20%|██        | 1/5 [01:15<05:00, 75.07s/it]

Saved checkpoint to /gdrive/MyDrive/Netflix_Prize/checkpoints/checkpoint_1.npz
Saved checkpoint to /gdrive/MyDrive/Netflix_Prize/checkpoints/checkpoint_2.npz


ALS Optimization:  40%|████      | 2/5 [02:34<03:52, 77.59s/it]

Iteration 2 -- RMSE: 0.6124


ALS Optimization:  60%|██████    | 3/5 [03:51<02:34, 77.45s/it]

Saved checkpoint to /gdrive/MyDrive/Netflix_Prize/checkpoints/checkpoint_3.npz
Saved checkpoint to /gdrive/MyDrive/Netflix_Prize/checkpoints/checkpoint_4.npz


ALS Optimization:  80%|████████  | 4/5 [05:09<01:17, 77.61s/it]

Iteration 4 -- RMSE: 0.5148


ALS Optimization: 100%|██████████| 5/5 [06:25<00:00, 77.10s/it]

Saved checkpoint to /gdrive/MyDrive/Netflix_Prize/checkpoints/checkpoint_5.npz





Trained Model saved at /gdrive/MyDrive/Netflix_Prize/results


In [63]:
# NEED EVAL AND FIX INCREASING COMPUTATION TIME ISSUE, IMPLEMENT ON-LINE LEARNING TO ADDRESS MEMORY PROBLEM.