In [1]:
import numpy as np
import pandas as pd
import os
import sys
import re
import random
import torch
from typing import List, Dict, Optional
import matplotlib.pyplot as plt
from tqdm import tqdm
import optuna
from optuna.pruners import MedianPruner
from optuna.exceptions import TrialPruned
import warnings

In [2]:
CONFIG = {
    'num_partitions': 5,
    'tuning_epoch': 5,
    'checkpoint_interval': 5,
    'num_tuning_trials': 3,
    'seed': 42 # or None
}

In [3]:
sys.path.append('..') 

from src.models.autoencoder import AutoEncoder
from src.models.autoencoder_trainer import *
from src.data.data_utils import *

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
data_dir = '../NETFLIX_DATA/partitions/train'
val_dir = '../NETFLIX_DATA/partitions/validation'
checkpoint_dir = '../model_checkpoints'

In [5]:
# retreive training data info
train_partition_files = get_data(data_dir)
print(f"Number of training partitions: {len(train_partition_files)}")
val_partition_files = get_data(val_dir)
print(f"Number of validation partitions: {len(val_partition_files)}")

if CONFIG['seed'] is not None:
    random.seed(CONFIG['seed'])

# testing
sample_train_partitions = random.sample(train_partition_files, CONFIG['num_partitions'])

sample_val_partitions = []
for partition in sample_train_partitions:
  val_partition = partition.copy()
  val_partition['path'] = partition['path'].replace('train', 'validation')
  sample_val_partitions.append(val_partition)


print(f"Train EX: {sample_train_partitions[0]}")
print(f"Val EX: {sample_val_partitions[0]}")

Number of training partitions: 34
Number of validation partitions: 34
Train EX: {'path': '../NETFLIX_DATA/partitions/train/part_1_7.parquet', 'part': 1, 'group': 7}
Val EX: {'path': '../NETFLIX_DATA/partitions/validation/part_1_7.parquet', 'part': 1, 'group': 7}


In [6]:
# build user_map and movie_map
user_map, movie_map = map_id(sample_train_partitions)

Mapping IDs: 100%|██████████| 5/5 [00:01<00:00,  4.15it/s]

Map successful for 423631 users, 2880 movies





In [7]:
# preload user rating profiles

train_user_data = AutoEncoder.load_user_data(partitions=sample_train_partitions, 
                                             user_map=user_map)

validation_user_data = AutoEncoder.load_validation_data(partitions=sample_val_partitions, 
                                                        user_map=user_map,
                                                        movie_map=movie_map)

Loading users' rating profile 1/5: 100%|██████████| 1828517/1828517 [01:43<00:00, 17633.02it/s]
Loading users' rating profile 2/5: 100%|██████████| 2772664/2772664 [02:31<00:00, 18299.16it/s]
Loading users' rating profile 3/5:  48%|████▊     | 1337590/2776385 [01:15<01:21, 17663.65it/s]

KeyboardInterrupt



In [None]:
# optuna objective
def objective(trial):
    params = {
        "num_epochs": CONFIG['tuning_epoch'],
        "batch_size": 512,
        "learning_rate": trial.suggest_float("learning_rate", 0.0001, 0.001, log=True),
        "hidden_dims": trial.suggest_categorical("hidden_dims", 
                                                 [[1024,256,128], [512,256,128], [256,128], [512,128]]),
        "dropout": trial.suggest_float("dropout", 0.3, 0.7),
        "l2_reg": trial.suggest_float("l2_reg", 0.00001, 0.01, log=True),
        "checkpoint_interval": CONFIG['checkpoint_interval'],
        "eval_interval": 1,
    }

                #[[512,256,128], [256,128], [512,128]]

    try:
        print(params)
        
        model, rmse = train_autoencoder(
            train_partitions=sample_train_partitions,
            user_map=user_map,
            movie_map=movie_map,
            validation_partitions=sample_val_partitions,
            checkpoint_dir=checkpoint_dir,
            trial=trial,
            user_data=train_user_data,
            validation_data=validation_user_data,
            **params
        )

        return float(rmse)
    except optuna.TrialPruned:
        raise # reraise prune error for tuning

In [None]:
# tuning
warnings.filterwarnings("ignore", module="optuna.*")

study = optuna.create_study(
    study_name=f"autoencoder_tuning_{CONFIG['num_partitions']}_samples", 
    direction='minimize',
    
    # prune after 2 trials, after 1 if really bad
    pruner=MedianPruner(n_startup_trials=2, n_warmup_steps=1),
    
    sampler=optuna.samplers.TPESampler(),
    storage=f"sqlite:///optuna_study_{CONFIG['num_partitions']}_samples.db",
    load_if_exists=True
)

study.optimize(objective, n_trials=CONFIG['num_tuning_trials'], timeout=6*3600)

In [None]:
print("Best trial")
print(f"RMSE: {study.best_value:.4f}")
print(f"Params: {study.best_params}")

In [None]:
print(f"Total trials run: {len(study.trials)}")
completed_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
print(f"Completed trials: {len(completed_trials)}")

if len(completed_trials) > 0:
    print(f"Best RMSE so far: {study.best_value:.4f}")
    print(f"Best params: {study.best_params}")
    
    # Check if there's convergence
    recent_trials = completed_trials[-5:]  # Last 5 completed
    recent_values = [t.value for t in recent_trials]
    print(f"Recent RMSE values: {recent_values}")

In [None]:
train_users = set(train_user_data.keys())
val_users = set(validation_user_data.keys())
overlap = train_users.intersection(val_users)
print(f"User overlap: {len(overlap)} / {len(val_users)} = {len(overlap)/len(val_users)*100:.1f}%")

In [None]:
# 1. Check user/movie overlap
print("User overlap analysis:")
print(f"Train users: {len(train_user_data)}")
print(f"Val users: {len(validation_user_data)}")
user_overlap = len(set(train_user_data.keys()) & set(validation_user_data.keys()))
print(f"Overlapping users: {user_overlap}")

# 2. Check if validation is too easy
val_ratings = []
for user_ratings in validation_user_data.values():
    val_ratings.extend([r for _, r in user_ratings])
print(f"Validation rating distribution: {np.histogram(val_ratings, bins=5)}")