In [1]:
# Enable auto-reloading so you can edit .py files without restarting the kernel
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import joblib
import os
import sys

# Add the project root to path so we can import 'src'
sys.path.append('../')

from src.data_loader import DataLoader
from src.models import CollaborativeRecommender, ContentRecommender, HybridRecommender, SVDRecommender
from src.utils import tune_alpha, tune_svd
from src.evaluation import ModelEvaluator

# Intercepts calls to NearestNeighbors, SVD, and k-means and routes them through Intel's optimized oneDAL library.
# This often results in a 10x-100x speedup on Intel chips without changing the code logic.
from sklearnex import patch_sklearn
patch_sklearn()

Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
# 1. Initialize Loader
loader = DataLoader(base_path='../data')

# 2. Get Chronological Split (Train/Test)
# We apply Time Decay (Half-Life = 120 days) for the models that use it
train_df, test_df = loader.get_time_split(train_ratio=0.8, half_life_days=120)

# 3. Get Content Artifacts (Matrices & Map)
tfidf, vectors, item_map = loader.get_content_data()

print("\n>>> Data Summary:")
print(f"   Train Interactions: {len(train_df)}")
print(f"   Test Interactions:  {len(test_df)}")
print(f"   Unique Items:       {len(item_map)}")

>>> Loading Interactions...
   -> Interactions loaded: 87045 rows
>>> Splitting Data (First 80% Train)...
   -> Applying Time Decay (Half-Life: 120 days)...
   -> Train: 66580 | Test: 20465
   -> Total: 66580 + 20465 = 87045
>>> Loading Content Artifacts...
   -> Loaded features for 15291 items.

>>> Data Summary:
   Train Interactions: 66580
   Test Interactions:  20465
   Unique Items:       15291


# Collaborative-Based Recommender

TODO ADD EXPLANATION

No hyperparameters to tune

In [3]:
print(">>> Building Baseline Models...")

# 1. Collaborative Filtering (Memory-Based)
# It automatically detects the 'weight' column in train_df
cf_model = CollaborativeRecommender(train_df)
print(">>> Collaborative Model Built.")

>>> Building Baseline Models...
   -> Collaborative Model detected time weights. Using them.
>>> Collaborative Model Built.


# Content-Based Recommender

In [None]:
# 2. Content-Based Filtering
# We need to tune alpha (TF-IDF vs MiniLM)
print("\n>>> Tuning Content Alpha...")
best_content_alpha = tune_alpha(
    model=ContentRecommender(train_df, tfidf, vectors, item_map),
    test_df=test_df,
    param_name='alpha'
)


>>> Tuning Content Alpha...
>>> Tuning 'alpha' on 20465 users...


alpha=0.0:   7%|â–‹         | 1430/20465 [01:36<26:25, 12.00it/s]

In [None]:
# Instantiate optimized model
content_model = ContentRecommender(train_df, tfidf, vectors, item_map)
print(f">>> Content Model Built (Alpha: {best_content_alpha})")

# Hybrid Content-Collaboration Recommender