In [1]:
# Enable auto-reloading so you can edit .py files without restarting the kernel
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import joblib
import os
import sys

# Add the project root to path so we can import 'src'
sys.path.append('../')

from src.data_loader import DataLoader
from src.models import CollaborativeRecommender, ContentRecommender, HybridRecommender, SVDRecommender
from src.utils import tune_alpha, tune_svd, get_tuning_sample, generate_kaggle_submission
from src.evaluation import ModelEvaluator

# Intercepts calls to NearestNeighbors, SVD, and k-means and routes them through Intel's optimized oneDAL library.
# This often results in a 10x-100x speedup on Intel chips without changing the code logic.
from sklearnex import patch_sklearn
patch_sklearn()

Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
# 1. Initialize Loader
loader = DataLoader(base_path='../data')

# 2. Get Chronological Split (Train/Test)
# We apply Time Decay (Half-Life = 120 days) for the models that use it
train_df, test_df = loader.get_time_split(train_ratio=0.8, half_life_days=120)

# 3. Get Content Artifacts (Matrices & Map)
tfidf, vectors, item_map = loader.get_content_data()

# 4. Get Full Dataset
full_df = loader.get_full_data(half_life_days=120)

print("\n>>> Data Summary:")
print(f"   Train Interactions: {len(train_df)}")
print(f"   Test Interactions:  {len(test_df)}")
print(f"   Unique Items:       {len(item_map)}")

>>> Loading Interactions...
   -> Interactions loaded: 87045 rows
>>> Splitting Data (First 80% Train)...
   -> Applying Time Decay (Half-Life: 120 days)...
   -> Train: 66580 | Test: 20465
   -> Total: 66580 + 20465 = 87045
>>> Loading Content Artifacts...
   -> Loaded features for 15291 items.
>>> Preparing Full Dataset (Half-Life: 120 days)...
   -> Full Data Weighted: 87045 rows

>>> Data Summary:
   Train Interactions: 66580
   Test Interactions:  20465
   Unique Items:       15291


# Collaborative-Based Recommender

TODO ADD EXPLANATION

No hyperparameters to tune

In [4]:
print(">>> Building Baseline Models...")

# 1. Collaborative Filtering (Memory-Based)
# It automatically detects the 'weight' column in train_df
cf_model = CollaborativeRecommender(train_df)
print(">>> Collaborative Model Built.")

>>> Building Baseline Models...
   -> Applying TF-IDF to Interaction Matrix...
>>> Collaborative Model Built.


In [6]:
print("\n>>> EVALUATING: Collaborative Filtering")
all_item_ids = full_df['item_id'].unique()
evaluator = ModelEvaluator(train_df, all_item_ids)
res_cf= evaluator.evaluate(cf_model, test_df, k=10, model_name="Collaborative (TF-IDF)")

print(f"   -> Score: {res_cf['Hit Rate @ 10']:.4%}")


>>> EVALUATING: Collaborative Filtering
   -> Pre-computing Item Popularity for Novelty metrics...
>>> Evaluating Collaborative (TF-IDF) on 20465 users...


Eval Collaborative (TF-IDF):   0%|          | 0/20465 [00:00<?, ?it/s]

   -> Score: 3.9384%


# Content-Based Recommender

In [25]:
# Content-Based Filtering
# We need to tune alpha (TF-IDF vs MiniLM)
print("\n>>> Tuning Content Alpha... (Sample of user)")
tuning_df = get_tuning_sample(test_df, n_users=1000)
best_content_alpha = tune_alpha(
    model=ContentRecommender(train_df, tfidf, vectors, item_map),
    test_df=tuning_df,
    param_name='alpha'
)


>>> Tuning Content Alpha... (Sample of user)
   -> Sampling Strategy: Selected 1000 Users
   -> Original Rows: 20465 | Sampled Rows: 2496
>>> Tuning 'alpha' on 2496 users...


alpha=0.0:  42%|████▏     | 1044/2496 [02:58<04:08,  5.84it/s]


KeyboardInterrupt: 

In [7]:
# Instantiate optimized model
best_content_alpha = 0.5
content_model = ContentRecommender(train_df, tfidf, vectors, item_map)
print(f">>> Content Model Built (Alpha: {best_content_alpha})")

>>> Content Model Built (Alpha: 0.5)


# Hybrid Content-Collaboration Recommender

In [8]:
# Instantiate the Hybrid
# We pass the best_content_alpha we just found so the Content engine inside is optimized.
hybrid_model = HybridRecommender(
    cf_model,
    content_model,
    content_alpha=best_content_alpha
)

In [7]:
# We use the same 'tuning_df' sample to keep it fast
print(">>> Tuning Hybrid Alpha (CF vs Content)...")

best_hybrid_alpha = tune_alpha(
    model=hybrid_model,
    test_df=tuning_df,  # Use the same sample for consistency/speed
    param_name='hybrid_alpha',
    values=[0.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0]
)

print(f"\n>>> Optimal Hybrid Configuration:")
print(f"   -> Content Internal Alpha: {best_content_alpha}")
print(f"   -> Hybrid Balance Alpha:   {best_hybrid_alpha}")

if best_hybrid_alpha > 0.5:
    print("   -> Interpretation: The model leans towards Collaborative Filtering (Social Signals).")
elif best_hybrid_alpha < 0.5:
    print("   -> Interpretation: The model leans towards Content Matching (Metadata).")
else:
    print("   -> Interpretation: A perfect 50/50 balance.")

>>> Tuning Hybrid Alpha (CF vs Content)...
>>> Tuning 'hybrid_alpha' on 2496 users...


hybrid_alpha=0.0: 100%|██████████| 2496/2496 [03:13<00:00, 12.89it/s]


   [0.0] Hit Rate: 3.92628205%


hybrid_alpha=0.2: 100%|██████████| 2496/2496 [03:15<00:00, 12.75it/s]


   [0.2] Hit Rate: 4.36698718%


hybrid_alpha=0.4: 100%|██████████| 2496/2496 [03:16<00:00, 12.71it/s]


   [0.4] Hit Rate: 4.36698718%


hybrid_alpha=0.5: 100%|██████████| 2496/2496 [03:15<00:00, 12.75it/s]


   [0.5] Hit Rate: 4.24679487%


hybrid_alpha=0.6: 100%|██████████| 2496/2496 [03:17<00:00, 12.64it/s]


   [0.6] Hit Rate: 4.12660256%


hybrid_alpha=0.8: 100%|██████████| 2496/2496 [03:23<00:00, 12.26it/s]


   [0.8] Hit Rate: 3.72596154%


hybrid_alpha=1.0: 100%|██████████| 2496/2496 [02:57<00:00, 14.06it/s]

   [1.0] Hit Rate: 2.76442308%

>>> Best hybrid_alpha: 0.2 (Hit Rate: 4.36698718%)

>>> Optimal Hybrid Configuration:
   -> Content Internal Alpha: 0.5
   -> Hybrid Balance Alpha:   0.2
   -> Interpretation: The model leans towards Content Matching (Metadata).





# Save parameters

In [11]:
best_content_alpha = 0.5
best_hybrid_alpha = 0.2

final_config = {
    "best_content_alpha": best_content_alpha,
    "best_hybrid_alpha": best_hybrid_alpha,
}

joblib.dump(final_config, '../data/artifacts/best_params.pkl')
print(">>> Hyperparameters Saved:")
print(final_config)

>>> Hyperparameters Saved:
{'best_content_alpha': 0.5, 'best_hybrid_alpha': 0.2}


# Retraining on full dataset

In [12]:
print("\n>>> BUILDING FINAL PRODUCTION MODELS...")

#Train Collaborative Model (Full Data)
cf_full = CollaborativeRecommender(full_df)
print("   -> Collaborative Model Retrained.")

# Train Content Model (Full Data)
# Reuses the artifacts (tfidf, vectors) which are already full catalog
content_full = ContentRecommender(
    full_df,
    tfidf,
    vectors,
    item_map
)
print("   -> Content Model Retrained.")


>>> BUILDING FINAL PRODUCTION MODELS...
   -> Applying TF-IDF to Interaction Matrix...
   -> Collaborative Model Retrained.
   -> Content Model Retrained.


In [13]:
# Hybrid Model
# We pass the optimized alpha we just saved
hybrid_full = HybridRecommender(
    cf_full,
    content_full,
    content_alpha=final_config['best_content_alpha']
)

print(f"   -> Hybrid Model Assembled (Content Alpha: {final_config['best_content_alpha']})")
print("\n>>> Final Model Ready for Submission Generation.")

   -> Hybrid Model Assembled (Content Alpha: 0.5)

>>> Final Model Ready for Submission Generation.


In [15]:
target_users = full_df['user_id'].unique()
sumbmission = generate_kaggle_submission(hybrid_full, target_users, 10, final_config['best_hybrid_alpha'])

>>> Generating predictions for 7838 users...


Generating Submission: 100%|██████████| 7838/7838 [28:16<00:00,  4.62it/s]  


In [16]:
sumbmission

Unnamed: 0,user_id,recommendation
0,0,9759 13261 4512 8096 611 6890 13307 8404 6433 ...
1,1,3222 9926 2553 27 1573 15023 1367 4024 2489 13434
2,2,8999 14990 3055 14991 8474 3062 10715 14824 80...
3,3,1436 14107 794 2553 10372 12087 611 2309 82 2145
4,4,248 7995 3505 6231 4712 1506 3127 13765 11366 ...
...,...,...
7833,7833,5838 10967 4921 667 7127 14778 7306 8498 2350 ...
7834,7834,8999 92 36 14991 7121 3062 3055 114 7178 54
7835,7835,45 9310 9719 12813 92 8369 11045 53 12811 3019
7836,7836,14557 3816 3470 3062 14552 14991 7325 611 3469...


# Save

In [17]:
# save submission
sumbmission.to_csv('../submissions/submission.csv', index=False)

In [14]:
# save all the models
joblib.dump(cf_full, '../models/cf_model.pkl')
joblib.dump(content_full, '../models/content_model.pkl')
joblib.dump(hybrid_full, '../models/hybrid_model.pkl')

['../models/hybrid_model.pkl']