# H&M Recommendation Models: All Experiments

This notebook contains all experiments from basic baselines to a hybrid model.
Run the cells sequentially from top to bottom.

## 1. Setup

In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import torch
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
import glob

from data import HnMLightningDataModule
from models import NeuMF
from tqdm.notebook import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

## 2. Load Data

In [None]:
dm = HnMLightningDataModule(data_dir='../data', batch_size=2048, num_workers=4)
dm.setup()

articles_df = pd.read_csv('../data/articles.csv', dtype={'article_id': str})
articles_df = articles_df.set_index('article_id')

## 3. Evaluation Metric (MAP@12)

In [None]:
def apk(actual, predicted, k=12):
    if not actual:
        return 0.0
    if len(predicted) > k:
        predicted = predicted[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(actual), k)

def mapk(actual, predicted, k=12):
    if not actual or not predicted:
        return 0.0
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

val_true_items = dm.val_df.groupby('customer_id')['article_id'].apply(list).to_dict()
val_users = list(val_true_items.keys())
actuals = [val_true_items.get(user, []) for user in val_users]

## 4. Model 1: Most Popular Items

In [None]:
popular_items = dm.train_df['article_id'].value_counts().index[:12].tolist()
popular_predictions = [popular_items] * len(val_users)
map_popular = mapk(actuals, popular_predictions, k=12)
print(f"[Result] Most Popular Items MAP@12: {map_popular:.6f}")

## 5. Model 2: NeuMF

### 5.1. Training

In [None]:
model = NeuMF(num_users=dm.num_users, num_items=dm.num_items)
checkpoint_callback = ModelCheckpoint(monitor='val_loss', dirpath='../models/', filename='best-neumf-model', save_top_k=1, mode='min')
trainer = pl.Trainer(max_epochs=1, accelerator='auto', logger=True, callbacks=[checkpoint_callback])
trainer.fit(model, dm)

### 5.2. Evaluation

In [None]:
best_model_path = checkpoint_callback.best_model_path
print(f"Loading best NeuMF model from: {best_model_path}")
neumf_model = NeuMF.load_from_checkpoint(best_model_path)
neumf_model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
neumf_model.to(device)

all_item_ids_mapped = list(dm.item_map.keys())
all_item_indices = torch.tensor(list(dm.item_map.values()), dtype=torch.long).to(device)

neumf_predictions = []
for user_id_str in tqdm(val_users, desc="Generating NeuMF Predictions"):
    user_idx = dm.user_map.get(user_id_str)
    if user_idx is not None:
        user_tensor = torch.tensor([user_idx] * len(all_item_indices), dtype=torch.long).to(device)
        with torch.no_grad():
            scores = neumf_model(user_tensor, all_item_indices)
        top_indices = torch.argsort(scores, descending=True)[:12]
        neumf_predictions.append([all_item_ids_mapped[i] for i in top_indices.cpu().numpy()])
    else:
        neumf_predictions.append(popular_items)

map_neumf = mapk(actuals, neumf_predictions, k=12)
print(f"[Result] NeuMF Model MAP@12: {map_neumf:.6f}")

## 6. Model 3: Content-Based Filtering

In [None]:
# Content Feature Engineering
for col in ['prod_name', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'colour_group_name', 'department_name', 'index_name', 'section_name', 'garment_group_name']:
    articles_df[col] = articles_df[col].fillna('')
articles_df['content'] = articles_df.apply(lambda row: ' '.join(row.astype(str)), axis=1)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(articles_df['content'])
article_id_to_idx = {id: i for i, id in enumerate(articles_df.index)}

# Generate Predictions
user_history = dm.train_df.groupby('customer_id')['article_id'].apply(list).to_dict()
content_predictions = []
for user_id in tqdm(val_users, desc="Generating Content-Based Predictions"):
    history_articles = user_history.get(user_id, [])
    history_indices = [article_id_to_idx[art_id] for art_id in history_articles if art_id in article_id_to_idx]
    if history_indices:
        user_profile_vector = tfidf_matrix[history_indices].mean(axis=0)
        cosine_sims = cosine_similarity(user_profile_vector, tfidf_matrix).flatten()
        top_indices = np.argsort(cosine_sims)[::-1][:50]
        rec_items = [articles_df.index[i] for i in top_indices if articles_df.index[i] not in history_articles]
        content_predictions.append(rec_items[:12])
    else:
        content_predictions.append(popular_items)

map_content = mapk(actuals, content_predictions, k=12)
print(f"[Result] Content-Based Filtering MAP@12: {map_content:.6f}")

## 7. Model 4: Hybrid (NeuMF + Content-Based)

In [None]:
hybrid_predictions = []
alpha = 0.5 # Weight for NeuMF model
scaler = MinMaxScaler()

for i, user_id_str in enumerate(tqdm(val_users, desc="Generating Hybrid Predictions")):
    # Get NeuMF scores (already calculated)
    neumf_recs = neumf_predictions[i]
    neumf_scores = {art_id: (12-rank) for rank, art_id in enumerate(neumf_recs)}
    
    # Get Content-based scores (already calculated)
    content_recs = content_predictions[i]
    content_scores = {art_id: (12-rank) for rank, art_id in enumerate(content_recs)}
    
    # Combine scores
    hybrid_scores = {}
    all_recs = set(neumf_recs) | set(content_recs)
    for art_id in all_recs:
        hybrid_scores[art_id] = alpha * neumf_scores.get(art_id, 0) + (1 - alpha) * content_scores.get(art_id, 0)
        
    # Sort and get top 12
    sorted_hybrid = sorted(hybrid_scores.items(), key=lambda item: item[1], reverse=True)[:12]
    hybrid_predictions.append([art_id for art_id, score in sorted_hybrid])

map_hybrid = mapk(actuals, hybrid_predictions, k=12)
print(f"[Result] Hybrid Model MAP@12: {map_hybrid:.6f}")