# Hybrid Personalized Recommender System
This notebook implements a hybrid recommendation system combining collaborative filtering (ALS) and semantic similarity (SentenceTransformer embeddings), with fast evaluation and good MLOps practices.

In [None]:
!pip install -q datasets implicit sentence-transformers faiss-cpu mlflow

## Configuration
Set sample size, hyperparameters, and device settings.

In [None]:
SAMPLE_ROWS = 1000_000
MIN_USER_INTERACTIONS = 2
ALS_FACTORS = 64
ALS_ITERS = 15
ALPHA = 0.6
TOP_K = 10
EMB_BATCH = 128
EMB_CACHE = "/kaggle/working/product_embeddings_aligned.pt"
DEVICE = "cuda" if __import__('torch').cuda.is_available() else "cpu"
print("Device:", DEVICE)

## Load Dataset from Hugging Face Streaming

In [None]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Electronics", split="full", streaming=True, trust_remote_code=True)
sample_iter = ds.take(SAMPLE_ROWS)
reviews = pd.DataFrame(sample_iter)
print("Loaded rows:", len(reviews))
reviews.head()

## Preprocess Data
Standardize columns, drop nulls, remove duplicates.

In [None]:
rename_map = {"user_id":"reviewerID","asin":"asin","rating":"overall","text":"reviewText","timestamp":"unixReviewTime","title":"title"}
for k,v in rename_map.items():
    if k in reviews.columns and v not in reviews.columns:
        reviews.rename(columns={k:v}, inplace=True)

keep_cols = [c for c in ["reviewerID","asin","overall","reviewText","unixReviewTime","title"] if c in reviews.columns]
reviews = reviews[keep_cols].copy()
reviews.dropna(subset=["reviewerID","asin","overall"], inplace=True)
reviews.drop_duplicates(subset=["reviewerID","asin"], inplace=True)
print("After preprocessing:", reviews.shape)

## Leave-one-out Train/Test Split
Hold out last review per user for test.

In [None]:
if "unixReviewTime" in reviews.columns:
    reviews["ts"] = pd.to_datetime(reviews["unixReviewTime"], unit='ms', errors='coerce')
else:
    reviews["ts"] = pd.NaT

reviews.sort_values(["reviewerID","ts"], inplace=True)
user_counts = reviews["reviewerID"].value_counts()
valid_users = user_counts[user_counts >= MIN_USER_INTERACTIONS].index
reviews = reviews[reviews["reviewerID"].isin(valid_users)].copy()

test_idx = reviews.groupby("reviewerID").tail(1).index
test_df = reviews.loc[test_idx].copy()
train_df = reviews.drop(test_idx).copy()

print("Train size:", len(train_df), "Test size:", len(test_df))

## Sample for Fast Experimentation

In [None]:
SAMPLE_TRAIN_N = 30_000
if SAMPLE_TRAIN_N and len(train_df) > SAMPLE_TRAIN_N:
    train_dev = train_df.sample(SAMPLE_TRAIN_N, random_state=42).copy()
else:
    train_dev = train_df.copy()

print("Using sample size:", len(train_dev))

## Build Maps and Interaction Matrix

In [None]:
from scipy.sparse import coo_matrix

user_ids = train_dev["reviewerID"].unique().tolist()
item_ids = train_dev["asin"].unique().tolist()
user_map = {u:i for i,u in enumerate(user_ids)}
item_map = {p:i for i,p in enumerate(item_ids)}
index_to_asin = {i:p for p,i in item_map.items()}

rows = train_dev["reviewerID"].map(user_map).astype(int)
cols = train_dev["asin"].map(item_map).astype(int)
vals = train_dev["overall"].astype(float)

interaction_matrix = coo_matrix((vals, (rows, cols)), shape=(len(user_ids), len(item_ids)))
matrix_csr = interaction_matrix.tocsr()

print("Interaction matrix shape:", interaction_matrix.shape)

## Train ALS Model

In [None]:
from implicit.als import AlternatingLeastSquares

item_user = matrix_csr.T.tocsr()
als = AlternatingLeastSquares(factors=ALS_FACTORS, regularization=0.1, iterations=ALS_ITERS, calculate_training_loss=False)
als.fit(item_user)

print("ALS user factors shape:", als.user_factors.shape)
print("ALS item factors shape:", als.item_factors.shape)

## Product Embeddings Aligned

In [None]:
from sentence_transformers import SentenceTransformer
import torch, os

model = SentenceTransformer("all-mpnet-base-v2", device=DEVICE)
ordered_asins = [None] * len(item_map)
for asin, idx in item_map.items():
    ordered_asins[idx] = asin

grouped_text = train_dev.groupby("asin")["reviewText"].apply(lambda s: " ".join(s.astype(str).values[:5])).to_dict()
ordered_texts = [ grouped_text.get(a, "") for a in ordered_asins ]

if os.path.exists(EMB_CACHE):
    os.remove(EMB_CACHE)

all_embs = []
for i in range(0, len(ordered_texts), EMB_BATCH):
    batch = ordered_texts[i:i+EMB_BATCH]
    emb = model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
    all_embs.append(emb)
embeddings = torch.cat(all_embs, dim=0)
torch.save(embeddings, EMB_CACHE)

print("Aligned embeddings shape:", embeddings.shape)

## Compute User Profile Embeddings

In [None]:
user_profile = {}
for user_id in user_map.keys():
    user_items = train_dev[train_dev["reviewerID"] == user_id]["asin"].tolist()
    idxs = [item_map[a] for a in user_items if a in item_map]
    if not idxs:
        continue
    emb_stack = embeddings[idxs]
    user_profile[user_id] = emb_stack.mean(dim=0, keepdim=True)

print("User profiles computed:", len(user_profile))

## Hybrid Recommendation Function

In [None]:
from collections import Counter
from sentence_transformers import util as st_util

def hybrid_recommend(user_id, topn=10, alpha=ALPHA):
    popular_items = [p for p,_ in Counter(train_dev["asin"]).most_common(topn)]

    if user_id not in user_map:
        return popular_items

    uidx = user_map[user_id]
    try:
        item_indices, cf_scores = als.recommend(uidx, matrix_csr[uidx], N=topn * 5)
    except Exception:
        return popular_items

    uprofile = user_profile.get(user_id, None)

    hybrid_scores = []
    for idx, cf_s in zip(item_indices, cf_scores):
        if (idx < 0) or (idx >= embeddings.shape[0]):
            sem_s = 0.0
        else:
            if uprofile is None:
                sem_s = 0.0
            else:
                sem_s = float(st_util.cos_sim(uprofile, embeddings[idx].unsqueeze(0)).cpu().numpy()[0][0])

        final_score = alpha * float(cf_s) + (1.0 - alpha) * float(sem_s)
        hybrid_scores.append((idx, final_score))

    hybrid_scores.sort(key=lambda x: -x[1])
    chosen_asins = [index_to_asin[idx] for idx, _ in hybrid_scores[:topn] if idx in index_to_asin]

    return chosen_asins or popular_items

## Evaluate System Performance
Evaluate using Precision@K, Recall@K, NDCG@K, Hit Rate, and MAP@K.

In [None]:
import random
import numpy as np

random.seed(42)
TOP_K = 10

eval_users = [u for u in test_df["reviewerID"].unique() if u in user_profile and u in user_map]
eval_sample = random.sample(eval_users, min(1000, len(eval_users)))

precisions, recalls, ndcgs, hits, maps = [], [], [], [], []

for u in eval_sample:
    recs = hybrid_recommend(u, topn=TOP_K, alpha=ALPHA)
    true_items = set([test_df.loc[test_df["reviewerID"] == u, "asin"].iloc[0]])
    precisions.append(precision_at_k(recs, true_items, TOP_K))
    recalls.append(recall_at_k(recs, true_items, TOP_K))
    ndcgs.append(ndcg_at_k(recs, true_items, TOP_K))
    hits.append(hit_rate_at_k(recs, true_items, TOP_K))
    maps.append(average_precision_at_k(recs, true_items, TOP_K))

print("\nEvaluation Results:")
print(f"Precision@{TOP_K}: {np.mean(precisions):.6f}")
print(f"Recall@{TOP_K}: {np.mean(recalls):.6f}")
print(f"NDCG@{TOP_K}: {np.mean(ndcgs):.6f}")
print(f"HitRate@{TOP_K}: {np.mean(hits):.6f}")
print(f"MAP@{TOP_K}: {np.mean(maps):.6f}")

## Sample Recommendations for Example Users
Showcase personalized recommendations for a few users.

In [None]:
sample_users = random.sample(eval_users, 5)

for u in sample_users:
    recommendations = hybrid_recommend(u, topn=TOP_K, alpha=ALPHA)
    print(f"Recommendations for user {u}: {recommendations}")