# Hybrid Personalized Recommender System
This notebook implements a hybrid recommendation system combining collaborative filtering (ALS) and semantic similarity (SentenceTransformer embeddings), with fast evaluation and good MLOps practices.

In [1]:
!pip install -q datasets implicit sentence-transformers faiss-cpu mlflow

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m80.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.4/26.4 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m50.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Configuration
Set sample size, hyperparameters, and device settings.

In [2]:
SAMPLE_ROWS = 1000_000
MIN_USER_INTERACTIONS = 2
ALS_FACTORS = 64
ALS_ITERS = 15
ALPHA = 0.6
TOP_K = 10
EMB_BATCH = 128
EMB_CACHE = "/kaggle/working/product_embeddings_aligned.pt"
DEVICE = "cuda" if __import__('torch').cuda.is_available() else "cpu"
print("Device:", DEVICE)

Device: cuda


## Load Dataset from Hugging Face Streaming

In [3]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("McAuley-Lab/Amazon-Reviews-2023", "raw_review_Electronics", split="full", streaming=True, trust_remote_code=True)
sample_iter = ds.take(SAMPLE_ROWS)
reviews = pd.DataFrame(sample_iter)
print("Loaded rows:", len(reviews))
reviews.head()

README.md: 0.00B [00:00, ?B/s]

Amazon-Reviews-2023.py: 0.00B [00:00, ?B/s]

Loaded rows: 1000000


Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,3.0,Smells like gasoline! Going back!,First & most offensive: they reek of gasoline ...,[{'small_image_url': 'https://m.media-amazon.c...,B083NRGZMM,B083NRGZMM,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1658185117948,0,True
1,1.0,Didn’t work at all lenses loose/broken.,These didn’t work. Idk if they were damaged in...,[],B07N69T6TM,B07N69T6TM,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1592678549731,0,True
2,5.0,Excellent!,I love these. They even come with a carry case...,[],B01G8JO5F2,B01G8JO5F2,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1523093017534,0,True
3,5.0,Great laptop backpack!,I was searching for a sturdy backpack for scho...,[],B001OC5JKY,B001OC5JKY,AGGZ357AO26RQZVRLGU4D4N52DZQ,1290278495000,18,True
4,5.0,Best Headphones in the Fifties price range!,I've bought these headphones three times becau...,[],B013J7WUGC,B07CJYMRWM,AG2L7H23R5LLKDKLBEF2Q3L2MVDA,1676601581238,0,True


## Preprocess Data
Standardize columns, drop nulls, remove duplicates.

In [4]:
rename_map = {"user_id":"reviewerID","asin":"asin","rating":"overall","text":"reviewText","timestamp":"unixReviewTime","title":"title"}
for k,v in rename_map.items():
    if k in reviews.columns and v not in reviews.columns:
        reviews.rename(columns={k:v}, inplace=True)

keep_cols = [c for c in ["reviewerID","asin","overall","reviewText","unixReviewTime","title"] if c in reviews.columns]
reviews = reviews[keep_cols].copy()
reviews.dropna(subset=["reviewerID","asin","overall"], inplace=True)
reviews.drop_duplicates(subset=["reviewerID","asin"], inplace=True)
print("After preprocessing:", reviews.shape)

After preprocessing: (997501, 6)


## Leave-one-out Train/Test Split
Hold out last review per user for test.

In [5]:
if "unixReviewTime" in reviews.columns:
    reviews["ts"] = pd.to_datetime(reviews["unixReviewTime"], unit='ms', errors='coerce')
else:
    reviews["ts"] = pd.NaT

reviews.sort_values(["reviewerID","ts"], inplace=True)
user_counts = reviews["reviewerID"].value_counts()
valid_users = user_counts[user_counts >= MIN_USER_INTERACTIONS].index
reviews = reviews[reviews["reviewerID"].isin(valid_users)].copy()

test_idx = reviews.groupby("reviewerID").tail(1).index
test_df = reviews.loc[test_idx].copy()
train_df = reviews.drop(test_idx).copy()

print("Train size:", len(train_df), "Test size:", len(test_df))

Train size: 812259 Test size: 126792


## Sample for Fast Experimentation

In [6]:
SAMPLE_TRAIN_N = 30_000
if SAMPLE_TRAIN_N and len(train_df) > SAMPLE_TRAIN_N:
    train_dev = train_df.sample(SAMPLE_TRAIN_N, random_state=42).copy()
else:
    train_dev = train_df.copy()

print("Using sample size:", len(train_dev))

Using sample size: 30000


## Build Maps and Interaction Matrix

In [7]:
from scipy.sparse import coo_matrix

user_ids = train_dev["reviewerID"].unique().tolist()
item_ids = train_dev["asin"].unique().tolist()
user_map = {u:i for i,u in enumerate(user_ids)}
item_map = {p:i for i,p in enumerate(item_ids)}
index_to_asin = {i:p for p,i in item_map.items()}

rows = train_dev["reviewerID"].map(user_map).astype(int)
cols = train_dev["asin"].map(item_map).astype(int)
vals = train_dev["overall"].astype(float)

interaction_matrix = coo_matrix((vals, (rows, cols)), shape=(len(user_ids), len(item_ids)))
matrix_csr = interaction_matrix.tocsr()

print("Interaction matrix shape:", interaction_matrix.shape)

Interaction matrix shape: (22054, 23095)


## Train ALS Model

In [8]:
from implicit.als import AlternatingLeastSquares

item_user = matrix_csr.T.tocsr()
als = AlternatingLeastSquares(factors=ALS_FACTORS, regularization=0.1, iterations=ALS_ITERS, calculate_training_loss=False)
als.fit(item_user)

print("ALS user factors shape:", als.user_factors.shape)
print("ALS item factors shape:", als.item_factors.shape)

  check_blas_config()
  check_blas_config()


  0%|          | 0/15 [00:00<?, ?it/s]

ALS user factors shape: (23095, 64)
ALS item factors shape: (22054, 64)


## Product Embeddings Aligned

In [9]:
from sentence_transformers import SentenceTransformer
import torch, os

model = SentenceTransformer("all-mpnet-base-v2", device=DEVICE)
ordered_asins = [None] * len(item_map)
for asin, idx in item_map.items():
    ordered_asins[idx] = asin

grouped_text = train_dev.groupby("asin")["reviewText"].apply(lambda s: " ".join(s.astype(str).values[:5])).to_dict()
ordered_texts = [ grouped_text.get(a, "") for a in ordered_asins ]

if os.path.exists(EMB_CACHE):
    os.remove(EMB_CACHE)

all_embs = []
for i in range(0, len(ordered_texts), EMB_BATCH):
    batch = ordered_texts[i:i+EMB_BATCH]
    emb = model.encode(batch, convert_to_tensor=True, show_progress_bar=False)
    all_embs.append(emb)
embeddings = torch.cat(all_embs, dim=0)
torch.save(embeddings, EMB_CACHE)

print("Aligned embeddings shape:", embeddings.shape)

2025-09-14 21:18:05.415227: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757884685.602923      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757884685.666228      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Aligned embeddings shape: torch.Size([23095, 768])


## Compute User Profile Embeddings

In [10]:
user_profile = {}
for user_id in user_map.keys():
    user_items = train_dev[train_dev["reviewerID"] == user_id]["asin"].tolist()
    idxs = [item_map[a] for a in user_items if a in item_map]
    if not idxs:
        continue
    emb_stack = embeddings[idxs]
    user_profile[user_id] = emb_stack.mean(dim=0, keepdim=True)

print("User profiles computed:", len(user_profile))

User profiles computed: 22054


## Hybrid Recommendation Function

In [11]:
from collections import Counter
from sentence_transformers import util as st_util

def hybrid_recommend(user_id, topn=10, alpha=ALPHA):
    popular_items = [p for p,_ in Counter(train_dev["asin"]).most_common(topn)]

    if user_id not in user_map:
        return popular_items

    uidx = user_map[user_id]
    try:
        item_indices, cf_scores = als.recommend(uidx, matrix_csr[uidx], N=topn * 5)
    except Exception:
        return popular_items

    uprofile = user_profile.get(user_id, None)

    hybrid_scores = []
    for idx, cf_s in zip(item_indices, cf_scores):
        if (idx < 0) or (idx >= embeddings.shape[0]):
            sem_s = 0.0
        else:
            if uprofile is None:
                sem_s = 0.0
            else:
                sem_s = float(st_util.cos_sim(uprofile, embeddings[idx].unsqueeze(0)).cpu().numpy()[0][0])

        final_score = alpha * float(cf_s) + (1.0 - alpha) * float(sem_s)
        hybrid_scores.append((idx, final_score))

    hybrid_scores.sort(key=lambda x: -x[1])
    chosen_asins = [index_to_asin[idx] for idx, _ in hybrid_scores[:topn] if idx in index_to_asin]

    return chosen_asins or popular_items

## Evaluate System Performance
Evaluate using Precision@K, Recall@K, NDCG@K, Hit Rate, and MAP@K.

In [12]:
import math

def precision_at_k(recommended, ground_truth, k):
    return sum([1 for r in recommended[:k] if r in ground_truth]) / k if recommended else 0.0

def recall_at_k(recommended, ground_truth, k):
    return sum([1 for r in recommended[:k] if r in ground_truth]) / len(ground_truth) if ground_truth else 0.0

def ndcg_at_k(recommended, ground_truth, k):
    dcg = sum((1.0 / math.log2(i + 2)) for i, r in enumerate(recommended[:k]) if r in ground_truth)
    idcg = sum((1.0 / math.log2(i + 2)) for i in range(min(len(ground_truth), k)))
    return dcg / idcg if idcg > 0 else 0.0

def hit_rate_at_k(recommended, ground_truth, k):
    return 1.0 if any(r in ground_truth for r in recommended[:k]) else 0.0

def average_precision_at_k(recommended, ground_truth, k):
    if not ground_truth:
        return 0.0
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(recommended[:k]):
        if p in ground_truth:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    return score / min(len(ground_truth), k)


In [13]:
import random
import numpy as np

random.seed(42)
TOP_K = 10

eval_users = [u for u in test_df["reviewerID"].unique() if u in user_profile and u in user_map]
eval_sample = random.sample(eval_users, min(1000, len(eval_users)))

precisions, recalls, ndcgs, hits, maps = [], [], [], [], []

for u in eval_sample:
    recs = hybrid_recommend(u, topn=TOP_K, alpha=ALPHA)
    true_items = set([test_df.loc[test_df["reviewerID"] == u, "asin"].iloc[0]])
    precisions.append(precision_at_k(recs, true_items, TOP_K))
    recalls.append(recall_at_k(recs, true_items, TOP_K))
    ndcgs.append(ndcg_at_k(recs, true_items, TOP_K))
    hits.append(hit_rate_at_k(recs, true_items, TOP_K))
    maps.append(average_precision_at_k(recs, true_items, TOP_K))

print("\nEvaluation Results:")
print(f"Precision@{TOP_K}: {np.mean(precisions):.6f}")
print(f"Recall@{TOP_K}: {np.mean(recalls):.6f}")
print(f"NDCG@{TOP_K}: {np.mean(ndcgs):.6f}")
print(f"HitRate@{TOP_K}: {np.mean(hits):.6f}")
print(f"MAP@{TOP_K}: {np.mean(maps):.6f}")


Evaluation Results:
Precision@10: 0.000200
Recall@10: 0.002000
NDCG@10: 0.000688
HitRate@10: 0.002000
MAP@10: 0.000311


## Sample Recommendations for Example Users
Showcase personalized recommendations for a few users.

In [14]:
sample_users = random.sample(eval_users, 5)

for u in sample_users:
    recommendations = hybrid_recommend(u, topn=TOP_K, alpha=ALPHA)
    print(f"Recommendations for user {u}: {recommendations}")

Recommendations for user AHAYA6QEG3JHAMOYBO4E5DYRVIFQ: ['B0177M4SLK', 'B003YNS0W0', 'B00B3UR496', 'B01IG5J27M', 'B07W84R76K', 'B003EM2WAW', 'B07JHLB3ZD', 'B07MJW5BXZ', 'B01K6PBIO0', 'B07H9VJ15W']
Recommendations for user AGREHSWE5OWSM7PSQ2DRUWQMPU4A: ['B075M9PT6H', 'B004G7U5LC', 'B003L137Y6', 'B00C7T9AJ4', 'B00FRHTTJE', 'B00BW3FUOK', 'B01IG5J27M', 'B00AJSWC0Y', 'B07B9VFT48', 'B00OOJPAGW']
Recommendations for user AGIFLB6HGS3DEZMIYID4N2ESTMVQ: ['B00BWHILCY', 'B012VZ7MUM', 'B0896TPV9P', 'B01IG5J27M', 'B00FRHTTJE', 'B07L3BQ2ZX', 'B00HFRYAGQ', 'B007R5YFS4', 'B003O973OA', 'B07QFWNMJD']
Recommendations for user AEE7BHAFPMY75OFCVUMW2QMTSGEA: ['B000083JZ1', 'B012BMK6C6', 'B07NC9J2M5', 'B00L1UEZS6', 'B01DBGVB7K', 'B00GY0UFFA', 'B074M2HLL4', 'B07T3FT5KJ', 'B07F3GQCMJ', 'B07TPLZY74']
Recommendations for user AF5C2VN2T3YFMG37IF6UNYL32GCA: ['B08CR35RC5', 'B009SKPVO8', 'B00GNRA01S', 'B07CQ2NGB2', 'B00004YK10', 'B004C3AW40', 'B08DLSDK1H', 'B00002NAX7', 'B001W28L2Y', 'B0894PX3HP']
