**Hybrid Recommender Systems**

Combine heterogeneous signals into a Hybrid Recommender System. Implement at least one hybrid strategy (e.g., weighted blending, candidate generation + reranking), and explicitly discuss the optimization trade-offs, why the performance improves or degrades, and who benefits from the hybrid structure

We combined BPR-MF with an Enhanced CB recommender. We implemented two distinct architectural strategies:
1. core-level fusion where predictions are normalized and combined using a tuned parameter $\alpha$.
2. two-stage pipeline where BPR retrieves the top 100 candidates, and the CB model reranks them for the final top 10.

We then evaluated the models on the test set and performed a segmented analysis on "Cold" vs "Warm" users.

In [2]:
# imports
import sys
sys.path.append("../src")
sys.path.append("../src/evaluation")
sys.path.append("../src/models")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numba import njit
from collections import defaultdict
from utils.data_loader import MovieLensDataLoader
from evaluator import RecommenderEvaluator
from content_based import build_tfidf_features, EnhancedContentBasedRecommender
from numba import prange

loader = MovieLensDataLoader()
train, val, test = loader.load_splits()
_, movies, _ = loader.load_raw_data()

n_users = train['user_id'].max() + 1
n_items = train['item_id'].max() + 1

Train: 797,758 | Val: 97,383 | Test: 105,068
Loaded 1,000,209 ratings
Loaded 3,883 movies
Loaded 6,040 users


In [3]:
# from 07_bpr
@njit(fastmath=True)
def bpr_epoch(user_ids, pos_items_flat, pos_offsets, pos_counts,
              n_items, user_factors, item_factors, user_bias, item_bias,
              lr, reg, n_samples):
    
    loss = 0.0
    n_users = len(pos_offsets)
    for _ in range(n_samples):
        u = np.random.randint(0, n_users)
        uid = user_ids[u]
        count = pos_counts[u]
        if count == 0:
            continue
        offset = pos_offsets[u]
        pi = pos_items_flat[offset + np.random.randint(0, count)]
        ni = np.random.randint(0, n_items)
        attempts = 0
        while attempts < 10:
            found = False
            for k in range(count):
                if pos_items_flat[offset + k] == ni:
                    found = True
                    break
            if not found:
                break
            ni = np.random.randint(0, n_items)
            attempts += 1
        x_ui = user_bias[uid] + item_bias[pi] + np.dot(user_factors[uid], item_factors[pi])
        x_uj = user_bias[uid] + item_bias[ni] + np.dot(user_factors[uid], item_factors[ni])
        x_uij = x_ui - x_uj
        sig = 1.0 / (1.0 + np.exp(x_uij))
        loss += np.log(1.0 / (1.0 + np.exp(-x_uij)) + 1e-10)
        user_factors[uid] += lr * (sig * (item_factors[pi] - item_factors[ni]) - reg * user_factors[uid])
        item_factors[pi] += lr * (sig * user_factors[uid] - reg * item_factors[pi])
        item_factors[ni] += lr * (-sig * user_factors[uid] - reg * item_factors[ni])
        item_bias[pi] += lr * (sig - reg * item_bias[pi])
        item_bias[ni] += lr * (-sig - reg * item_bias[ni])
    return loss / n_samples

In [4]:
# from 07_bpr
class BPRMF:
    def __init__(self, n_users, n_items, n_factors=64, lr=0.01, reg=0.001, n_epochs=40):
        self.n_users, self.n_items = n_users, n_items
        self.n_factors, self.lr, self.reg, self.n_epochs = n_factors, lr, reg, n_epochs
        np.random.seed(42)
        self.user_factors = np.random.normal(0, 0.01, (n_users, n_factors))
        self.item_factors = np.random.normal(0, 0.01, (n_items, n_factors))
        self.user_bias = np.zeros(n_users)
        self.item_bias = np.zeros(n_items)
    
    def fit(self, user_pos_items):
        uids, pf, offs, cnts = [], [], [], []
        offset = 0
        for uid in range(self.n_users):
            items = list(user_pos_items.get(uid, []))
            uids.append(uid); offs.append(offset); cnts.append(len(items))
            pf.extend(items); offset += len(items)
        uids, pf = np.array(uids, dtype=np.int64), np.array(pf, dtype=np.int64)
        offs, cnts = np.array(offs, dtype=np.int64), np.array(cnts, dtype=np.int64)
        n_samples = len(pf) * 5
        for ep in range(self.n_epochs):
            loss = bpr_epoch(uids, pf, offs, cnts, self.n_items,
                           self.user_factors, self.item_factors,
                           self.user_bias, self.item_bias, self.lr, self.reg, n_samples)
            if ep % 10 == 0:
                print(f"BPR уpoch {ep:3d}: loss = {loss:.4f}")
        print(f"BPR уpoch {self.n_epochs-1:3d}: loss = {loss:.4f}")
    
    def score_all_items(self, user_id):
        if user_id >= self.n_users:
            return np.zeros(self.n_items)
        return self.item_bias + self.item_factors @ self.user_factors[user_id]
    
    def predict_for_user(self, user_id, k=10, train_df=None):
        scores = self.score_all_items(user_id)
        if train_df is not None:
            seen = train_df[train_df['user_id'] == user_id]['item_id'].values
            scores[seen] = -np.inf
        top_k = np.argpartition(scores, -k)[-k:]
        top_k = top_k[np.argsort(scores[top_k])[::-1]]
        return [(int(i), float(scores[i])) for i in top_k if scores[i] > -np.inf]

In [5]:
# training BPR-MF (our best collaborative ranking model)
THRESHOLD = 4.0
pos_df = train[train['rating'] >= THRESHOLD]
user_pos_items = pos_df.groupby('user_id')['item_id'].apply(set).to_dict()

bpr = BPRMF(n_users, n_items, n_factors=64, lr=0.01, reg=0.001, n_epochs=40)
bpr.fit(user_pos_items)

BPR уpoch   0: loss = -0.3193
BPR уpoch  10: loss = -0.1372
BPR уpoch  20: loss = -0.0696
BPR уpoch  30: loss = -0.0515
BPR уpoch  39: loss = -0.0436


In [6]:
# training enhanced Content-Based model (TF-IDF on movie genres with a popularity weight)
tfidf_matrix, feature_names = build_tfidf_features(movies)
cb = EnhancedContentBasedRecommender(tfidf_matrix, popularity_weight=0.3, min_ratings_threshold=5)
cb.fit(train)
len(cb.user_profiles)

# both models are trained using the same data to prevent data leakage

6040

**Hybrid strategy A: Weighted Blending**

Linear combination of scores. Since BPR outputs unbounded dot-product scores and the CB model outputs cosine similarities (0 to 1), we must apply Min-Max normalization to both before blending. The formula is: `Score = α * BPR_norm + (1 - α) * CB_norm`

In [7]:
class WeightedBlendHybrid:

    def __init__(self, bpr_model, cb_model, alpha=0.7, n_items=None):
        self.bpr = bpr_model
        self.cb = cb_model
        self.alpha = alpha
        self.n_items = n_items
    
    def _normalize(self, scores):
        mask = scores > -np.inf
        if mask.sum() == 0:
            return scores
        valid = scores[mask]
        mn, mx = valid.min(), valid.max()
        if mx - mn < 1e-10:
            scores[mask] = 0.5
        else:
            scores[mask] = (valid - mn) / (mx - mn)
        return scores
    
    def predict_for_user(self, user_id, k=10, train_df=None):
        bpr_scores = self.bpr.score_all_items(user_id).copy()
        
        if user_id in self.cb.user_profiles:
            profile = self.cb.user_profiles[user_id]
            cb_scores = self.cb.normalized_features @ profile
            cb_scores = (1 - self.cb.popularity_weight) * cb_scores + \
                        self.cb.popularity_weight * self.cb.popularity_scores
        else:
            cb_scores = np.zeros(self.n_items)
        
        if train_df is not None:
            seen = train_df[train_df['user_id'] == user_id]['item_id'].values
            bpr_scores[seen] = -np.inf
            cb_scores[seen] = -np.inf
        
        bpr_norm = self._normalize(bpr_scores)
        cb_norm = self._normalize(cb_scores)
        
        final = self.alpha * bpr_norm + (1 - self.alpha) * cb_norm
        
        final[bpr_scores == -np.inf] = -np.inf
        
        top_k = np.argpartition(final, -k)[-k:]
        top_k = top_k[np.argsort(final[top_k])[::-1]]
        return [(int(i), float(final[i])) for i in top_k if final[i] > -np.inf]

**Hybrid strategy A: Candidate Generation + Reranking**

We use the fast BPR matrix multiplication to generate a "Candidate Pool" of 100 items, and then apply the CB model to rerank only those 100 items for the final top-10 display

In [8]:
class CandidateRerankHybrid:

    def __init__(self, bpr_model, cb_model, n_candidates=100, n_items=None):
        self.bpr = bpr_model
        self.cb = cb_model
        self.n_candidates = n_candidates
        self.n_items = n_items
    
    def predict_for_user(self, user_id, k=10, train_df=None):

        candidates = self.bpr.predict_for_user(user_id, k=self.n_candidates, train_df=train_df)
        if not candidates:
            return []
        

        if user_id not in self.cb.user_profiles:
            return candidates[:k]
        
        profile = self.cb.user_profiles[user_id]
        
        reranked = []
        for iid, bpr_score in candidates:
            if iid < len(self.cb.normalized_features):
                cb_score = float(self.cb.normalized_features[iid] @ profile)
            else:
                cb_score = 0.0

            combined = 0.6 * bpr_score + 0.4 * cb_score
            reranked.append((iid, combined))
        
        reranked.sort(key=lambda x: x[1], reverse=True)
        return reranked[:k]

In [9]:
evaluator = RecommenderEvaluator(train, test, k_values=[5, 10, 20])

metrics_bpr = evaluator.evaluate_model(bpr, model_name="BPR-MF")
evaluator.print_metrics(metrics_bpr, "BPR-MF")

metrics_cb = evaluator.evaluate_model(cb, model_name="Content-Based")
evaluator.print_metrics(metrics_cb, "Content-Based")

BPR-MF - Evaluation results
Ranking metrics:
NDCG@ 5: 0.0549
NDCG@10: 0.0649
NDCG@20: 0.0822


Relevance metrics (threshold=4.0):
Recall@ 5: 0.0375
Precision@ 5: 0.0498
Recall@10: 0.0689
Precision@10: 0.0455
Recall@20: 0.1208
Precision@20: 0.0406


Diversity metrics:
Coverage: 0.4580
Popularity bias: 1094.21
Content-Based - Evaluation results
Ranking metrics:
NDCG@ 5: 0.0304
NDCG@10: 0.0359
NDCG@20: 0.0448


Relevance metrics (threshold=4.0):
Recall@ 5: 0.0230
Precision@ 5: 0.0255
Recall@10: 0.0382
Precision@10: 0.0224
Recall@20: 0.0626
Precision@20: 0.0188


Diversity metrics:
Coverage: 0.5499
Popularity bias: 734.40


Blending weight `alpha` dictates how much we trust the collaborative signal vs content signal. We must tune this on the `val` set to avoid overfitting the test set

In [10]:
val_evaluator = RecommenderEvaluator(train, val, k_values=[10])
alphas = [0.3, 0.5, 0.6, 0.7, 0.8, 0.9]
alpha_results = []

for a in alphas:
    model = WeightedBlendHybrid(bpr, cb, alpha=a, n_items=n_items)
    m = val_evaluator.evaluate_model(model, model_name=f"Blend_a{a}")
    alpha_results.append({"alpha": a, "NDCG@10": m['NDCG@10'], "Recall@10": m['Recall@10']})
    print(f"alpha={a:.1f}: NDCG@10={m['NDCG@10']:.4f}, Recall@10={m['Recall@10']:.4f}")

best_alpha = max(alpha_results, key=lambda x: x['NDCG@10'])['alpha']
print(f"best alpha: {best_alpha}")

alpha=0.3: NDCG@10=0.0642, Recall@10=0.0750
alpha=0.5: NDCG@10=0.0720, Recall@10=0.0840
alpha=0.6: NDCG@10=0.0755, Recall@10=0.0887
alpha=0.7: NDCG@10=0.0785, Recall@10=0.0914
alpha=0.8: NDCG@10=0.0811, Recall@10=0.0938
alpha=0.9: NDCG@10=0.0805, Recall@10=0.0922
иest alpha: 0.8


low alpha means that we rate CB more and it results is worse performance. it means that CB is generally weaker predictor of user tastes than BPR

increasing alpha results in better metrics. it peaks with alpha = 0.8

increasing alpha even more to 0.9 results in worse metrics. this means that we can't just use 100% BPR and still use some CB

In [11]:
# evaluating strategies on unseen test and compare to underlying base models

blend = WeightedBlendHybrid(bpr, cb, alpha=best_alpha, n_items=n_items)
rerank = CandidateRerankHybrid(bpr, cb, n_candidates=100, n_items=n_items)

metrics_blend = evaluator.evaluate_model(blend, model_name=f"Blend(a={best_alpha})")
evaluator.print_metrics(metrics_blend, f"Blend(a={best_alpha})")

metrics_rerank = evaluator.evaluate_model(rerank, model_name="CandGen+Rerank")
evaluator.print_metrics(metrics_rerank, "CandGen+Rerank")

Blend(a=0.8) - Evaluation results
Ranking metrics:
NDCG@ 5: 0.0551
NDCG@10: 0.0657
NDCG@20: 0.0830


Relevance metrics (threshold=4.0):
Recall@ 5: 0.0388
Precision@ 5: 0.0496
Recall@10: 0.0708
Precision@10: 0.0452
Recall@20: 0.1225
Precision@20: 0.0396


Diversity metrics:
Coverage: 0.4806
Popularity bias: 1058.23
CandGen+Rerank - Evaluation results
Ranking metrics:
NDCG@ 5: 0.0555
NDCG@10: 0.0655
NDCG@20: 0.0827


Relevance metrics (threshold=4.0):
Recall@ 5: 0.0384
Precision@ 5: 0.0503
Recall@10: 0.0701
Precision@10: 0.0455
Recall@20: 0.1219
Precision@20: 0.0403


Diversity metrics:
Coverage: 0.4724
Popularity bias: 1066.91


In [12]:
results_df = pd.DataFrame(evaluator.history)
cols = ['Model', 'NDCG@5', 'NDCG@10', 'NDCG@20', 'Recall@10', 'Precision@10', 'Coverage', 'Popularity_Bias']
print(results_df[[c for c in cols if c in results_df.columns]].round(4).to_string(index=False))

         Model  NDCG@5  NDCG@10  NDCG@20  Recall@10  Precision@10  Coverage  Popularity_Bias
        BPR-MF  0.0549   0.0649   0.0822     0.0689        0.0455    0.4580        1094.2085
 Content-Based  0.0304   0.0359   0.0448     0.0382        0.0224    0.5499         734.4027
  Blend(a=0.8)  0.0551   0.0657   0.0830     0.0708        0.0452    0.4806        1058.2326
CandGen+Rerank  0.0555   0.0655   0.0827     0.0701        0.0455    0.4724        1066.9107


* both hybrid models outperform pure base models in accuracy, meaning that they complement each other. CB helps BPR in niche cases with few ratings

* Blend is slithly more accurate and has slightly better coverage than CandGen+Rerank. However scoring many items is computationally expensive and reranking allows to run on 100 items and performance drop-off is negligible compared to full Blend

* pure CB has 55% coverage and low pop bias of 734. by blending 20% of CB, hybrid gets some of that diversity, having increased its coverate by 2% from 46 to 48% and reduced pop bias from 1100 to 1060 without accuracy sacrifice

In [13]:
# checking who benefits: cold or warm start users

user_activity = train.groupby('user_id').size()
cold_users = set(user_activity[user_activity <= 30].index)
warm_users = set(user_activity[user_activity >= 100].index)

print(f"Cold users (<=30 ratings): {len(cold_users)}")
print(f"Warm users (>=100 ratings): {len(warm_users)}")

def segment_ndcg(model, test_df, train_df, user_set, k=10):
    gt = defaultdict(dict)
    for _, row in test_df.iterrows():
        if row['user_id'] in user_set:
            gt[row['user_id']][row['item_id']] = row['rating']
    ndcgs = []
    for uid, truth in gt.items():
        preds = model.predict_for_user(uid, k=k, train_df=train_df)
        rel = [truth.get(iid, 0) for iid, _ in preds[:k]]
        ideal = sorted(truth.values(), reverse=True)
        def dcg(s, k):
            s = np.array(s)[:k]
            return np.sum((2**s - 1) / np.log2(np.arange(2, len(s)+2))) if len(s) > 0 else 0.0
        d, id_ = dcg(rel, k), dcg(ideal, k)
        ndcgs.append(d / id_ if id_ > 0 else 0.0)
    return np.mean(ndcgs) if ndcgs else 0.0

models = {"BPR-MF": bpr, "Content-Based": cb, f"Blend(a={best_alpha})": blend, "CandGen+Rerank": rerank}
segments = {"Cold (<=30)": cold_users, "Warm (>=100)": warm_users}

user_seg_results = []
for mname, model in models.items():
    row = {"Model": mname}
    for sname, uset in segments.items():
        row[sname] = segment_ndcg(model, test, train, uset, k=10)
    user_seg_results.append(row)

user_seg_df = pd.DataFrame(user_seg_results)
print(user_seg_df.round(4).to_string(index=False))

Cold users (<=30 ratings): 1254
Warm users (>=100 ratings): 2453
         Model  Cold (<=30)  Warm (>=100)
        BPR-MF       0.0730        0.0707
 Content-Based       0.0478        0.0312
  Blend(a=0.8)       0.0806        0.0668
CandGen+Rerank       0.0769        0.0693


* for cold-start users with less than 30 ratings hybrid approach is clear winner, significantly beating pure models. BPR struggles here and falling back to CB allows to get better results
* for warm-start users with many ratings pure model actually beats hybrid. We already have information on them and CB is generic, thus adding 20% of it actually only adds noise and dilutes collaborative predictions

In [14]:
evaluator.save_results("../experiments/results/hybrid_comparison.csv")
user_seg_df.to_csv("../experiments/results/cold_warm_analysis.csv", index=False)
pd.DataFrame(alpha_results).to_csv("../experiments/results/alpha_tuning.csv", index=False)