In [50]:
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import numpy as np
import scipy.sparse as sp
import joblib
import random
from tqdm.notebook import tqdm
from sklearn.metrics.pairwise import cosine_similarity

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [51]:
def load_and_clean_data():
    print(">>> Loading Data...")

    # Load the Full Data
    full_catalog = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/data/interactions_merged.csv')
    print(">>> Full Catalog loaded...")

    # Load the Split Data
    train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/data/train_marged_interactions.csv')
    test_df  = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/data/val_merged_interactions.csv')
    print(">>> Split Data loaded...")

    # Load the Map (The bridge between Item IDs and Matrix Rows)
    # This file has currently 87k rows
    index_map = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/data/02_interaction_index_map.csv')
    print(">>> Index Map loaded...")

    # Load the Matrices (87k rows) and the MiniLM embeddings
    tfidf_bloated = sp.load_npz('/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/data/02_tfidf_matrix.npz')
    print(">>> TF-IDF Matrix loaded...")
    minilm_bloated = np.load('/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/data/02_minilm_embeddings.npy')
    print(">>> MiniLM Embeddings loaded...")

    print(f">>> Deduplicating {len(index_map)} vector rows to unique items...")

    # Identify the FIRST row index for every unique item_id
    # We drop duplicates based on item_id, keeping the first occurrence.
    unique_map = index_map.drop_duplicates(subset='item_id', keep='first')

    # Get the list of row indices we want to KEEP
    unique_indices = unique_map.index.values

    # Slice the bloated matrices
    # We take only the rows corresponding to unique items
    unique_tfidf = tfidf_bloated[unique_indices]
    unique_minilm = minilm_bloated[unique_indices]

    # Create the New Lookup Dictionary
    # We need to map: item_id -> NEW row index in the sliced matrix
    unique_item_ids = unique_map['item_id'].values
    item_to_row_idx = {item_id: i for i, item_id in enumerate(unique_item_ids)}

    print("\n>>> Data Loaded & Cleaned:")
    print(f">>> Train Interactions: {len(train_df)}")
    print(f">>> Test Interactions:  {len(test_df)}")
    print(f">>> Unique Items in Catalog: {len(item_to_row_idx)}")
    print(f">>> Matrix Shapes: TF-IDF {unique_tfidf.shape}, MiniLM {unique_minilm.shape}")

    return full_catalog, train_df, test_df, unique_tfidf, unique_minilm, item_to_row_idx

In [52]:
full_catalog, train_df, test_df, item_tfidf, item_minilm, item_map = load_and_clean_data()

>>> Loading Data...
>>> Full Catalog loaded...
>>> Split Data loaded...
>>> Index Map loaded...
>>> TF-IDF Matrix loaded...
>>> MiniLM Embeddings loaded...
>>> Deduplicating 87045 vector rows to unique items...

>>> Data Loaded & Cleaned:
>>> Train Interactions: 79207
>>> Test Interactions:  7838
>>> Unique Items in Catalog: 15109
>>> Matrix Shapes: TF-IDF (15109, 15000), MiniLM (15109, 384)


In [53]:
train_df.head()

Unnamed: 0,user_id,item_id,timestamp,datetime,year,month,day,day_of_week,age_days,time_decay,Title,Author,Publisher,Subjects,Subjects_list
0,0,0,1680191000.0,2023-03-30 15:44:30,2023,3,30,3,564,0.917025,classification décimale universelle édition ab...,unknown author,ed du cefal,classification décimale universelle; indexatio...,"['classification décimale universelle', ' inde..."
1,0,1,1680783000.0,2023-04-06 12:13:54,2023,4,6,3,557,0.917991,les interactions dans lenseignement des langue...,cicurel francine 1947,didier,didactiquelangue étrangère enseignement; didac...,"['didactiquelangue étrangère enseignement', ' ..."
2,0,2,1680801000.0,2023-04-06 17:15:08,2023,4,6,3,556,0.91802,histoire de vie et recherche biographique pers...,unknown author,lharmattan,histoires de vie en sociologie; sciences socia...,"['histoires de vie en sociologie', ' sciences ..."
3,0,3,1683715000.0,2023-05-10 10:35:45,2023,5,10,2,523,0.922787,ce livre devrait me permettre de résoudre le c...,mazas sylvain 1980,vraoum,moyenorient; bandes dessinées autobiographique...,"['moyenorient', ' bandes dessinées autobiograp..."
4,0,3,1683715000.0,2023-05-10 10:35:50,2023,5,10,2,523,0.922787,ce livre devrait me permettre de résoudre le c...,mazas sylvain 1980,vraoum,moyenorient; bandes dessinées autobiographique...,"['moyenorient', ' bandes dessinées autobiograp..."


# Shared Utilities

In [54]:
def get_book_titles(item_ids, metadata_df):
    """
    Standalone function to map Item IDs to Titles.
    Works for ANY model (Collaborative or Content).
    """
    # Create a subset of the metadata for these items
    subset = metadata_df[metadata_df['item_id'].isin(item_ids)]

    # Drop duplicates to ensure unique book info
    subset = subset[['item_id', 'Title', 'Author']].drop_duplicates(subset='item_id')

    # Re-index to match the order of the input list (Rank 1, Rank 2...)
    return subset.set_index('item_id').reindex(item_ids)

In [76]:
def tune_alpha(model, test_df, param_name='alpha', k=10, alphas=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0]):
    """
    Universal tuner for any model that accepts an alpha-like weight parameter.

    Args:
        model: The recommender object (Content or Hybrid).
        test_df: The validation dataset.
        param_name: The name of the argument to pass to .recommend() (e.g., 'alpha' or 'hybrid_alpha').
    """
    print(f">>> Tuning '{param_name}' on {len(test_df)} users...")

    results = {}

    for alpha in alphas:
        hits = 0
        total = 0

        # Determine the kwargs for this specific iteration
        # This dynamically creates the argument dict: {'alpha': 0.2} or {'hybrid_alpha': 0.2}
        kwargs = {param_name: alpha, 'top_k': k}

        for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc=f"{param_name}={alpha}"):
            user_id = row['user_id']
            target = row['item_id']

            try:
                # We unpack **kwargs to pass the correct parameter name dynamically
                recs = model.recommend(user_id, **kwargs)

                if target in recs:
                    hits += 1
            except:
                pass
            total += 1

        score = hits / total if total > 0 else 0
        results[alpha] = score
        print(f"   [{alpha:.1f}] Hit Rate: {score:.4%}")

    best_val = max(results, key=results.get)
    print(f"\n>>> Winner: {param_name} = {best_val}")
    print(f"   -> Score: {results[best_val]:.4%}")

    return best_val

## TO ADAPT AND ADD MUCH LATER FOR ALL MODELS

In [55]:
def evaluate_model(model, test_df, k=10):
    """
    Calculates Hit Rate @ K.
    metric: "For each user, does the hidden test item appear in the Top K recommendations?"
    """
    print(f"Evaluating Model on {len(test_df)} test users...")

    hits = 0
    total = 0

    # We iterate through the Test Set
    # (Since it's Leave-One-Out, each user has exactly 1 row in test_df)
    for index, row in test_df.iterrows():
        user_id = row['user_id']
        target_item = row['item_id']

        # Ask model for Top K recommendations
        try:
            recs = model.recommend(user_id, top_k=k)

            # Check if target is in recommendations
            if target_item in recs:
                hits += 1
        except:
            pass # Handle errors gracefully (e.g. unknown user)

        total += 1

        # Progress tracker (optional, prints every 1000 users)
        if total % 1000 == 0:
            print(f"   Processed {total} users...")

    hit_rate = hits / total
    print(f"Evaluation Complete.")
    print(f"   -> Hit Rate @ {k}: {hit_rate:.4f} ({hit_rate*100:.2f}%)")
    return hit_rate

# Collaborative-Based Recommender




The `CollaborativeRecommender` we are building uses an Item-Item Nearest Neighbor approach with a Dot Product scoring mechanism.

*To score a candidate book (Target), we compare it to every book the user has actually read (History). If the Target is similar to the History (i.e., read by the same people), it gets a high score.*

For a specific User $u$ and a Candidate Item $i$:
$$Score(u, i) = \sum_{j \in H_u} Similarity(i, j)$$


Where:
- $H_u$: The set of items in User $u$'s history (Training data).
- $j$ : An item the user has already read.
- $Similarity(i, j)$ : The similarity score between the Candidate $i$ and History Item $j$.

**Vectorized Calculation in Code**:
Instead of iterating through every user-item pair, we use Linear Algebra for efficiency. We define a User Profile Vector ($\vec{P}_u$) which represents the "center of gravity" of the user's taste based on the items they interacted with:

$$\vec{P}_u = \sum_{j \in H_u} \vec{V}_j$$

Where $\vec{V}_j$ is the column vector from the Item-User interaction matrix, representing all users who read book $j$.

Then, we calculate scores for all items in the catalog at once using a dot product:

$$\vec{Scores} = R \cdot \vec{P}_u$$

Where $R$ is the Item-User matrix. Note: In the code below, `item_user_matrix` is already transposed to (Items x Users), so we perform `matrix.dot(vector)` directly.

In [56]:
class CollaborativeRecommender:
    def __init__(self, train_df):
        """
        Item-Item Collaborative Filtering using Sparse Matrices.
        """
        self.train_df = train_df

       # 1. Create Mappings (User/Item <-> Matrix Index)
        # We need these to convert IDs to row/col numbers efficiently
        self.user_ids = train_df['user_id'].unique()
        self.item_ids = train_df['item_id'].unique()

        # FIX: Correctly map UserID -> 0..N and ItemID -> 0..M
        self.user2idx = {u: i for i, u in enumerate(self.user_ids)}
        self.item2idx = {item: i for i, item in enumerate(self.item_ids)}
        self.idx2item = {i: item for i, item in enumerate(self.item_ids)}

        # 2. Build Sparse Interaction Matrix (Rows=Users, Cols=Items)
        # We transpose it to (Items x Users) for Item-Item Similarity
        print("   -> Building Sparse Interaction Matrix...")
        rows = train_df['user_id'].map(self.user2idx)
        cols = train_df['item_id'].map(self.item2idx)
        data = np.ones(len(train_df)) # Binary interaction (1 = read)

        # Shape: (N_Users, N_Items)
        self.interaction_matrix = sp.coo_matrix(
            (data, (rows, cols)),
            shape=(len(self.user_ids), len(self.item_ids))
        ).tocsr()

        # Transpose for Item-Item calculations: (N_Items, N_Users)
        self.item_user_matrix = self.interaction_matrix.T.tocsr()

        # 3. Pre-compute Global Popularity (Fallback)
        self.popular_items = train_df['item_id'].value_counts().head(20).index.tolist()

    def recommend(self, user_id, top_k=10):
        """
        Recommends items based on user's history.
        Logic: Sum of similarity vectors of items the user has read.
        """
        # 1. Handle New Users (Cold Start User not in training)
        if user_id not in self.user2idx:
            return self.popular_items[:top_k]

        # 2. Get User History
        user_idx = self.user2idx[user_id]

        # Get indices of items this user interacted with (Sparse slicing)
        user_history_vec = self.interaction_matrix[user_idx]
        history_indices = user_history_vec.indices

        # If user exists but has 0 history (shouldn't happen with our split, but good safety)
        if len(history_indices) == 0:
            return self.popular_items[:top_k]

        # 3. Compute Scores (Vectorized)
        # Get vectors for items the user liked (Rows corresponding to history items)
        # Shape: (N_History, N_Users)
        relevant_item_vecs = self.item_user_matrix[history_indices]

        # Calculate "User Profile" = Sum of columns (users) who read these books
        # This creates a vector representing the "aggregate readership" of the user's taste
        user_profile = np.asarray(relevant_item_vecs.sum(axis=0)).flatten()

        # Find items that overlap with this profile
        # Score = Item_Vectors (N_Items, N_Users) * User_Profile (N_Users)
        # Result = (N_Items) score vector
        scores = self.item_user_matrix.dot(user_profile)

        # 4. Rank & Filter
        # Sort indices by score descending
        top_indices = scores.argsort()[::-1]

        recommendations = []
        for idx in top_indices:
            item = self.idx2item[idx]

            # Exclude items the user has already read
            if idx not in history_indices:
                recommendations.append(item)

            if len(recommendations) >= top_k:
                break

        return recommendations

In [57]:
cf_model = CollaborativeRecommender(train_df)
print(">>> Collaborative Filtering Model Built.")

   -> Building Sparse Interaction Matrix...
>>> Collaborative Filtering Model Built.


Let's try for on one random user

In [58]:
all_users = train_df['user_id'].unique()
sample_user = random.choice(all_users)
cf_recs = cf_model.recommend(sample_user, top_k=10)

In [69]:
cf_recs

[np.int64(13587),
 np.int64(739),
 np.int64(7115),
 np.int64(14166),
 np.int64(14081),
 np.int64(611),
 np.int64(287),
 np.int64(14553),
 np.int64(5415),
 np.int64(78)]

In [59]:
history_ids = train_df[train_df['user_id'] == sample_user]['item_id'].unique()
print(f">>> User {sample_user} History (Last 5 Read)")
display(get_book_titles(history_ids[-5:], train_df)[['Title', 'Author']])

print(f"\n\n >>> User {sample_user} Recommendations:")
get_book_titles(cf_recs, full_catalog)

>>> User 7384 History (Last 5 Read)


Unnamed: 0_level_0,Title,Author
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
14557,bienvenue,abouet marguerite 1971
3471,aya de yopougon,abouet marguerite 1971
10914,comment le voile est devenu musulman,aboudrar brunonassim
618,le siècle de jeanne,burnand éric 1953
8473,frère dâme roman,diop david




 >>> User 7384 Recommendations:


Unnamed: 0_level_0,Title,Author
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
13587,assassination classroom,matsui yusei
739,lattaque des titans édition colossale,isayama hajime
7115,reine degypte,inudoh chie 1985
14166,chainsaw man,fujimoto tatsuki
14081,akira,otomo katsuhiro
611,larabe du futur,sattouf riad
287,le siècle demma une famille suisse dans les tu...,burnand eric
14553,gotham central,brubaker ed
5415,le château des animaux,delep félix 1994
78,fullmetal alchemist,arakawa hiromu 1973


We save the model for future evaluations

In [60]:
#joblib.dump(cf_model, '/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/models/03_cf_model.pkl')

In [61]:
cf_score = evaluate_model(cf_model, test_df, k=10) #temporary

📉 Evaluating Model on 7838 test users...
   Processed 1000 users...
   Processed 2000 users...
   Processed 3000 users...
   Processed 4000 users...
   Processed 5000 users...
   Processed 6000 users...
   Processed 7000 users...
✅ Evaluation Complete.
   -> Hit Rate @ 10: 0.0471 (4.71%)


# Content-Based Recommender

Unlike the Collaborative model (which used User History similarity), this model uses **Vector Space Similarity**.

*"Show me books that describe the same topics (TF-IDF) or share the same meaning (MiniLM) as the books I have read in the past."*


1. **User Profile ($\vec{P}_u$):** We calculate the "average" content vector of the user's history:
$$\vec{P}_u = \frac{1}{N} \sum_{j \in History} \vec{V}_j$$ *If you read 3 books about "Space", your average vector points heavily toward "Space".*


2. **Scoring:** We calculate the Cosine Similarity between this User Profile and every book in the catalog ($C$).
$$Score(i) = \cos(\vec{P}_u, \vec{V}_i)$$


3. **Hybrid Content Score:** We combine the two signals (Keyword vs. Meaning) using a weighting parameter $\alpha$.

$$FinalScore = \alpha \cdot Score_{MiniLM} + (1 - \alpha) \cdot Score_{TFIDF}$$

In [62]:
class ContentRecommender:
    def __init__(self, interactions_df, tfidf_matrix, minilm_embeddings, item_to_row_idx):
        """
        Content-Based Recommender using Centroid Profiling.
        """
        self.interactions_df = interactions_df
        self.tfidf_matrix = tfidf_matrix
        self.minilm_embeddings = minilm_embeddings
        self.item_map = item_to_row_idx

        # Create Inverse Map (Row Index -> Item ID) for retrieving results
        self.idx_to_item = {v: k for k, v in item_to_row_idx.items()}

        # Pre-compute Global Popularity (Fallback)
        self.popular_items = interactions_df['item_id'].value_counts().head(20).index.tolist()

    def recommend(self, user_id, top_k=10, alpha=0.5):
        """
        Generates recommendations based on the user's content profile.
        alpha (0.0 - 1.0): Weight for MiniLM.
                           1.0 = Pure Embedding, 0.0 = Pure Keywords.
        """
        # 1. Get User History from Training Data
        user_history = self.interactions_df[self.interactions_df['user_id'] == user_id]['item_id'].unique()

        # --- CASE A: Cold Start User (No history) ---
        if len(user_history) == 0:
            # We cannot build a profile, so we return popular items
            return self.popular_items[:top_k]

        # 2. Get Matrix Indices for the user's history
        # We filter out items that might not exist in our item_map (just in case)
        valid_indices = [self.item_map[i] for i in user_history if i in self.item_map]

        # Safety check: if they read books we don't have vectors for
        if not valid_indices:
            return self.popular_items[:top_k]

        # 3. Build User Profile (Centroid)
        # Average MiniLM Vectors (Dense)
        user_prof_mini = np.mean(self.minilm_embeddings[valid_indices], axis=0).reshape(1, -1)

        # Average TF-IDF Vectors (Sparse)
        # Note: .mean() on sparse matrix returns a np.matrix, we cast to array
        user_prof_tfidf = np.asarray(self.tfidf_matrix[valid_indices].mean(axis=0))

        # 4. Compute Similarity (The Search)
        # MiniLM Similarity (Dense vs Dense)
        sim_mini = cosine_similarity(user_prof_mini, self.minilm_embeddings).flatten()

        # TF-IDF Similarity (Sparse vs Sparse)
        sim_tfidf = cosine_similarity(user_prof_tfidf, self.tfidf_matrix).flatten()

        # 5. Hybrid Weighted Score
        # alpha controls the blend. 0.5 means 50% semantics, 50% keywords.
        final_scores = (alpha * sim_mini) + ((1 - alpha) * sim_tfidf)

        # 6. Rank & Filter
        # Get indices of top items
        candidate_indices = final_scores.argsort()[::-1]

        recommendations = []
        for idx in candidate_indices:
            item_id = self.idx_to_item.get(idx)

            # Exclude items already read
            if item_id not in user_history:
                recommendations.append(item_id)

            if len(recommendations) >= top_k:
                break

        return recommendations

In [63]:
content_model = ContentRecommender(
    train_df,
    item_tfidf,
    item_minilm,
    item_map
)
print(">>> Content-Based Model Built.")

>>> Content-Based Model Built.


In [64]:
# Test on the SAME random user from before to compare
content_recs = content_model.recommend(sample_user, top_k=10, alpha=0.5)
display(get_book_titles(content_recs, full_catalog)[['Title', 'Author']])

Unnamed: 0_level_0,Title,Author
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
4861,le joueur déchecs,sala david
14107,le dieufauve,vehlmann fabien
140,habibi,thompson craig 1975
2553,glaise,bouysse franck 1965
12087,éveils,mancini juliette
5233,harleen,sejic stjepan 1981
2145,merel,lodewick clara 1996
2489,madones et putains,antico nine 1981
8504,létreinte,jim
6913,sigi,arnoux eric


We save the mdoel

In [65]:
joblib.dump(content_model, '/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/models/03_content_model.pkl')

['/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/models/03_content_model.pkl']

In [66]:
content_score = evaluate_model(content_model, test_df, k=10)

📉 Evaluating Model on 7838 test users...
   Processed 1000 users...
   Processed 2000 users...
   Processed 3000 users...
   Processed 4000 users...
   Processed 5000 users...
   Processed 6000 users...
   Processed 7000 users...
✅ Evaluation Complete.
   -> Hit Rate @ 10: 0.0638 (6.38%)


## Find the best hyperparameter for content-based recommender


In [67]:
def tune_content_alpha(model, test_df, k=10):
    print(f">>> Tuning Content Model Alpha (TF-IDF vs MiniLM)...")
    print(f"   -> Testing on {len(test_df)} users (Leave-One-Out)")

    results = {}
    # 0.0 = Pure TF-IDF (Keywords)
    # 1.0 = Pure MiniLM (Meaning)
    alphas = [0.0, 0.2, 0.5, 0.8, 1.0]

    for alpha in alphas:
        hits = 0
        total = 0

        for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc=f"Alpha {alpha}"):
            user_id = row['user_id']
            target_item = row['item_id']

            # Get prediction with specific alpha
            try:
                recs = model.recommend(user_id, top_k=k, alpha=alpha)
                if target_item in recs:
                    hits += 1
            except:
                pass
            total += 1

        score = hits / total
        results[alpha] = score
        print(f"   [Alpha {alpha:.1f}] Hit Rate: {score:.4%}")

    best_alpha = max(results, key=results.get)
    print(f"\n>>> Best Content Configuration:")
    print(f"   -> Alpha: {best_alpha}")
    print(f"   -> Score: {results[best_alpha]:.4%}")

    if best_alpha == 0.0:
        print("   -> Insight: Users rely mostly on specific KEYWORDS (TF-IDF).")
    elif best_alpha == 1.0:
        print("   -> Insight: Users rely mostly on general THEMES (MiniLM).")
    else:
        print("   -> Insight: A MIX of keywords and themes works best.")

    return best_alpha

In [68]:
#best_content_alpha = tune_content_alpha(content_model, test_df)
#print(f"\n>>> Content Model optimized with alpha={best_content_alpha}")

>>> Tuning Content Model Alpha (TF-IDF vs MiniLM)...
   -> Testing on 7838 users (Leave-One-Out)


Alpha 0.0:   0%|          | 0/7838 [00:00<?, ?it/s]

   [Alpha 0.0] Hit Rate: 5.3713%


Alpha 0.2:   0%|          | 0/7838 [00:00<?, ?it/s]

   [Alpha 0.2] Hit Rate: 6.0347%


Alpha 0.5:   0%|          | 0/7838 [00:00<?, ?it/s]

   [Alpha 0.5] Hit Rate: 6.3792%


Alpha 0.8:   0%|          | 0/7838 [00:00<?, ?it/s]

   [Alpha 0.8] Hit Rate: 6.0475%


Alpha 1.0:   0%|          | 0/7838 [00:00<?, ?it/s]

   [Alpha 1.0] Hit Rate: 5.3713%

>>> Best Content Configuration:
   -> Alpha: 0.5
   -> Score: 6.3792%
   -> Insight: A MIX of keywords and themes works best.

>>> Content Model optimized with alpha=0.5


In [None]:
best_content_alpha = tune_alpha(content_model, test_df, param_name='alpha')
print(f"\n>>> Content Model optimized with alpha={best_content_alpha}")

# Hybrid Recommender

The Hybrid model represents the final stage of our pipeline. Its goal is to combine the Behavioral Signals (from Collaborative Filtering) and the Semantic Signals (from Content-Based Filtering) into a single, superior recommendation list.

## 1. The Challenge: Incompatible Scales
We cannot simply add the raw scores from our two models because they exist in different mathematical spaces:
- **Collaborative Model**: Returns Dot Product scores (unbounded, e.g., $0.0$ to $50.0+$). High scores depend on the density of user interactions.
- **Content Model**: Returns Cosine Similarity scores (bounded, $0.0$ to $1.0$).

If we performed a simple addition ($Score_{CF} + Score_{Content}$), the Collaborative model would mathematically dominate the decision simply because its numbers are larger, rendering the Content model useless.

## 2. The Solution: Reciprocal Rank Fusion (RRF)
To solve this, we ignore the raw values and look at the **Rank** (Order). If both models agree that a book is "Rank #1," it matters less that one scored it `15.4` and the other `0.98`.

We implement a **Weighted Reciprocal Rank** strategy:

### Step A: Normalization (Rank to Score)
For every item $i$ recommended by a model, we assign a normalized score based on its rank $r$ (where $r=0$ is the 1st recommendation):

$$S(i) = \frac{1}{r + 1}$$

Where:
- 1st Place ($r=0$): Score $= 1.0$
- 2nd Place ($r=1$): Score $= 0.5$
- 3rd Place ($r=2$): Score $= 0.33$
- etc.

### Step B: The Fusion Equation
We calculate the final score by combining the normalized scores from both engines, controlled by a hyperparameter $\beta$ (Hybrid Alpha):

$$FinalScore(u, i) = \beta \cdot S_{CF}(i) + (1 - \beta) \cdot S_{Content}(i)$$

- Where $\beta$ is a tuning parameter between $0.0$ and $1.0$.
- If $\beta > 0.5$, we trust **Social Trends** (what others read) more.
- If $\beta < 0.5$, we trust **Book Content** (what the book is about) more.

In [70]:
class HybridRecommender:
    def __init__(self, cf_model, content_model, content_alpha=0.5):
        """
        Hybrid Recommender combining Behavioral and Semantic signals.

        Args:
            cf_model: The trained CollaborativeRecommender (Behavior)
            content_model: The trained ContentRecommender (Semantics)
            content_alpha: The optimized alpha for the content model (Keywords vs Meaning).
        """
        self.cf_model = cf_model
        self.content_model = content_model
        self.content_alpha = content_alpha

    def recommend(self, user_id, top_k=10, hybrid_alpha=0.5):
        """
        Generates hybrid recommendations using Reciprocal Rank Fusion.

        Args:
            user_id: The user to recommend for.
            top_k: Number of items to return.
            hybrid_alpha (beta): Weight for Collaborative Filtering (0.0 to 1.0).
                                 1.0 = Trust Social Trends (CF)
                                 0.0 = Trust Content Match (Content)
        """
        # 1. Get Candidates from both models
        # We ask for more items (top_k * 50) to increase the chance of finding overlaps
        cf_items = self.cf_model.recommend(user_id, top_k=top_k*50)
        content_items = self.content_model.recommend(user_id, top_k=top_k*50, alpha=self.content_alpha)

        # 2. Normalize: Convert Ranks to Scores (Reciprocal Rank)
        # Score = 1 / (Rank + 1)
        # Rank 0 -> 1.0, Rank 1 -> 0.5, etc.
        cf_scores = {item: 1.0/(i+1) for i, item in enumerate(cf_items)}
        content_scores = {item: 1.0/(i+1) for i, item in enumerate(content_items)}

        # 3. Fusion: Weighted Sum
        # Union of all candidate items
        all_items = set(cf_scores.keys()) | set(content_scores.keys())
        hybrid_scores = {}

        for item in all_items:
            # Get score (0.0 if the model didn't recommend it)
            s_cf = cf_scores.get(item, 0.0)
            s_content = content_scores.get(item, 0.0)

            # The Fusion Equation: Beta * CF + (1 - Beta) * Content
            hybrid_scores[item] = (hybrid_alpha * s_cf) + ((1 - hybrid_alpha) * s_content)

        # 4. Rank & Return
        # Sort by final score descending
        sorted_items = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)

        # Return only the item_ids
        return [item for item, score in sorted_items[:top_k]]

In [71]:
hybrid_model = HybridRecommender(
    cf_model,
    content_model,
    content_alpha=best_content_alpha
)

In [73]:
# Test on the SAME random user from before to compare
hybrid_recs = hybrid_model.recommend(sample_user, top_k=10, hybrid_alpha=0.5)
display(get_book_titles(hybrid_recs, full_catalog)[['Title', 'Author']])

Unnamed: 0_level_0,Title,Author
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
4861,le joueur déchecs,sala david
13587,assassination classroom,matsui yusei
739,lattaque des titans édition colossale,isayama hajime
14107,le dieufauve,vehlmann fabien
140,habibi,thompson craig 1975
7115,reine degypte,inudoh chie 1985
2553,glaise,bouysse franck 1965
14166,chainsaw man,fujimoto tatsuki
14081,akira,otomo katsuhiro
12087,éveils,mancini juliette


We save the model

In [74]:
joblib.dump(hybrid_model, '/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/models/03_hybrid_model.pkl')

['/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/models/03_hybrid_model.pkl']

In [75]:
content_score = evaluate_model(hybrid_model, test_df, k=10)

📉 Evaluating Model on 7838 test users...
   Processed 1000 users...
   Processed 2000 users...
   Processed 3000 users...
   Processed 4000 users...
   Processed 5000 users...
   Processed 6000 users...
   Processed 7000 users...
✅ Evaluation Complete.
   -> Hit Rate @ 10: 0.0706 (7.06%)


##Find the best hyperparameter for content-based recommender

In [77]:
best_hybrid_alpha = tune_alpha(hybrid_model, test_df, param_name='hybrid_alpha')
print(f"\n>>> Hybrid Model optimized with alpha={best_hybrid_alpha}")

>>> Tuning 'hybrid_alpha' on 7838 users...


hybrid_alpha=0.0:   0%|          | 0/7838 [00:00<?, ?it/s]

   [0.0] Hit Rate: 6.3792%


hybrid_alpha=0.2:   0%|          | 0/7838 [00:00<?, ?it/s]

   [0.2] Hit Rate: 6.9788%


hybrid_alpha=0.4:   0%|          | 0/7838 [00:00<?, ?it/s]

   [0.4] Hit Rate: 7.0681%


hybrid_alpha=0.6:   0%|          | 0/7838 [00:00<?, ?it/s]

   [0.6] Hit Rate: 6.7492%


hybrid_alpha=0.8:   0%|          | 0/7838 [00:00<?, ?it/s]

   [0.8] Hit Rate: 5.9709%


hybrid_alpha=1.0:   0%|          | 0/7838 [00:00<?, ?it/s]

   [1.0] Hit Rate: 4.7078%

>>> Winner: hybrid_alpha = 0.4
   -> Score: 7.0681%


NameError: name 'best_content_alpha2' is not defined

In [79]:
print(f"\n>>> Hybrid Model optimized with alpha={best_hybrid_alpha}")


>>> Hybrid Model optimized with alpha=0.4


We save all the hyperparameters

In [80]:
config = {
    "best_content_alpha": best_content_alpha,
    "best_hybrid_alpha": best_hybrid_alpha
}

joblib.dump(config, '/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/models/03_best_params.pkl')

['/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/models/03_best_params.pkl']