In [2]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import joblib
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from tqdm.notebook import tqdm
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Class Definition

Because we are working with Notebooks only, we MUST re-define the Classes (`CollaborativeRecommender`, etc.) in this new notebook before we can load them. Python needs the "blueprint" to understand the saved "object."


WE HAVE TO MAKE THEM INTO A PY FILE FOR THE SUBMISSION BC IT IS CLEANER

In [12]:
class CollaborativeRecommender:
    def __init__(self, train_df):
        """
        Item-Item Collaborative Filtering using Sparse Matrices.
        """
        self.train_df = train_df
        self.user_ids = train_df['user_id'].unique()
        self.item_ids = train_df['item_id'].unique()
        self.user2idx = {u: i for i, u in enumerate(self.user_ids)}
        self.item2idx = {item: i for i, item in enumerate(self.item_ids)}
        self.idx2item = {i: item for i, item in enumerate(self.item_ids)}

        rows = train_df['user_id'].map(self.user2idx)
        cols = train_df['item_id'].map(self.item2idx)
        data = np.ones(len(train_df))

        self.interaction_matrix = sp.coo_matrix(
            (data, (rows, cols)),
            shape=(len(self.user_ids), len(self.item_ids))
        ).tocsr()

        self.item_user_matrix = self.interaction_matrix.T.tocsr()
        self.popular_items = train_df['item_id'].value_counts().head(20).index.tolist()

    def recommend(self, user_id, top_k=10):
        if user_id not in self.user2idx:
            return self.popular_items[:top_k]

        user_idx = self.user2idx[user_id]
        user_history_vec = self.interaction_matrix[user_idx]
        history_indices = user_history_vec.indices

        if len(history_indices) == 0:
            return self.popular_items[:top_k]

        relevant_item_vecs = self.item_user_matrix[history_indices]
        user_profile = np.asarray(relevant_item_vecs.sum(axis=0)).flatten()
        scores = self.item_user_matrix.dot(user_profile)
        top_indices = scores.argsort()[::-1]

        recommendations = []
        for idx in top_indices:
            item = self.idx2item[idx]

            if idx not in history_indices:
                recommendations.append(item)

            if len(recommendations) >= top_k:
                break

        return recommendations

class ContentRecommender:
    def __init__(self, interactions_df, tfidf_matrix, minilm_embeddings, item_to_row_idx):
        self.interactions_df = interactions_df
        self.tfidf_matrix = tfidf_matrix
        self.minilm_embeddings = minilm_embeddings
        self.item_map = item_to_row_idx

        self.idx_to_item = {v: k for k, v in item_to_row_idx.items()}

        self.popular_items = interactions_df['item_id'].value_counts().head(20).index.tolist()

    def recommend(self, user_id, top_k=10, alpha=0.5):
        user_history = self.interactions_df[self.interactions_df['user_id'] == user_id]['item_id'].unique()

        if len(user_history) == 0:
            return self.popular_items[:top_k]

        valid_indices = [self.item_map[i] for i in user_history if i in self.item_map]

        if not valid_indices:
            return self.popular_items[:top_k]

        user_prof_mini = np.mean(self.minilm_embeddings[valid_indices], axis=0).reshape(1, -1)
        user_prof_tfidf = np.asarray(self.tfidf_matrix[valid_indices].mean(axis=0))
        sim_mini = cosine_similarity(user_prof_mini, self.minilm_embeddings).flatten()
        sim_tfidf = cosine_similarity(user_prof_tfidf, self.tfidf_matrix).flatten()
        final_scores = (alpha * sim_mini) + ((1 - alpha) * sim_tfidf)
        candidate_indices = final_scores.argsort()[::-1]

        recommendations = []
        for idx in candidate_indices:
            item_id = self.idx_to_item.get(idx)

            if item_id not in user_history:
                recommendations.append(item_id)

            if len(recommendations) >= top_k:
                break

        return recommendations

class HybridRecommender:
    def __init__(self, cf_model, content_model, content_alpha=0.5):
        self.cf_model = cf_model
        self.content_model = content_model
        self.content_alpha = content_alpha

    def recommend(self, user_id, top_k=10, hybrid_alpha=0.5):
        cf_items = self.cf_model.recommend(user_id, top_k=top_k*50)
        content_items = self.content_model.recommend(user_id, top_k=top_k*50, alpha=self.content_alpha)

        cf_scores = {item: 1.0/(i+1) for i, item in enumerate(cf_items)}
        content_scores = {item: 1.0/(i+1) for i, item in enumerate(content_items)}

        all_items = set(cf_scores.keys()) | set(content_scores.keys())
        hybrid_scores = {}

        for item in all_items:
            s_cf = cf_scores.get(item, 0.0)
            s_content = content_scores.get(item, 0.0)
            hybrid_scores[item] = (hybrid_alpha * s_cf) + ((1 - hybrid_alpha) * s_content)

        sorted_items = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)

        return [item for item, score in sorted_items[:top_k]]

# Load the data

In [13]:
BASE_PATH = '/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/'

def load_data_resources():
    print(f">>> Loading CSVs from {BASE_PATH}data/ ...")
    full_catalog = pd.read_csv(BASE_PATH + 'data/interactions_merged.csv')

    train_df = pd.read_csv(BASE_PATH + 'data/train_marged_interactions.csv')
    val_df   = pd.read_csv(BASE_PATH + 'data/val_merged_interactions.csv')
    index_map = pd.read_csv(BASE_PATH + 'data/02_interaction_index_map.csv')

    print(f"      Full Catalog: {len(full_catalog)}")
    print(f"      Train: {len(train_df)} | Val: {len(val_df)}")
    print(f">>> Loading & Cleaning Matrices...")
    tfidf_bloated = sp.load_npz(BASE_PATH + 'data/02_tfidf_matrix.npz')
    minilm_bloated = np.load(BASE_PATH + 'data/02_minilm_embeddings.npy')

    # Deduplication Logic
    unique_map = index_map.drop_duplicates(subset='item_id', keep='first')
    unique_indices = unique_map.index.values

    unique_tfidf = tfidf_bloated[unique_indices]
    unique_minilm = minilm_bloated[unique_indices]

    # Lookup Dictionary
    unique_item_ids = unique_map['item_id'].values
    item_to_row_idx = {item_id: i for i, item_id in enumerate(unique_item_ids)}

    print(f"      Unique Items: {len(item_to_row_idx)}")
    print(f"      Matrix Shapes: TF-IDF {unique_tfidf.shape}, MiniLM {unique_minilm.shape}")

    return full_catalog, train_df, val_df, unique_tfidf, unique_minilm, item_to_row_idx

def load_model_resources():
    print("\n>>> Loading saved models...")
    models = {}
    model_path = BASE_PATH + 'models/'

    files = {
        'cf': '03_cf_model.pkl',
        'content': '03_content_model.pkl',
        'hybrid': '03_hybrid_model.pkl',
        'config': '03_best_params.pkl'
    }

    for name, filename in files.items():
        try:
            models[name] = joblib.load(model_path + filename)
            print(f">>> Loaded {name}: {filename}")
        except Exception as e:
            print(f">>> Failed to load {name}: {e}")

    return models

In [14]:
full_catalog, train_df, val_df, item_tfidf, item_minilm, item_map = load_data_resources()

# We get the models too
saved_models = load_model_resources()
cf_model = saved_models.get('cf')
content_model = saved_models.get('content')
hybrid_model = saved_models.get('hybrid')
config = saved_models.get('config')

>>> Loading CSVs from /content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/data/ ...
      Full Catalog: 87045
      Train: 79207 | Val: 7838
>>> Loading & Cleaning Matrices...
      Unique Items: 15109
      Matrix Shapes: TF-IDF (15109, 15000), MiniLM (15109, 384)

>>> Loading saved models...
>>> Loaded cf: 03_cf_model.pkl
>>> Loaded content: 03_content_model.pkl
>>> Loaded hybrid: 03_hybrid_model.pkl
>>> Loaded config: 03_best_params.pkl


# Metrics Selection

While our goal is to recommend Top-10 ($K=10$) items to every user, our Leave-One-Out (LOO) split strategy creates a specific constraint for evaluation: **For each user, there is exactly one correct answer (Ground Truth) in the test set.**

We have therefore selected these following metrics to evaluate our models :

## Hit Rate @ 10 (Accuracy)
* **Context**: In a standard multi-item test, this would be the "Recall."
* **LOO Adaptation:** Since there is only 1 target item, Recall is binary. Either we found it (1.0) or we didn't (0.0).
* **Definition:** The percentage of users for whom the hidden test item appears anywhere in the Top-10 recommendations.
* **Why we use it:** It is the fundamental measure of utility. If the Hit Rate is 0, the model is useless.

##  MAP @ 10 (Ranking Quality)
* **Context**: Mean Average Precision usually averages the precision scores across multiple relevant items.
* **LOO Adaptation**: With only one relevant item, the "Average Precision" collapses into the Reciprocal Rank.
* * If the target is at Rank 1: Score = $1/1 = 1.0$
* * If the target is at Rank 10: Score = $1/10 = 0.1$
* * If missing: Score = $0.0$
* **Why we use it**: Hit Rate treats Rank #1 and Rank #10 equally. MAP penalizes the model if the user has to scroll down to find the correct book.

## Novelty (Discovery)
* **Context**: Recommender systems often bias towards popular items (e.g., Harry Potter).
* **Definition**: The Self-Information of the recommended items using $-\log_2(p)$, where $p$ is the item's popularity.
* **Why we use it**: To measure "serendipity." High novelty scores indicate the model is digging deep into the catalog to find niche items, rather than just recommending bestsellers.

## Catalog Coverage (Diversity)
* **Definition**: The percentage of unique items in our total library that were recommended to at least one user.
* **Why we use it**: To detect Mode Collapse. A bad model might achieve a high Hit Rate by recommending the same 50 popular books to everyone. A healthy model utilizes the entire library.

In [15]:
class ModelEvaluator:
    def __init__(self, train_df, full_catalog_ids):
        """
        Args:
            train_df: The training data (needed for Item Popularity/Novelty)
            full_catalog_ids: List of all unique item IDs (needed for Coverage)
        """
        self.train_df = train_df
        self.catalog_items = set(full_catalog_ids)

        # Pre-compute Item Popularity (Probability) for Novelty
        print("   -> Pre-computing Item Popularity for Novelty metrics...")
        item_counts = train_df['item_id'].value_counts()
        total_interactions = len(train_df)
        self.item_probs = (item_counts / total_interactions).to_dict()

    def _calculate_novelty(self, recs):
        """Calculates mean self-information of the recommended list."""
        novelty_score = 0
        for item in recs:
            # P(i) has a small epsilon if missing to avoid log(0)
            p = self.item_probs.get(item, 1e-10)
            novelty_score += -np.log2(p)
        return novelty_score / len(recs) if len(recs) > 0 else 0

    def evaluate(self, model, test_df, k=10, model_name="Model", **kwargs):
        """
        Runs metrics on the test set.
        **kwargs: allows passing 'alpha' or 'hybrid_alpha' dynamically.
        """
        print(f">>> Evaluating {model_name} on {len(test_df)} users...")

        metrics = {
            'hits': 0,       # Hit Rate
            'sum_ap': 0.0,   # MAP (Reciprocal Rank)
            'sum_nov': 0.0,  # Novelty
            'total': 0
        }

        unique_recs = set()

        for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc=f"Eval {model_name}"):
            user_id = row['user_id']
            target = row['item_id']

            try:
                model_type = str(type(model))

                if 'Hybrid' in model_type:
                    # Hybrid requires 'hybrid_alpha'
                    h_alpha = kwargs.get('hybrid_alpha', 0.5)
                    recs = model.recommend(user_id, top_k=k, hybrid_alpha=h_alpha)

                elif 'Content' in model_type:
                    # Content requires 'alpha'
                    c_alpha = kwargs.get('alpha', 0.5)
                    recs = model.recommend(user_id, top_k=k, alpha=c_alpha)

                else:
                    # Collaborative (and others) just need user_id
                    recs = model.recommend(user_id, top_k=k)

                # --- METRIC CALCULATION ---
                # 1. Coverage Tracking
                unique_recs.update(recs)

                # 2. Accuracy (Hit Rate & MAP)
                if target in recs:
                    metrics['hits'] += 1
                    rank = recs.index(target) + 1
                    metrics['sum_ap'] += (1.0 / rank)

                # 3. Discovery (Novelty)
                metrics['sum_nov'] += self._calculate_novelty(recs)

            except Exception as e:
                # Fallback for rare errors (e.g. user not in map)
                print(f"   -> Error: {e}")
                pass

            metrics['total'] += 1

        # Summary Stats
        n = metrics['total'] if metrics['total'] > 0 else 1

        final_scores = {
            'Model': model_name,
            'Hit Rate @ 10': metrics['hits'] / n,
            'MAP @ 10': metrics['sum_ap'] / n,
            'Novelty': metrics['sum_nov'] / n,
            'Coverage': len(unique_recs) / len(self.catalog_items)
        }

        print(f"   -> Results: {final_scores}")
        return final_scores

print(">>> Evaluator Class Ready.")

>>> Evaluator Class Ready.


In [16]:
config

{'best_content_alpha': 0.5, 'best_hybrid_alpha': 0.4}

In [17]:
# 1. Instantiate the Evaluator
# We need the full catalog IDs to calculate "Coverage" correctly
all_item_ids = full_catalog['item_id'].unique()
evaluator = ModelEvaluator(train_df, all_item_ids)
results_list = []

   -> Pre-computing Item Popularity for Novelty metrics...


In [None]:
print("\n>>> EVALUATING: Collaborative Filtering...")
res_cf = evaluator.evaluate(cf_model, val_df, k=10, model_name="Collaborative")
results_list.append(res_cf)


>>> EVALUATING: Collaborative Filtering...
>>> Evaluating Collaborative on 7838 users...


Eval Collaborative:   0%|          | 0/7838 [00:00<?, ?it/s]

   -> Results: {'Model': 'Collaborative', 'Hit Rate @ 10': 0.047078336310283235, 'MAP @ 10': 0.019112575284430347, 'Novelty': np.float64(12.70286306942157), 'Coverage': 0.5954067112317162}


In [None]:
c_alpha = config.get('best_content_alpha', 0.5) if config else 0.5
print(f"\n>>> EVALUATING: Content-Based (alpha={c_alpha})...")
res_content = evaluator.evaluate(content_model, val_df, k=10, model_name="Content", alpha=c_alpha)
results_list.append(res_content)


>>> EVALUATING: Content-Based (alpha=0.5)...
>>> Evaluating Content on 7838 users...


Eval Content:   0%|          | 0/7838 [00:00<?, ?it/s]

   -> Results: {'Model': 'Content', 'Hit Rate @ 10': 0.06379178361826997, 'MAP @ 10': 0.025888376529483934, 'Novelty': np.float64(14.439639453544437), 'Coverage': 0.7190416308160699}


In [None]:
h_alpha = config.get('best_hybrid_alpha', 0.4) if config else 0.4
print(f"\n>>> EVALUATING: Hybrid (beta={h_alpha})...")
res_hybrid = evaluator.evaluate(hybrid_model, val_df, k=10, model_name="Hybrid", alpha=h_alpha)
results_list.append(res_hybrid)


>>> EVALUATING: Hybrid (beta=0.4)...
>>> Evaluating Hybrid on 7838 users...


Eval Hybrid:   0%|          | 0/7838 [00:00<?, ?it/s]

   -> Results: {'Model': 'Hybrid', 'Hit Rate @ 10': 0.07055371268180659, 'MAP @ 10': 0.027725529269290422, 'Novelty': np.float64(13.435942632698852), 'Coverage': 0.7338672314514528}


In [None]:
leaderboard = pd.DataFrame(results_list).set_index('Model')
print("\n>>> FINAL RESULTS LEADERBOARD")
display(leaderboard.style.highlight_max(axis=0, color='green'))


>>> FINAL RESULTS LEADERBOARD


Unnamed: 0_level_0,Hit Rate @ 10,MAP @ 10,Novelty,Coverage
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Collaborative,0.047078,0.019113,12.702863,0.595407
Content,0.063792,0.025888,14.439639,0.719042
Hybrid,0.070554,0.027726,13.435943,0.733867


## Interpretation

* Hit Rate @ 10: **Hybrid Wins** (+50% over CF). It finds more relevant books.
* Map @ 10: **Hybrid Wins**. It ranks the correct book higher on the list.
* Novelty: **Content Wins**. Pure semantic matching finds the rarest gems. Hybrid balances this nicely (better than CF).
* Coverage: **Hybrid Wins**. It recommends the widest variety of books (almost 3/4 of the library!).

## Conclusion
* **Collaborative Filtering** struggled (likely due to sparsity and cold items).
* **Content-Based** was the hero, proving that metadata is crucial for this dataset.
* **Hybrid** took the best of both worlds: the accuracy of Content + the slight behavioral nudge from CF to push the really good stuff to the top.

# Segmentation Analysis

Hypothesis: The Content Model is rescuing the "Cold Start Items"

In [18]:
def evaluate_segments(model, test_df, train_df, k=10, model_name="Model", **kwargs):
    print(f">>> Segmenting Evaluation for {model_name}...")

    # 1. Identify Warm vs Cold Items
    train_items = set(train_df['item_id'].unique())

    # Boolean mask for test rows
    is_warm = test_df['item_id'].isin(train_items)

    test_warm = test_df[is_warm]
    test_cold = test_df[~is_warm]

    print(f"   -> Warm Test Cases: {len(test_warm)}")
    print(f"   -> Cold Test Cases: {len(test_cold)}")

    # 2. Run Evaluation on both subsets
    # We reuse the evaluate method but just for Hit Rate to keep it simple
    def get_hit_rate(sub_df):
        hits = 0
        total = 0
        for _, row in tqdm(sub_df.iterrows(), total=len(sub_df), desc=f"{model_name}"):
            user_id = row['user_id']
            target = row['item_id']
            try:
                # Handle model types again
                model_type = str(type(model))
                if 'Hybrid' in model_type:
                    recs = model.recommend(user_id, top_k=k, hybrid_alpha=kwargs.get('hybrid_alpha', 0.5))
                elif 'Content' in model_type:
                    recs = model.recommend(user_id, top_k=k, alpha=kwargs.get('alpha', 0.5))
                else:
                    recs = model.recommend(user_id, top_k=k)

                if target in recs:
                    hits += 1
            except:
                pass
            total += 1
        return hits / total if total > 0 else 0

    hr_warm = get_hit_rate(test_warm)
    hr_cold = get_hit_rate(test_cold)

    return {
        'Model': model_name,
        'HR@10 (Warm)': hr_warm,
        'HR@10 (Cold)': hr_cold
    }

In [None]:
segments = []

# CF
segments.append(evaluate_segments(cf_model, val_df, train_df, model_name="Collaborative"))
# Content
c_alpha = config['best_content_alpha']
segments.append(evaluate_segments(content_model, val_df, train_df, model_name="Content", alpha=c_alpha))
# Hybrid
h_alpha = config['best_hybrid_alpha']
segments.append(evaluate_segments(hybrid_model, val_df, train_df, model_name="Hybrid", hybrid_alpha=h_alpha))

>>> Segmenting Evaluation for Collaborative...
   -> Warm Test Cases: 7678
   -> Cold Test Cases: 160


Collaborative:   0%|          | 0/7678 [00:00<?, ?it/s]

Collaborative:   0%|          | 0/160 [00:00<?, ?it/s]

>>> Segmenting Evaluation for Content...
   -> Warm Test Cases: 7678
   -> Cold Test Cases: 160


Content:   0%|          | 0/7678 [00:00<?, ?it/s]

Content:   0%|          | 0/160 [00:00<?, ?it/s]

>>> Segmenting Evaluation for Hybrid...
   -> Warm Test Cases: 7678
   -> Cold Test Cases: 160


Hybrid:   0%|          | 0/7678 [00:00<?, ?it/s]

Hybrid:   0%|          | 0/160 [00:00<?, ?it/s]

In [None]:
seg_df = pd.DataFrame(segments).set_index('Model')
print("\n>>> COLD START VS WARM START PERFORMANCE")
display(seg_df.style.highlight_max(axis=0, color='green'))


>>> COLD START VS WARM START PERFORMANCE


Unnamed: 0_level_0,HR@10 (Warm),HR@10 (Cold)
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Collaborative,0.048059,0.0
Content,0.061995,0.15
Hybrid,0.06981,0.1125


### Interpretation

* **Collaborative Filtering** (0% on Cold Items): It failed completely on new books. If a book has no history, this model is blind. It cannot recommend what it doesn't know.

* **Content-Based** (15% on Cold Items): It is much much better ! It reads the book titles and subjects, so it can recommend new books even without user history. This proves adding this model is necessary.

* **Hybrid** (Best on Warm Items): It achieved the highest score (~7%) on known books. By combining "what people read" (CF) with "what books are about" (Content), it outperforms both individual models.

# Model Upgrade: Matrix Factorization (SVD)

After submitting the hybrid model recommendation in Kaggle, it is clear that our current model is not good enough. While our Hybrid Model achieved a respectable Hit Rate of 7.06%, a deeper analysis reveals a bottleneck in its behavioral component.

* **The Bottleneck:** Our current** Memory-Based **Collaborative Filtering model only achieved 4.71%. It relies on **Exact Overlaps**, finding similarities only if User A and User B read the exact same book. In our sparse dataset, these overlaps are rare, limiting the model's contribution to the Hybrid.

To break this ceiling and boost the overall Hybrid score, we are upgrading the behavioral engine **from Memory-Based to Model-Based** using **Singular Value Decomposition (SVD)**.

## The Logic: Latent Factors (Hidden Concepts)
Instead of looking for direct matches ("User A read Book X"), SVD looks for Hidden Concepts (Latent Factors) that connect users and items.

### Example:
* The model notices that User A reads Isaac Asimov and Frank Herbert.
* It infers a hidden concept: "Classic Sci-Fi".
* It sees that Book Z is heavily associated with "Classic Sci-Fi".
* **Result**: It recommends Book Z to User A, even if no other user in the dataset has read both Herbert and Book Z.2.

## The Math: Decomposing the Matrix
We take the massive Interaction Matrix ($R$) and decompose it into three smaller matrices:

$$R \approx U \cdot \Sigma \cdot V^T$$

* **$U$ (User Factors)**: Represents users' affinity for hidden concepts (e.g., "How much does User A like Romance?").
* **$V^T$ (Item Factors)**: Represents items' relationship to those concepts (e.g., "How much is Book B a Romance book?").
* **$\Sigma$ (Weights)**: The strength/importance of each concept.

**The Prediction**:To predict the score for User $u$ and Item $i$, we simply calculate the Dot Product of their factor vectors:$$\text{Score}(u, i) = \vec{U}_u \cdot \vec{V}_i$$

In [21]:
class SVDRecommender:
    def __init__(self, train_df, n_components=50):
        """
        Matrix Factorization using Truncated SVD.
        n_components: The number of 'Latent Factors' (Hidden concepts) to keep.
        """
        self.train_df = train_df
        self.n_components = n_components

        # 1. Create Mappings for Matrix construction
        self.user_ids = train_df['user_id'].unique()
        self.item_ids = train_df['item_id'].unique()

        self.user2idx = {u: i for i, u in enumerate(self.user_ids)}
        self.item2idx = {item: idx for idx, item in enumerate(self.item_ids)}
        self.idx2item = {idx: item for idx, item in enumerate(self.item_ids)}

        # 2. Build Sparse Interaction Matrix (Rows=Users, Cols=Items)
        print(f"   -> Building Sparse Matrix ({len(self.user_ids)} users x {len(self.item_ids)} items)...")
        rows = train_df['user_id'].map(self.user2idx)
        cols = train_df['item_id'].map(self.item2idx)

        # We use implicit feedback (1 = interacted)
        data = np.ones(len(train_df))

        self.interaction_matrix = sp.coo_matrix(
            (data, (rows, cols)),
            shape=(len(self.user_ids), len(self.item_ids))
        ).tocsr()

        # 3. Train SVD (The Decomposition)
        print(f"   -> Training SVD (n_components={n_components})...")
        self.svd = TruncatedSVD(n_components=n_components, random_state=42)

        # User Factors (U * Sigma): Shape (N_Users, n_components)
        # Represents: "How much does this User like each Concept?"
        self.user_factors = self.svd.fit_transform(self.interaction_matrix)

        # Item Factors (V^T): Shape (n_components, N_Items)
        # Represents: "How much does this Item belong to each Concept?"
        self.item_factors = self.svd.components_

        # Fallback for Cold Start Users
        self.popular_items = train_df['item_id'].value_counts().head(20).index.tolist()

    def recommend(self, user_id, top_k=10):
        """
        Predicts scores by multiplying User Vector * Item Matrix.
        """
        # A. Cold Start User Check
        if user_id not in self.user2idx:
            return self.popular_items[:top_k]

        user_idx = self.user2idx[user_id]

        # B. Predict Scores (Dot Product)
        # User Vector: (1, 50)
        user_vec = self.user_factors[user_idx].reshape(1, -1)

        # Scores = User(1, 50) . Item(50, 15000) -> Result(1, 15000)
        # This reconstructs the "predicted rating" for all books
        scores = user_vec.dot(self.item_factors).flatten()

        # C. Filter History (Don't recommend what they already read)
        # We look up the original sparse matrix to find indices of read items
        history_indices = self.interaction_matrix[user_idx].indices

        # Set their score to negative infinity so they drop to the bottom
        scores[history_indices] = -np.inf

        # D. Rank & Return
        # Get top K indices
        top_indices = scores.argsort()[-(top_k):][::-1]

        return [self.idx2item[i] for i in top_indices]

In [32]:
# 1. Train SVD (We start with 50 latent factors as a standard baseline)
svd_model = SVDRecommender(train_df, n_components=50)

print("\n>>> EVALUATING: SVD (50 Factors)...")
res_svd = evaluator.evaluate(svd_model, val_df, k=10, model_name="SVD (CF)")

   -> Building Sparse Matrix (7838 users x 14979 items)...
   -> Training SVD (n_components=50)...

>>> EVALUATING: SVD (50 Factors)...
>>> Evaluating SVD (CF) on 7838 users...


Eval SVD (CF):   0%|          | 0/7838 [00:00<?, ?it/s]

   -> Results: {'Model': 'SVD (CF)', 'Hit Rate @ 10': 0.01773411584587905, 'MAP @ 10': 0.007206800811674512, 'Novelty': np.float64(11.247516634221176), 'Coverage': 0.03441657290356741}


## Interpretation

score of 1.7% (vs Memory-Based 4.7%) is probably due topUnderfitting.

50 Latent Factors might be too few to capture the complexity of 15,000 unique books. The model is "blurring" the details too much.

Therefore, we need to find the correct Number of Latent Factor, through tuning:

* Low (e.g., 5-10): The model is "simple." It forces all books into broad buckets like "Fiction" vs "Non-Fiction." It might miss nuances (Underfitting).
* High (e.g., 300): The model is "complex." It can memorize specific details, but it risks memorizing noise or random one-time clicks (Overfitting).

We will run a loop, training the SVD model with different sizes and checking the MAP@10 score.

In [34]:
def tune_svd_components(train_df, val_df, evaluator, k=10):
    print(f">>> Tuning SVD Latent Factors (n_components)...")

    # Range of components to test
    component_list = [10, 25, 50, 100, 150, 200, 250, 300]

    results_list = []

    for n in component_list:
        print(f"\n--- Testing n_components = {n} ---")

        # 1. Train
        try:
            model = SVDRecommender(train_df, n_components=n)

            # 2. Evaluate (We use MAP@K as the decision metric)
            metrics = evaluator.evaluate(model, val_df, k=k, model_name=f"SVD-{n}")

            metrics['n_components'] = n
            results_list.append(metrics)

        except Exception as e:
            print(f"   >>> Error with n={n}: {e}")

    # 3. Create a DataFrame to visualize trade-offs
    results_df = pd.DataFrame(results_list).set_index('n_components')

    print("\n>>> TUNING RESULTS DASHBOARD:")
    # Highlight the best values in each column
    display(results_df.style.highlight_max(axis=0, subset=['Hit Rate @ 10', 'MAP @ 10', 'Novelty', 'Coverage'], color='green'))

    # 4. Pick Winner based on Primary Metric (MAP)
    best_n = results_df['MAP @ 10'].idxmax()
    print(f"\n>>> Winner (Best MAP): n={best_n}")

    return best_n

In [35]:
best_n_components = tune_svd_components(train_df, val_df, evaluator)

print(f"\n>>> Optimal SVD Size found: {best_n_components}")

>>> Tuning SVD Latent Factors (n_components)...

--- Testing n_components = 10 ---
   -> Building Sparse Matrix (7838 users x 14979 items)...
   -> Training SVD (n_components=10)...
>>> Evaluating SVD-10 on 7838 users...


Eval SVD-10:   0%|          | 0/7838 [00:00<?, ?it/s]

   -> Results: {'Model': 'SVD-10', 'Hit Rate @ 10': 0.0011482521051288594, 'MAP @ 10': 0.0004004706416035837, 'Novelty': np.float64(9.835799288900676), 'Coverage': 0.008074657488913893}

--- Testing n_components = 25 ---
   -> Building Sparse Matrix (7838 users x 14979 items)...
   -> Training SVD (n_components=25)...
>>> Evaluating SVD-25 on 7838 users...


Eval SVD-25:   0%|          | 0/7838 [00:00<?, ?it/s]

   -> Results: {'Model': 'SVD-25', 'Hit Rate @ 10': 0.007017096198009697, 'MAP @ 10': 0.0038177357359546684, 'Novelty': np.float64(10.769570478830973), 'Coverage': 0.015355086372360844}

--- Testing n_components = 50 ---
   -> Building Sparse Matrix (7838 users x 14979 items)...
   -> Training SVD (n_components=50)...
>>> Evaluating SVD-50 on 7838 users...


Eval SVD-50:   0%|          | 0/7838 [00:00<?, ?it/s]

   -> Results: {'Model': 'SVD-50', 'Hit Rate @ 10': 0.01773411584587905, 'MAP @ 10': 0.007206800811674512, 'Novelty': np.float64(11.247516634221176), 'Coverage': 0.03441657290356741}

--- Testing n_components = 100 ---
   -> Building Sparse Matrix (7838 users x 14979 items)...
   -> Training SVD (n_components=100)...
>>> Evaluating SVD-100 on 7838 users...


Eval SVD-100:   0%|          | 0/7838 [00:00<?, ?it/s]

   -> Results: {'Model': 'SVD-100', 'Hit Rate @ 10': 0.01990303648890023, 'MAP @ 10': 0.008812126109268244, 'Novelty': np.float64(11.856739909702865), 'Coverage': 0.07558408895360381}

--- Testing n_components = 150 ---
   -> Building Sparse Matrix (7838 users x 14979 items)...
   -> Training SVD (n_components=150)...
>>> Evaluating SVD-150 on 7838 users...


Eval SVD-150:   0%|          | 0/7838 [00:00<?, ?it/s]

   -> Results: {'Model': 'SVD-150', 'Hit Rate @ 10': 0.023220209237050267, 'MAP @ 10': 0.009451917196239739, 'Novelty': np.float64(12.23901114735522), 'Coverage': 0.12151697663644186}

--- Testing n_components = 200 ---
   -> Building Sparse Matrix (7838 users x 14979 items)...
   -> Training SVD (n_components=200)...
>>> Evaluating SVD-200 on 7838 users...


Eval SVD-200:   0%|          | 0/7838 [00:00<?, ?it/s]

   -> Results: {'Model': 'SVD-200', 'Hit Rate @ 10': 0.026282214850727226, 'MAP @ 10': 0.009913951971874925, 'Novelty': np.float64(12.488782782222282), 'Coverage': 0.16698656429942418}

--- Testing n_components = 250 ---
   -> Building Sparse Matrix (7838 users x 14979 items)...
   -> Training SVD (n_components=250)...
>>> Evaluating SVD-250 on 7838 users...


Eval SVD-250:   0%|          | 0/7838 [00:00<?, ?it/s]

   -> Results: {'Model': 'SVD-250', 'Hit Rate @ 10': 0.025133962745598367, 'MAP @ 10': 0.009344179961684435, 'Novelty': np.float64(12.645325926006866), 'Coverage': 0.20689655172413793}

--- Testing n_components = 300 ---
   -> Building Sparse Matrix (7838 users x 14979 items)...
   -> Training SVD (n_components=300)...
>>> Evaluating SVD-300 on 7838 users...


Eval SVD-300:   0%|          | 0/7838 [00:00<?, ?it/s]

   -> Results: {'Model': 'SVD-300', 'Hit Rate @ 10': 0.027175299821383007, 'MAP @ 10': 0.00961038408729147, 'Novelty': np.float64(12.771791279149324), 'Coverage': 0.2544840823350321}

>>> TUNING RESULTS DASHBOARD:


Unnamed: 0_level_0,Model,Hit Rate @ 10,MAP @ 10,Novelty,Coverage
n_components,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
10,SVD-10,0.001148,0.0004,9.835799,0.008075
25,SVD-25,0.007017,0.003818,10.76957,0.015355
50,SVD-50,0.017734,0.007207,11.247517,0.034417
100,SVD-100,0.019903,0.008812,11.85674,0.075584
150,SVD-150,0.02322,0.009452,12.239011,0.121517
200,SVD-200,0.026282,0.009914,12.488783,0.166987
250,SVD-250,0.025134,0.009344,12.645326,0.206897
300,SVD-300,0.027175,0.00961,12.771791,0.254484



>>> Winner (Best MAP): n=200

>>> Optimal SVD Size found: 200


## Interpration after tuning SDV

SVD is still weaker alone doesn't mean it's useless :

* Memory CF finds Local Neighbors.
* SVD finds Global Structures.
* Content finds Metadata Matches.

So combining all should still boost the score !

# TODO SAVE MODEL + PARAM

# Model Upgrade : Hybrid SVD Content

We extend our Reciprocal Rank Fusion (RRF) formula to handle three inputs.

### Step A: Normalization (Rank to Score)
For every item $i$ recommended by any model, we convert its Rank $r$ into a Score $S$:$$S(i) = \frac{1}{r + 1}$$

##Step B: The Weighted Ensemble Equation
We define three weights: $\alpha, \beta, \gamma$ (where $\alpha + \beta + \gamma = 1.0$).

$$FinalScore(u, i) = \alpha \cdot S_{Memory}(i) + \beta \cdot S_{SVD}(i) + \gamma \cdot S_{Content}(i)$$

* $\alpha$ (Memory Weight): Trust exact history matches.
* $\beta$ (SVD Weight): Trust latent concepts.
* $\gamma$ (Content Weight): Trust metadata/keywords.


In [56]:
best_svd_model = SVDRecommender(train_df, n_components=best_n_components)

   -> Building Sparse Matrix (7838 users x 14979 items)...
   -> Training SVD (n_components=200)...


In [58]:
class Hybrid_SVD:
    def __init__(self, memory_model, svd_model, content_model, content_alpha=0.5):
        """
        3-Way Hybrid Recommender.
        """
        self.memory_model = memory_model
        self.svd_model = svd_model
        self.content_model = content_model
        self.content_alpha = content_alpha

    def recommend(self, user_id, top_k=10, weights=(0.33, 0.33, 0.33)):
        """
        weights: Tuple (w_memory, w_svd, w_content). Must sum to 1.0.
        """
        w_mem, w_svd, w_cont = weights

        # 1. Get Candidates from all 3 models
        mem_items = self.memory_model.recommend(user_id, top_k=top_k*2)
        svd_items = self.svd_model.recommend(user_id, top_k=top_k*2)
        cont_items = self.content_model.recommend(user_id, top_k=top_k*2, alpha=self.content_alpha)

        # 2. Rank-Based Scoring (1.0 for 1st, 0.5 for 2nd...)
        mem_scores = {item: 1.0/(i+1) for i, item in enumerate(mem_items)}
        svd_scores = {item: 1.0/(i+1) for i, item in enumerate(svd_items)}
        cont_scores = {item: 1.0/(i+1) for i, item in enumerate(cont_items)}

        # 3. Weighted Fusion
        all_items = set(mem_scores.keys()) | set(svd_scores.keys()) | set(cont_scores.keys())
        final_scores = {}

        for item in all_items:
            score = (w_mem * mem_scores.get(item, 0.0)) + \
                    (w_svd * svd_scores.get(item, 0.0)) + \
                    (w_cont * cont_scores.get(item, 0.0))
            final_scores[item] = score

        # 4. Sort & Return
        sorted_items = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)
        return [item for item, score in sorted_items[:top_k]]

In [59]:
# Instantiate
hybrid_svd = Hybrid_SVD(
    memory_model=cf_model,
    svd_model=best_svd_model,
    content_model=content_model,
    content_alpha=config.get('best_content_alpha', 0.5)
)
print(">>> Hybrid_SVD Class Ready.")

>>> Hybrid_SVD Class Ready.


## Tuning Hyperparameters

Since we have 3 weights ($\alpha, \beta, \gamma$), we need a nested loop to find the best combination.

In [60]:
def tune_ensemble_weights(model, test_df, k=10):
    print(">>> Tuning Ensemble Weights (Memory vs SVD vs Content)...")

    results = {}

    # Grid Search: Generate combinations that sum to 1.0
    # Step size 0.2 to keep it fast
    steps = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]

    combinations = []
    for w1 in steps:
        for w2 in steps:
            if w1 + w2 <= 1.0:
                w3 = 1.0 - w1 - w2
                # Round to avoid float precision errors (e.g. 0.99999)
                w3 = round(w3, 1)
                combinations.append((w1, w2, w3))

    print(f"   -> Testing {len(combinations)} weight combinations...")

    for weights in tqdm(combinations):
        hits = 0
        total = 0

        # Run on subset if needed, or full set
        for _, row in test_df.iterrows():
            user_id = row['user_id']
            target = row['item_id']
            try:
                recs = model.recommend(user_id, top_k=k, weights=weights)
                if target in recs:
                    hits += 1
            except:
                pass
            total += 1

        score = hits / total
        results[weights] = score

    # Find Winner
    best_weights = max(results, key=results.get)
    print(f"\n>>> Best Ensemble Weights: {best_weights}")
    print(f"   -> Memory CF: {best_weights[0]}")
    print(f"   -> SVD CF:    {best_weights[1]}")
    print(f"   -> Content:   {best_weights[2]}")
    print(f"   -> Hit Rate:  {results[best_weights]:.4%}")

    return best_weights

In [61]:
# Execute
best_ensemble_weights = tune_ensemble_weights(hybrid_svd, val_df)

print(best_ensemble_weights)

>>> Tuning Ensemble Weights (Memory vs SVD vs Content)...
   -> Testing 21 weight combinations...


  0%|          | 0/21 [00:00<?, ?it/s]


>>> Best Ensemble Weights: (0.4, 0.0, 0.6)
   -> Memory CF: 0.4
   -> SVD CF:    0.0
   -> Content:   0.6
   -> Hit Rate:  7.0554%
(0.4, 0.0, 0.6)


In [62]:
# Update Config & Save
config['best_sdv_weights'] = best_ensemble_weights
joblib.dump(config, f'{BASE_PATH}models/03_best_params.pkl')
print("\n>>> Final Config Saved with Ensemble Weights.")


>>> Final Config Saved with Ensemble Weights.
