In [32]:
import pandas as pd
import numpy as np
import scipy.sparse as sp
import joblib
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Class Definition

Because we are working with Notebooks only, we MUST re-define the Classes (`CollaborativeRecommender`, etc.) in this new notebook before we can load them. Python needs the "blueprint" to understand the saved "object."


WE HAVE TO MAKE THEM INTO A PY FILE FOR THE SUBMISSION BC IT IS CLEANER

In [33]:
class CollaborativeRecommender:
    def __init__(self, train_df):
        """
        Item-Item Collaborative Filtering using Sparse Matrices.
        """
        self.train_df = train_df
        self.user_ids = train_df['user_id'].unique()
        self.item_ids = train_df['item_id'].unique()
        self.user2idx = {u: i for i, u in enumerate(self.user_ids)}
        self.item2idx = {item: i for i, item in enumerate(self.item_ids)}
        self.idx2item = {i: item for i, item in enumerate(self.item_ids)}

        rows = train_df['user_id'].map(self.user2idx)
        cols = train_df['item_id'].map(self.item2idx)
        data = np.ones(len(train_df))

        self.interaction_matrix = sp.coo_matrix(
            (data, (rows, cols)),
            shape=(len(self.user_ids), len(self.item_ids))
        ).tocsr()

        self.item_user_matrix = self.interaction_matrix.T.tocsr()
        self.popular_items = train_df['item_id'].value_counts().head(20).index.tolist()

    def recommend(self, user_id, top_k=10):
        if user_id not in self.user2idx:
            return self.popular_items[:top_k]

        user_idx = self.user2idx[user_id]
        user_history_vec = self.interaction_matrix[user_idx]
        history_indices = user_history_vec.indices

        if len(history_indices) == 0:
            return self.popular_items[:top_k]

        relevant_item_vecs = self.item_user_matrix[history_indices]
        user_profile = np.asarray(relevant_item_vecs.sum(axis=0)).flatten()
        scores = self.item_user_matrix.dot(user_profile)
        top_indices = scores.argsort()[::-1]

        recommendations = []
        for idx in top_indices:
            item = self.idx2item[idx]

            if idx not in history_indices:
                recommendations.append(item)

            if len(recommendations) >= top_k:
                break

        return recommendations

class ContentRecommender:
    def __init__(self, interactions_df, tfidf_matrix, minilm_embeddings, item_to_row_idx):
        self.interactions_df = interactions_df
        self.tfidf_matrix = tfidf_matrix
        self.minilm_embeddings = minilm_embeddings
        self.item_map = item_to_row_idx

        self.idx_to_item = {v: k for k, v in item_to_row_idx.items()}

        self.popular_items = interactions_df['item_id'].value_counts().head(20).index.tolist()

    def recommend(self, user_id, top_k=10, alpha=0.5):
        user_history = self.interactions_df[self.interactions_df['user_id'] == user_id]['item_id'].unique()

        if len(user_history) == 0:
            return self.popular_items[:top_k]

        valid_indices = [self.item_map[i] for i in user_history if i in self.item_map]

        if not valid_indices:
            return self.popular_items[:top_k]

        user_prof_mini = np.mean(self.minilm_embeddings[valid_indices], axis=0).reshape(1, -1)
        user_prof_tfidf = np.asarray(self.tfidf_matrix[valid_indices].mean(axis=0))
        sim_mini = cosine_similarity(user_prof_mini, self.minilm_embeddings).flatten()
        sim_tfidf = cosine_similarity(user_prof_tfidf, self.tfidf_matrix).flatten()
        final_scores = (alpha * sim_mini) + ((1 - alpha) * sim_tfidf)
        candidate_indices = final_scores.argsort()[::-1]

        recommendations = []
        for idx in candidate_indices:
            item_id = self.idx_to_item.get(idx)

            if item_id not in user_history:
                recommendations.append(item_id)

            if len(recommendations) >= top_k:
                break

        return recommendations

class HybridRecommender:
    def __init__(self, cf_model, content_model, content_alpha=0.5):
        self.cf_model = cf_model
        self.content_model = content_model
        self.content_alpha = content_alpha

    def recommend(self, user_id, top_k=10, hybrid_alpha=0.5):
        cf_items = self.cf_model.recommend(user_id, top_k=top_k*50)
        content_items = self.content_model.recommend(user_id, top_k=top_k*50, alpha=self.content_alpha)

        cf_scores = {item: 1.0/(i+1) for i, item in enumerate(cf_items)}
        content_scores = {item: 1.0/(i+1) for i, item in enumerate(content_items)}

        all_items = set(cf_scores.keys()) | set(content_scores.keys())
        hybrid_scores = {}

        for item in all_items:
            s_cf = cf_scores.get(item, 0.0)
            s_content = content_scores.get(item, 0.0)
            hybrid_scores[item] = (hybrid_alpha * s_cf) + ((1 - hybrid_alpha) * s_content)

        sorted_items = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)

        return [item for item, score in sorted_items[:top_k]]

# Load the data and models

In [34]:
BASE_PATH = '/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/'

def load_production_resources():
    print(">>> LOADING FULL PRODUCTION DATA...")

    # 1. Load Full History (The Truth Source)
    # We use this as the "Training Data" for the final models
    full_catalog = pd.read_csv(BASE_PATH + 'data/interactions_merged.csv')
    print(f"   -> Full History Loaded: {len(full_catalog)} rows")

    # 2. Load the Map (87k rows)
    index_map = pd.read_csv(BASE_PATH + 'data/02_interaction_index_map.csv')

    # 3. Load & Deduplicate Matrices
    print(f"   -> Loading & Cleaning Matrices...")
    tfidf_bloated = sp.load_npz(BASE_PATH + 'data/02_tfidf_matrix.npz')
    minilm_bloated = np.load(BASE_PATH + 'data/02_minilm_embeddings.npy')

    # Deduplication Logic
    unique_map = index_map.drop_duplicates(subset='item_id', keep='first')
    unique_indices = unique_map.index.values

    # Slice the matrices to keep only unique book vectors
    unique_tfidf = tfidf_bloated[unique_indices]
    unique_minilm = minilm_bloated[unique_indices]

    # Create the "Library Catalog" (Item ID -> Matrix Row)
    unique_item_ids = unique_map['item_id'].values
    item_to_row_idx = {item_id: i for i, item_id in enumerate(unique_item_ids)}

    print(f"   -> Unique Items in Catalog: {len(item_to_row_idx)}")
    print(f"   -> Matrix Shapes: TF-IDF {unique_tfidf.shape}, MiniLM {unique_minilm.shape}")

    # 4. Load Hyperparameters (The Best Alphas we found in Lab)
    try:
        config = joblib.load(BASE_PATH + 'models/03_best_params.pkl')
        print(f"   -> Loaded Best Params: {config}")
    except:
        print("   Config not found. Using defaults.")
        config = {'best_content_alpha': 0.5, 'best_hybrid_alpha': 0.4}

    return full_catalog, unique_tfidf, unique_minilm, item_to_row_idx, config

# Generate Submission Function

We create a function that we can reuse for any models

In [57]:
def generate_kaggle_submission(model, target_user_ids, k=10, **kwargs):
    """
    Generates a submission file for ANY model class.

    Args:
        model: The trained model object (must have .recommend() method)
        target_user_ids: List of user_ids to predict for (from sample_submission)
        **kwargs: Extra arguments for .recommend (e.g., hybrid_alpha=0.6)
    """
    print(f">>> Generating predictions for {len(target_user_ids)} users...")

    recommendations = []

    # Progress bar to track generation
    for user_id in tqdm(target_user_ids):
        try:
            # Polymorphic call: Works for CF, Content, Hybrid, or SVD
            recs = model.recommend(user_id, top_k=k, **kwargs)

            # Format: Space-separated string "id1 id2 id3..."
            recs_str = " ".join([str(r) for r in recs])
        except Exception as e:
            print(f">>> Error generating recommendations for user {user_id}: {e}")
            recs_str = ""

        recommendations.append(recs_str)

    # Create DataFrame matching Kaggle format
    submission = pd.DataFrame({
        'user_id': target_user_ids,
        'recommendation': recommendations
    })

    submission.sort_values(by='user_id', inplace=True)

    return submission

# Create Reccomendation

In [58]:
full_df, item_tfidf, item_minilm, item_map, best_params = load_production_resources()

>>> LOADING FULL PRODUCTION DATA...
   -> Full History Loaded: 87045 rows
   -> Loading & Cleaning Matrices...
   -> Unique Items in Catalog: 15109
   -> Matrix Shapes: TF-IDF (15109, 15000), MiniLM (15109, 384)
   -> Loaded Best Params: {'best_content_alpha': 0.5, 'best_hybrid_alpha': 0.4}


## Re-training on Full Data

In [59]:
print("\n>>> BUILDING FINAL PRODUCTION MODEL...")

# 1. Collaborative Filtering (Trained on 100% of history)
print("   -> Training Collaborative Model (Full Data)...")
cf_full = CollaborativeRecommender(full_df)

# 2. Content-Based (Trained on 100% of history)
print("   -> Training Content Model (Full Data)...")
content_full = ContentRecommender(
    full_df,
    item_tfidf,
    item_minilm,
    item_map
)

# 3. Hybrid (Combining both)
print(f"   -> Assembling Hybrid Model (Content Alpha: {best_params['best_content_alpha']})...")
hybrid_full = HybridRecommender(
    cf_full,
    content_full,
    content_alpha=best_params['best_content_alpha']
)

print("\n>>> Final Models Ready for Submission.")


>>> BUILDING FINAL PRODUCTION MODEL...
   -> Training Collaborative Model (Full Data)...
   -> Training Content Model (Full Data)...
   -> Assembling Hybrid Model (Content Alpha: 0.5)...

>>> Final Models Ready for Submission.


In [60]:
target_users = full_df['user_id'].unique()
optimal_alpha = best_params.get('best_hybrid_alpha', 0.5)

final_submission = generate_kaggle_submission(
    model=hybrid_full,
    target_user_ids=target_users,
    k=10,
    hybrid_alpha=optimal_alpha
)

>>> Generating predictions for 7838 users...


  0%|          | 0/7838 [00:00<?, ?it/s]

In [61]:
final_submission

Unnamed: 0,user_id,recommendation
2,0,13261 6286 4512 8404 8506 1583 9021 2641 5329 ...
130,1,7158 11549 9897 14597 14107 3596 103 2489 1506...
211,2,8504 3055 140 11184 15023 2198 10715 2550 9907...
180,3,1404 611 5412 10475 14107 4861 13021 766 625 2553
856,4,248 4418 8300 14601 9461 1474 14602 5654 2225 ...
...,...,...
2596,7833,10967 7306 12967 9610 1275 4921 611 1370 7308 ...
3349,7834,14991 11184 3062 7324 14990 54 11061 8050 1105...
5052,7835,9310 53 12813 11184 12811 2163 14578 4861 54 1...
5261,7836,3062 14555 3816 13587 14991 14989 14166 14990 ...


In [62]:
#order by user_id
final_submission = final_submission.sort_values(by='user_id')
# save final_submission
final_submission.to_csv("/content/drive/MyDrive/Colab Notebooks/HEC/Data Science and Machine Learning/Team Project/submissions/submission_HybridRecommender.csv", index=False)

Hybrid Reccommender system combining Collaborative Filtering and Content-Based (using MiniLM embeddings + TF-IDF matrix) recommenders

Hyper parameters:
Content-Based -> alpha = 0.5
Hybrid -> alpha = 0.5