In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
train_df = pd.read_csv("../data/processed/train.csv")
train_df.head()


Unnamed: 0,user_id,book_id,rating
0,c82d3a24cf4c6afe923ee906451e7cd9,1499941,4
1,d7941b308469050c693713f00bd9206f,15645,5
2,e744432b77ec0a68d8cf9f0195e92698,19064137,3
3,2b90a2eadd6e47e50ab7c1e868a94262,16128095,4
4,06d2b1d9f173bc4a58824b138d67cc6d,491318,4


In [3]:
user_item_matrix = train_df.pivot_table(
    index='user_id',
    columns='book_id',
    values='rating',
    fill_value=0
)

user_item_matrix.shape


MemoryError: Unable to allocate 9.24 GiB for an array with shape (20020, 61967) and data type float64

In [4]:
# Select top 3000 most-rated books
top_books = (
    train_df['book_id']
    .value_counts()
    .head(3000)
    .index
)

train_subset = train_df[train_df['book_id'].isin(top_books)]
train_subset.shape


(509954, 3)

In [5]:
# Optionally also limit users
top_users = (
    train_subset['user_id']
    .value_counts()
    .head(5000)
    .index
)

train_subset = train_subset[train_subset['user_id'].isin(top_users)]
train_subset.shape


(164454, 3)

In [6]:
user_item_matrix = train_subset.pivot_table(
    index='user_id',
    columns='book_id',
    values='rating',
    fill_value=0
)

user_item_matrix.shape


(5000, 2991)

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

item_similarity = cosine_similarity(user_item_matrix.T)

item_similarity_df = pd.DataFrame(
    item_similarity,
    index=user_item_matrix.columns,
    columns=user_item_matrix.columns
)

item_similarity_df.iloc[:5, :5]


book_id,234,254,286,290,291
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
234,1.0,0.0392,0.0,0.0,0.026215
254,0.0392,1.0,0.0,0.0,0.0
286,0.0,0.0,1.0,0.1226,0.033735
290,0.0,0.0,0.1226,1.0,0.150482
291,0.026215,0.0,0.033735,0.150482,1.0


In [8]:
def predict_rating(user_id, book_id, user_item_matrix, item_similarity_df, k=10):
    """
    Predict rating of user_id for book_id using item-based CF
    """
    # If user or book not in matrix, return None
    if user_id not in user_item_matrix.index or book_id not in user_item_matrix.columns:
        return None

    # Get user's ratings
    user_ratings = user_item_matrix.loc[user_id]

    # Books rated by user
    rated_books = user_ratings[user_ratings > 0]

    if rated_books.empty:
        return None

    # Similarity scores for target book
    similarities = item_similarity_df[book_id]

    # Keep only books rated by user
    similarities = similarities[rated_books.index]

    # Select top-k similar books
    top_k = similarities.sort_values(ascending=False).head(k)

    # Weighted average
    numerator = np.dot(top_k.values, rated_books[top_k.index].values)
    denominator = np.sum(np.abs(top_k.values))

    if denominator == 0:
        return None

    return numerator / denominator


In [9]:
sample_user = user_item_matrix.index[0]
sample_book = user_item_matrix.columns[1]

predict_rating(sample_user, sample_book, user_item_matrix, item_similarity_df)


In [10]:
def recommend_books(user_id, user_item_matrix, item_similarity_df, k=10):
    """
    Recommend top-k books for a user
    """
    if user_id not in user_item_matrix.index:
        return []

    user_ratings = user_item_matrix.loc[user_id]

    # Books not rated by user
    unrated_books = user_ratings[user_ratings == 0].index

    predictions = []

    for book_id in unrated_books:
        pred = predict_rating(
            user_id, book_id,
            user_item_matrix,
            item_similarity_df
        )
        if pred is not None:
            predictions.append((book_id, pred))

    # Sort by predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)

    return predictions[:k]


In [11]:
recommend_books(sample_user, user_item_matrix, item_similarity_df, k=5)


[(22990, 5.0), (87047, 5.0), (133380, 5.0), (145659, 5.0), (150236, 5.0)]

In [14]:
#This freezes the model for the deployment
import pickle

with open("../models/user_item_matrix.pkl", "wb") as f:
    pickle.dump(user_item_matrix, f)

with open("../models/item_similarity.pkl", "wb") as f:
    pickle.dump(item_similarity_df, f)
