In [2]:
# import libraries
import numpy as np
import pandas as pd
from IPython.display import display
from IPython.core.display import HTML
from sklearn.model_selection import train_test_split
from scipy.sparse.linalg import svds
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from tqdm import tqdm

In [3]:
df_books = pd.read_csv("cleaned_books.csv")
df_ratings = pd.read_csv("cleaned_ratings.csv")

In [4]:
# limit the ratings per book to max 1000
df_ratings = df_ratings[df_ratings["book_id"].isin(df_books.index)]
df_ratings = (
    df_ratings.groupby("book_id")
    .apply(lambda x: x.sample(min(1000, len(x))))
    .reset_index(drop=True)
)

In [4]:
len(df_ratings)  

3457285

In [5]:
# Map IDs to indices
user_map = {u: i for i, u in enumerate(df_ratings['user_id'].unique())}
item_map = {b: i for i, b in enumerate(df_ratings['book_id'].unique())}
df_ratings['user_idx'] = df_ratings['user_id'].map(user_map)
df_ratings['item_idx'] = df_ratings['book_id'].map(item_map)

n_users = len(user_map)
n_items = len(item_map)

In [6]:
train_df, test_df = train_test_split(df_ratings, test_size=0.2, random_state=42)

In [7]:
R_train = np.zeros((n_users, n_items))

for row in train_df.itertuples():
    R_train[row.user_idx, row.item_idx] = row.rating

In [8]:
k = 5  # Number of latent factors
U, sigma, Vt = svds(R_train, k=k)
sigma = np.diag(sigma)

predicted_ratings_matrix = np.dot(np.dot(U, sigma), Vt)

In [9]:
def compute_rmse(pred_matrix, test_df):
    preds = []
    truths = []

    for row in test_df.itertuples():
        user_idx = row.user_idx
        item_idx = row.item_idx
        pred_rating = pred_matrix[user_idx, item_idx]
        preds.append(pred_rating)
        truths.append(row.rating)

    rmse = np.sqrt(mean_squared_error(truths, preds))
    return round(rmse, 4)

In [10]:
def compute_mae(pred_matrix, test_df):
    preds = []
    truths = []

    for row in test_df.itertuples():
        user_idx = row.user_idx
        item_idx = row.item_idx
        pred_rating = pred_matrix[user_idx, item_idx]
        preds.append(pred_rating)
        truths.append(row.rating)

    mae = mean_absolute_error(truths, preds)
    return round(mae, 4)

In [11]:
def apk(actual, predicted, k=5):
    if not actual:
        return 0.0
    predicted = predicted[:k]

    score = 0.0
    hits = 0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            hits += 1
            score += hits / (i + 1)
    return score / min(len(actual), k)

def reciprocal_rank(actual, predicted):
    for i, p in enumerate(predicted):
        if p in actual:
            return 1.0 / (i + 1)
    return 0.0

In [12]:
def pre_map_mrr(pred_matrix, train_df, test_df, k=10, threshold=4.0):
    correct = 0
    total = 0
    total_score = 0
    total_rr = 0
    user_count = 0
    train_user_items = train_df.groupby("user_idx")["item_idx"].apply(set)

    for user in tqdm(test_df['user_idx'].unique(), desc="Processing evaluation (MRR, MAP, Precision)", unit="user"):
        rated_items = train_user_items.get(user, set())
        user_pred = pred_matrix[user]
        
        # Exclude items seen in training
        unseen_items = np.setdiff1d(np.arange(pred_matrix.shape[1]), list(rated_items))
        top_k_items = unseen_items[np.argsort(user_pred[unseen_items])[::-1][:k]]

        # True relevant items in test set for this user
        user_test = test_df[test_df['user_idx'] == user]
        relevant_items = user_test[user_test['rating'] >= threshold]['item_idx'].values
        
        # Calculate precision
        correct += len(set(top_k_items) & set(relevant_items))
        total += k
        
        actual_items = test_df[(test_df['user_idx'] == user) & (test_df['rating'] >= threshold)]['item_idx'].tolist()
        #Calculate MAP and MRR
        if actual_items:
            total_score += apk(actual_items, list(top_k_items), k)
            total_rr += reciprocal_rank(actual_items, list(top_k_items))
            user_count += 1      

    precision = round(correct / total, 4)
    map = round(total_score / user_count, 4) if user_count else 0
    mrr = round(total_rr / user_count, 4) if user_count else 0
    return precision, map, mrr


In [29]:
print("RMSE:", compute_rmse(predicted_ratings_matrix, test_df))
print("MAE:", compute_mae(predicted_ratings_matrix, test_df))

precision, map, mrr = pre_map_mrr(predicted_ratings_matrix, train_df, test_df, k=5, threshold=4.0)
print("MAP@5:", map)
print("MRR@5:", mrr)
print("Precision@5:", precision)


RMSE: 3.9095
MAE: 3.7877


Processing evaluation (MRR, MAP, Precision): 100%|██████████| 53408/53408 [16:37<00:00, 53.53user/s]


MAP@5: 0.0185
MRR@5: 0.0574
Precision@5: 0.0284


In [13]:
def recommend_books(user_id, R_train, predicted_ratings, n=5):
    user_idx = user_map[user_id]
    user_ratings = R_train[user_idx]
    preds = predicted_ratings[user_idx]
    
    # Books not rated by user
    unrated_indices = np.where(user_ratings == 0)[0]
    recommended_indices = unrated_indices[np.argsort(preds[unrated_indices])[::-1][:n]]

    # Map back to book IDs
    item_map_rev = {i: b for b, i in item_map.items()}
    recommended_books = [(item_map_rev[i], preds[i]) for i in recommended_indices]
    return recommended_books

In [14]:
# Function to display images in a DataFrame
def display_images(df, image_column):
    # Create an HTML representation of the DataFrame with images
    html = df.to_html(escape=False, formatters={
        image_column: lambda url: f'<img src="{url}" width="100">'
    })
    display(HTML(html))

In [15]:
rcmd = recommend_books(123, R_train, predicted_ratings_matrix)
rcmd_df = pd.DataFrame(rcmd, columns=["book_id", "predicted_rating"])
rcmd_df = rcmd_df.merge(df_books[['book_id', 'title', 'image_url']], on='book_id', how='left')
rcmd_df['predicted_rating'] = rcmd_df['predicted_rating'].round(4)
display_images(rcmd_df, 'image_url')

Unnamed: 0,book_id,predicted_rating,title,image_url
0,1159,0.134,Stones from the River,
1,437,0.1331,The Invention of Wings,
2,291,0.1306,Cutting for Stone,
3,1198,0.1301,A Little Life,
4,267,0.13,The Nightingale,
