In [None]:
#!pip install -r requirements.txt
!pip install numpy==1.24.3 thefuzz

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import kagglehub
import torch

# Download latest version
path = kagglehub.dataset_download("arashnic/book-recommendation-dataset")
# path = '../dataset'

books = pd.read_csv(f"{path}/Books.csv")
ratings = pd.read_csv(f"{path}/Ratings.csv")
users= pd.read_csv(f"{path}/Users.csv")

book_of_interest = 'don quixote'

Find LOTR books in the database

In [None]:
lotr_books = books[books["Book-Title"].str.contains(book_of_interest, case=False)]
lotr_books.head()

Books

Get rid of images and nan values

In [None]:
books.drop(['Image-URL-S', 'Image-URL-L'], axis=1, inplace=True) # leave one image column for later visualization
print(books.isna().any(axis=1).sum()) # there only 4 incomplete rows, I'll simply drop them
books.dropna(inplace=True)

I will deduplicate on (Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher') tuples later, as there may be ratings that I would otherwise get rid off if i do deduplication in books table now

In [None]:
num_duplicates = books.duplicated(['Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']).sum()
print(f"Duplicate books {100*num_duplicates/len(books):.2f}%")

In [None]:
books['Year-Of-Publication'] = pd.to_numeric(books['Year-Of-Publication'], errors='coerce')
print(books['Year-Of-Publication'].isna().sum())
books = books.fillna({'Year-Of-Publication': books['Year-Of-Publication'].mean()}) # impute with mean

In [None]:
author_count = books['Book-Author'].value_counts().reset_index(name='Count').sort_values('Count', ascending=False).head(10)

fig, ax = plt.subplots(1, 1, figsize=(10, 4))
ax.bar(author_count['Book-Author'], author_count['Count'], color='salmon')
ax.set_xticks(range(len(author_count)))
ax.set_xticklabels(author_count['Book-Author'], rotation=45, ha='right')
ax.set_title('Top Authors')

Users

In [None]:
users['Location'].nunique()

In [None]:
users['Location'] = users['Location'].apply(lambda x: x.split(',')[-1].lower().strip()) # replace (city, region, country) with just the country
users['Location'] = users['Location'].str.replace(r'[!./@"]', '', regex=True)
users['Location'].nunique()

In [None]:
pd.Series(users['Location'].unique()).loc[lambda s: s.str.contains('spa', case=False, na=False)]

Locations are still pretty dirty. There are around 200 recognized countries in the world and the data we have tells us that there are 617. There are many invalid places like "somewhere in space" or country names not written in english, that have a typo, written in incorrect format and so on...

In [None]:
users.isna().sum()

In [None]:
print(f"Missing ages {100*users['Age'].isna().sum() / len(users['Age']):.2f}%")

In [None]:
users.fillna({'Age': 0}, inplace=True)
users.loc[users['Age'] <= 3, 'Age'] = 0
mean = users['Age'][users['Age'] <= 3].mean()
users['Age'] = users['Age'].replace(0, mean) # impute by mean

Ratings

In [None]:
ratings.head()

In [None]:
plt.hist(ratings['Book-Rating'], color='salmon', bins=range(1, 12), edgecolor='black', align='left')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.title('Distribution of explicit book ratings')
plt.xticks(range(1, 11))
plt.show()

We have explicit and implicit feedback...

In [None]:
ratings_explicit = ratings[ratings['Book-Rating'] != 0] # choose only explicit ratings for now

book_ratings = ratings_explicit.merge(books.drop('Image-URL-M', axis=1), on='ISBN')
implicit = 1 - len(book_ratings) / len(ratings)
print(f"Implicit ratings percentage: {implicit:.2%}")

Get the user-book rating matrix

In [None]:
complete_df = book_ratings.merge(users, on="User-ID") # create a triplet dataframe of users;rankings;books dataframes

Now that we have the metadata combined with ratings, we can deduplicate and clean up the data some more

In [None]:
complete_df['Book-Title'] = complete_df['Book-Title'].str.lower()
complete_df['Book-Author'] = complete_df['Book-Author'].str.lower()
complete_df.duplicated({'Book-Title', 'Book-Author', 'User-ID'}).sum() / len(complete_df)

In [None]:
complete_df = complete_df.groupby(['Book-Title', 'Book-Author', 'User-ID'], as_index=False).agg({
    'Book-Rating': 'mean',
    **{col: 'first' for col in complete_df.columns if col not in ['Book-Title', 'Book-Author', 'User-ID', 'Book-Rating']}
})

Calculate the sparsity of our user-book ratings

In [None]:
def compute_sparsity(ratings_df):
    n_users = ratings_df['User-ID'].nunique()
    n_books = ratings_df['ISBN'].nunique()
    print(f"Number of users: {n_users}")
    print(f"Number of books: {n_books}")

    total_possible = n_users * n_books
    actual_ratings = len(ratings_df)
    print(f"Actual ratings: {actual_ratings}")
    print(f"Total possible ratings: {total_possible}")

    sparsity = 1 - (actual_ratings / total_possible)
    print(f"Sparsity of the user-book ratings: {sparsity:.7f}")

    return sparsity

In [None]:
sparsity = compute_sparsity(complete_df)

Our data is very sparse. To try and mitgate this, we can filter out less popular books and inexperienced users

In [None]:
mask_users = complete_df.groupby('User-ID')['Book-Rating'] \
                        .transform('size') >= 10
mask_books = complete_df.groupby('ISBN')['Book-Rating'] \
                         .transform('size') >= 10

filtered_df = complete_df[mask_users & mask_books]

In [None]:
sparsity = compute_sparsity(filtered_df)

I will train a classical SVD model and use the learned book vectors in latent space to do item to item similarity.

If a root mean square error is around 1.5 on average. We can safely say that the average rating prediction of our model is off by no more than 1.5, since rmse is greater or equal to the mean absolute error

In [None]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import KFold
from surprise import accuracy

data_df = filtered_df[['User-ID', 'ISBN', 'Book-Rating']].copy()

# create the model from surprise library and perform 5-fold cross-validation
min_c, max_c = data_df['Book-Rating'].min(), data_df['Book-Rating'].max()
reader = Reader(rating_scale=(min_c, max_c))
data = Dataset.load_from_df(data_df, reader)
model = SVD(n_factors=50)

rmse_test, rmse_train = [], []
kf = KFold(n_splits=5, random_state=42, shuffle=True)

for fold, (train, test) in enumerate(kf.split(data)):
    model.fit(train)

    preds = model.test(test)
    rmse_curr = accuracy.rmse(preds)
    rmse_test.append(rmse_curr)

    train_testset = train.build_testset()
    preds = model.test(train_testset)
    rmse_curr = accuracy.rmse(preds, verbose=False)
    rmse_train.append(rmse_curr)

print(f"Mean test set RMSE: {np.mean(rmse_test):.4f}")
print(f"Mean train set RMSE: {np.mean(rmse_train):.4f}")

In [None]:
import numpy as np

isbn2title = {
    isbn: title.strip().lower()
    for isbn, title in zip(books['ISBN'], books['Book-Title'])
}

inner2title = {}
for inner_id in range(train.n_items):
    try:
        raw_id = train.to_raw_iid(inner_id)
        inner2title[inner_id] = isbn2title.get(raw_id, "").lower()
    except ValueError:
        continue

title2inner = {
    title: iid
    for iid, title in inner2title.items()
    if title
}

def book2book_retrieval(book_title, model, title2inner, inner2title, top_n=10):
    key = book_title.lower()
    if key not in title2inner:
        # find a partial match
        for inner_id, title in inner2title.items():
            if book_title.lower() in title:
                key = title
                break
        else:
            print(f"No match found for '{book_title}'")
            return []
        print(f"No exact match for '{book_title}' found. Showing results for '{key}' instead.")

    inner_id = title2inner[key]

    # cosine sim
    q = model.qi[inner_id]
    q_norm = q / np.linalg.norm(q)
    db_norm = model.qi / np.linalg.norm(model.qi, axis=1, keepdims=True)
    sims = db_norm.dot(q_norm)

    recs = [
        (inner2title[iid], round(score, 3))
        for iid, score in enumerate(sims)
        if iid != inner_id
    ]
    recs.sort(key=lambda x: x[1], reverse=True)
    return recs[:top_n]

In [None]:
book_of_interest = 'the lord of the rings'
print(title2inner[book_of_interest])

In [None]:
book2book_retrieval(book_of_interest, model, title2inner, inner2title, top_n=10)

In [None]:
# import pickle
#
# with open("models/svd_model/model2.pkl", "wb") as f:
#     pickle.dump(model, f)
#
# with open("models/svd_model/inner2title2.pkl", "wb") as f:
#     pickle.dump(inner2title, f)

[EASE](https://arxiv.org/pdf/1905.03375), but with explicit ratings

In [None]:
threshold = ratings['Book-Rating'][ratings['Book-Rating'] > 0].mean() # the mean of explicit rankings
ratings_implicit = ratings[
    (ratings['Book-Rating'] == 0) | (ratings['Book-Rating'] >= threshold)
].copy()

ratings_implicit['Book-Rating'] = ratings_implicit['Book-Rating'].apply(lambda x: 1 if x > 0 else 0)

ratings_implicit.head()

In [None]:
book_ratings = ratings_implicit.merge(books.drop('Image-URL-M', axis=1), on='ISBN')
complete_df = book_ratings.merge(users, on='User-ID')
compute_sparsity(complete_df)

In [None]:
mask_users = complete_df.groupby('User-ID')['Book-Rating'] \
                        .transform('size') >= 3
mask_books = complete_df.groupby('ISBN')['Book-Rating'] \
                         .transform('size') >= 10

filtered_df = complete_df[mask_users & mask_books]
compute_sparsity(filtered_df)

In [None]:
unique_isbns = filtered_df['ISBN'].unique()
isbn_to_id = {isbn: idx for idx, isbn in enumerate(unique_isbns)}

id_to_isbn = {idx: isbn for isbn, idx in isbn_to_id.items()}
filtered_df['ISBN_ID'] = filtered_df['ISBN'].map(isbn_to_id)

X_id_df = (
    pd.pivot(
        data=filtered_df,
        columns='ISBN_ID',
        index='User-ID',
        values='Book-Rating'
    )
    .fillna(0)
)

X = torch.tensor(X_id_df.fillna(0).values, dtype=torch.float32)

In [None]:
print(X.shape)

In [None]:
I = torch.eye(X.shape[1])
reg = 100 # regularization hyperparameter
P_hat = torch.inverse(X.T @ X + reg * I)
B_hat = I - P_hat * torch.diag(I / torch.diag(P_hat))
B_hat = B_hat.fill_diagonal_(0)
print(B_hat.shape)

In [None]:
# import pickle

# with open("model10.pkl", "wb") as f:
#     pickle.dump(B_hat, f)

# with open("df10.pkl", "wb") as f:
#     pickle.dump(filtered_df, f)


In [None]:
from thefuzz import process

def ease_item2item(B, book_name, df, score_cutoff=90, top_n=10):
    unique_titles = list(dict.fromkeys(df['Book-Title'].str.lower()))
    unique_isbns = df['ISBN'].unique().tolist()

    best_match = process.extractOne(book_name.lower(), unique_titles)
    if not best_match or best_match[1] < score_cutoff:
        return None

    # Get the matched row
    print(f'Matched book: {best_match[0]}')
    matched_ISBNs = df[df['Book-Title'].str.lower() == best_match[0]]['ISBN']
    if matched_ISBNs.empty:
        return None

    inner_id = unique_isbns.index(matched_ISBNs.iloc[0])
    top_indices = torch.topk(B[inner_id], top_n).indices
    top_isbns = [unique_isbns[i] for i in top_indices]
    titles = filtered_df[filtered_df['ISBN'].isin(top_isbns)]['Book-Title'].unique().tolist()

    return top_indices

In [None]:
book_of_interest = 'the lord of the rings'
titles = ease_item2item(B_hat, book_of_interest, filtered_df)
for title in titles:
    print(title)

Since the feedback is so sparse, I'll try a NN based approach incorporate book and user metadata. For that, I need to do some more data preprocessing

In [None]:
# split_idx = int(0.8 * len(filtered_df))
# train_df = filtered_df[:split_idx]
# test_df = filtered_df[split_idx:]

# train = torch.utils.data.DataLoader(Loader(train_df, config['year_intervals'], config['age_intervals']),
#                                     batch_size=config['batch_size'], shuffle=True)
# test = torch.utils.data.DataLoader(Loader(test_df, config['year_intervals'], config['age_intervals']),
#                                     batch_size=config['batch_size'], shuffle=False)

# nn_model = Recommender(
#     filtered_df['User-ID'].nunique(),
#     filtered_df['ISBN'].nunique(),
#     filtered_df['Location'].nunique() * len(config['age_intervals']),
#     filtered_df['Book-Author'].nunique() * len(config['year_intervals']))

# optimizer = torch.optim.AdamW(nn_model.parameters(), lr=1e-3, weight_decay=1e-3)

In [None]:
# for i in range(config['epochs']):
#     nn_model.train()
#     train_losses = []
#     for (x, y) in train:
#         optimizer.zero_grad()
#         y_pred = nn_model(x)
#         metric = torch.sqrt(torch.nn.functional.mse_loss(y_pred.sigmoid() * 10, y))
#         loss = torch.nn.functional.binary_cross_entropy_with_logits(y_pred, y / 10)
#         train_losses.append(metric.item())
#         loss.backward()
#         optimizer.step()

#     nn_model.eval()
#     test_losses = []
#     with torch.no_grad():
#         for (x, y) in test:
#             y_pred = nn_model(x).squeeze(-1)
#             loss = torch.sqrt(torch.nn.functional.mse_loss(y_pred.sigmoid() * 10, y))
#             test_losses.append(loss.item())

#     train_rmse = np.mean(train_losses)
#     test_rmse = np.mean(test_losses)
#     print(f"Epoch {i+1}/{config['epochs']} - Train RMSE: {train_rmse:.4f}, Test RMSE: {test_rmse:.4f}")

In [None]:
# def book2book_retrieval(book_title, train, model, book2idx, idx2book, top_n=10):
#     key = book_title.lower()
#     if key not in title2inner:
#         # find a partial match
#         for inner_id, title in inner2title.items():
#             if book_title.lower() in title:
#                 key = title
#                 break
#         else:
#             print(f"No match found for '{book_title}'")
#             return []
#         print(f"No exact match for '{book_title}' found. Showing results for '{key}' instead.")

#     inner_id = title2inner[key] # the inner key in this case is actually the ISBN

#     # cosine sim
#     author = train.ratings.loc[train.ratings['ISBN'] == inner_id, 'Book-Author'].iloc[0]
#     year_bin = train.ratings.loc[train.ratings['ISBN'] == inner_id, 'Year-Of-Publication'].iloc[0]
#     sims = model.item_embeddings()

#     recs = [
#         (inner2title[iid], round(score, 3))
#         for iid, score in enumerate(sims)
#         if iid != inner_id
#     ]
#     recs.sort(key=lambda x: x[1], reverse=True)
#     return recs[:top_n]

In [None]:
# book2book_retrieval()