### Implementing the following recommender systems using the training set to train them, based on user ratings for books in the amazon dataset.
1. Non-personalized recommender
2. Content-based recommender based on K=50 nearest neighbors
3. User-user collaborative filtering
4. SVD

In [1]:
import re
import io
import time
import math
import numpy as np
import pandas as pd
from scipy import sparse
from sparselsh import LSH
from scipy.sparse import vstack
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from datetime import datetime, timedelta
from collections import defaultdict
# Load Surprise libraries
from surprise import KNNBasic
from surprise import Reader
from surprise import Dataset
from surprise import accuracy
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split
from surprise import accuracy

### Load and Pre-Process Data

#### Amazon Books

In [2]:
df_b = pd.read_csv('../data/amazon/books_meta.csv')
ratings_train_df = pd.read_csv('../data/amazon/books_ratings_training.csv')
ratings_test_df = pd.read_csv('../data/amazon/books_ratings_test.csv')

In [3]:
df_b.head()

Unnamed: 0,item_id,title,price,categories
0,1,Parker & Knight,0.0,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
1,2,Need You Now - A Story of Hope,13.97,[]
2,3,Jealousy,0.0,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
3,4,Meat Eater: Adventures from the Life of an Ame...,49.0,"['Books', 'Cookbooks, Food & Wine', 'Cooking b..."
4,5,The Last Valley: Dien Bien Phu and the French ...,37.43,"['Books', 'History', 'Military']"


In [4]:
duplicates = df_b.duplicated().sum()
print("Number of duplicate rows:", duplicates)

Number of duplicate rows: 0


In [5]:
missing_values = df_b.isnull().sum()
print(missing_values)

item_id          0
title            0
price         7647
categories       0
dtype: int64


In [6]:
# Replace various representations of empty values with None
df_b.replace(['', ' ', '-', np.nan], None, inplace=True)
df_b.head()

Unnamed: 0,item_id,title,price,categories
0,1,Parker & Knight,0.0,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
1,2,Need You Now - A Story of Hope,13.97,[]
2,3,Jealousy,0.0,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
3,4,Meat Eater: Adventures from the Life of an Ame...,49.0,"['Books', 'Cookbooks, Food & Wine', 'Cooking b..."
4,5,The Last Valley: Dien Bien Phu and the French ...,37.43,"['Books', 'History', 'Military']"


In [7]:
df_b = df_b[df_b['categories']!='[]']

In [8]:
# df_b[df_b['categories']=='[]']

In [9]:
# Function to split categories, remove excessive spaces, and expand into separate rows
def split_and_expand_categories(row):
    categories = eval(row['categories'])
    expanded_categories = []
    for category in categories:
        sub_categories = [x.strip() for x in category.split(',')]
        clean_sub_categories = [re.sub(r'\s+', ' ', sub_cat).strip() for sub_cat in sub_categories]        
        expanded_categories.extend(clean_sub_categories)
    return pd.Series(expanded_categories)

expanded_categories_df = df_b.apply(split_and_expand_categories, axis=1).stack().reset_index(level=1, drop=True).rename('category')
expanded_df = df_b.drop('categories', axis=1).join(expanded_categories_df).reset_index(drop=True)

expanded_df

Unnamed: 0,item_id,title,price,category
0,1,Parker & Knight,0.0,Books
1,1,Parker & Knight,0.0,Mystery
2,1,Parker & Knight,0.0,Thriller & Suspense
3,1,Parker & Knight,0.0,Thrillers & Suspense
4,3,Jealousy,0.0,Books
...,...,...,...,...
185789,56947,The Secret Teachings of All Ages (Reader's Edi...,19.49,Literature & Fiction
185790,56948,Tennis Shoe Adventure series: Tennis Shoes Amo...,8.16,Books
185791,56948,Tennis Shoe Adventure series: Tennis Shoes Amo...,8.16,Mystery
185792,56948,Tennis Shoe Adventure series: Tennis Shoes Amo...,8.16,Thriller & Suspense


In [10]:
expanded_df = expanded_df.drop_duplicates(subset=['item_id', 'category'])
expanded_df 

Unnamed: 0,item_id,title,price,category
0,1,Parker & Knight,0.0,Books
1,1,Parker & Knight,0.0,Mystery
2,1,Parker & Knight,0.0,Thriller & Suspense
3,1,Parker & Knight,0.0,Thrillers & Suspense
4,3,Jealousy,0.0,Books
...,...,...,...,...
185788,56947,The Secret Teachings of All Ages (Reader's Edi...,19.49,Religion & Spirituality
185789,56947,The Secret Teachings of All Ages (Reader's Edi...,19.49,Literature & Fiction
185790,56948,Tennis Shoe Adventure series: Tennis Shoes Amo...,8.16,Books
185791,56948,Tennis Shoe Adventure series: Tennis Shoes Amo...,8.16,Mystery


In [11]:
# For Checking
# tryi = expanded_df[expanded_df['category']=='']
# tryi
# missing_values = df_b.isnull().sum()
# print(missing_values)
# count_comma_in_category = expanded_categories_df['category'].str.contains(',').sum()
# count_comma_in_category

In [12]:
expanded_categories_df = expanded_df

In [13]:
df = df_b.drop(['categories', 'price'], axis=1)
df

Unnamed: 0,item_id,title
0,1,Parker & Knight
2,3,Jealousy
3,4,Meat Eater: Adventures from the Life of an Ame...
4,5,The Last Valley: Dien Bien Phu and the French ...
5,6,Woodstock: Three Days That Rocked the World
...,...,...
56943,56944,"How Much for Just the Planet? (Star Trek, Book..."
56944,56945,M Is For Mitten: A Michigan Alphabet (Discover...
56945,56946,Manga for the Beginner Shoujo: Everything You ...
56946,56947,The Secret Teachings of All Ages (Reader's Edi...


In [14]:
# Merge expanded categories with ratings_train_df
merged_train_df = pd.merge(ratings_train_df, expanded_categories_df, on='item_id')

# Merge expanded categories with ratings_test_df
merged_test_df = pd.merge(ratings_test_df, expanded_categories_df, on='item_id')

In [15]:
merged_train_df.columns

Index(['user_id', 'item_id', 'rating', 'timestamp', 'title', 'price',
       'category'],
      dtype='object')

In [16]:
merged_train_tags = merged_train_df[['user_id', 'item_id', 'title', 'category', 'timestamp']]
merged_train_tags

Unnamed: 0,user_id,item_id,title,category,timestamp
0,3361,53296,Crescent Dawn (Dirk Pitt Adventure),Books,1543536132144
1,3361,53296,Crescent Dawn (Dirk Pitt Adventure),Literature & Fiction,1543536132144
2,3361,53296,Crescent Dawn (Dirk Pitt Adventure),Action & Adventure,1543536132144
3,4471,1424,Cars (Disney/Pixar Cars) (Little Golden Book),Books,1543536168823
4,4471,1424,Cars (Disney/Pixar Cars) (Little Golden Book),Children's Books,1543536168823
...,...,...,...,...,...
262164,6954,53231,Pride and Prejudice,Literature & Fiction,1555050587832
262165,6954,53231,Pride and Prejudice,Classics,1555050587832
262166,6322,47262,The Red Sword (The Red Sword Trilogy Book 1),Books,1555050710933
262167,6322,47262,The Red Sword (The Red Sword Trilogy Book 1),Literature & Fiction,1555050710933


In [17]:
# Merge expanded categories with ratings_train_df
merged_train_df_new = pd.merge(ratings_train_df, df, on='item_id')

# Merge expanded categories with ratings_test_df
merged_test_df_new = pd.merge(ratings_test_df, df, on='item_id')
df_rating_test = merged_test_df_new[['user_id', 'item_id', 'title', 'rating', 'timestamp']]
df_ratings = merged_train_df_new[['user_id', 'item_id', 'title', 'rating', 'timestamp']]
df_ratings

Unnamed: 0,user_id,item_id,title,rating,timestamp
0,3361,53296,Crescent Dawn (Dirk Pitt Adventure),5.0,1543536132144
1,4471,1424,Cars (Disney/Pixar Cars) (Little Golden Book),5.0,1543536168823
2,342,50326,Love Coloring Book: An Adult Coloring Book wit...,5.0,1543536419204
3,10576,1953,"Magic Tree House Boxed Set, Books 1-4: Dinosau...",5.0,1543536938535
4,3658,1953,"Magic Tree House Boxed Set, Books 1-4: Dinosau...",5.0,1546568460586
...,...,...,...,...,...
79085,8464,40568,Eternal Flame: A WWII-1980s Time Travel Love S...,4.0,1555050231806
79086,3840,56077,Moskva: 'The new Le Carre' BBC Radio 2 The Sar...,4.0,1555050392664
79087,6841,26078,Space Activity Book for Kids Ages 4-8: A Fun K...,5.0,1555050413806
79088,6954,53231,Pride and Prejudice,5.0,1555050587832


In [18]:
df_ratings_sorted = df_ratings.sort_values(by='timestamp', axis=0, ascending=False)
df_ratings_sorted

Unnamed: 0,user_id,item_id,title,rating,timestamp
79089,6322,47262,The Red Sword (The Red Sword Trilogy Book 1),5.0,1555050710933
79088,6954,53231,Pride and Prejudice,5.0,1555050587832
79087,6841,26078,Space Activity Book for Kids Ages 4-8: A Fun K...,5.0,1555050413806
79086,3840,56077,Moskva: 'The new Le Carre' BBC Radio 2 The Sar...,4.0,1555050392664
79085,8464,40568,Eternal Flame: A WWII-1980s Time Travel Love S...,4.0,1555050231806
...,...,...,...,...,...
12,7282,51145,Geddy Lee's Big Beautiful Book of Bass,5.0,1543537322164
3,10576,1953,"Magic Tree House Boxed Set, Books 1-4: Dinosau...",5.0,1543536938535
2,342,50326,Love Coloring Book: An Adult Coloring Book wit...,5.0,1543536419204
1,4471,1424,Cars (Disney/Pixar Cars) (Little Golden Book),5.0,1543536168823


In [19]:
# Checking
# un = df_ratings_sorted['user_id'].unique()
# uni = len(un)
# uni

In [20]:
# Convert the 'timestamp' column from Unix timestamps to datetime
df_ratings_sorted['timestamp'] = pd.to_datetime(df_ratings_sorted['timestamp'], unit='ms')
most_recent_timestamp = df_ratings_sorted['timestamp'].max()
cutoff_date = most_recent_timestamp - timedelta(days=30)
recent_ratings_df = df_ratings_sorted[df_ratings_sorted['timestamp'] > cutoff_date]

In [21]:
recent_ratings_df

Unnamed: 0,user_id,item_id,title,rating,timestamp
79089,6322,47262,The Red Sword (The Red Sword Trilogy Book 1),5.0,2019-04-12 06:31:50.933
79088,6954,53231,Pride and Prejudice,5.0,2019-04-12 06:29:47.832
79087,6841,26078,Space Activity Book for Kids Ages 4-8: A Fun K...,5.0,2019-04-12 06:26:53.806
79086,3840,56077,Moskva: 'The new Le Carre' BBC Radio 2 The Sar...,4.0,2019-04-12 06:26:32.664
79085,8464,40568,Eternal Flame: A WWII-1980s Time Travel Love S...,4.0,2019-04-12 06:23:51.806
...,...,...,...,...,...
58190,10801,42826,The Shy Little Kitten (Little Golden Books),5.0,2019-03-13 07:03:56.295
71038,5746,24688,Dutchman and The Slave: Two Plays,1.0,2019-03-13 07:03:09.407
28948,1285,28288,If She Wakes,4.0,2019-03-13 07:01:20.809
20212,4917,43094,The Nursery (The Bayou Hauntings Book 3),4.0,2019-03-13 06:43:04.751


### Non personalized ranking (Damped Mean)

In [22]:
#USING DAMPED MEAN APPROACH
df_recent_ratings = pd.merge(recent_ratings_df, expanded_categories_df, on=['item_id', 'title'])

avg_ratings = df_recent_ratings.groupby('item_id')['rating'].mean()
num_ratings = df_recent_ratings.groupby('item_id')['rating'].count()

damping_factor = 10
damped_means = (avg_ratings * num_ratings + 2.5 * damping_factor) / (num_ratings + damping_factor)

top_books = damped_means.sort_values(ascending=False).head(20)

top_books_list = []
for item_id in top_books.index:
    book_title = df_recent_ratings.loc[df_recent_ratings['item_id'] == item_id, 'title'].iloc[0]
    top_books_list.append({'item_id': item_id, 'title': book_title, 'damped_mean_rating': top_books[item_id]})

top_books_df = pd.DataFrame(top_books_list)
top_books_df

Unnamed: 0,item_id,title,damped_mean_rating
0,35178,What I've Done (Morgan Dane Book 4),4.621212
1,19942,The Wonky Donkey,4.59434
2,19873,What the Wind Knows,4.537313
3,3593,Becoming,4.463918
4,44721,Giraffes Can't Dance (Board Book),4.456522
5,16590,Say You're Sorry (Morgan Dane Book 1),4.439394
6,26944,Girls of Glass,4.439394
7,30472,Secrets Never Die (Morgan Dane Book 5),4.431034
8,6502,Where the Crawdads Sing,4.428571
9,47416,Minutes to Kill (Scarlet Falls),4.404762


### Content-based recommender based on K = 50 nearest neighbors (using any combination of features: I use only the book categories)

In [23]:
df_tags = merged_train_tags.drop('title',axis=1)
df_tags

Unnamed: 0,user_id,item_id,category,timestamp
0,3361,53296,Books,1543536132144
1,3361,53296,Literature & Fiction,1543536132144
2,3361,53296,Action & Adventure,1543536132144
3,4471,1424,Books,1543536168823
4,4471,1424,Children's Books,1543536168823
...,...,...,...,...
262164,6954,53231,Literature & Fiction,1555050587832
262165,6954,53231,Classics,1555050587832
262166,6322,47262,Books,1555050710933
262167,6322,47262,Literature & Fiction,1555050710933


In [24]:
df_ratings

Unnamed: 0,user_id,item_id,title,rating,timestamp
0,3361,53296,Crescent Dawn (Dirk Pitt Adventure),5.0,1543536132144
1,4471,1424,Cars (Disney/Pixar Cars) (Little Golden Book),5.0,1543536168823
2,342,50326,Love Coloring Book: An Adult Coloring Book wit...,5.0,1543536419204
3,10576,1953,"Magic Tree House Boxed Set, Books 1-4: Dinosau...",5.0,1543536938535
4,3658,1953,"Magic Tree House Boxed Set, Books 1-4: Dinosau...",5.0,1546568460586
...,...,...,...,...,...
79085,8464,40568,Eternal Flame: A WWII-1980s Time Travel Love S...,4.0,1555050231806
79086,3840,56077,Moskva: 'The new Le Carre' BBC Radio 2 The Sar...,4.0,1555050392664
79087,6841,26078,Space Activity Book for Kids Ages 4-8: A Fun K...,5.0,1555050413806
79088,6954,53231,Pride and Prejudice,5.0,1555050587832


In [25]:
df_books = expanded_categories_df
df_books

Unnamed: 0,item_id,title,price,category
0,1,Parker & Knight,0.0,Books
1,1,Parker & Knight,0.0,Mystery
2,1,Parker & Knight,0.0,Thriller & Suspense
3,1,Parker & Knight,0.0,Thrillers & Suspense
4,3,Jealousy,0.0,Books
...,...,...,...,...
185788,56947,The Secret Teachings of All Ages (Reader's Edi...,19.49,Religion & Spirituality
185789,56947,The Secret Teachings of All Ages (Reader's Edi...,19.49,Literature & Fiction
185790,56948,Tennis Shoe Adventure series: Tennis Shoes Amo...,8.16,Books
185791,56948,Tennis Shoe Adventure series: Tennis Shoes Amo...,8.16,Mystery


In [26]:
#transform tags such that they are lower-case, single-word tokens
df_books['category'] = df_books['category'].apply(lambda x: str(x).lower().replace(' ', '_'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_books['category'] = df_books['category'].apply(lambda x: str(x).lower().replace(' ', '_'))


### Step1: Calculate item profiles

In [27]:
# calculates the lexicon of most frequent tags.
tag_frequency_threshold = 500
tag_counts = df_books['category'].value_counts()
df_lexicon = tag_counts[tag_counts >= tag_frequency_threshold].reset_index()
df_lexicon.columns = ['category', 'count']

# Filter df_tags to include only tags in the lexicon
df_tags_filtered = df_books[df_books['category'].isin(df_lexicon['category'])]

# Drop unnecessary columns
df_tags_filtered = df_tags_filtered.drop(columns=['title', 'price'], errors='ignore')

In [28]:
# uni = df_tags_filtered['category'].unique()
# len(uni)

In [29]:
#calculate the sparse feature vector based on the TF-IDF of words in documents
#the TF-IDF vectors are saved as sparse representations into the dataframe
df_features = df_tags_filtered.groupby('item_id')['category'].agg(lambda x: ' '.join(x)).reset_index()
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split(' ')).fit(sorted(df_features['category']))
vectorizer.vocabulary_
df_features['feature_vector'] = df_features['category'].apply(lambda x : vectorizer.transform([x]))
df_features



Unnamed: 0,item_id,category,feature_vector
0,1,books mystery thriller_&_suspense thrillers_&_...,"(0, 43)\t0.6086229425420621\n (0, 42)\t0.54..."
1,3,books mystery thriller_&_suspense thrillers_&_...,"(0, 43)\t0.6086229425420621\n (0, 42)\t0.54..."
2,4,books cookbooks food_&_wine,"(0, 19)\t0.6992576830001189\n (0, 13)\t0.69..."
3,5,books history,"(0, 24)\t0.9775049542055294\n (0, 6)\t0.210..."
4,6,books arts_&_photography music,"(0, 30)\t0.773649364709597\n (0, 6)\t0.1429..."
...,...,...,...
55927,56944,books biographies_&_memoirs,"(0, 6)\t0.22188159751403277\n (0, 5)\t0.975..."
55928,56945,books children's_books history,"(0, 24)\t0.8460677158230178\n (0, 8)\t0.500..."
55929,56946,books arts_&_photography,"(0, 6)\t0.2255749752362588\n (0, 4)\t0.9742..."
55930,56947,books religion_&_spirituality literature_&_fic...,"(0, 36)\t0.9074693422944268\n (0, 29)\t0.38..."


In [30]:
# csr_matrix_feature_vec.nnz

In [31]:
# print("TF-IDF Matrix shape")
# len(vectorizer.vocabulary_)

In [32]:
#tfidf_matrix.nnz

In [33]:
# zero_elements_mask = df_features['feature_vector'].apply(lambda x: x.nnz == 0)
# num_rows_with_zero_elements = zero_elements_mask.sum()
# print("Number of rows with sparse matrices containing zero stored elements:", num_rows_with_zero_elements)


### Step2: Index item profiles into LSH

In [30]:
print(df_features['feature_vector'].iloc[0].shape)

(1, 47)


In [31]:
feature_vectors = df_features['feature_vector'].tolist()
#feature_vectors

In [32]:
# #index all item vectors into LSH
matrix_feature_vec = vstack(feature_vectors)
csr_matrix_feature_vec = csr_matrix(matrix_feature_vec)

In [33]:
# # Ensure no rows with zero elements in CSR matrix
# non_zero_mask = csr_matrix_feature_vec.getnnz(axis=1) > 0
# csr_matrix_feature_vec = csr_matrix_feature_vec[non_zero_mask]

In [34]:
NUM_DIM = len(vectorizer.vocabulary_)
hash_size = 8
num_hashtables = 3

# Create LSH index
lsh = LSH(hash_size=hash_size, input_dim=NUM_DIM, num_hashtables=num_hashtables)

for i, row in df_features.iterrows():
    item_vector = csr_matrix_feature_vec[i]  # Access the item vector directly from the CSR matrix
    lsh.index(item_vector, extra_data=row['item_id'])

In [35]:
# Example query
query_vector = df_features.iloc[0]['feature_vector']
nearest_neighbors = lsh.query(query_vector, num_results=5)

In [36]:
nearest_neighbors_ids = [item[0][1] for item in nearest_neighbors]
nearest_neighbors_ids

[1, 3, 1, 56927, 56925]

In [37]:
neighbL = []
for i in nearest_neighbors_ids:
    # Access the row corresponding to the index i and convert it to a dictionary
    row_dict = df_b[df_b['item_id'] == i].iloc[0].to_dict()
    neighbL.append(row_dict)

# Convert the list into a DataFrame
neighb = pd.DataFrame(neighbL)

neighb

Unnamed: 0,item_id,title,price,categories
0,1,Parker & Knight,0.0,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
1,3,Jealousy,0.0,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
2,1,Parker & Knight,0.0,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
3,56927,"4th of July (Women's Murder Club, No 4)",8.49,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
4,56925,A Killer's Alibi (Philadelphia Legal),7.89,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."


### Step 3: Calculate user profile

In [38]:
df_ratings

Unnamed: 0,user_id,item_id,title,rating,timestamp
0,3361,53296,Crescent Dawn (Dirk Pitt Adventure),5.0,1543536132144
1,4471,1424,Cars (Disney/Pixar Cars) (Little Golden Book),5.0,1543536168823
2,342,50326,Love Coloring Book: An Adult Coloring Book wit...,5.0,1543536419204
3,10576,1953,"Magic Tree House Boxed Set, Books 1-4: Dinosau...",5.0,1543536938535
4,3658,1953,"Magic Tree House Boxed Set, Books 1-4: Dinosau...",5.0,1546568460586
...,...,...,...,...,...
79085,8464,40568,Eternal Flame: A WWII-1980s Time Travel Love S...,4.0,1555050231806
79086,3840,56077,Moskva: 'The new Le Carre' BBC Radio 2 The Sar...,4.0,1555050392664
79087,6841,26078,Space Activity Book for Kids Ages 4-8: A Fun K...,5.0,1555050413806
79088,6954,53231,Pride and Prejudice,5.0,1555050587832


In [None]:
# restricts the ratings to the set of most popular books (optional, not needed for content-based)
numratings_threshold = 3 #increase this number if you want to filter
df_item_popularity = df_ratings[['item_id','rating']].groupby('item_id').count().reset_index()
df_item_popularity.columns = ['item_id','count'] 
df_item_popularity = df_item_popularity.sort_values(by='count', ascending=False)
df_item_popularity = df_item_popularity[df_item_popularity['count'] >= numratings_threshold]
print(f'Number of books reduced from {len(df_ratings.item_id.unique())} to {len(df_item_popularity.item_id.unique())}')
df_ratings_1 = pd.merge(df_ratings, df_item_popularity, on='item_id', how='inner')[['user_id', 'item_id', 'rating']]
df_ratings_1 = df_ratings_1.sort_values(by='user_id')

df_ratings_1.head()

Number of books reduced from 45433 to 5480


Unnamed: 0,user_id,item_id,rating
2486,2,47886,3.0
8140,2,13194,5.0
14707,2,26600,5.0
7084,2,10107,5.0
21881,2,42004,5.0


In [40]:
df_ratings_1['rating_scaled'] = df_ratings_1.groupby('user_id')['rating'].transform(lambda x: x - x.mean())

In [41]:
# Filter out rating_scaled = 0
df_ratings_1 = df_ratings_1[df_ratings_1['rating_scaled'] != 0]

In [42]:
df_profile = pd.merge(df_ratings_1, df_features[['item_id', 'feature_vector']], on='item_id')

# Scale feature vector by rating
df_profile['feature_vector_scaled'] = df_profile.apply(lambda row: row['rating_scaled'] * row['feature_vector'], axis=1)

In [43]:
uni = df_profile['user_id'].unique()
len(uni)

3359

In [None]:
start = time.time()
#stack all sparse vectors of user's books
df_user_vectors = df_profile[['user_id', 'feature_vector_scaled']].groupby('user_id').agg(sparse.vstack).reset_index()
#compute the average of the vectors without considering the zero entries (this will take a while)
df_user_vectors['feature_vector_scaled'] = df_user_vectors['feature_vector_scaled'].apply(lambda x: csr_matrix(np.nan_to_num(x.sum(axis=0)/x.getnnz(axis=0), 0)))
end = time.time()
print(end - start)
df_user_vectors

3.977790594100952


Unnamed: 0,user_id,feature_vector_scaled
0,2,"(0, 0)\t-0.06319033569412447\n (0, 5)\t-0.0..."
1,6,"(0, 1)\t0.08217123708168471\n (0, 3)\t0.108..."
2,9,"(0, 0)\t0.821474364023621\n (0, 6)\t-0.0081..."
3,13,"(0, 6)\t0.002069850377450097\n (0, 31)\t0.0..."
4,20,"(0, 6)\t-0.0003196934492990306\n (0, 25)\t0..."
...,...,...
3354,13758,"(0, 6)\t0.0034925896678157237\n (0, 7)\t-0...."
3355,13763,"(0, 6)\t-0.0016062668798636746\n (0, 7)\t0...."
3356,13767,"(0, 6)\t0.007411762130932002\n (0, 20)\t0.1..."
3357,13790,


In [45]:
zero_elements_mask = df_user_vectors['feature_vector_scaled'].apply(lambda x: x.nnz == 0)
num_rows_with_zero_elements = zero_elements_mask.sum()

print("Number of rows with sparse matrices containing zero stored elements:", num_rows_with_zero_elements)


Number of rows with sparse matrices containing zero stored elements: 331


In [46]:
df_user_vectors

Unnamed: 0,user_id,feature_vector_scaled
0,2,"(0, 0)\t-0.06319033569412447\n (0, 5)\t-0.0..."
1,6,"(0, 1)\t0.08217123708168471\n (0, 3)\t0.108..."
2,9,"(0, 0)\t0.821474364023621\n (0, 6)\t-0.0081..."
3,13,"(0, 6)\t0.002069850377450097\n (0, 31)\t0.0..."
4,20,"(0, 6)\t-0.0003196934492990306\n (0, 25)\t0..."
...,...,...
3354,13758,"(0, 6)\t0.0034925896678157237\n (0, 7)\t-0...."
3355,13763,"(0, 6)\t-0.0016062668798636746\n (0, 7)\t0...."
3356,13767,"(0, 6)\t0.007411762130932002\n (0, 20)\t0.1..."
3357,13790,


### Step 4: Rank potential recommendation candidates
• Select a target user to provide recommendations to
• Query LSH with the user vector get a rank of candidate items
(Use sparse vector representations if needed)

In [97]:
# Picking a target user to provide recommendations
idx = 700
target_user_id = df_user_vectors.iloc[idx]['user_id']

# Get user rating history
df_user_history = df_ratings[df_ratings['user_id'] == target_user_id]

# Select candidate recommendations for the user
df_recommendation = df_features[~df_features['item_id'].isin(df_user_history['item_id'])]
user_vector = df_user_vectors[df_user_vectors['user_id'] == target_user_id]['feature_vector_scaled'].values[0]

# Query LSH with the user vector
candidate_items = lsh.query(user_vector, num_results=20)
candidate_item_ids = [item[0][1] for item in candidate_items]

df_candidate_recommendations = df_recommendation[df_recommendation['item_id'].isin(candidate_item_ids)]
#print("Candidate Recommendations:")
#df_candidate_recommendations

In [98]:
user = pd.merge(df_user_history, df_b, on='item_id')
user

Unnamed: 0,user_id,item_id,title_x,rating,timestamp,title_y,price,categories
0,2194,2485,Paradox (An FBI Thriller Book 22),5.0,1550237368244,Paradox (An FBI Thriller Book 22),11.99,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
1,2194,32424,Murder in Park Lane (The Detective Lavender My...,5.0,1543639200826,Murder in Park Lane (The Detective Lavender My...,0.0,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
2,2194,7621,The Beantown Girls,5.0,1548955398697,The Beantown Girls,0.0,"['Books', 'Literature & Fiction', 'Genre Ficti..."
3,2194,42749,THE OXFORD MYSTERY an absolutely gripping whod...,4.0,1547917185944,THE OXFORD MYSTERY an absolutely gripping whod...,0.0,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
4,2194,53559,The Moroccan Girl: A Novel,4.0,1554947939625,The Moroccan Girl: A Novel,9.49,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
5,2194,6119,The Light Over London,5.0,1546131607007,The Light Over London,9.99,"['Books', 'Literature & Fiction', 'Genre Ficti..."
6,2194,1487,The Last Second (6) (A Brit in the FBI),5.0,1548167431945,The Last Second (6) (A Brit in the FBI),14.73,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
7,2194,37747,Night of Camp David,5.0,1553280637128,Night of Camp David,7.99,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
8,2194,7708,THE WINTER MYSTERY an absolutely gripping whod...,5.0,1554841907021,THE WINTER MYSTERY an absolutely gripping whod...,0.0,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
9,2194,15377,THE RIVERBOAT MYSTERY an absolutely gripping w...,5.0,1544588267589,THE RIVERBOAT MYSTERY an absolutely gripping w...,0.0,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."


In [99]:
df_candidates = pd.merge(df_candidate_recommendations, df_b, on='item_id')
df_candidates = df_candidates[['item_id', 'title', 'category']]
df_candidates

Unnamed: 0,item_id,title,category
0,28100,Age of Swords: Book Two of The Legends of the ...,books literature_&_fiction action_&_adventure
1,36195,The Protocol: A James Acton Thriller (James Ac...,books literature_&_fiction action_&_adventure
2,36239,Mission Critical (Gray Man),books literature_&_fiction action_&_adventure
3,36249,The Last Wish: Introducing The Witcher,books literature_&_fiction action_&_adventure
4,36252,The Girl in the Tower: A Novel (Winternight Tr...,books literature_&_fiction action_&_adventure
5,36294,Agent in Place (Gray Man),books literature_&_fiction action_&_adventure
6,36343,The Man Who Would Be King,books literature_&_fiction action_&_adventure
7,36601,"Ice Forged (The Ascendant Kingdoms Saga, 1)",books literature_&_fiction action_&_adventure
8,36615,Scourged (The Iron Druid Chronicles),books literature_&_fiction action_&_adventure
9,36696,Call of the Six (The Preston Six),books literature_&_fiction action_&_adventure


### Step 5: Predict ratings of candidate items
• Create another LSH index for users
• Query the user LSH index with the vectors of each recommended book to estimate the predicted rating based on k-nearest neighbor

In [79]:
# Create an LSH index for user profile vectors
# input_dim = len(vectorizer.vocabulary_)
# hash_size_user = 10  
# num_hashtables_user = 3  

# user_lsh = LSH(hash_size=hash_size_user, input_dim=input_dim, num_hashtables=num_hashtables_user)

# for i, row in df_user_vectors.iterrows():
#     user_vector = row['feature_vector_scaled']
#     user_lsh.index(user_vector, extra_data=row['user_id'])

In [80]:
# Back-up when the recommended books have no ratings ^.^'
df_user_avg_rating = df_ratings.groupby('user_id')['rating'].mean().reset_index()
df_user_avg_rating.columns = ['user_id', 'avg_rating']

In [100]:
predicted_ratings = []

for idx, row in df_candidate_recommendations.iterrows():
    book_id = row['item_id']
    book_vector = row['feature_vector']

    similar_users = user_lsh.query(book_vector, num_results=50)  
    similar_user_info = [(item[0][1], item[1]) for item in similar_users]  # (similarity_score, user_id)
    similar_user_ids = [info[0] for info in similar_user_info]

    similar_user_ratings = df_ratings[(df_ratings['user_id'].isin(similar_user_ids)) & (df_ratings['item_id'] == book_id)]
    
    if not similar_user_ratings.empty:
        # Compute the weighted average rating
        rating_sum = 0
        weight_sum = 0
        for user_id, similarity_score in similar_user_info:
            user_rating_row = similar_user_ratings[similar_user_ratings['user_id'] == user_id]
            if not user_rating_row.empty:
                user_rating = user_rating_row['rating'].values[0]
                rating_sum += similarity_score * user_rating
                weight_sum += similarity_score
        
        predicted_rating = rating_sum / weight_sum if weight_sum != 0 else 0
    else:
        target_user_avg_rating = df_user_avg_rating[df_user_avg_rating['user_id'] == target_user_id]['avg_rating'].values[0]
        predicted_rating = target_user_avg_rating

    predicted_ratings.append((book_id, predicted_rating))

In [101]:
df_predicted_ratings = pd.DataFrame(predicted_ratings, columns=['item_id', 'predicted_rating'])
df_final_recommendations = pd.merge(df_candidates, df_predicted_ratings, on='item_id')
df_final_recommendations = df_final_recommendations.sort_values(by='predicted_rating', ascending=False)
df_final_recommendations

Unnamed: 0,item_id,title,category,predicted_rating
16,37085,The Night Window: A Jane Hawk Novel,books literature_&_fiction action_&_adventure,5.0
3,36249,The Last Wish: Introducing The Witcher,books literature_&_fiction action_&_adventure,5.0
0,28100,Age of Swords: Book Two of The Legends of the ...,books literature_&_fiction action_&_adventure,4.85
11,36934,"Endgame: Arisen series, Book 14",books literature_&_fiction action_&_adventure,4.85
18,37785,Fool's Assassin: Book I of the Fitz and the Fo...,books literature_&_fiction action_&_adventure,4.85
17,37112,Slaughter of Eagles,books literature_&_fiction action_&_adventure,4.85
15,37047,Tarnsman of Gor,books literature_&_fiction action_&_adventure,4.85
14,37045,Spellbook Cards: Ranger (Dungeons & Dragons),books literature_&_fiction action_&_adventure,4.85
13,36976,"The Cornwalls Are Gone (Amy Cornwall, 1)",books literature_&_fiction action_&_adventure,4.85
12,36953,Hearts and Minds,books literature_&_fiction action_&_adventure,4.85


### User-User collaborative Filtering

In [102]:
train_df = pd.merge(ratings_train_df, df_b[['item_id','title','categories']],
              on='item_id').drop('timestamp', axis=1)
train_df
test_df = pd.merge(ratings_train_df, df_b[['item_id','title','categories']],
              on='item_id').drop('timestamp', axis=1)

In [103]:
reader = Reader(rating_scale=(1, 5))

# Load the data from the DataFrame
train = Dataset.load_from_df(train_df[['user_id', 'item_id', 'rating']], reader)
test = Dataset.load_from_df(test_df[['user_id', 'item_id', 'rating']], reader)

In [104]:
# Build the trainset
trainset = train.build_full_trainset()
testset = test.build_full_trainset().build_testset()

In [105]:
# Use k-NN algorithm with user-based collaborative filtering and cosine similarity
kk = 50
sim_options = {'name': 'cosine', 'user_based': True, 'min_support': 2 }
algo = KNNBasic(k = kk, sim_options = sim_options, verbose = True)

In [106]:
algo.fit(trainset)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x2734c53fe10>

In [107]:
def get_top_n_neighbors(user_id, n):
    inner_id = algo.trainset.to_inner_uid(user_id)
    user_inner_ids = algo.trainset.all_users()
    
    similarities = []
    for other_inner_id in user_inner_ids:
        if inner_id != other_inner_id:
            similarity = algo.sim[inner_id, other_inner_id]
            similarities.append((algo.trainset.to_raw_uid(other_inner_id), similarity))
    
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[1:n+1]

In [108]:
def get_recommendations(user_id, top_n_neighbors, num_recommendations):
    user_inner_id = algo.trainset.to_inner_uid(user_id)
    user_rated_items = set(item_id for item_id, _ in algo.trainset.ur[user_inner_id])

    #print(user_rated_items)
    all_items = algo.trainset.all_items()
    recommendations = []
    
    for item_inner_id in all_items:
        item_raw_id = algo.trainset.to_raw_iid(item_inner_id)
        if item_inner_id not in user_rated_items:
            estimated_rating = algo.predict(user_id, item_raw_id).est
            recommendations.append((item_raw_id, estimated_rating))
    
    recommendations.sort(key=lambda x: x[1], reverse=True)
    
    return [(item, rating) for item, rating in recommendations[:num_recommendations]]

In [109]:
# Get top N neighbors for a specific user
user_id = 1300
top_n_neighbors = get_top_n_neighbors(user_id, n=20)
num_recommendations = 20

In [110]:
uni = train_df[train_df['user_id']==1300]
uni

Unnamed: 0,user_id,item_id,rating,title,categories
11778,1300,3196,5.0,This book just ate my dog!,"['Books', ""Children's Books"", 'Animals']"
16657,1300,1062,5.0,"Good Morning, Snowplow!","['Books', ""Children's Books"", 'Science, Nature..."
23065,1300,28076,5.0,FIREBOAT: The Heroic Adventures of the John J....,"['Books', ""Children's Books"", 'History']"
41512,1300,36812,5.0,Round Trip,"['Books', ""Children's Books"", 'Arts, Music & P..."
42748,1300,49563,5.0,Mary Who Wrote Frankenstein (Who Wrote Classics),"['Books', ""Children's Books"", 'Growing Up & Fa..."
44868,1300,36885,5.0,The Wall in the Middle of the Book,"['Books', ""Children's Books"", 'Growing Up & Fa..."


In [111]:
recommendations = get_recommendations(user_id, top_n_neighbors, 20)

In [112]:
rec = pd.DataFrame(recommendations, columns=['item_id', 'estimated_rating'])
rec_uu = pd.merge(rec, df_b[['item_id','title','categories']], on='item_id')
# Print the DataFrame
rec_uu

Unnamed: 0,item_id,estimated_rating,title,categories
0,19942,5,The Wonky Donkey,"['Books', ""Children's Books"", 'Animals']"
1,51566,5,Parts (Picture Puffin Books),"['Books', ""Children's Books"", 'Growing Up & Fa..."
2,51620,5,The Night Gardener,"['Books', ""Children's Books"", 'Arts, Music & P..."
3,14290,5,The Silent Patient,"['Books', 'Mystery, Thriller & Suspense', 'Thr..."
4,45499,5,We Are in a Book!-An Elephant and Piggie Book,"['Books', ""Children's Books"", 'Growing Up & Fa..."
5,40290,5,What Do You Do With a Chance? — New York Times...,"['Books', ""Children's Books"", 'Growing Up & Fa..."
6,3130,5,Gone with the Wind,"['Books', 'Literature & Fiction', 'Genre Ficti..."
7,49764,5,"Tiny, Perfect Things","['Books', ""Children's Books"", 'Growing Up & Fa..."
8,24596,5,"Good Night, Gorilla","['Books', ""Children's Books"", 'Growing Up & Fa..."
9,30936,5,Ed Emberley's Complete Funprint Drawing Book,"['Books', ""Children's Books"", 'Arts, Music & P..."


### SVD

In [114]:
svd = SVD(n_epochs=10)
results = cross_validate(svd, train, measures=['RMSE', 'MAE'], cv=10, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 10 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Fold 6  Fold 7  Fold 8  Fold 9  Fold 10 Mean    Std     
RMSE (testset)    0.7961  0.7803  0.7993  0.8257  0.7963  0.7942  0.7928  0.8223  0.7975  0.8000  0.8005  0.0129  
MAE (testset)     0.5931  0.5831  0.5945  0.6019  0.5892  0.5905  0.5888  0.6013  0.5882  0.5892  0.5920  0.0056  
Fit time          1.41    1.40    1.39    1.39    1.40    1.40    1.38    1.39    1.42    1.38    1.40    0.01    
Test time         0.10    0.11    0.10    0.10    0.10    0.11    0.11    0.11    0.10    0.11    0.10    0.00    


In [115]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2734b977f50>

In [116]:
# Map original IDs to internal IDs
raw_id_to_inner_id = {raw_id: inner_id for raw_id, inner_id in trainset._raw2inner_id_items.items()}

In [117]:
def generate_recommendation(model, user_id, ratings_df, books_df, n_items):
    book_ids = ratings_df["item_id"].unique()
    book_ids_user = ratings_df.loc[ratings_df["user_id"] == user_id, "item_id"]
    book_ids_to_pred = np.setdiff1d(book_ids, book_ids_user)
    
    # Map book_ids_to_pred to internal IDs
    book_ids_to_pred_internal = [raw_id_to_inner_id[book_id] for book_id in book_ids_to_pred if book_id in raw_id_to_inner_id]
    
    test_set = [(user_id, book_id, 4) for book_id in book_ids_to_pred_internal]
    predictions = model.test(test_set)

    pred_ratings = np.array([pred.est for pred in predictions])

    
    index_max = (-pred_ratings).argsort()[:n_items]
    recommendations = []
    for i in index_max:
        if i >= len(book_ids_to_pred_internal):
            print(f"Index {i} is out of bounds for book_ids_to_pred_internal of size {len(book_ids_to_pred_internal)}")
            continue
        book_id_internal = book_ids_to_pred_internal[i]
        book_id = next(key for key, value in raw_id_to_inner_id.items() if value == book_id_internal)
        book_title = books_df[books_df["item_id"] == book_id]["title"].values
        if len(book_title) > 0:
            recommendations.append({
                'item_id': book_id,
                'title': book_title[0],
                'predicted_rating': pred_ratings[i]
            })
        else:
            print(f"Book ID {book_id} not found in books_df")

    recommendations_df = pd.DataFrame(recommendations)
    return recommendations_df

In [118]:
# Finding 20 recommendation for userID = 2
userID = 2
n_items = 20
rec_svd = generate_recommendation(svd,userID,test_df,df_b,n_items)
rec_svd = pd.merge(rec_svd, df_b[['item_id','title', 'categories']], on=['item_id', 'title'])

In [119]:
rec_svd

Unnamed: 0,item_id,title,predicted_rating,categories
0,42882,Fodor's New Orleans 2016 (Full-color Travel Gu...,5.0,"['Books', 'Travel', 'United States']"
1,15940,Real Food for Dogs: 50 Vet-Approved Recipes fo...,5.0,"['Books', 'Crafts, Hobbies & Home', 'Pets & An..."
2,30476,Large Print Word-Finds Puzzle Book-Word Search...,5.0,"['Books', 'Humor & Entertainment', 'Puzzles & ..."
3,42636,Yellow Star,5.0,"['Books', 'Teen & Young Adult', 'Literature & ..."
4,35503,The Husband Hour,4.993249,"['Books', 'Literature & Fiction', 'Genre Ficti..."
5,34592,"How to Lose an Alien in 10 Days (Alienn, Arkan...",4.984946,"['Books', 'Romance', 'Paranormal']"
6,27420,The Unlikely Pilgrimage of Harold Fry: A Novel,4.975693,"['Books', 'Literature & Fiction', 'Genre Ficti..."
7,7406,The Richest Man in Babylon,4.97166,"['Books', 'Business & Money', 'Business Culture']"
8,31470,Moreta: Dragonlady of Pern,4.960599,"['Books', 'Literature & Fiction', 'Action & Ad..."
9,12905,The Black Hole War: My Battle with Stephen Haw...,4.95567,"['Books', 'Biographies & Memoirs', 'Profession..."
