In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
import random

In [2]:
df = pd.read_feather('./data.feather')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414719 entries, 0 to 414718
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ISBN                      414719 non-null  object 
 1   Book-Title                404698 non-null  object 
 2   Book-Author               404698 non-null  object 
 3   Book-Year-Of-Publication  404698 non-null  float16
 4   Book-Publisher            404698 non-null  object 
 5   User-ID                   414719 non-null  int32  
 6   Book-Rating               135079 non-null  float16
dtypes: float16(2), int32(1), object(4)
memory usage: 15.8+ MB


In [3]:
book = random.sample(df['ISBN'].drop_duplicates().to_list(), 1)[0]
df[df.ISBN == book].drop(columns=['User-ID', 'Book-Rating']).drop_duplicates()

Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher
14790,374234574,Pleading Guilty,Scott Turow,1993.0,Farrar Straus Giroux


In [4]:
PERC_OF_DATA_TO_USE = 100.0

user_rating_counts = df['User-ID'].value_counts()
cutoff_point = int(user_rating_counts.shape[0]*(PERC_OF_DATA_TO_USE/100.0))
users_with_most_ratings = user_rating_counts[:cutoff_point]

book_rating_counts = df['ISBN'].value_counts()
cutoff_point = int(book_rating_counts.shape[0]*(PERC_OF_DATA_TO_USE/100.0))
books_with_most_ratings = book_rating_counts[:cutoff_point]

df = df[df.apply(lambda rating: rating['User-ID'] in users_with_most_ratings, axis=1)]
df = df[df.apply(lambda rating: rating['ISBN'] in books_with_most_ratings, axis=1)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 414719 entries, 0 to 414718
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ISBN                      414719 non-null  object 
 1   Book-Title                404698 non-null  object 
 2   Book-Author               404698 non-null  object 
 3   Book-Year-Of-Publication  404698 non-null  float16
 4   Book-Publisher            404698 non-null  object 
 5   User-ID                   414719 non-null  int32  
 6   Book-Rating               135079 non-null  float16
dtypes: float16(2), int32(1), object(4)
memory usage: 19.0+ MB


In [5]:
user_item_df = df.drop(columns=['Book-Title', 'Book-Author', 'Book-Year-Of-Publication', 'Book-Publisher']).dropna().reset_index(drop=True)
user_item_df['Book-Rating'] = user_item_df['Book-Rating'].astype(np.int16)
user_item_df.head()

Unnamed: 0,ISBN,User-ID,Book-Rating
0,786868716,11400,9
1,151008116,11400,6
2,312195516,11400,7
3,316789089,11400,7
4,743418174,11400,8


In [6]:
user_item_matrix = pd.pivot_table(user_item_df, values='Book-Rating', index='User-ID', columns='ISBN', fill_value=0)
user_item_matrix

ISBN,000000000,0002005018,0002251760,0002255081,0002259001,0002259834,0002558122,0006172768,0006374921,0006475973,...,9724113361,9724119378,9726101794,9726106141,9726116902,9727591965,9727722458,9770390107900,9871138148,B00009EF82
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
254,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
383,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
388,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
424,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278535,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278554,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278582,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278633,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn.decomposition import TruncatedSVD
X = user_item_matrix.T
SVD = TruncatedSVD(n_components=8, random_state=11400464)
resultant_matrix = SVD.fit_transform(X)
resultant_matrix.shape

(17681, 8)

In [8]:
len(df['ISBN'].drop_duplicates())

18309

In [9]:
corr_mat = np.corrcoef(resultant_matrix)
corr_mat.shape

(17681, 17681)

In [10]:
col_idx = user_item_matrix.columns.get_loc(book)
corr_specific = corr_mat[col_idx]
corr_specific

array([0.95953078, 0.02848921, 0.12071432, ..., 0.0319275 , 0.60663122,
       0.42182761])

In [11]:
recommendations = pd.DataFrame({'corr_specific':corr_specific,'ISBN': user_item_matrix.columns}).sort_values('corr_specific', ascending=False).head(5)
recommendations

Unnamed: 0,corr_specific,ISBN
4299,1.0,374234574
2875,0.993173,330336630
14116,0.992105,743410513
87,0.991673,20518706
10997,0.991193,553379011


In [12]:
pd.merge(df.drop(columns=['User-ID', 'Book-Rating']).drop_duplicates(), recommendations)

Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher,corr_specific
0,374234574,Pleading Guilty,Scott Turow,1993.0,Farrar Straus Giroux,1.0
1,743410513,The Unburied,Charles Palliser,2000.0,Washington Square Press,0.992105
2,20518706,The Sun Also Rises (A Scribner classic),Ernest Hemingway,1982.0,Simon &amp; Schuster,0.991673
3,553379011,The Story of B,DANIEL QUINN,1997.0,Bantam,0.991193
4,330336630,The Scold's Bridle,Minette Walters,1995.0,Macmillan Pub Ltd,0.993173
