In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from scipy.spatial.distance import pdist, squareform
import matplotlib.pyplot as plt
import random

In [2]:
df = pd.read_feather('./data.feather')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414719 entries, 0 to 414718
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ISBN                      414719 non-null  object 
 1   Book-Title                404698 non-null  object 
 2   Book-Author               404698 non-null  object 
 3   Book-Year-Of-Publication  404698 non-null  float16
 4   Book-Publisher            404698 non-null  object 
 5   User-ID                   414719 non-null  int32  
 6   Book-Rating               414719 non-null  int16  
dtypes: float16(1), int16(1), int32(1), object(4)
memory usage: 15.8+ MB


In [3]:
book = random.sample(df['ISBN'].drop_duplicates().to_list(), 1)[0]
df[df.ISBN == book].drop(columns=['User-ID', 'Book-Rating']).drop_duplicates()

Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher
15727,312960743,Revenge Of The Barbeque Queens : At The Barbeq...,Lou Jane Temple,1997.0,St. Martin's Paperbacks


In [4]:
PERC_OF_DATA_TO_USE = 80.0

user_rating_counts = df['User-ID'].value_counts()
cutoff_point = int(user_rating_counts.shape[0]*(PERC_OF_DATA_TO_USE/100.0))
users_with_most_ratings = user_rating_counts[:cutoff_point]

book_rating_counts = df['ISBN'].value_counts()
cutoff_point = int(book_rating_counts.shape[0]*(PERC_OF_DATA_TO_USE/100.0))
books_with_most_ratings = book_rating_counts[:cutoff_point]

df = df[df.apply(lambda rating: rating['User-ID'] in users_with_most_ratings, axis=1)]
df = df[df.apply(lambda rating: rating['ISBN'] in books_with_most_ratings, axis=1)]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 373612 entries, 1 to 414708
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ISBN                      373612 non-null  object 
 1   Book-Title                366231 non-null  object 
 2   Book-Author               366231 non-null  object 
 3   Book-Year-Of-Publication  366231 non-null  float16
 4   Book-Publisher            366231 non-null  object 
 5   User-ID                   373612 non-null  int32  
 6   Book-Rating               373612 non-null  int16  
dtypes: float16(1), int16(1), int32(1), object(4)
memory usage: 17.1+ MB


In [5]:
user_item_df = df.drop(columns=['Book-Title', 'Book-Author', 'Book-Year-Of-Publication', 'Book-Publisher']).reset_index(drop=True)
user_item_df.head()

Unnamed: 0,ISBN,User-ID,Book-Rating
0,786868716,11400,9
1,151008116,11400,6
2,671021001,11400,0
3,312195516,11400,7
4,446364193,11400,0


In [6]:
user_item_matrix = pd.pivot_table(df, values='Book-Rating', index='User-ID', columns='ISBN', fill_value=0)
user_item_matrix

ISBN,0002251760,0006475973,0006485200,0006493580,0006496423,000649840X,0006512135,0006514006,0006514855,0006530400,...,857237528,8817125539,8873122933,8878188824,8881930625,9129622425,950491036X,9727722458,9871138148,B00009EF82
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
243,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
254,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
383,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
388,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
446,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278418,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278535,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278582,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
278633,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn.decomposition import TruncatedSVD
X = user_item_matrix.T
SVD = TruncatedSVD(n_components=8, random_state=11400464)
resultant_matrix = SVD.fit_transform(X)
resultant_matrix.shape

(14647, 8)

In [8]:
len(df['ISBN'].drop_duplicates())

14647

In [9]:
corr_mat = np.corrcoef(resultant_matrix)
corr_mat.shape

  c /= stddev[:, None]
  c /= stddev[None, :]


(14647, 14647)

In [10]:
col_idx = user_item_matrix.columns.get_loc(book)
corr_specific = corr_mat[col_idx]
corr_specific

array([-0.10156376,  0.35288014,  0.07859002, ..., -0.20199669,
       -0.01685635,  0.38483213])

In [11]:
recommendations = pd.DataFrame({'corr_specific':corr_specific,'ISBN': user_item_matrix.columns}).sort_values('corr_specific', ascending=False).head(5)
recommendations

Unnamed: 0,corr_specific,ISBN
1857,1.0,312960743
11941,0.881514,743211375
3296,0.854457,373512023
8709,0.825189,525945415
7860,0.811801,451206878


In [12]:
pd.merge(df.drop(columns=['User-ID', 'Book-Rating']).drop_duplicates(), recommendations)

Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher,corr_specific
0,743211375,From a Buick 8 : A Novel,Stephen King,2002.0,Scribner,0.881514
1,373512023,Whispers In The Woods (Reader's Choice),Helen R Myers,2002.0,Silhouette,0.854457
2,451206878,The Return,Bentley Little,2002.0,Signet Book,0.811801
3,525945415,A Slow Burning,Stanley Pottinger,2000.0,Penguin USA,0.825189
4,312960743,Revenge Of The Barbeque Queens : At The Barbeq...,Lou Jane Temple,1997.0,St. Martin's Paperbacks,1.0
