In [1]:
import random
import pandas as pd
import numpy as np

from scipy.spatial.distance import hamming

In [2]:
df = pd.read_feather('./data.feather')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414719 entries, 0 to 414718
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ISBN                      414719 non-null  object 
 1   Book-Title                404698 non-null  object 
 2   Book-Author               404698 non-null  object 
 3   Book-Year-Of-Publication  404698 non-null  float16
 4   Book-Publisher            404698 non-null  object 
 5   User-ID                   414719 non-null  int32  
 6   Book-Rating               135079 non-null  float16
dtypes: float16(2), int32(1), object(4)
memory usage: 15.8+ MB


In [4]:
user1 = random.sample(df['User-ID'].drop_duplicates().to_list(), 1)[0]
user1

35859

In [5]:
user_data = df[df['User-ID'] == user1]
user_data

Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher,User-ID,Book-Rating
22972,0440234743,The Testament,John Grisham,1999.0,Dell,35859,
22973,0971880107,Wild Animus,Rich Shapero,2004.0,Too Far,35859,
22974,0345417623,Timeline,MICHAEL CRICHTON,2000.0,Ballantine Books,35859,
22975,0446310786,To Kill a Mockingbird,Harper Lee,1988.0,Little Brown &amp; Company,35859,
22976,0425182908,Isle of Dogs,Patricia Cornwell,2002.0,Berkley Publishing Group,35859,4.0
...,...,...,...,...,...,...,...
25412,055326351X,,,,,35859,
25413,0451184386,,,,,35859,
25414,0330489461,,,,,35859,
25415,035332306X,,,,,35859,


In [6]:
PERC_OF_DATA_TO_USE = 100

In [7]:
user_rating_counts = df['User-ID'].value_counts()
cutoff_point = int(user_rating_counts.shape[0]*(PERC_OF_DATA_TO_USE/100.0))
users_with_most_ratings = user_rating_counts[:cutoff_point]

In [8]:
book_rating_counts = df['ISBN'].value_counts()
cutoff_point = int(book_rating_counts.shape[0]*(PERC_OF_DATA_TO_USE/100.0))
books_with_most_ratings = book_rating_counts[:cutoff_point]

In [9]:
df = df[df.apply(lambda rating: rating['User-ID'] in users_with_most_ratings, axis=1)]
df = df[df.apply(lambda rating: rating['ISBN'] in books_with_most_ratings, axis=1)]

In [10]:
df = df.append(user_data, ignore_index=True).drop_duplicates()

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 414719 entries, 0 to 414718
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ISBN                      414719 non-null  object 
 1   Book-Title                404698 non-null  object 
 2   Book-Author               404698 non-null  object 
 3   Book-Year-Of-Publication  404698 non-null  float16
 4   Book-Publisher            404698 non-null  object 
 5   User-ID                   414719 non-null  int32  
 6   Book-Rating               135079 non-null  float16
dtypes: float16(2), int32(1), object(4)
memory usage: 19.0+ MB


In [12]:
user_item_df = df.drop(columns=['Book-Title', 'Book-Author', 'Book-Year-Of-Publication', 'Book-Publisher']).dropna().reset_index(drop=True)
user_item_df['Book-Rating'] = user_item_df['Book-Rating'].astype(np.int16)
user_item_df.head()

Unnamed: 0,ISBN,User-ID,Book-Rating
0,786868716,11400,9
1,151008116,11400,6
2,312195516,11400,7
3,316789089,11400,7
4,743418174,11400,8


In [13]:
user_item_matrix = pd.pivot_table(user_item_df, values='Book-Rating', index='User-ID', columns='ISBN')
user_item_matrix

ISBN,000000000,0002005018,0002251760,0002255081,0002259001,0002259834,0002558122,0006172768,0006374921,0006475973,...,9724113361,9724119378,9726101794,9726106141,9726116902,9727591965,9727722458,9770390107900,9871138148,B00009EF82
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,
383,,,,,,,,,,,...,,,,,,,,,,
388,,,,,,,,,,,...,,,,,,,,,,
424,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278535,,,,,,,,,,,...,,,,,,,,,,
278554,,,,,,,,,,,...,,,,,,,,,,
278582,,,,,,,,,,,...,,,,,,,,,,
278633,,,,,,,,,,,...,,,,,,,,,,


In [14]:
df = df[df['User-ID'] != user1]

In [15]:
def hamming_distance(user1, user2, user_item_matrix):
    try:
        user1_ratings = user_item_matrix.transpose()[user1]
        user2_ratings = user_item_matrix.transpose()[user2]
        distance = hamming(user1_ratings, user2_ratings)
    except: 
        distance = np.NaN
    return distance

In [16]:
user1_ratings = user_item_matrix.transpose()[user1]
user2_ratings = user_item_matrix.transpose()[random.sample(df['User-ID'].drop_duplicates().to_list(), 1)[0]]
distance = hamming(user1_ratings,user2_ratings)
distance

0.9999434421130027

In [17]:
df["Distance"] = df["User-ID"].apply(lambda user2: hamming_distance(user1, user2, user_item_matrix))
df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher,User-ID,Book-Rating,Distance
0,2005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,11400,,1.0
1,786868716,The Five People You Meet in Heaven,Mitch Albom,2003.0,Hyperion,11400,9.0,1.0
2,151008116,Life of Pi,Yann Martel,2002.0,Harcourt,11400,6.0,1.0
3,671021001,She's Come Undone (Oprah's Book Club),Wally Lamb,1998.0,Pocket,11400,,1.0
4,312195516,The Red Tent (Bestselling Backlist),Anita Diamant,1998.0,Picador USA,11400,7.0,1.0


In [18]:
df.sort_values(["Distance"], ascending=True)

Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher,User-ID,Book-Rating,Distance
5416,3462033743,,,,,11676,,0.999095
3630,0380000490,Sunshine: A Novel (An Avon/Flare Book),Norma Klein,1982.0,Harper Mass Market Paperbacks (Mm),11676,8.0,0.999095
3629,0440225272,Homecoming,Belva Plain,1998.0,Dell Publishing Company,11676,8.0,0.999095
3628,0553565370,Scandal in Fair Haven,Carolyn G. Hart,1995.0,Bantam Books,11676,8.0,0.999095
3627,0345394259,Hotel Paradise,Martha Grimes,1997.0,Ballantine Books,11676,7.0,0.999095
...,...,...,...,...,...,...,...,...
414483,0843949074,House of Pain,Sephera Giron,2001.0,Leisure Books,75091,,
414484,0425180727,The White Room,A. J. Matthews,2001.0,Berkley Publishing Group,75091,,
414485,0786015039,The Red Church,Scott Nicholson,2002.0,Zebra Books,75091,,
414486,1551667835,Blind Faith,Christiane Heggan,2001.0,Mira,75091,,


In [19]:
RECOMMENDATION_AMOUNT = 3

In [20]:
print(user1)
neighbours_amount = RECOMMENDATION_AMOUNT*2
k_nearest_users = df[df['User-ID'] != user1].sort_values(["Distance"], ascending=True)["User-ID"].drop_duplicates()[:neighbours_amount]
k_nearest_users

35859


5416       11676
217062    104636
306373    144241
129977     95359
84599     153662
6325      177458
Name: User-ID, dtype: int32

In [21]:
nn_ratings = user_item_matrix[user_item_matrix.index.isin(k_nearest_users)]
nn_ratings

ISBN,000000000,0002005018,0002251760,0002255081,0002259001,0002259834,0002558122,0006172768,0006374921,0006475973,...,9724113361,9724119378,9726101794,9726106141,9726116902,9727591965,9727722458,9770390107900,9871138148,B00009EF82
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
11676,,8.0,,,,,8.0,6.0,,,...,10.0,,,,7.0,,,1.0,,
95359,,,,,,,,,,,...,,,,,,,,,,
104636,,,,,,,,,,,...,,,,,,,,,,
144241,,,,,,,,,,,...,,,,,,,,,,
153662,,,,,,,,,,,...,,,,,,,,,,
177458,,,,,,,,,,,...,,,,,,,,,,


In [22]:
books_read = user_item_matrix.transpose()[user1].dropna().index
books_read

Index(['0060198524', '0060502258', '0060932813', '0060934417', '0060952768',
       '0061006629', '0061009059', '0061012351', '006103102X', '0061059323',
       ...
       '1557731136', '1558531025', '1558532161', '1558744215', '155874424X',
       '1558745017', '1558745157', '1565122798', '1572970669', '1575663708'],
      dtype='object', name='ISBN', length=312)

In [23]:
avg_rating = nn_ratings.apply(np.nanmean).dropna()
avg_rating

  results[i] = self.f(v)


ISBN
0002005018        8.0
0002558122        8.0
0006172768        6.0
000648302X        8.0
000649840X        8.0
                 ... 
9023412389        5.0
9722319345        5.0
9724113361       10.0
9726116902        7.0
9770390107900     1.0
Length: 4508, dtype: float64

In [24]:
avg_rating = avg_rating[~avg_rating.index.isin(books_read)]
avg_rating

ISBN
0002005018        8.0
0002558122        8.0
0006172768        6.0
000648302X        8.0
000649840X        8.0
                 ... 
9023412389        5.0
9722319345        5.0
9724113361       10.0
9726116902        7.0
9770390107900     1.0
Length: 4352, dtype: float64

In [25]:
avg_rating.sort_values(ascending=False)

ISBN
0449907481       10.0
0345416260       10.0
034540288X       10.0
0689710879       10.0
0689710682       10.0
                 ... 
1551669021        1.0
3100488148        1.0
2266104535        1.0
2253003107        1.0
9770390107900     1.0
Length: 4352, dtype: float64

In [26]:
recommended_books = avg_rating.sort_values(ascending=False).index[:RECOMMENDATION_AMOUNT]
recommended_books

Index(['0449907481', '0345416260', '034540288X'], dtype='object', name='ISBN')

In [27]:
df[df['ISBN'].apply(lambda isbn: isbn in recommended_books)].drop(columns=['User-ID', 'Book-Rating']).sort_values(["Distance"], ascending=True).drop_duplicates()[:RECOMMENDATION_AMOUNT]

Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher,Distance
220,0449907481,A Thousand Acres (Ballantine Reader's Circle),JANE SMILEY,1992.0,Ballantine Books,0.999095
549,0345416260,Pope Joan (Ballantine Reader's Circle),Donna Woolfolk Cross,1996.0,Ballantine Books,0.999095
591,034540288X,The Lost World,Michael Crichton,1996.0,Ballantine Books,0.999095
