In [1]:
import random
import pandas as pd
import numpy as np

from scipy.spatial.distance import hamming

In [2]:
df = pd.read_feather('./data.feather')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414719 entries, 0 to 414718
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ISBN                      414719 non-null  object 
 1   Book-Title                404698 non-null  object 
 2   Book-Author               404698 non-null  object 
 3   Book-Year-Of-Publication  404698 non-null  float16
 4   Book-Publisher            404698 non-null  object 
 5   User-ID                   414719 non-null  int32  
 6   Book-Rating               414719 non-null  int16  
dtypes: float16(1), int16(1), int32(1), object(4)
memory usage: 15.8+ MB


In [4]:
user1 = random.sample(df['User-ID'].drop_duplicates().to_list(), 1)[0]
user1

18250

In [5]:
user_data = df[df['User-ID'] == user1]
user_data

Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher,User-ID,Book-Rating
390087,0446527793,The Guardian,Nicholas Sparks,2003.0,Warner Books,18250,0
390088,0345443284,While I Was Gone,Sue Miller,1999.0,Ballantine Books,18250,0
390089,0316781266,The Last Time They Met : A Novel,Anita Shreve,2002.0,Back Bay Books,18250,8
390090,0375727132,The Dive From Clausen's Pier : A Novel (Vintag...,ANN PACKER,2003.0,Vintage,18250,0
390091,0385265700,The Book of Ruth (Oprah's Book Club (Paperback)),Jane Hamilton,1990.0,Anchor,18250,0
390092,0385720106,A Map of the World,Jane Hamilton,1999.0,Anchor Books/Doubleday,18250,0
390093,068484477X,STONES FROM THE RIVER,Ursula Hegi,1997.0,Touchstone,18250,5
390094,0525945938,Scarlet Feather,Maeve Binchy,2001.0,Dutton Books,18250,6
390095,0373484232,Stanislaski Sisters,Nora Roberts,2001.0,Silhouette,18250,1
390096,0671620991,Solve Your Child's Sleep Problems,Richard Ferber,1986.0,Fireside,18250,0


In [6]:
PERC_OF_DATA_TO_USE = 33.0

In [7]:
user_rating_counts = df['User-ID'].value_counts()
cutoff_point = int(user_rating_counts.shape[0]*(PERC_OF_DATA_TO_USE/100.0))
users_with_most_ratings = user_rating_counts[:cutoff_point]

In [8]:
book_rating_counts = df['ISBN'].value_counts()
cutoff_point = int(book_rating_counts.shape[0]*(PERC_OF_DATA_TO_USE/100.0))
books_with_most_ratings = book_rating_counts[:cutoff_point]

In [9]:
df = df[df.apply(lambda rating: rating['User-ID'] in users_with_most_ratings, axis=1)]
df = df[df.apply(lambda rating: rating['ISBN'] in books_with_most_ratings, axis=1)]

In [10]:
df = df.append(user_data, ignore_index=True).drop_duplicates()

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 220738 entries, 0 to 220737
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ISBN                      220738 non-null  object 
 1   Book-Title                218269 non-null  object 
 2   Book-Author               218269 non-null  object 
 3   Book-Year-Of-Publication  218269 non-null  float16
 4   Book-Publisher            218269 non-null  object 
 5   User-ID                   220738 non-null  int32  
 6   Book-Rating               220738 non-null  int16  
dtypes: float16(1), int16(1), int32(1), object(4)
memory usage: 10.1+ MB


In [12]:
user_item_df = df.drop(columns=['Book-Title', 'Book-Author', 'Book-Year-Of-Publication', 'Book-Publisher']).reset_index(drop=True)
user_item_df.head()

Unnamed: 0,ISBN,User-ID,Book-Rating
0,786868716,11400,9
1,151008116,11400,6
2,671021001,11400,0
3,312195516,11400,7
4,446364193,11400,0


In [13]:
user_item_matrix = pd.pivot_table(user_item_df, values='Book-Rating', index=['User-ID'], columns=['ISBN'])
user_item_matrix

ISBN,000649840X,0006547834,0006550681,0006550789,0007110928,0007141076,0007154615,0020125305,0020198906,0020199600,...,3423202327,3426029553,3442437407,3442541751,3492045170,3492238696,3746614007,3822860867,3896672282,8873122933
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,
507,,,,,,,,,,,...,,,,,,,,,,
638,,,,,,,,,,,...,,,,,,,,,,
882,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
278137,,,,,,,,,,,...,,,,,,,,,,
278144,,,,,,,,,,,...,,,,,,,,,,
278188,,,,,,,,,,,...,,,,,,,,,,
278418,,,,,,,,,,,...,,,,,,,,,,


In [14]:
df = df[df['User-ID'] != user1]

In [15]:
def hamming_distance(user1, user2, user_item_matrix):
    try:
        user1_ratings = user_item_matrix.transpose()[user1]
        user2_ratings = user_item_matrix.transpose()[user2]
        distance = hamming(user1_ratings, user2_ratings)
    except: 
        distance = np.NaN
    return distance

In [16]:
user1_ratings = user_item_matrix.transpose()[user1]
user2_ratings = user_item_matrix.transpose()[random.sample(df['User-ID'].drop_duplicates().to_list(), 1)[0]]
distance = hamming(user1_ratings,user2_ratings)
distance

1.0

In [None]:
df["Distance"] = df["User-ID"].apply(lambda user2: hamming_distance(user1, user2, user_item_matrix))
df.head()

In [None]:
df.sort_values(["Distance"], ascending=True)

In [None]:
RECOMMENDATION_AMOUNT = 3

In [None]:
print(user1)
neighbours_amount = RECOMMENDATION_AMOUNT*2
k_nearest_users = df[df['User-ID'] != user1].sort_values(["Distance"], ascending=True)["User-ID"].drop_duplicates()[:neighbours_amount]
k_nearest_users

In [None]:
nn_ratings = user_item_matrix[user_item_matrix.index.isin(k_nearest_users)]
nn_ratings

In [None]:
books_read = user_item_matrix.transpose()[user1].dropna().index
books_read

In [None]:
avg_rating = nn_ratings.apply(np.nanmean).dropna()
avg_rating

In [None]:
avg_rating = avg_rating[~avg_rating.index.isin(books_read)]
avg_rating

In [None]:
avg_rating.sort_values(ascending=False)

In [None]:
recommended_books = avg_rating.sort_values(ascending=False).index[:RECOMMENDATION_AMOUNT]
recommended_books

In [None]:
df[df['ISBN'].apply(lambda isbn: isbn in recommended_books)].drop(columns=['User-ID', 'Book-Rating']).sort_values(["Distance"], ascending=True).drop_duplicates()[:RECOMMENDATION_AMOUNT]