In [1]:
import random
import pandas as pd
import numpy as np

# https://docs.scipy.org/doc/scipy/reference/spatial.distance.html
from scipy.spatial.distance import hamming
from sklearn.decomposition import TruncatedSVD

In [2]:
df = pd.read_feather('./data.feather')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414719 entries, 0 to 414718
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ISBN                      414719 non-null  object 
 1   Book-Title                404698 non-null  object 
 2   Book-Author               404698 non-null  object 
 3   Book-Year-Of-Publication  404698 non-null  float16
 4   Book-Publisher            404698 non-null  object 
 5   User-ID                   414719 non-null  int32  
 6   Book-Rating               135079 non-null  float16
dtypes: float16(2), int32(1), object(4)
memory usage: 15.8+ MB


In [3]:
def limit_data_used(df, percentage):
    for col in ['User-ID', 'ISBN']:
        rating_counts = df[col].value_counts()
        cutoff_point = int(rating_counts.shape[0]*(percentage/100.0))
        most_ratings = rating_counts[:cutoff_point]
        return df[df.apply(lambda rating: rating[col] in most_ratings, axis=1)]

In [4]:
def hamming_distance(user1, user2, user_item_matrix):
        try:
            user1_ratings = user_item_matrix.transpose()[user1]
            user2_ratings = user_item_matrix.transpose()[user2]
            distance = hamming(user1_ratings, user2_ratings)
        except: 
            distance = np.NaN
        return distance

In [5]:
def get_user_item_recommendations(df, userID, recommendation_amount):
    df = df.append(user_data, ignore_index=True).reset_index(drop=True)
    
    user_item_df = df.drop(columns=['Book-Title', 'Book-Author', 'Book-Year-Of-Publication', 'Book-Publisher']).dropna().reset_index(drop=True)
    user_item_df['Book-Rating'] = user_item_df['Book-Rating'].astype(np.int16)
    
    user_item_matrix = pd.pivot_table(user_item_df, values='Book-Rating', index='User-ID', columns='ISBN')
    df = df[df['User-ID'] != userID]
    
    df["Distance"] = df["User-ID"].apply(lambda user2: hamming_distance(userID, user2, user_item_matrix))
    
    neighbours_amount = recommendation_amount*2
    k_nearest_users = df[df['User-ID'] != userID].sort_values(["Distance"], ascending=True)["User-ID"].drop_duplicates()[:neighbours_amount]
    nn_ratings = user_item_matrix[user_item_matrix.index.isin(k_nearest_users)]
    
    books_read = user_item_matrix.transpose()[userID].dropna().index
    avg_rating = nn_ratings.apply(np.nanmean).dropna()
    avg_rating = avg_rating[~avg_rating.index.isin(books_read)]
    
    recommended_books = avg_rating.sort_values(ascending=False).index[:recommendation_amount]
    return df[df['ISBN'].apply(lambda isbn: isbn in recommended_books)].drop(columns=['User-ID', 'Book-Rating']).sort_values(["Distance"], ascending=True).drop_duplicates()[:recommendation_amount]

In [6]:
user = random.sample(df['User-ID'].drop_duplicates().to_list(), 1)[0]
user_data = df[df['User-ID'] == user]
print(f'User ID : {user}')
user_data.head(5)

User ID : 168064


Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher,User-ID,Book-Rating
148791,971880107,Wild Animus,Rich Shapero,2004.0,Too Far,168064,
148792,553582909,Icebound,Dean R. Koontz,2000.0,Bantam Books,168064,8.0
148793,140067477,The Tao of Pooh,Benjamin Hoff,1983.0,Penguin Books,168064,7.0
148794,671867156,Pretend You Don't See Her,Mary Higgins Clark,1998.0,Pocket,168064,8.0
148795,786868716,The Five People You Meet in Heaven,Mitch Albom,2003.0,Hyperion,168064,7.0


In [7]:
get_user_item_recommendations(limit_data_used(df, 20), user, 5)

  results[i] = self.f(v)


Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher,Distance
2161,0515134465,Bloody Bones (Anita Blake Vampire Hunter (Pape...,Laurell K. Hamilton,1996.0,Jove Books,0.999566
2312,067102177X,"The Clayborne Brides : One Pink Rose, One Whit...",Julie Garwood,1998.0,Pocket,0.999566
4875,0684856069,The Many Lives &amp; Secret Sorrows of Josephi...,Sandra Gulland,1999.0,Touchstone,0.999566
4912,0140268863,The Odyssey,Robert Fagles,1999.0,Penguin USA (Paper),0.999566
5089,0671023888,All She Wanted,Aphrodite Jones,1996.0,Pocket,0.999566


In [8]:
def get_item_item_recommendations(df, bookISBN, recommendation_amount):
    user_item_df = df.drop(columns=['Book-Title', 'Book-Author', 'Book-Year-Of-Publication', 'Book-Publisher']).dropna().reset_index(drop=True)
    user_item_df['Book-Rating'] = user_item_df['Book-Rating'].astype(np.int16)
    user_item_matrix = pd.pivot_table(user_item_df, values='Book-Rating', index='User-ID', columns='ISBN', fill_value=0)
    
    X = user_item_matrix.T
    SVD = TruncatedSVD(n_components=8, random_state=11400464)
    resultant_matrix = SVD.fit_transform(X)
    
    corr_mat = np.corrcoef(resultant_matrix)
    
    col_idx = user_item_matrix.columns.get_loc(bookISBN)
    corr_specific = corr_mat[col_idx]
    
    recommendations = pd.DataFrame({'corr_specific':corr_specific,'ISBN': user_item_matrix.columns}).sort_values('corr_specific', ascending=False)
    return pd.merge(df[df.ISBN != bookISBN].drop(columns=['User-ID', 'Book-Rating']).drop_duplicates(), recommendations).sort_values('corr_specific', ascending=False).drop(columns=['corr_specific']).head(recommendation_amount)

In [9]:
bookISBN = random.sample(df['ISBN'].drop_duplicates().to_list(), 1)[0]
print(f'Book ISBN : {bookISBN}')
df[df.ISBN == bookISBN].drop(columns=['User-ID', 'Book-Rating']).drop_duplicates()

Book ISBN : 0515124524


Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher
32654,515124524,Hot Chocolate,Forster,1999.0,Jove Books


In [10]:
get_item_item_recommendations(df, bookISBN, 5)

Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher
15879,0758205511,Bad Boys to Go,Lori Foster,2003.0,BRAVA
12335,0373273134,Night Watch,Suzanne Brockmann,2003.0,Silhouette
8343,0373765193,Scenes of Passion,Suzanne Brockmann,2003.0,Silhouette
8268,0373834691,Caught In The Act,Lori Foster,2001.0,Harlequin
9667,080411952X,The Unsung Hero,Suzanne Brockmann,2000.0,Ivy Books
