In [1]:
import random
import pandas as pd
import numpy as np

# https://docs.scipy.org/doc/scipy/reference/spatial.distance.html
from scipy.spatial.distance import hamming
from sklearn.decomposition import TruncatedSVD

In [2]:
df = pd.read_feather('./data.feather')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414719 entries, 0 to 414718
Data columns (total 7 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   ISBN                      414719 non-null  object 
 1   Book-Title                404698 non-null  object 
 2   Book-Author               404698 non-null  object 
 3   Book-Year-Of-Publication  404698 non-null  float16
 4   Book-Publisher            404698 non-null  object 
 5   User-ID                   414719 non-null  int32  
 6   Book-Rating               414719 non-null  int16  
dtypes: float16(1), int16(1), int32(1), object(4)
memory usage: 15.8+ MB


In [3]:
def limit_data_used(df, percentage):
    for col in ['User-ID', 'ISBN']:
        rating_counts = df[col].value_counts()
        cutoff_point = int(rating_counts.shape[0]*(percentage/100.0))
        most_ratings = rating_counts[:cutoff_point]
        return df[df.apply(lambda rating: rating[col] in most_ratings, axis=1)]

In [4]:
def hamming_distance(user1, user2, user_item_matrix):
        try:
            user1_ratings = user_item_matrix.transpose()[user1]
            user2_ratings = user_item_matrix.transpose()[user2]
            distance = hamming(user1_ratings, user2_ratings)
        except: 
            distance = np.NaN
        return distance

In [5]:
def get_user_item_recommendations(df, userID, recommendation_amount):
    df = df.append(user_data, ignore_index=True).reset_index(drop=True)
    user_item_matrix = pd.pivot_table(df, values='Book-Rating', index='User-ID', columns='ISBN')
    df = df[df['User-ID'] != userID]
    print(df.shape)
    
    df["Distance"] = df["User-ID"].apply(lambda user2: hamming_distance(userID, user2, user_item_matrix))
    
    neighbours_amount = recommendation_amount*2
    k_nearest_users = df[df['User-ID'] != userID].sort_values(["Distance"], ascending=True)["User-ID"].drop_duplicates()[:neighbours_amount]
    nn_ratings = user_item_matrix[user_item_matrix.index.isin(k_nearest_users)]
    
    books_read = user_item_matrix.transpose()[userID].dropna().index
    avg_rating = nn_ratings.apply(np.nanmean).dropna()
    avg_rating = avg_rating[~avg_rating.index.isin(books_read)]
    
    recommended_books = avg_rating.sort_values(ascending=False).index[:recommendation_amount]
    return df[df['ISBN'].apply(lambda isbn: isbn in recommended_books)].drop(columns=['User-ID', 'Book-Rating']).sort_values(["Distance"], ascending=True).drop_duplicates()[:recommendation_amount]

In [6]:
user = random.sample(df['User-ID'].drop_duplicates().to_list(), 1)[0]
user_data = df[df['User-ID'] == user]
print(f'User ID : {user}')
user_data.head(5)

User ID : 172101


Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher,User-ID,Book-Rating
411091,804108447,Druids,MORGAN LLYWELYN,1992.0,Del Rey,172101,0
411092,880389052,Homeland (Forgotten Realms: The Dark Elf Trilo...,R.A. Salvatore,1990.0,Wizards of the Coast,172101,8
411093,880389206,Exile (Forgotten Realms: Dark Elf Trilogy),R. A. Salvatore,1996.0,Wizards of the Coast,172101,0
411094,886773741,Tailchaser's Song,Tad Williams,1994.0,Daw Books,172101,6
411095,553573985,Gibbon's Decline and Fall,Sheri S. Tepper,1997.0,Bantam Books,172101,0


In [7]:
get_user_item_recommendations(limit_data_used(df, 20), user, 5)

(284524, 7)


  results[i] = self.f(v)


Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher,Distance
1385,0440224713,Total Recall: A V.I. Warshawski Novel,Sara Paretsky,2002.0,Dell Publishing Company,0.999889
3533,044022859X,Holes (Readers Circle),LOUIS SACHAR,2001.0,Laurel Leaf,0.999889
3571,0440228654,Chinese Cinderella: The True Story of an Unwan...,Adeline Yen Mah,2001.0,Laurel-Leaf Books,0.999889
3799,0689826990,Hatchet,Gary Paulsen,1999.0,Simon Pulse,0.999889
3952,0590423541,Black Beauty,Anna Sewell,1989.0,Scholastic,0.999889


In [8]:
def get_item_item_recommendations(df, bookISBN, recommendation_amount):
    user_item_matrix = pd.pivot_table(df, values='Book-Rating', index='User-ID', columns='ISBN', fill_value=0)
    
    X = user_item_matrix.T
    SVD = TruncatedSVD(n_components=8, random_state=11400464)
    resultant_matrix = SVD.fit_transform(X)
    
    corr_mat = np.corrcoef(resultant_matrix)
    
    col_idx = user_item_matrix.columns.get_loc(bookISBN)
    corr_specific = corr_mat[col_idx]
    
    recommendations = pd.DataFrame({'corr_specific':corr_specific,'ISBN': user_item_matrix.columns}).sort_values('corr_specific', ascending=False)
    return pd.merge(df[df.ISBN != bookISBN].drop(columns=['User-ID', 'Book-Rating']).drop_duplicates(), recommendations).sort_values('corr_specific', ascending=False).drop(columns=['corr_specific']).head(recommendation_amount)

In [9]:
bookISBN = random.sample(df['ISBN'].drop_duplicates().to_list(), 1)[0]
print(f'Book ISBN : {bookISBN}')
df[df.ISBN == bookISBN].drop(columns=['User-ID', 'Book-Rating']).drop_duplicates()

Book ISBN : 0380844001


Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher
27805,380844001,A Rose in Winter,Kathleen E. Woodiwiss,1983.0,Avon


In [10]:
get_item_item_recommendations(df, bookISBN, 5)

  c /= stddev[:, None]
  c /= stddev[None, :]


Unnamed: 0,ISBN,Book-Title,Book-Author,Book-Year-Of-Publication,Book-Publisher
5495,0515093556,Belinda,Anne Rice,1994.0,Jove Books
8026,059033123X,Titanic: The Long Night,Diane Hoh,1998.0,Scholastic
14362,0345368975,Pegasus in Flight,Anne McCaffrey,1991.0,Del Rey Books
7808,0451190556,The Green Mile: The Bad Death of Eduard Delacr...,Stephen King,1996.0,Signet Book
5524,0446805580,The Executioner's Song,Norman Mailer,1980.0,Warner Books
