In [1]:
import ast
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score

In [2]:
# recommended for MiniBatchKMeans
import os
os.environ["OMP_NUM_THREADS"] = "2"

In [3]:
# load data
data = pd.read_csv('final_df.csv')

In [4]:
# view data attributes & first 5 rows
print(f"num entries: {data.shape[0]}")
print(f"features: {data.columns}")
data.head()

num entries: 10770
features: Index(['title_x', 'author(s)_x', 'average_rating_x', 'isbn13',
       'language_code_x', 'num_pages_x', 'ratings_count_x',
       'text_reviews_count_x', 'publication_date_x', 'publisher_x', 'title_y',
       'author(s)_y', 'average_rating_y', 'language_code_y', 'num_pages_y',
       'ratings_count_y', 'text_reviews_count_y', 'publication_date_y',
       'publisher_y', 'genres'],
      dtype='object')


Unnamed: 0,title_x,author(s)_x,average_rating_x,isbn13,language_code_x,num_pages_x,ratings_count_x,text_reviews_count_x,publication_date_x,publisher_x,title_y,author(s)_y,average_rating_y,language_code_y,num_pages_y,ratings_count_y,text_reviews_count_y,publication_date_y,publisher_y,genres
0,harry potter and the half-blood prince (harry ...,j.k. rowling/mary grandpré,4.57,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,eng,652.0,2095690.0,27591.0,9/16/2006,Scholastic Inc.,"['fiction', 'magic', 'witches', 'school storie..."
1,harry potter and the order of the phoenix (har...,j.k. rowling/mary grandpré,4.49,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,eng,870.0,2153167.0,29221.0,9/1/2004,Scholastic Inc.,"['magic', 'fiction', 'fantasy', 'coming of age..."
2,harry potter and the chamber of secrets (harry...,j.k. rowling,4.42,9780439554893,eng,352,6333,244,11/1/2003,Scholastic,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,eng,352.0,6333.0,244.0,11/1/2003,Scholastic,"['school stories', 'fiction', 'fantasy', 'magi..."
3,harry potter and the prisoner of azkaban (harr...,j.k. rowling/mary grandpré,4.56,9780439655484,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,eng,435.0,2339585.0,36325.0,5/1/2004,Scholastic Inc.,"['fantasy', 'literature', 'adventure', 'magic'..."
4,harry potter boxed set books 1-5 (harry potte...,j.k. rowling/mary grandpré,4.78,9780439682589,eng,2690,41428,164,9/13/2004,Scholastic,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,eng,2690.0,41428.0,164.0,9/13/2004,Scholastic,[]


In [5]:
# some additional data cleaning & organizing

# remove duplicate and uninformative features
df = data.drop(['publisher_x', 'title_y', 'author(s)_y', 'average_rating_y', 'ratings_count_y', 'text_reviews_count_y',
                  'language_code_y', 'num_pages_y', 'publisher_y', 'publication_date_y'], axis=1)

df.replace(['en-US', 'en-GB', 'en-CA'], 'eng', inplace=True) # consolidate english language tags

df['genres'] = df['genres'].apply(ast.literal_eval) # change genre data into lists

# one-hot encoding for language codes
onehot = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
langs = pd.DataFrame(onehot.fit_transform(pd.DataFrame(df['language_code_x'])), columns=onehot.categories_[0])
langs *= 3 # heavier weights for 
df = pd.concat([df, langs], axis=1)

# multi-hot encoding for genres
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['genres'])
genres = pd.DataFrame(genre_matrix, columns=mlb.classes_)
df = pd.concat([df, genres], axis=1)

# remove redundant columns
df.drop(['language_code_x', 'genres'], axis=1, inplace=True)

# data scaling for KMeans
scaler = StandardScaler()
feats = ['num_pages_x', 'average_rating_x', 'ratings_count_x', 'text_reviews_count_x']
X = df[feats]
df.drop(feats, axis=1, inplace=True)
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=feats)
df = pd.concat([df, X_scaled], axis=1)

In [6]:
# view the data again
df.head()

Unnamed: 0,title_x,author(s)_x,isbn13,publication_date_x,ale,ara,eng,enm,fre,ger,...,world war ii,writing,young adult,young adult fantasy,young readers,zombies,num_pages_x,average_rating_x,ratings_count_x,text_reviews_count_x
0,harry potter and the half-blood prince (harry ...,j.k. rowling/mary grandpré,9780439785969,9/16/2006,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,0,1.314315,1.792776,18.710349,10.537255
1,harry potter and the order of the phoenix (har...,j.k. rowling/mary grandpré,9780439358071,9/1/2004,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,0,2.218035,1.567369,19.227874,11.172125
2,harry potter and the chamber of secrets (harry...,j.k. rowling,9780439554893,11/1/2003,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.070663,1.370137,-0.102318,-0.114138
3,harry potter and the prisoner of azkaban (harr...,j.k. rowling/mary grandpré,9780439655484,5/1/2004,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0.41474,1.7646,20.906391,13.939065
4,harry potter boxed set books 1-5 (harry potte...,j.k. rowling/mary grandpré,9780439682589,9/13/2004,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,0,9.762854,2.384472,0.213679,-0.145298


In [7]:
# testing KMeans hyperparameter
feat_df = df.drop(['isbn13', 'title_x', 'author(s)_x', 'publication_date_x'], axis=1)

for k in [50, 100, 150, 200]:
    km = KMeans(n_clusters=k, random_state=0, n_init='auto')
    labels = km.fit_predict(feat_df)
    score = silhouette_score(feat_df, labels)
    print(k, score)

50 0.10239406171182006
100 0.10029604948082724
150 0.103209480473158
200 0.11101141974118414


In [8]:
# KMeans step
# kmeans = KMeans(n_clusters=50, n_init='auto')
kmeans = MiniBatchKMeans(n_clusters=50, batch_size=512)
clusters = kmeans.fit_predict(feat_df)

# df['cluster'] = kmeans.labels_
df['cluster'] = clusters



In [9]:
# view cluster sizes, make sure none are <= 5
df['cluster'].value_counts()

cluster
30    951
4     875
34    825
33    489
14    407
27    401
16    385
20    373
25    338
11    312
41    309
1     291
32    289
3     281
29    273
42    239
17    228
47    226
8     214
37    199
24    188
18    188
43    181
48    179
19    177
0     170
12    152
40    138
38    115
22    114
36    113
26    112
35     97
44     96
10     95
6      91
7      83
2      82
5      73
21     60
49     58
46     49
23     46
31     36
15     36
13     36
9      34
39     27
45     25
28     14
Name: count, dtype: int64

In [10]:
# view data again to see cluster label
df.head()

Unnamed: 0,title_x,author(s)_x,isbn13,publication_date_x,ale,ara,eng,enm,fre,ger,...,writing,young adult,young adult fantasy,young readers,zombies,num_pages_x,average_rating_x,ratings_count_x,text_reviews_count_x,cluster
0,harry potter and the half-blood prince (harry ...,j.k. rowling/mary grandpré,9780439785969,9/16/2006,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,1.314315,1.792776,18.710349,10.537255,39
1,harry potter and the order of the phoenix (har...,j.k. rowling/mary grandpré,9780439358071,9/1/2004,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,2.218035,1.567369,19.227874,11.172125,39
2,harry potter and the chamber of secrets (harry...,j.k. rowling,9780439554893,11/1/2003,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,0.070663,1.370137,-0.102318,-0.114138,17
3,harry potter and the prisoner of azkaban (harr...,j.k. rowling/mary grandpré,9780439655484,5/1/2004,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,0.41474,1.7646,20.906391,13.939065,39
4,harry potter boxed set books 1-5 (harry potte...,j.k. rowling/mary grandpré,9780439682589,9/13/2004,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,9.762854,2.384472,0.213679,-0.145298,45


In [11]:
# cosine similarity step
target_isbn = df['isbn13'].iloc[457]
target_isbn

np.int64(9780812216271)

In [12]:
isbn_ind = df.index[df['isbn13'] == target_isbn].tolist()[0] # find index of target isbn
target_cluster = df.loc[isbn_ind, 'cluster'] # find cluster of target isbn
cluster_inds = df.index[df['cluster'] == target_cluster].tolist() # find indices of other cluster members

feat_matrix = feat_df.to_numpy()

target_vec = feat_matrix[isbn_ind].reshape(1, -1)
cluster_vecs = feat_matrix[cluster_inds]

similarities = cosine_similarity(target_vec, cluster_vecs)[0]

t5_inds = np.argsort(similarities)[-6:-1]
df_inds = [cluster_inds[i] for i in t5_inds]
df_inds.reverse()

print(f"best matches for {df['title_x'].iloc[isbn_ind]}:")
df.iloc[df_inds]

best matches for aeschylus  1: the oresteia: agamemnon/the libation bearers/the eumenides:


Unnamed: 0,title_x,author(s)_x,isbn13,publication_date_x,ale,ara,eng,enm,fre,ger,...,writing,young adult,young adult fantasy,young readers,zombies,num_pages_x,average_rating_x,ratings_count_x,text_reviews_count_x,cluster
461,aeschylus 2: the persians/seven against thebes...,aeschylus/david r. slavitt/smith palmer bovie,9780812216714,1/1/1998,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,-0.426798,0.468506,-0.159233,-0.208395,44
448,euripides: iphigenia at aulis (companions to g...,euripides/pantelis michelakis,9780715629949,3/9/2006,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,-0.791602,0.271274,-0.159233,-0.208785,44
451,aeschylus ii: the suppliant maidens the persi...,aeschylus/david grene/richmond lattimore/seth ...,9780226307947,2/1/1992,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,-0.6092,0.468506,-0.155838,-0.199047,44
2935,the trial of god: (as it was held on february ...,elie wiesel/robert mcafee brown/matthew fox,9780805210538,11/14/1995,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,-0.52629,0.58121,-0.150724,-0.181131,44
425,euripides v: electra / the phoenician women / ...,euripides/david grene/richmond lattimore/emily...,9780226307848,1/15/1969,0.0,0.0,3.0,0.0,0.0,0.0,...,0,0,0,0,0,-0.44338,0.778442,-0.129321,-0.191258,44


### To Do:
1. ```sklearn.pipeline.Pipeline```
2. put cosine similarity code in a function with the format ```recommend_books(isbn, df)```, which takes an ISBN and the DataFrame of all books, and returns a List of the 5 most similar books
3. helper functions
4. ways to make clusters more evenly sized?