In [1]:
import pandas as pd
import numpy as np


In [2]:
books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

In [4]:
books_filename

'BX-Books.csv'

In [14]:
books_df = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})



In [15]:
ratings_df = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [16]:
books_df.head()

Unnamed: 0,isbn,title,author
0,195153448,Classical Mythology,Mark P. O. Morford
1,2005018,Clara Callan,Richard Bruce Wright
2,60973129,Decision in Normandy,Carlo D'Este
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,393045218,The Mummies of Urumchi,E. J. W. Barber


In [18]:
del books_df['author']

In [19]:
books_df.head()

Unnamed: 0,isbn,title
0,195153448,Classical Mythology
1,2005018,Clara Callan
2,60973129,Decision in Normandy
3,374157065,Flu: The Story of the Great Influenza Pandemic...
4,393045218,The Mummies of Urumchi


In [21]:
ratings_df.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [23]:
df=pd.merge(ratings_df,books_df,on='isbn')

In [24]:
df.head()

Unnamed: 0,user,isbn,rating,title
0,276725,034545104X,0.0,Flesh Tones: A Novel
1,2313,034545104X,5.0,Flesh Tones: A Novel
2,6543,034545104X,0.0,Flesh Tones: A Novel
3,8680,034545104X,5.0,Flesh Tones: A Novel
4,10314,034545104X,9.0,Flesh Tones: A Novel


In [25]:
combine_book_rating = df.dropna(axis = 0, subset = ['title'])
combine_book_rating.head()

Unnamed: 0,user,isbn,rating,title
0,276725,034545104X,0.0,Flesh Tones: A Novel
1,2313,034545104X,5.0,Flesh Tones: A Novel
2,6543,034545104X,0.0,Flesh Tones: A Novel
3,8680,034545104X,5.0,Flesh Tones: A Novel
4,10314,034545104X,9.0,Flesh Tones: A Novel


In [26]:
book_ratingCount = (combine_book_rating. 
    groupby(by = ['title'])['rating']. 
    count().
    reset_index().
    rename (columns = {'rating': 'totalRatingCount'}) 
    [['title', 'totalRatingCount']]
                    )


In [28]:
book_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [31]:
#df_test = combine_movie_rating[combine_movie_rating['title'].str.contains('eXistenZ (1999)')]
#df_test

In [32]:
rating_with_totalRatingCount=combine_book_rating.merge(book_ratingCount,left_on='title',right_on='title',how='left')
rating_with_totalRatingCount.head()

Unnamed: 0,user,isbn,rating,title,totalRatingCount
0,276725,034545104X,0.0,Flesh Tones: A Novel,60
1,2313,034545104X,5.0,Flesh Tones: A Novel,60
2,6543,034545104X,0.0,Flesh Tones: A Novel,60
3,8680,034545104X,5.0,Flesh Tones: A Novel,60
4,10314,034545104X,9.0,Flesh Tones: A Novel,60


In [33]:
pd.set_option('display.float_format',lambda x:'%.3f'%x)
print(book_ratingCount['totalRatingCount'].describe())

count   241090.000
mean         4.277
std         16.738
min          1.000
25%          1.000
50%          1.000
75%          3.000
max       2502.000
Name: totalRatingCount, dtype: float64


In [34]:
popularity_threshold=50
rating_popular_book=rating_with_totalRatingCount.query('totalRatingCount>=@popularity_threshold')
rating_popular_book.head()

Unnamed: 0,user,isbn,rating,title,totalRatingCount
0,276725,034545104X,0.0,Flesh Tones: A Novel,60
1,2313,034545104X,5.0,Flesh Tones: A Novel,60
2,6543,034545104X,0.0,Flesh Tones: A Novel,60
3,8680,034545104X,5.0,Flesh Tones: A Novel,60
4,10314,034545104X,9.0,Flesh Tones: A Novel,60


In [35]:
rating_popular_book.shape

(288740, 5)

In [37]:
book_features_df=rating_popular_book.pivot_table(index='title',columns='user',values='rating').fillna(0)
book_features_df.head() 

user,8,9,14,16,17,19,23,26,32,39,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
from scipy.sparse import csr_matrix

book_features_df_matrix = csr_matrix(book_features_df.values) 
book_features_df_matrix

<2444x47994 sparse matrix of type '<class 'numpy.float32'>'
	with 113910 stored elements in Compressed Sparse Row format>

In [39]:
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine', algorithm= 'brute')
model_knn.fit(book_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [41]:
book_features_df.shape

(2444, 47994)

In [50]:
query_index=np.random.choice(book_features_df.shape[0])
print(query_index)


881


In [51]:
distances,indices=model_knn.kneighbors(book_features_df.iloc[query_index,:].values.reshape(1,-1),n_neighbors=6)
print(indices)

[[ 881 2199 2262 1589 1252  580]]


In [52]:
book_features_df.head()

user,8,9,14,16,17,19,23,26,32,39,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
for i in range(0,len(distances.flatten())):
    if i==0:
        print('recommendations for {0}:\n'.format(book_features_df.index[query_index]))
    else:
        print('{0}:{1},with distance of {2}:'.format(i,book_features_df.index[indices.flatten()[i]],distances.flatten()[i]))

recommendations for If I'd Killed Him When I Met Him (Elizabeth MacPherson Novels (Paperback)):

1:The Third Victim,with distance of 0.8818414211273193:
2:Three Wishes,with distance of 0.8874393701553345:
3:Star,with distance of 0.890062689781189:
4:One for the Money (Stephanie Plum Novels (Paperback)),with distance of 0.890186607837677:
5:Embraced by the Light,with distance of 0.9033122658729553:
