In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from zipfile import ZipFile
from sklearn.metrics.pairwise import cosine_similarity
import pickle

## Downloading the Dataset

In [46]:
# # kaggle api
# !kaggle datasets download -d arashnic/book-recommendation-dataset

In [47]:
# with ZipFile('book-recommendation-dataset.zip', 'r') as zip:
#   zip.extractall()
#   print("The dataset has been extracted...")

In [48]:
books = pd.read_csv('data/Books.csv')
users = pd.read_csv('data/Users.csv')
ratings = pd.read_csv('data/Ratings.csv')

  books = pd.read_csv('data/Books.csv')


In [49]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [50]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [51]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [52]:
print(books.shape)
print(ratings.shape)
print(users.shape)

(271360, 8)
(1149780, 3)
(278858, 3)


In [53]:
books.isnull().sum()

ISBN                   0
Book-Title             0
Book-Author            2
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [54]:
users.isnull().sum()

User-ID          0
Location         0
Age         110762
dtype: int64

In [55]:
ratings.isnull().sum()

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [56]:
print(books.duplicated().sum())
print(ratings.duplicated().sum())
print(users.duplicated().sum())

0
0
0


## Popularity Based Recommendation System

In [57]:
ratings_with_names = ratings.merge(books, on='ISBN')

In [58]:
num_rating_df = ratings_with_names.groupby('Book-Title').count()['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating': 'Num_Ratings'}, inplace=True)

In [59]:
num_rating_df

Unnamed: 0,Book-Title,Num_Ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [60]:
avg_rating_df = ratings_with_names.groupby('Book-Title')['Book-Rating'].agg(lambda x: x.astype(float).mean()).reset_index()
avg_rating_df.rename(columns={'Book-Rating': 'Avg_Rating'}, inplace=True)
avg_rating_df

Unnamed: 0,Book-Title,Avg_Rating
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241066,Ã?Â?lpiraten.,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,5.250000
241068,Ã?Â?sterlich leben.,7.000000
241069,Ã?Â?stlich der Berge.,2.666667


In [61]:
popular_df = num_rating_df.merge(avg_rating_df, on='Book-Title')
popular_df

Unnamed: 0,Book-Title,Num_Ratings,Avg_Rating
0,A Light in the Storm: The Civil War Diary of ...,4,2.250000
1,Always Have Popsicles,1,0.000000
2,Apple Magic (The Collector's series),1,0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1,8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,1,0.000000
...,...,...,...
241066,Ã?Â?lpiraten.,2,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,4,5.250000
241068,Ã?Â?sterlich leben.,1,7.000000
241069,Ã?Â?stlich der Berge.,3,2.666667


In [62]:
popular_df = popular_df[popular_df['Num_Ratings'] >= 250].sort_values('Avg_Rating', ascending=False).head(50)

In [63]:
popular_df.columns

Index(['Book-Title', 'Num_Ratings', 'Avg_Rating'], dtype='object')

In [64]:
popular_df = popular_df.merge(books, on='Book-Title').drop_duplicates('Book-Title')

In [65]:
popular_df.shape

(50, 10)

In [66]:
popular_df.drop(['ISBN', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M'], axis=1, inplace=True)

In [67]:
popular_df.sample(5)

Unnamed: 0,Book-Title,Num_Ratings,Avg_Rating,Book-Author,Image-URL-L
103,Fahrenheit 451,409,4.264059,Ray Bradbury,http://images.amazon.com/images/P/3257208626.0...
177,Hard Eight : A Stephanie Plum Novel (A Stephan...,269,3.825279,Janet Evanovich,http://images.amazon.com/images/P/0312983867.0...
63,1984,284,4.454225,George Orwell,http://images.amazon.com/images/P/0451524934.0...
117,Outlander,283,4.173145,DIANA GABALDON,http://images.amazon.com/images/P/0440222915.0...
173,Seven Up (A Stephanie Plum Novel),278,3.888489,Janet Evanovich,http://images.amazon.com/images/P/0312265840.0...


## Collaborative Filtering Based Recommender System

In [68]:
x = ratings_with_names.groupby('User-ID').count()['Book-Rating'] > 200
famous_users = x[x].index

In [69]:
filtered_ratings = ratings_with_names[ratings_with_names["User-ID"].isin(famous_users)]

In [70]:
y = filtered_ratings.groupby('Book-Title').count()['Book-Rating'] > 50
famous_books = y[y].index

In [71]:
final_ratings = filtered_ratings[filtered_ratings['Book-Title'].isin(famous_books)]

In [72]:
pt = final_ratings.pivot_table(index='Book-Title', columns='User-ID', values="Book-Rating")

In [73]:
pt.fillna(0, inplace=True)

In [74]:
similarity_scores = cosine_similarity(pt)

In [75]:
similarity_scores.shape

(679, 679)

In [76]:
def recommend(book_name):
  index = np.where(pt.index == book_name)[0][0]
  similar_items = sorted(list(enumerate(similarity_scores[index])), key=lambda x: x[1], reverse=True)[1:7]

  data = []
  for i in similar_items:
    item = []

    temp_df = books[books['Book-Title']  == pt.index[i[0]]]
    item.extend(list(temp_df.drop_duplicates("Book-Title")['Book-Title'].values))
    item.extend(list(temp_df.drop_duplicates("Book-Title")['Book-Author'].values))
    item.extend(list(temp_df.drop_duplicates("Book-Title")['Image-URL-L'].values))

    data.append(item)

  return data

In [77]:
recommend('The Da Vinci Code')

[['Angels &amp; Demons',
  'Dan Brown',
  'http://images.amazon.com/images/P/0671027360.01.LZZZZZZZ.jpg'],
 ['Touching Evil',
  'Kay Hooper',
  'http://images.amazon.com/images/P/0553583441.01.LZZZZZZZ.jpg'],
 ['Saving Faith',
  'David Baldacci',
  'http://images.amazon.com/images/P/0446608890.01.LZZZZZZZ.jpg'],
 ["The Sweet Potato Queens' Book of Love",
  'JILL CONNER BROWNE',
  'http://images.amazon.com/images/P/0609804138.01.LZZZZZZZ.jpg'],
 ['Middlesex: A Novel',
  'Jeffrey Eugenides',
  'http://images.amazon.com/images/P/0312422156.01.LZZZZZZZ.jpg'],
 ['The Lovely Bones: A Novel',
  'Alice Sebold',
  'http://images.amazon.com/images/P/0316666343.01.LZZZZZZZ.jpg']]

In [79]:
pickle.dump(popular_df, open('PKL/popular.pkl', 'wb'))
pickle.dump(pt, open('PKL/pt.pkl', 'wb'))
pickle.dump(books, open('PKL/books.pkl', 'wb'))
pickle.dump(similarity_scores, open('PKL/similarity_scores.pkl', 'wb'))