In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors

In [4]:
books = pd.read_csv("Books.csv")
users = pd.read_csv("Users.csv")
ratings = pd.read_csv("Ratings.csv")
print('Books:',books.shape)
print('Users:',users.shape)
print('Ratings:',ratings.shape)

  books = pd.read_csv("Books.csv")


Books: (271360, 8)
Users: (278858, 3)
Ratings: (1149780, 3)


## Books

In [5]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [6]:
books.rename(columns={'Book-Title':'Title','Book-Author':'Author','Year-Of-Publication':'Publication_Year'},inplace=True)

In [7]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271360 entries, 0 to 271359
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   ISBN              271360 non-null  object
 1   Title             271360 non-null  object
 2   Author            271358 non-null  object
 3   Publication_Year  271360 non-null  object
 4   Publisher         271358 non-null  object
 5   Image-URL-S       271360 non-null  object
 6   Image-URL-M       271360 non-null  object
 7   Image-URL-L       271357 non-null  object
dtypes: object(8)
memory usage: 16.6+ MB


In [8]:
books.isnull().sum()

ISBN                0
Title               0
Author              2
Publication_Year    0
Publisher           2
Image-URL-S         0
Image-URL-M         0
Image-URL-L         3
dtype: int64

In [9]:
books.duplicated().sum()

0

In [10]:
books['Image-URL-S'] = books['Image-URL-S'].str.replace('http://', 'https://')
books['Image-URL-M'] = books['Image-URL-M'].str.replace('http://', 'https://')
books['Image-URL-L'] = books['Image-URL-L'].str.replace('http://', 'https://')

## Users

In [11]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [12]:
users.rename(columns={'User-ID':'User_ID'}, inplace=True)

In [13]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   User_ID   278858 non-null  int64  
 1   Location  278858 non-null  object 
 2   Age       168096 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 6.4+ MB


In [14]:
users.isnull().sum()

User_ID          0
Location         0
Age         110762
dtype: int64

In [15]:
users.duplicated().sum()

0

## Ratings

In [16]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [17]:
ratings.rename(columns={'User-ID':'User_ID','Book-Rating':'Rating'},inplace=True)

In [18]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   User_ID  1149780 non-null  int64 
 1   ISBN     1149780 non-null  object
 2   Rating   1149780 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [19]:
ratings.isnull().sum()

User_ID    0
ISBN       0
Rating     0
dtype: int64

In [20]:
ratings.duplicated().sum()

0

## Top 50 books based on popularity

In [21]:
ratings_with_books = ratings.merge(books, on='ISBN')
ratings_with_books.shape

(1031136, 10)

In [22]:
# ratings_with_books.head()

In [23]:
num = ratings_with_books.groupby('Title')['Rating'].count().reset_index().rename(columns={'Rating':'Number_of_Ratings'})
num

Unnamed: 0,Title,Number_of_Ratings
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1
...,...,...
241066,Ã?Â?lpiraten.,2
241067,Ã?Â?rger mit Produkt X. Roman.,4
241068,Ã?Â?sterlich leben.,1
241069,Ã?Â?stlich der Berge.,3


In [24]:
avg = ratings_with_books.groupby('Title')['Rating'].mean().reset_index().rename(columns={'Rating':'Average_Rating'})
avg

Unnamed: 0,Title,Average_Rating
0,A Light in the Storm: The Civil War Diary of ...,2.250000
1,Always Have Popsicles,0.000000
2,Apple Magic (The Collector's series),0.000000
3,"Ask Lily (Young Women of Faith: Lily Series, ...",8.000000
4,Beyond IBM: Leadership Marketing and Finance ...,0.000000
...,...,...
241066,Ã?Â?lpiraten.,0.000000
241067,Ã?Â?rger mit Produkt X. Roman.,5.250000
241068,Ã?Â?sterlich leben.,7.000000
241069,Ã?Â?stlich der Berge.,2.666667


In [25]:
Popularity_df = num.merge(avg, on='Title')
Popularity_df.sample(5)

Unnamed: 0,Title,Number_of_Ratings,Average_Rating
216632,The tartan sell (Penguin crime fiction),2,3.5
70083,Frameshift,10,2.3
87998,Hunting the Wild Pineapple,1,8.0
90345,Illearth War Covenant 2,11,2.272727
164839,Special Edition Using Visual Basic 4 (Using .....,1,0.0


In [26]:
Popularity_df = Popularity_df[Popularity_df['Number_of_Ratings'] > 250].sort_values('Average_Rating', ascending=False).head(50)
Popularity_df.head()

Unnamed: 0,Title,Number_of_Ratings,Average_Rating
80434,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804
80422,Harry Potter and the Goblet of Fire (Book 4),387,5.824289
80441,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741
80426,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441
80414,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453


In [28]:
books1 = books.drop_duplicates('Title')

In [29]:
df = Popularity_df.merge(books1, on='Title')
df.shape

(50, 10)

In [30]:
Popularity_Final = df[['ISBN','Title','Author','Publication_Year','Publisher','Image-URL-L','Number_of_Ratings', 'Average_Rating']]

In [31]:
Popularity_Final.head()

Unnamed: 0,ISBN,Title,Author,Publication_Year,Publisher,Image-URL-L,Number_of_Ratings,Average_Rating
0,0439136350,Harry Potter and the Prisoner of Azkaban (Book 3),J. K. Rowling,1999,Scholastic,https://images.amazon.com/images/P/0439136350....,428,5.852804
1,0439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,https://images.amazon.com/images/P/0439139597....,387,5.824289
2,0590353403,Harry Potter and the Sorcerer's Stone (Book 1),J. K. Rowling,1998,Scholastic,https://images.amazon.com/images/P/0590353403....,278,5.73741
3,043935806X,Harry Potter and the Order of the Phoenix (Boo...,J. K. Rowling,2003,Scholastic,https://images.amazon.com/images/P/043935806X....,347,5.501441
4,0439064872,Harry Potter and the Chamber of Secrets (Book 2),J. K. Rowling,2000,Scholastic,https://images.amazon.com/images/P/0439064872....,556,5.183453


In [32]:
aGoodBook = 50
aGoodUser = 200

In [38]:
x = ratings_with_books.groupby('User_ID')['Rating'].count()>aGoodUser
good_users_index = x[x].index
filtered_ratings = ratings_with_books[ratings_with_books['User_ID'].isin(good_users_index)]
filtered_ratings.shape

(474007, 10)

In [39]:
y = filtered_ratings.groupby('Title')['Rating'].count()>aGoodBook
good_book_index = y[y].index
filtered_ratings_books = filtered_ratings[filtered_ratings['Title'].isin(good_book_index)]
filtered_ratings_books.shape

(57236, 10)

In [40]:
pt = filtered_ratings_books.pivot_table(index='Title', columns='User_ID', values='Rating')

In [41]:
pt.fillna(0, inplace=True)
pt.shape

(679, 810)

In [42]:
pt.head()

User_ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
final_dataset = pt.copy()

In [44]:
final_dataset_sparse = csr_matrix(final_dataset)

In [46]:
nn_model = NearestNeighbors(metric='cosine', algorithm='brute')
nn_model.fit(final_dataset_sparse)

In [47]:
def recommend_nn(book_name):
    dist , sugg = nn_model.kneighbors(final_dataset[final_dataset.index == book_name],n_neighbors=6)
    print('Book Recommendation for',book_name,'are:')
    for i in range(len(sugg[0])):
        if i!=0:
            print(i,final_dataset.index[sugg[0][i]])

In [48]:
recommend_nn('Message in a Bottle')

Book Recommendation for Message in a Bottle are:
1 Nights in Rodanthe
2 The Mulberry Tree
3 A Walk to Remember
4 River's End
5 Nightmares &amp; Dreamscapes
