## Data Loading

In [1]:
#importing python libraries
import pandas as pd
import numpy as np
import pickle

In [2]:
#loading datasets
df_books = pd.read_csv('Dataset/Books.csv', low_memory=False)
df_ratings = pd.read_csv('Dataset/Ratings.csv')
df_users = pd.read_csv('Dataset/Users.csv')

In [3]:
#set seed for reproducibility
np.random.seed(0)

## Preprocessing on Books dataset

In [4]:
#first five rows of books dataset
df_books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [5]:
#number of missing values in books dataset
missing_books_count = df_books.isnull().sum()
missing_books_count

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [6]:
#dropping unrequired columns in books dataset
df_books.drop(['Image-URL-S', 'Image-URL-L'], axis = 1, inplace = True)

In [7]:
#uppercasing ISBN
df_books['ISBN'].str.upper()

0         0195153448
1         0002005018
2         0060973129
3         0374157065
4         0393045218
             ...    
271355    0440400988
271356    0525447644
271357    006008667X
271358    0192126040
271359    0767409752
Name: ISBN, Length: 271360, dtype: object

In [8]:
#replacing null author and publisher with other
null_Author = np.where(df_books['Book-Author'].isnull())
null_publisher = np.where(df_books['Publisher'].isnull())

df_books.at[null_Author[0][0],'Book-Author'] = 'Other'
df_books.at[null_publisher[0][0],'Publisher'] = 'Other'
df_books.at[null_publisher[0][1],'Publisher'] = 'Other'

In [9]:
#get all the unique values of year of publication
years = df_books['Year-Of-Publication'].unique().sort()


In [10]:
#checking data for 'DK Publishing Inc'
df_books.loc[df_books['Year-Of-Publication'] == 'DK Publishing Inc',:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
209538,078946697X,"DK Readers: Creating the X-Men, How It All Beg...",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.0...,http://images.amazon.com/images/P/078946697X.0...
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Book...",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.0...,http://images.amazon.com/images/P/0789466953.0...


In [11]:
#editing data for DK Publishing Inc
df_books.at[209538,'Book-Author'] = 'Other'
df_books.at[209538,'Year-Of-Publication'] = 2000
df_books.at[209538,'Publisher'] = 'DK Publishing Inc'

df_books.at[221678,'Book-Author'] = 'Other'
df_books.at[221678,'Publisher'] = 'DK Publishing Inc'
df_books.at[221678,'Year-Of-Publication'] = 2000

In [12]:
#checking data for 'Gallimard'
df_books.loc[df_books['Year-Of-Publication'] == 'Gallimard',:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-M...",2003,Gallimard,http://images.amazon.com/images/P/2070426769.0...,http://images.amazon.com/images/P/2070426769.0...


In [13]:
#editing data for Gallimard
df_books.at[220731 ,'Book-Author'] = 'Other'
df_books.at[220731 ,'Publisher'] = 'Gallimard'
df_books.at[220731 ,'Year-Of-Publication'] = '2003'

In [14]:
#converting year of publication in int data type
df_books['Year-Of-Publication'] = df_books['Year-Of-Publication'].astype(int)

In [15]:
#selecting range which less than 2022
df_books.loc[df_books['Year-Of-Publication'] > 2022, 'Year-Of-Publication'] = 2002

#replacing Invalid years with max year
df_books.loc[df_books['Year-Of-Publication'] == 0, 'Year-Of-Publication'] = 2002

In [16]:
#duplicate rows in books dataset
duplicated_books = df_books.duplicated().sum()

## Preprocessing on Users dataset

In [17]:
#first five rows of users dataset
df_users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [18]:
#number of missing values in users dataset
missing_users_count = df_users.isnull().sum()
print(missing_users_count)

User-ID          0
Location         0
Age         110762
dtype: int64


In [19]:
#splitting location into city, state and country
locations_list = df_users.Location.str.split(', ')
location_count = len(locations_list)
cities_list = []
states_list = []
countries_list = []
for location in range(0, location_count):
    if locations_list[location][0] == '' or locations_list[location][0] == 'n/a' or locations_list[location][0] == ' ':
        cities_list.append('Other')
    else: 
        cities_list.append(locations_list[location][0])

    if (len(locations_list[location]) < 2):
        states_list.append('Other')
        countries_list.append('Other')
    
    else: 
        if locations_list[location][1] == '' or locations_list[location][1] == 'n/a' or locations_list[location][1] == ' ':
            states_list.append('Other')
        else: 
            states_list.append(locations_list[location][1])
        
        if (len(locations_list[location]) < 3):
            countries_list.append('Other')
        
        else: 
            if locations_list[location][2] == '' or locations_list[location][2] == 'n/a' or locations_list[location][2] == ' ':
                countries_list.append('Other')
            else: 
                countries_list.append(locations_list[location][2])



In [20]:
#creating location dataframes
df_city = pd.DataFrame(cities_list, columns=['City'])
df_state = pd.DataFrame(states_list, columns = ['State'])
df_country = pd.DataFrame(countries_list, columns =['Country'])

df_location = pd.concat([df_city, df_state, df_country], axis=1)
df_location

Unnamed: 0,City,State,Country
0,nyc,new york,usa
1,stockton,california,usa
2,moscow,yukon territory,russia
3,porto,v.n.gaia,portugal
4,farnborough,hants,united kingdom
...,...,...,...
278853,portland,oregon,usa
278854,tacoma,washington,united kingdom
278855,brampton,ontario,canada
278856,knoxville,tennessee,usa


In [21]:
#converting location to lowercase
df_location['City'] = df_location['City'].str.lower()
df_location['State'] = df_location['State'].str.lower()
df_location['Country'] = df_location['Country'].str.lower()

In [22]:
#adding locations to df_users
df_users = pd.concat([df_users, df_location], axis = 1)
df_users

Unnamed: 0,User-ID,Location,Age,City,State,Country
0,1,"nyc, new york, usa",,nyc,new york,usa
1,2,"stockton, california, usa",18.0,stockton,california,usa
2,3,"moscow, yukon territory, russia",,moscow,yukon territory,russia
3,4,"porto, v.n.gaia, portugal",17.0,porto,v.n.gaia,portugal
4,5,"farnborough, hants, united kingdom",,farnborough,hants,united kingdom
...,...,...,...,...,...,...
278853,278854,"portland, oregon, usa",,portland,oregon,usa
278854,278855,"tacoma, washington, united kingdom",50.0,tacoma,washington,united kingdom
278855,278856,"brampton, ontario, canada",,brampton,ontario,canada
278856,278857,"knoxville, tennessee, usa",,knoxville,tennessee,usa


In [23]:
#dropping location from users dataset
df_users.drop(['Location'], axis = 1, inplace = True)

In [24]:
#age preprocessing
ages = df_users['Age'].unique().sort()
considerable_age = df_users[df_users['Age'] <= 98] 
considerable_age = considerable_age[considerable_age['Age'] >= 8]
average_age = round(considerable_age['Age'].mean())


In [25]:
#replacing ages that don't fall in range with average
df_users.loc[df_users['Age'] > 98, 'Age'] = average_age
df_users.loc[df_users['Age'] < 8, 'Age'] = average_age

In [26]:
#filling missing age with average age 
#changing age data type to int
df_users['Age'] = df_users['Age'].fillna(average_age)

df_users['Age'] = df_users['Age'].astype(int)

In [27]:
#duplicate users in books dataset
duplicated_users = df_users.duplicated().sum()
duplicated_users

0

## Preprocessing on Ratings dataset

In [28]:
#first five rows of ratings dataset
df_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [29]:
#number of missing values in ratings dataset
missing_ratings_count = df_ratings.isnull().sum()
missing_ratings_count

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [30]:
#checking data type of 'Book-Rating'
df_ratings.dtypes

User-ID         int64
ISBN           object
Book-Rating     int64
dtype: object

In [31]:
#uppercasing ISBN
df_books['ISBN'].str.upper()

0         0195153448
1         0002005018
2         0060973129
3         0374157065
4         0393045218
             ...    
271355    0440400988
271356    0525447644
271357    006008667X
271358    0192126040
271359    0767409752
Name: ISBN, Length: 271360, dtype: object

In [32]:
#duplicate ratings in books dataset
duplicated_ratings = df_ratings.duplicated().sum()
duplicated_ratings

0

## Dataset Merging

In [33]:
df_recommendation_dataset = pd.merge(df_books, df_ratings, on="ISBN")
df_recommendation_dataset = pd.merge(df_recommendation_dataset, df_users, on="User-ID")

In [34]:
df_recommendation_dataset.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-M,User-ID,Book-Rating,Age,City,State,Country
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,2,0,18,stockton,california,usa
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,8,5,35,timmins,ontario,canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,8,0,35,timmins,ontario,canada
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,8,0,35,timmins,ontario,canada
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,8,0,35,timmins,ontario,canada


In [35]:
#books with ratings
df_books_with_ratings = df_recommendation_dataset[df_recommendation_dataset['Book-Rating'] != 0]
df_books_with_ratings = df_books_with_ratings.reset_index(drop = True)

In [36]:
#books without ratings
df_books_without_ratings = df_recommendation_dataset[df_recommendation_dataset['Book-Rating'] == 0]
df_books_without_ratings = df_books_without_ratings.reset_index(drop = True)

## TOP 50 Books

In [37]:
#calculating total number of ratings for each book
df_ratings_count = df_books_with_ratings.groupby('Book-Title').count()['Book-Rating'].reset_index()
df_ratings_count = df_ratings_count.sort_values('Book-Rating', ascending=False)

In [38]:
#calculating average ratings 
df_average_rating = df_books_with_ratings.groupby('Book-Title').mean(numeric_only = True)['Book-Rating'].reset_index()
df_average_rating.rename(columns={'Book-Rating':'Average-Rating'},inplace=True)
df_average_rating = df_average_rating.sort_values('Average-Rating', ascending=False)


In [39]:
#merging total-ratings and average-ratings dataset
df_popular_books = pd.merge(df_ratings_count, df_average_rating, on="Book-Title")

In [40]:
#filter to consider total-ratings atleast more than 200
df_top_books = df_popular_books[df_popular_books['Book-Rating']>=200].sort_values('Average-Rating',ascending=False)

In [41]:
#merge with books for display
df_top_books = df_top_books.merge(df_books,on='Book-Title').drop_duplicates('Book-Title')[['Book-Title','Book-Author','Image-URL-M', 'Book-Rating', 'Average-Rating']]
df_top_books.reset_index(inplace=True)

In [42]:
def get_top_books():
    top_books = pickle.dump(df_top_books, open('top_books.pkl', 'wb'))
    return top_books
get_top_books()

## Books by same author and publisher

In [43]:
#calculating ratings count on all books
df_total_ratings_count = df_recommendation_dataset.groupby('Book-Title').count()['Book-Rating'].reset_index()
df_total_ratings_count = df_total_ratings_count.sort_values('Book-Rating', ascending=False)

In [44]:
#calculating average ratings on all books
df_average_books_rating = df_recommendation_dataset.groupby('Book-Title').mean(numeric_only = True)['Book-Rating'].reset_index()
df_average_books_rating.rename(columns={'Book-Rating':'Average-Rating'},inplace=True)

In [45]:
# merging all the books
df_all_books = df_total_ratings_count.merge(df_average_books_rating,on='Book-Title')

In [46]:
#calculating aggregared rating
df_author_recommendations = df_all_books.sort_values('Average-Rating', ascending=False)
df_author_recommendations["Aggregated-Rating"] = df_author_recommendations['Book-Rating']*df_author_recommendations['Average-Rating']

In [47]:
#merging with books
df_author_recommendations = df_author_recommendations.merge(df_books,on='Book-Title').drop_duplicates('Book-Title')
df_author_recommendations=df_author_recommendations.sort_values('Aggregated-Rating',ascending=False)

In [48]:
#books by same author
bookname = input()
#Harry Potter and the Chamber of Secrets (Book 2)
dataframe_books = df_author_recommendations[df_author_recommendations['Book-Title'] == bookname]
book_author = dataframe_books['Book-Author']
author_name = book_author.to_string(index=False)
author_recommnedations = df_author_recommendations.loc[df_author_recommendations['Book-Author'] == author_name,:][:6]
author_recommnedations.drop(author_recommnedations.index[author_recommnedations['Book-Title'] == bookname], inplace = True)
author_recommnedations

Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))


Unnamed: 0,Book-Title,Book-Rating,Average-Rating,Aggregated-Rating,ISBN,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
57032,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453,2882.0,0439064872,J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439064872.0...
53173,Harry Potter and the Prisoner of Azkaban (Book 3),428,5.852804,2505.0,0439136350,J. K. Rowling,1999,Scholastic,http://images.amazon.com/images/P/0439136350.0...
53243,Harry Potter and the Goblet of Fire (Book 4),387,5.824289,2254.0,0439139597,J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...
54691,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441,1909.0,043935806X,J. K. Rowling,2003,Scholastic,http://images.amazon.com/images/P/043935806X.0...
53580,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741,1595.0,0590353403,J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...


In [49]:
#books by same publisher
bookname = input()
#Harry Potter and the Chamber of Secrets (Book 2)
dataframe_books = df_author_recommendations[df_author_recommendations['Book-Title'] == bookname]
book_publisher = dataframe_books['Publisher']
publisher_name = book_publisher.to_string(index=False)
publisher_recommnedations = df_author_recommendations.loc[df_author_recommendations['Publisher'] == publisher_name,:][:6]
publisher_recommnedations.drop(publisher_recommnedations.index[publisher_recommnedations['Book-Title'] == bookname], inplace = True)
publisher_recommnedations

Harry Potter and the Prisoner of Azkaban (Book 3)


Unnamed: 0,Book-Title,Book-Rating,Average-Rating,Aggregated-Rating,ISBN,Book-Author,Year-Of-Publication,Publisher,Image-URL-M
57032,Harry Potter and the Chamber of Secrets (Book 2),556,5.183453,2882.0,0439064872,J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439064872.0...
53243,Harry Potter and the Goblet of Fire (Book 4),387,5.824289,2254.0,0439139597,J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...
54691,Harry Potter and the Order of the Phoenix (Boo...,347,5.501441,1909.0,043935806X,J. K. Rowling,2003,Scholastic,http://images.amazon.com/images/P/043935806X.0...
53580,Harry Potter and the Sorcerer's Stone (Book 1),278,5.73741,1595.0,0590353403,J. K. Rowling,1998,Scholastic,http://images.amazon.com/images/P/0590353403.0...
103471,The Thief Lord,124,3.25,403.0,0439404371,Cornelia Funke,2002,Scholastic,http://images.amazon.com/images/P/0439404371.0...


## collaborative filtering

In [50]:
#fetching experienced users who have rated at least 200 books
collaborative_user_data = df_recommendation_dataset.groupby('User-ID').count()['Book-Rating'] > 200
experienced_users = collaborative_user_data[collaborative_user_data].index

df_filtered_collaborative_data = df_recommendation_dataset[df_recommendation_dataset['User-ID'].isin(experienced_users)]

In [51]:
#fetching books with minimum 50 ratings by users
collaborative_rating_data = df_filtered_collaborative_data.groupby('Book-Title').count()['Book-Rating'] > 50
books_with_experienced_ratings = collaborative_rating_data[collaborative_rating_data].index 

df_final_collaborative_data = df_filtered_collaborative_data[df_filtered_collaborative_data['Book-Title'].isin(books_with_experienced_ratings)]

In [52]:
#creating pivot table
df_pivot_table = df_final_collaborative_data.pivot_table(index='Book-Title', columns='User-ID', values='Book-Rating')
df_pivot_table.fillna(0, inplace=True)

In [53]:
df_pivot_table

User-ID,254,2276,2766,2977,3363,4017,4385,6251,6323,6543,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bend in the Road,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
#similarity-scores
from sklearn.metrics.pairwise import cosine_similarity
df_similarity_scores = cosine_similarity(df_pivot_table)

In [55]:
def collaborative_recommendation(book_name):
    book_index = np.where(df_pivot_table.index==book_name)[0][0]
    similar_books = sorted(list(enumerate(df_similarity_scores[book_index])),key=lambda x:x[1],reverse=True)[1:5]
    
    recommended_books = []
    for book in similar_books:
        books = []
        temp_df = df_books[df_books['Book-Title'] == df_pivot_table.index[book[0]]]
        books.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Title'].values))
        books.extend(list(temp_df.drop_duplicates('Book-Title')['Book-Author'].values))
        books.extend(list(temp_df.drop_duplicates('Book-Title')['Image-URL-M'].values))
        
        recommended_books.append(books)
    
    return recommended_books

In [56]:
collaborative_recommendation('1st to Die: A Novel')

[['Along Came a Spider (Alex Cross Novels)',
  'James Patterson',
  'http://images.amazon.com/images/P/0446364193.01.MZZZZZZZ.jpg'],
 ['Roses Are Red (Alex Cross Novels)',
  'James Patterson',
  'http://images.amazon.com/images/P/0446605484.01.MZZZZZZZ.jpg'],
 ['Pop Goes the Weasel',
  'James Patterson',
  'http://images.amazon.com/images/P/0316693286.01.MZZZZZZZ.jpg'],
 ['Violets Are Blue',
  'James Patterson',
  'http://images.amazon.com/images/P/0446611212.01.MZZZZZZZ.jpg']]

In [57]:
pickle.dump(df_pivot_table, open('pivot_table.pkl', 'wb'))
pickle.dump(df_similarity_scores, open('similarity_scores.pkl', 'wb'))
pickle.dump(df_books,open('books.pkl', 'wb'))