In [1]:
import os
import random
import requests

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics.pairwise import cosine_similarity
from PIL import Image

import warnings
warnings.filterwarnings('ignore')

In [2]:
books = pd.read_csv('Books.csv')
users = pd.read_csv('Users.csv')
ratings = pd.read_csv('Ratings.csv')

In [3]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
users.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [5]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [6]:
# checking the shape of our three dataframes
print(f'Book data shape: {books.shape}')
print(f'User data shape: {users.shape}')
print(f'Rating data shape: {ratings.shape}')

Book data shape: (271360, 8)
User data shape: (278858, 3)
Rating data shape: (1149780, 3)


In [7]:
# checking the dataframes for empty values
print(f'Empty values in "Books" data:-\n\n{books.isna().sum()}\n')
print(f'Empty values in "Users" data:-\n\n{users.isna().sum()}\n')
print(f'Empty values in "Ratings" data:-\n\n{ratings.isna().sum()}\n')

Empty values in "Books" data:-

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

Empty values in "Users" data:-

User-ID          0
Location         0
Age         110762
dtype: int64

Empty values in "Ratings" data:-

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64



In [8]:
book_rating = books.merge(ratings, on = 'ISBN')
book_rating.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,2,0
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,8,5
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11400,0
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,11676,8
4,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,41385,0


In [9]:
book_rating['Book-Title']

0                                        Classical Mythology
1                                               Clara Callan
2                                               Clara Callan
3                                               Clara Callan
4                                               Clara Callan
                                 ...                        
1031131                           There's a Bat in Bunk Five
1031132                              From One to One Hundred
1031133    Lily Dale : The True Story of the Town that Ta...
1031134                          Republic (World's Classics)
1031135    A Guided Tour of Rene Descartes' Meditations o...
Name: Book-Title, Length: 1031136, dtype: object

In [10]:
def clean_booktitle(title):
    return str(title).title().strip()
    
book_rating['Book-Title'] = book_rating['Book-Title'].apply(clean_booktitle)

In [11]:
book_rating['Book-Title']

0                                        Classical Mythology
1                                               Clara Callan
2                                               Clara Callan
3                                               Clara Callan
4                                               Clara Callan
                                 ...                        
1031131                           There'S A Bat In Bunk Five
1031132                              From One To One Hundred
1031133    Lily Dale : The True Story Of The Town That Ta...
1031134                          Republic (World'S Classics)
1031135    A Guided Tour Of Rene Descartes' Meditations O...
Name: Book-Title, Length: 1031136, dtype: object

In [12]:
print(f'Shape of dataset before filtering {book_rating.shape}')

# filtering out users that have casted less than 200 votes

filters = book_rating['User-ID'].map(book_rating['User-ID'].value_counts()) > 200
book_rating = book_rating[filters]


# filtering out books that have received less than 50 votes

filters = book_rating['Book-Title'].map(book_rating['Book-Title'].value_counts()) > 50
book_rating = book_rating[filters]

print(f'Shape of dataset after filtering {book_rating.shape}')

Shape of dataset before filtering (1031136, 10)
Shape of dataset after filtering (58685, 10)


# Popularity Based Recommender System

This system will present the most popular n-books of the time based on the integer 'n' fed by the user.

In [13]:
def get_n_most_popular_books(book_rating, n = 25):
    rating_count = book_rating.groupby('Book-Title').count()['Book-Rating'].reset_index()
    rating_count.rename(columns={'Book-Rating': 'Num Votes'}, inplace = True)
    
    rating_avg = book_rating.groupby('Book-Title')['Book-Rating'].mean().reset_index()
    rating_avg.rename(columns={'Book-Rating': 'Avg Ratings'}, inplace = True)
    
    popular_books = rating_count.merge(rating_avg, on = 'Book-Title')
    
    def weighted_score(df):
        v = df['Num Votes']
        R = df['Avg Ratings']
        
        return ((v*R) + (m*C)) / (v+m)
    
    C = popular_books['Avg Ratings'].mean()
    m = popular_books['Num Votes'].quantile(0.90)
    
    filters = popular_books['Num Votes'] >= m
    popular_books = popular_books[filters]
    
    popular_books['Popularity Score'] = popular_books.apply(weighted_score, axis = 1)
    popular_books = popular_books.sort_values(by = 'Popularity Score', ascending = False)
    
    columns = ['Book-Title', 'Avg Ratings', 'Num Votes', 'Popularity Score']
    print(f'Top {n} popular books')
    
    return popular_books[columns].reset_index(drop=True).head(n)

In [14]:
display(get_n_most_popular_books(book_rating, 10))

Top 10 popular books


Unnamed: 0,Book-Title,Avg Ratings,Num Votes,Popularity Score
0,Harry Potter And The Prisoner Of Azkaban (Book 3),4.414815,135,3.202962
1,Harry Potter And The Chamber Of Secrets (Book 2),3.988636,176,3.120212
2,To Kill A Mockingbird,3.690608,181,2.960723
3,A Wrinkle In Time,3.723077,130,2.833788
4,The Lovely Bones: A Novel,3.203065,261,2.784616
5,Harry Potter And The Sorcerer'S Stone (Harry P...,3.316384,177,2.735456
6,The Da Vinci Code,3.157407,216,2.70169
7,The Red Tent (Bestselling Backlist),3.083832,167,2.585134
8,The Secret Life Of Bees,2.848039,204,2.496362
9,"Tuesdays With Morrie: An Old Man, A Young Man,...",3.0,139,2.489907
