In [1]:
import pandas as pd
import numpy as np

In [2]:
df_books = pd.read_csv('clubs_book.csv', encoding='latin1')
df_users = pd.read_csv('clubs_user.csv', encoding='latin1')
df_ratings = pd.read_csv('ratings.csv', encoding='latin1', sep=';')

In [3]:
df_ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [4]:
df_books.head()


Unnamed: 0,id,ISBN,title,author,publisher,publication_year
0,1,393310779,The Greek Way,Edith Hamilton,W. W. Norton & Company,1993
1,2,399136487,Damia (Rowan),Anne McCaffrey,Putnam Pub Group,1992
2,3,1551051729,Ghost Stories and Mysterious Creatures of Brit...,Barbara Smith,Lone Pine Publishing,1999
3,4,1402201435,The One True Ocean,Sarah Beth Martin,Sourcebooks Landmark,2003
4,5,60926546,"Murder on a Kibbutz : Communal Case, A",Batya Gur,Perennial,1995


### Cleaning all data

### df books

In [5]:
df_books.head(10)

Unnamed: 0,id,ISBN,title,author,publisher,publication_year
0,1,393310779,The Greek Way,Edith Hamilton,W. W. Norton & Company,1993
1,2,399136487,Damia (Rowan),Anne McCaffrey,Putnam Pub Group,1992
2,3,1551051729,Ghost Stories and Mysterious Creatures of Brit...,Barbara Smith,Lone Pine Publishing,1999
3,4,1402201435,The One True Ocean,Sarah Beth Martin,Sourcebooks Landmark,2003
4,5,60926546,"Murder on a Kibbutz : Communal Case, A",Batya Gur,Perennial,1995
5,6,844239062,Astrology (Teach Yourself Books),Jeff Mayo,McGraw-Hill,1992
6,7,425068145,Promise Me the Dawn,Jill Gregory,Berkley Pub Group,1984
7,8,373078188,"Wife, Mother ... Lover? (Harlequin Silhouette ...",Sally Tyler Hayes,Silhouette,1997
8,9,446604623,A King's Cutter (Nathaniel Drinkwater),Richard Woodman,Warner Books (Mm),1997
9,10,1585671274,South from the Limpopo: Travels Through South ...,Dervla Murphy,Overlook Press,2001


In [6]:
# clean all rows that contained some nan
df_books = df_books.dropna(how='any')

### df users

In [7]:
# cleaned all rows that contained some nan
# df_users = df_users.dropna(how='any')

In [8]:
df_users.head()

Unnamed: 0,password,last_login,is_superuser,is_staff,is_active,date_joined,username,id,first_name,last_name,age,email,bio,country
0,Password123,,0,0,1,2022-02-18,@SylviaMiller109044,109044,Sylvia,Miller,64,Sylvia.Miller109044@example.com,Ea optio.,"oklahoma city, oklahoma, usa"
1,Password123,,0,0,1,2022-02-18,@StaceyBegum185532,185532,Stacey,Begum,147,Stacey.Begum185532@example.com,Error.,"san fracisco, california, usa"
2,Password123,,0,0,1,2022-02-18,@DorothyStone145929,145929,Dorothy,Stone,105,Dorothy.Stone145929@example.com,Inventore.,"bad homburg, hessen, germany"
3,Password123,,0,0,1,2022-02-18,@EdwardBradley170814,170814,Edward,Bradley,42,Edward.Bradley170814@example.com,Aliquam.,"messina, sicilia, italy"
4,Password123,,0,0,1,2022-02-18,@JanetSmart104159,104159,Janet,Smart,22,Janet.Smart104159@example.com,Suscipit.,"forest lake, minnesota, usa"


### df ratings

In [9]:
# eliminate nans first
df_ratings = df_ratings.dropna(how='any')

In [10]:
# remove zero rating 
df_ratings.drop(df_ratings[df_ratings['Book-Rating'] == 0].index, inplace = True)

In [11]:
# ratings distribution 
df_ratings.groupby(['Book-Rating']).count() / len(df_ratings) * 100 

Unnamed: 0_level_0,User-ID,ISBN
Book-Rating,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.408144,0.408144
2,0.636197,0.636197
3,1.382615,1.382615
4,2.053169,2.053169
5,11.754072,11.754072
6,8.514288,8.514288
7,17.630185,17.630185
8,23.920437,23.920437
9,15.574249,15.574249
10,18.126644,18.126644


## Model 1

#### top books and authors

In [12]:
df_books

Unnamed: 0,id,ISBN,title,author,publisher,publication_year
0,1,0393310779,The Greek Way,Edith Hamilton,W. W. Norton & Company,1993
1,2,0399136487,Damia (Rowan),Anne McCaffrey,Putnam Pub Group,1992
2,3,1551051729,Ghost Stories and Mysterious Creatures of Brit...,Barbara Smith,Lone Pine Publishing,1999
3,4,1402201435,The One True Ocean,Sarah Beth Martin,Sourcebooks Landmark,2003
4,5,0060926546,"Murder on a Kibbutz : Communal Case, A",Batya Gur,Perennial,1995
...,...,...,...,...,...,...
495,496,0394586131,Callanetics Countdown,Callan Pinckney,Random House,1990
496,497,0375702008,Mal de amores,Angeles Mastretta,Vintage Books USA,1998
497,498,0373286856,"Stark Lightning (Harlequin Historical, No. 85)",Elaine Rome,Harlequin,1991
498,499,0802135234,Picasso at the Lapin Agile and Other Plays : P...,Steve Martin,Grove Press,1997


##### top 20 highest rated books

In [13]:

# top 20 highest rated books 

top_20_books = df_ratings.groupby('ISBN').agg(['mean', 'count'])['Book-Rating'].reset_index()

# generate score based on mean rating and total number of times the book is rated
minVotes = top_20_books['count'].quantile(0.10) # minimum votes required to be listed in the Top 250
top_20_books = top_20_books[top_20_books['count']>minVotes]
print('minimum votes = ', minVotes)
print(top_20_books.shape)
R = top_20_books['mean'] # average for the book (mean) = (Rating)
v = top_20_books['count'] # number of votes for the book = (votes)
C = top_20_books['mean'].mean() # mean vote across all books
top_20_books['weighted rating'] = (v/(v+minVotes))*R + (minVotes/(v+minVotes))*C
top_20_books = top_20_books.sort_values('weighted rating', ascending=False).reset_index(drop=True)

# get title of books
top_20_books = pd.merge(top_20_books, df_books, on='ISBN')[['title', 'author', 'mean', 'count', 'weighted rating', 
                              'publication_year']].drop_duplicates('title').iloc[:20]
top_20_books

minimum votes =  1.0
(56352, 3)


Unnamed: 0,title,author,mean,count,weighted rating,publication_year
0,The Killer Angels: A Novel,Michael Shaara,10.0,3,9.391798,1993
1,The Bronze Horseman: A Novel,Paullina Simons,10.0,3,9.391798,2001
2,Zen and the Art of Motorcycle Maintenance,Robert M Pirsig,9.75,4,9.313438,1981
3,A Whisper of Eternity,Amanda Ashley,10.0,2,9.189063,2004
4,The Fortunate Four: Other Journeys of the Heart,Joy Kuby,10.0,2,9.189063,2003
5,Masks of the Illuminati,ROBERT A. WILSON,10.0,2,9.189063,1990
6,Little Women (Illustrated Junior Library),Louisa May Alcott,9.0,10,8.869745,1983
7,The Fairy Rebel,Lynne Reid Banks,9.5,2,8.85573,1989
8,Past Reason Hated: An Inspector Banks Mystery,Peter Robinson,9.0,3,8.641798,2000
9,Some of the Parts,T Cooper,9.0,2,8.522397,2002


In [14]:
# top 20 highest rated authors

# drop any duplicates
df_books = df_books.drop_duplicates(['author', 'title'])

# get book-author and title
highest_rated_author = pd.merge(df_books, df_ratings, on='ISBN')[['author', 'Book-Rating', 'title', 'ISBN']]

highest_rated_author = highest_rated_author.groupby('author').agg(['mean', 'count'])['Book-Rating'].reset_index()

# generate score based on mean rating and total number of times the author is rated
m = highest_rated_author['count'].quantile(0.6) # minimum votes required to be listed in the Top 250
highest_rated_author = highest_rated_author[highest_rated_author['count']>m]
print('minimum votes =', m)
print(highest_rated_author.shape)
R = highest_rated_author['mean'] # average for the author (mean) = (Rating)
v = highest_rated_author['count'] # number of votes for the author = (votes)
C = highest_rated_author['mean'].mean() # mean vote across all authors
highest_rated_author['weighted rating'] = (v/(v+m))*R + (m/(v+m))*C
highest_rated_author = highest_rated_author.sort_values('weighted rating', ascending=False).reset_index(drop=True)

highest_rated_author.iloc[:20]


minimum votes = 1.0
(92, 3)


  highest_rated_author = highest_rated_author.groupby('author').agg(['mean', 'count'])['Book-Rating'].reset_index()


Unnamed: 0,author,mean,count,weighted rating
0,Paullina Simons,10.0,3,9.362333
1,Michael Shaara,10.0,3,9.362333
2,Robert M Pirsig,9.75,4,9.289867
3,Amanda Ashley,10.0,2,9.149778
4,ROBERT A. WILSON,10.0,2,9.149778
5,Joy Kuby,10.0,2,9.149778
6,Louisa May Alcott,9.0,10,8.85903
7,Lynne Reid Banks,9.5,2,8.816445
8,Peter Robinson,9.0,3,8.612333
9,Betty Crocker,8.666667,6,8.492762


## Model 2 - collaborative filtering 

#### Based on records from various users provide recommendations based on user similarities

In [15]:
# merge ratings and books to get book titles and drop rows for which title is not available
df_books_ratings = pd.merge(df_ratings, df_books, on='ISBN')

# get total counts of no. of occurrence of book
df_books_ratings['count'] = df_books_ratings.groupby('ISBN').transform('count')['User-ID']

# fetch top 100 books based on count
isbn = df_books_ratings.drop_duplicates('ISBN').sort_values(
    'count', ascending=False).iloc[:100]['ISBN']

# filter out data as per the ISBN
df_books_ratings = df_books_ratings[df_books_ratings['ISBN'].isin(isbn)].reset_index(drop=True)

In [16]:
df_books_ratings.head(20)

Unnamed: 0,User-ID,ISBN,Book-Rating,id,title,author,publisher,publication_year,count
0,278137,055325698X,8,363,The Proteus Operation,James P. Hogan,Spectra Books,1991,4
1,197659,055325698X,8,363,The Proteus Operation,James P. Hogan,Spectra Books,1991,4
2,211344,055325698X,6,363,The Proteus Operation,James P. Hogan,Spectra Books,1991,4
3,234828,055325698X,7,363,The Proteus Operation,James P. Hogan,Spectra Books,1991,4
4,278188,0671644475,5,166,The Conquest,Jude Deveraux,Pocket,1991,14
5,7418,0671644475,7,166,The Conquest,Jude Deveraux,Pocket,1991,14
6,11676,0671644475,9,166,The Conquest,Jude Deveraux,Pocket,1991,14
7,23872,0671644475,7,166,The Conquest,Jude Deveraux,Pocket,1991,14
8,45136,0671644475,5,166,The Conquest,Jude Deveraux,Pocket,1991,14
9,132558,0671644475,8,166,The Conquest,Jude Deveraux,Pocket,1991,14


In [17]:
def get_recommendation(user_index):
    idx = user_index
    sim_scores = list(enumerate(similarity_matrix[idx]))

    # get books that are unrated by the given user
    unrated_books = matrix.iloc[idx][matrix.iloc[idx].isna()].index

    # get weighted ratings of unrated books by all other users
    book_ratings = (matrix[unrated_books].T * similarity_matrix[idx]).T

    # get top 100 similar users by skipping the current user
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:101]

    # get mean of book ratings by top 100 most similar users for the unrated books
    book_ratings = book_ratings.iloc[[x[0] for x in sim_scores]].mean()
    
    # get rid of null values and sort it based on ratings
    book_ratings = book_ratings.reset_index().dropna().sort_values(0, ascending=False).iloc[:10]
    
    # get recommended book titles in sorted order
    recommended_books = df_books_ratings[df_books_ratings['ISBN'].isin(book_ratings['ISBN'])][['ISBN', 'title']]
    recommended_books = recommended_books.drop_duplicates('ISBN').reset_index(drop=True)
    assumed_ratings = book_ratings[0].reset_index(drop=True)


    return pd.DataFrame({'ISBN':recommended_books['ISBN'], 
                         'Recommended Book':recommended_books['title'], 
                         'Assumed Rating':assumed_ratings})

In [18]:

user_index = 211

recommended_books = get_recommendation(user_index)

# get other highly rated books by user
temp = df_books_ratings[df_books_ratings['User-ID']==matrix.index[user_index]].sort_values(
    'Book-Rating', ascending=False)[['Book-Rating', 'title', 'User-ID']].iloc[:10].reset_index(drop=True)
recommended_books['userId'] = temp['User-ID']
recommended_books['Book Read'] = temp['title']
recommended_books['Rated']= temp['Book-Rating']
recommended_books

NameError: name 'similarity_matrix' is not defined

In [None]:
recommended_books.head()

Unnamed: 0,ISBN,Recommended Book,Assumed Rating,userId,Book Read,Rated
0,0380733285,Past Reason Hated: An Inspector Banks Mystery,9.999782,174642.0,Time Out for Mom . . . Ahhh Moments,7.0
1,0679425411,The Killer Angels: A Novel,9.999782,,,
2,044050306X,Masks of the Illuminati,9.999782,,,
3,0553244582,Zen and the Art of Motorcycle Maintenance,9.999782,,,
4,0060199261,The Bronze Horseman: A Novel,9.999782,,,
