In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [5]:
df_books = pd.read_csv('clubs_book.csv', encoding='latin1')
df_users = pd.read_csv('clubs_user.csv', encoding='latin1')
df_ratings = pd.read_csv('ratings.csv', encoding='latin1', sep=';')

### Cleaning all data

### df books

In [6]:
df_books.head(10)

Unnamed: 0,id,ISBN,title,author,publisher,publication_year
0,1,393310779,The Greek Way,Edith Hamilton,W. W. Norton & Company,1993
1,2,399136487,Damia (Rowan),Anne McCaffrey,Putnam Pub Group,1992
2,3,1551051729,Ghost Stories and Mysterious Creatures of Brit...,Barbara Smith,Lone Pine Publishing,1999
3,4,1402201435,The One True Ocean,Sarah Beth Martin,Sourcebooks Landmark,2003
4,5,60926546,"Murder on a Kibbutz : Communal Case, A",Batya Gur,Perennial,1995
5,6,844239062,Astrology (Teach Yourself Books),Jeff Mayo,McGraw-Hill,1992
6,7,425068145,Promise Me the Dawn,Jill Gregory,Berkley Pub Group,1984
7,8,373078188,"Wife, Mother ... Lover? (Harlequin Silhouette ...",Sally Tyler Hayes,Silhouette,1997
8,9,446604623,A King's Cutter (Nathaniel Drinkwater),Richard Woodman,Warner Books (Mm),1997
9,10,1585671274,South from the Limpopo: Travels Through South ...,Dervla Murphy,Overlook Press,2001


In [7]:
# columns to keep
df_books_cleaned = df_books[['ISBN','title', 'author','publication_year','publisher']]

In [8]:
df_books_cleaned.head()

Unnamed: 0,ISBN,title,author,publication_year,publisher
0,393310779,The Greek Way,Edith Hamilton,1993,W. W. Norton & Company
1,399136487,Damia (Rowan),Anne McCaffrey,1992,Putnam Pub Group
2,1551051729,Ghost Stories and Mysterious Creatures of Brit...,Barbara Smith,1999,Lone Pine Publishing
3,1402201435,The One True Ocean,Sarah Beth Martin,2003,Sourcebooks Landmark
4,60926546,"Murder on a Kibbutz : Communal Case, A",Batya Gur,1995,Perennial


In [9]:
# clean all rows that contained some nan
df_books_cleaned = df_books_cleaned.dropna(how='any')

In [10]:
df_books_cleaned.head()

Unnamed: 0,ISBN,title,author,publication_year,publisher
0,393310779,The Greek Way,Edith Hamilton,1993,W. W. Norton & Company
1,399136487,Damia (Rowan),Anne McCaffrey,1992,Putnam Pub Group
2,1551051729,Ghost Stories and Mysterious Creatures of Brit...,Barbara Smith,1999,Lone Pine Publishing
3,1402201435,The One True Ocean,Sarah Beth Martin,2003,Sourcebooks Landmark
4,60926546,"Murder on a Kibbutz : Communal Case, A",Batya Gur,1995,Perennial


In [11]:
# save books that are published between 1950 to 2016

df_books_cleaned = df_books_cleaned[(df_books_cleaned['publication_year'] > 1950) & (df_books_cleaned['publication_year'] <= 2016)]

In [12]:
df_books_cleaned[['publication_year']].describe()

Unnamed: 0,publication_year
count,491.0
mean,1993.250509
std,7.814976
min,1963.0
25%,1988.0
50%,1995.0
75%,1999.5
max,2004.0


In [13]:
# regex-based replacement of certain characters
df_books_cleaned['author'] = df_books_cleaned['author'].str.replace(r'[^\w\s]+', '')

  df_books_cleaned['author'] = df_books_cleaned['author'].str.replace(r'[^\w\s]+', '')


### df users

In [14]:
# cleaned all rows that contained some nan
# df_users_cleaned = df_users.dropna(how='any')

In [15]:
df_users.head()

Unnamed: 0,password,last_login,is_superuser,is_staff,is_active,date_joined,username,id,first_name,last_name,age,email,bio,country
0,Password123,,0,0,1,2022-02-18,@SylviaMiller109044,109044,Sylvia,Miller,64,Sylvia.Miller109044@example.com,Ea optio.,"oklahoma city, oklahoma, usa"
1,Password123,,0,0,1,2022-02-18,@StaceyBegum185532,185532,Stacey,Begum,147,Stacey.Begum185532@example.com,Error.,"san fracisco, california, usa"
2,Password123,,0,0,1,2022-02-18,@DorothyStone145929,145929,Dorothy,Stone,105,Dorothy.Stone145929@example.com,Inventore.,"bad homburg, hessen, germany"
3,Password123,,0,0,1,2022-02-18,@EdwardBradley170814,170814,Edward,Bradley,42,Edward.Bradley170814@example.com,Aliquam.,"messina, sicilia, italy"
4,Password123,,0,0,1,2022-02-18,@JanetSmart104159,104159,Janet,Smart,22,Janet.Smart104159@example.com,Suscipit.,"forest lake, minnesota, usa"


In [16]:
# clean rows with crazy age
df_users_cleaned = df_users[df_users['age']<=100]
df_users_cleaned 

Unnamed: 0,password,last_login,is_superuser,is_staff,is_active,date_joined,username,id,first_name,last_name,age,email,bio,country
0,Password123,,0,0,1,2022-02-18,@SylviaMiller109044,109044,Sylvia,Miller,64,Sylvia.Miller109044@example.com,Ea optio.,"oklahoma city, oklahoma, usa"
3,Password123,,0,0,1,2022-02-18,@EdwardBradley170814,170814,Edward,Bradley,42,Edward.Bradley170814@example.com,Aliquam.,"messina, sicilia, italy"
4,Password123,,0,0,1,2022-02-18,@JanetSmart104159,104159,Janet,Smart,22,Janet.Smart104159@example.com,Suscipit.,"forest lake, minnesota, usa"
7,Password123,,0,0,1,2022-02-18,@FionaHill1300,1300,Fiona,Hill,20,Fiona.Hill1300@example.com,Aut.,"wauwatosa, wisconsin, usa"
8,Password123,,0,0,1,2022-02-18,@MitchellMiller217681,217681,Mitchell,Miller,89,Mitchell.Miller217681@example.com,Repellat.,"oak hill, virginia, usa"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493,Password123,,0,0,1,2022-02-18,@PamelaEdwards6326,6326,Pamela,Edwards,99,Pamela.Edwards6326@example.com,Iusto.,"baxter, tennessee, usa"
494,Password123,,0,0,1,2022-02-18,@GeorgeBryan47409,47409,George,Bryan,91,George.Bryan47409@example.com,Ut aut.,"vancouver, british columbia, canada"
497,Password123,,0,0,1,2022-02-18,@DonaldTurner214054,214054,Donald,Turner,94,Donald.Turner214054@example.com,Est.,"bern, bern, switzerland"
498,Password123,,0,0,1,2022-02-18,@MarcRose61447,61447,Marc,Rose,21,Marc.Rose61447@example.com,Nemo.,"candler, north carolina, usa"


### df ratings

In [17]:
# eliminate nans first
df_ratings_cleaned = df_ratings.dropna(how='any')

In [18]:
# remove zero rating 
df_ratings_cleaned.drop(df_ratings_cleaned[df_ratings_cleaned ['Book-Rating'] == 0].index, inplace = True)

In [19]:
# ratings distribution 
df_ratings_cleaned.groupby(['Book-Rating']).count() / len(df_ratings_cleaned) * 100 

Unnamed: 0_level_0,User-ID,ISBN
Book-Rating,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.408144,0.408144
2,0.636197,0.636197
3,1.382615,1.382615
4,2.053169,2.053169
5,11.754072,11.754072
6,8.514288,8.514288
7,17.630185,17.630185
8,23.920437,23.920437
9,15.574249,15.574249
10,18.126644,18.126644


In [20]:
# ratings by count
df_ratings_cleaned.groupby(['Book-Rating']).count()

Unnamed: 0_level_0,User-ID,ISBN
Book-Rating,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1770,1770
2,2759,2759
3,5996,5996
4,8904,8904
5,50974,50974
6,36924,36924
7,76457,76457
8,103736,103736
9,67541,67541
10,78610,78610


## Model 1

#### top books and authors

In [21]:
df_books_cleaned

Unnamed: 0,ISBN,title,author,publication_year,publisher
0,0393310779,The Greek Way,Edith Hamilton,1993,W. W. Norton & Company
1,0399136487,Damia (Rowan),Anne McCaffrey,1992,Putnam Pub Group
2,1551051729,Ghost Stories and Mysterious Creatures of Brit...,Barbara Smith,1999,Lone Pine Publishing
3,1402201435,The One True Ocean,Sarah Beth Martin,2003,Sourcebooks Landmark
4,0060926546,"Murder on a Kibbutz : Communal Case, A",Batya Gur,1995,Perennial
...,...,...,...,...,...
495,0394586131,Callanetics Countdown,Callan Pinckney,1990,Random House
496,0375702008,Mal de amores,Angeles Mastretta,1998,Vintage Books USA
497,0373286856,"Stark Lightning (Harlequin Historical, No. 85)",Elaine Rome,1991,Harlequin
498,0802135234,Picasso at the Lapin Agile and Other Plays : P...,Steve Martin,1997,Grove Press


##### top 20 highest rated books

In [22]:

# top 20 highest rated books 

top_20_books = df_ratings_cleaned.groupby('ISBN').agg(['mean', 'count'])['Book-Rating'].reset_index()

# generate score based on mean rating and total number of times the book is rated
minVotes = top_20_books['count'].quantile(0.10) # minimum votes required to be listed in the Top 250
top_20_books = top_20_books[top_20_books['count']>minVotes]
print('minimum votes = ', minVotes)
print(top_20_books.shape)
R = top_20_books['mean'] # average for the book (mean) = (Rating)
v = top_20_books['count'] # number of votes for the book = (votes)
C = top_20_books['mean'].mean() # mean vote across all books
top_20_books['weighted rating'] = (v/(v+minVotes))*R + (minVotes/(v+minVotes))*C
top_20_books = top_20_books.sort_values('weighted rating', ascending=False).reset_index(drop=True)

# get title of books
top_20_books = pd.merge(top_20_books, df_books_cleaned, on='ISBN')[['title', 'author', 'mean', 'count', 'weighted rating', 
                              'publication_year']].drop_duplicates('title').iloc[:20]
top_20_books

minimum votes =  1.0
(56352, 3)


Unnamed: 0,title,author,mean,count,weighted rating,publication_year
0,The Killer Angels: A Novel,Michael Shaara,10.0,3,9.391798,1993
1,The Bronze Horseman: A Novel,Paullina Simons,10.0,3,9.391798,2001
2,Zen and the Art of Motorcycle Maintenance,Robert M Pirsig,9.75,4,9.313438,1981
3,A Whisper of Eternity,Amanda Ashley,10.0,2,9.189063,2004
4,The Fortunate Four: Other Journeys of the Heart,Joy Kuby,10.0,2,9.189063,2003
5,Masks of the Illuminati,ROBERT A WILSON,10.0,2,9.189063,1990
6,Little Women (Illustrated Junior Library),Louisa May Alcott,9.0,10,8.869745,1983
7,The Fairy Rebel,Lynne Reid Banks,9.5,2,8.85573,1989
8,Past Reason Hated: An Inspector Banks Mystery,Peter Robinson,9.0,3,8.641798,2000
9,Some of the Parts,T Cooper,9.0,2,8.522397,2002


In [23]:
# top 20 highest rated authors

# drop any duplicates
df_books_cleaned = df_books_cleaned.drop_duplicates(['author', 'title'])

# get book-author and title
highest_rated_author = pd.merge(df_books_cleaned, df_ratings_cleaned, on='ISBN')[['author', 'Book-Rating', 'title', 'ISBN']]

highest_rated_author = highest_rated_author.groupby('author').agg(['mean', 'count'])['Book-Rating'].reset_index()

# generate score based on mean rating and total number of times the author is rated
m = highest_rated_author['count'].quantile(0.6) # minimum votes required to be listed in the Top 250
highest_rated_author = highest_rated_author[highest_rated_author['count']>m]
print('minimum votes =', m)
print(highest_rated_author.shape)
R = highest_rated_author['mean'] # average for the author (mean) = (Rating)
v = highest_rated_author['count'] # number of votes for the author = (votes)
C = highest_rated_author['mean'].mean() # mean vote across all authors
highest_rated_author['weighted rating'] = (v/(v+m))*R + (m/(v+m))*C
highest_rated_author = highest_rated_author.sort_values('weighted rating', ascending=False).reset_index(drop=True)

highest_rated_author.iloc[:20]


minimum votes = 1.0
(91, 3)


  highest_rated_author = highest_rated_author.groupby('author').agg(['mean', 'count'])['Book-Rating'].reset_index()


Unnamed: 0,author,mean,count,weighted rating
0,Michael Shaara,10.0,3,9.365923
1,Paullina Simons,10.0,3,9.365923
2,Robert M Pirsig,9.75,4,9.292738
3,Amanda Ashley,10.0,2,9.154564
4,Joy Kuby,10.0,2,9.154564
5,ROBERT A WILSON,10.0,2,9.154564
6,Louisa May Alcott,9.0,10,8.860336
7,Lynne Reid Banks,9.5,2,8.82123
8,Peter Robinson,9.0,3,8.615923
9,Betty Crocker,8.666667,6,8.494813


In [24]:
# top 20 as list 

# highest_rated_author.values.tolist()

## Model 2 - collaborative filtering 

#### Based on records from various users provide recommendations based on user similarities

In [25]:
# merge ratings and books to get book titles and drop rows for which title is not available
df_books_ratings = pd.merge(df_ratings_cleaned, df_books_cleaned, on='ISBN')

# get total counts of no. of occurrence of book
df_books_ratings['count'] = df_books_ratings.groupby('ISBN').transform('count')['User-ID']

# fetch top 100 books based on count
isbn = df_books_ratings.drop_duplicates('ISBN').sort_values(
    'count', ascending=False).iloc[:100]['ISBN']

# filter out data as per the ISBN
df_books_ratings = df_books_ratings[df_books_ratings['ISBN'].isin(isbn)].reset_index(drop=True)

In [26]:
# create a user-book rating matrix
matrix = df_books_ratings.pivot(index='User-ID', columns='ISBN', values='Book-Rating')
matrix.head()

ISBN,0028603958,0060199261,0060199865,006105206X,0061083089,0099282194,0140285563,0195108973,0198319746,0310235138,...,1586212044,1592864996,1592980074,1853260126,185326217X,188845136X,3442421705,8408022466,8420429902,950511446X
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3167,,,,,,,,,,,...,,,,,,,,,,
4197,,,,,,,,,,,...,,,,,,,,,,
5499,,,,,,,,,,,...,,,,,,,,,,
7418,,,,,,,,,,,...,,,,,,,,,,
8936,,,,,,,,,,,...,,,,,,,,,,


In [27]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import train_test_split, cross_validate  

In [28]:
reader = Reader(rating_scale=(0, 10))
surprise_data = Dataset.load_from_df(df_books_ratings[['User-ID', 'ISBN', 'Book-Rating']], reader)
trainset, testset = train_test_split(surprise_data, test_size=0.25) 


<surprise.trainset.Trainset at 0x7fcc652181c0>

In [29]:
benchmark = []
# Iterate over all algorithms
for algorithm in [SVD()]:
    # Perform cross validation
    results = cross_validate(algorithm, surprise_data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

  tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))


Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,1.864433,0.027152,0.001106


In [30]:
svd = SVD() 
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fcc65218700>

## create predictions

In [31]:

index_val = 111
# get user id
userId = matrix.index[index_val]
books = []
ratings = [] 
titles = []

for isbn in matrix.iloc[index_val][matrix.iloc[index_val].isna()].index:
    books.append(isbn)
    title = df_books_ratings[df_books_ratings['ISBN']==isbn]['title'].values[0]
    titles.append(title)
    ratings.append(svd.predict(userId, isbn).est)

prediction = pd.DataFrame({'ISBN':books, 'title':titles, 'rating':ratings, 'userId':userId})  
prediction = prediction.sort_values('rating', ascending=False).iloc[:10].reset_index(drop=True)

# get other highly rated books by user
temp = df_books_ratings[df_books_ratings['User-ID']==matrix.index[index_val]].sort_values(
    'Book-Rating', ascending=False)[['Book-Rating', 'title', 'User-ID']].iloc[:10].reset_index(drop=True)
prediction['Book Read'] = temp['title']
prediction['Rated']= temp['Book-Rating']
prediction

Unnamed: 0,ISBN,title,rating,userId,Book Read,Rated
0,0679425411,The Killer Angels: A Novel,8.150466,93047,The Story of the House of Wooden Santas,8.0
1,1551669153,Split Second (Maggie O'Dell Novels (Paperback)),8.107344,93047,,
2,0553244582,Zen and the Art of Motorcycle Maintenance,8.015314,93047,,
3,0448060191,Little Women (Illustrated Junior Library),8.012882,93047,,
4,0380733285,Past Reason Hated: An Inspector Banks Mystery,8.011172,93047,,
5,0028603958,Betty Crocker's New Cookbook: Everything You N...,8.00805,93047,,
6,0517542099,The Hitchhiker's Guide to the Galaxy,8.0009,93047,,
7,0553289322,Scandal,7.948336,93047,,
8,0399136487,Damia (Rowan),7.928162,93047,,
9,055357003X,The Fowlers of Sweet Valley (Sweet Valley Saga),7.911012,93047,,


## Model 3 

### Using mean of other user's weighted ratings based on similarity matrix

In [32]:
from sklearn.metrics.pairwise import cosine_similarity 

In [33]:
# replace NaN with user based average rating in pivot (matrix) dataframe
matrix_imputed = matrix.fillna(matrix.mean(axis=0))

# get similarity between all users
similarity_matrix = cosine_similarity(matrix_imputed.values)

In [34]:
def get_recommendation(user_index):
    idx = user_index
    sim_scores = list(enumerate(similarity_matrix[idx]))

    # get books that are unrated by the given user
    unrated_books = matrix.iloc[idx][matrix.iloc[idx].isna()].index

    # get weighted ratings of unrated books by all other users
    book_ratings = (matrix[unrated_books].T * similarity_matrix[idx]).T

    # get top 100 similar users by skipping the current user
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:101]

    # get mean of book ratings by top 100 most similar users for the unrated books
    book_ratings = book_ratings.iloc[[x[0] for x in sim_scores]].mean()
    
    # get rid of null values and sort it based on ratings
    book_ratings = book_ratings.reset_index().dropna().sort_values(0, ascending=False).iloc[:10]
    
    # get recommended book titles in sorted order
    recommended_books = df_books_ratings[df_books_ratings['ISBN'].isin(book_ratings['ISBN'])][['ISBN', 'title']]
    recommended_books = recommended_books.drop_duplicates('ISBN').reset_index(drop=True)
    assumed_ratings = book_ratings[0].reset_index(drop=True)


    return pd.DataFrame({'ISBN':recommended_books['ISBN'], 
                         'Recommended Book':recommended_books['title'], 
                         'Assumed Rating':assumed_ratings})

In [35]:

user_index = 211

recommended_books = get_recommendation(user_index)

# get other highly rated books by user
temp = df_books_ratings[df_books_ratings['User-ID']==matrix.index[user_index]].sort_values(
    'Book-Rating', ascending=False)[['Book-Rating', 'title', 'User-ID']].iloc[:10].reset_index(drop=True)
recommended_books['userId'] = temp['User-ID']
recommended_books['Book Read'] = temp['title']
recommended_books['Rated']= temp['Book-Rating']
recommended_books

Unnamed: 0,ISBN,Recommended Book,Assumed Rating,userId,Book Read,Rated
0,0380733285,Past Reason Hated: An Inspector Banks Mystery,9.999782,174642.0,Time Out for Mom . . . Ahhh Moments,7.0
1,0679425411,The Killer Angels: A Novel,9.999782,,,
2,044050306X,Masks of the Illuminati,9.999782,,,
3,0553244582,Zen and the Art of Motorcycle Maintenance,9.999782,,,
4,0060199261,The Bronze Horseman: A Novel,9.999782,,,
5,0821773801,Midnight Sun,9.999782,,,
6,0821775294,A Whisper of Eternity,9.999731,,,
7,0380706504,The Fairy Rebel,9.999572,,,
8,0809448777,Wild Animals (A Child's First Library of Learn...,8.999804,,,
9,1592980074,The Fortunate Four: Other Journeys of the Heart,8.999464,,,
