In [1]:
#Importing the relevant packages
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
#Loading the relevant Datasets
books = pd.read_csv('Dataframes/books.csv', encoding='utf-8-sig')
ratings = pd.read_csv('Dataframes/ratings.csv', encoding='utf-8-sig')
to_read = pd.read_csv('Dataframes/to_read.csv', encoding='utf-8-sig')

## Data Preprocessing

### Preprocessing books dataframe

In [3]:
books.shape

(10000, 23)

In [4]:
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [5]:
books = books[['id','title','authors']]

In [6]:
books.rename(columns = {'id':'book_id'}, inplace = True) 

In [7]:
books.head()

Unnamed: 0,book_id,title,authors
0,1,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré"
2,3,"Twilight (Twilight, #1)",Stephenie Meyer
3,4,To Kill a Mockingbird,Harper Lee
4,5,The Great Gatsby,F. Scott Fitzgerald


In [8]:
#Removing the subtitle-title inside brackets from the 'title' column
books['title'] = books.title.str.replace(r"\(.*\)",'')

In [9]:
#Remove white space at the end of string
books.title = books.title.str.rstrip()

In [10]:
books.head()

Unnamed: 0,book_id,title,authors
0,1,The Hunger Games,Suzanne Collins
1,2,Harry Potter and the Sorcerer's Stone,"J.K. Rowling, Mary GrandPré"
2,3,Twilight,Stephenie Meyer
3,4,To Kill a Mockingbird,Harper Lee
4,5,The Great Gatsby,F. Scott Fitzgerald


### Preprocessing ratings dataframe

In [11]:
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [12]:
ratings.shape

(981756, 3)

In [13]:
#Removing duplicate records
ratings = ratings.drop_duplicates(subset=['book_id', 'user_id'], keep='last')

In [14]:
ratings.shape

(979478, 3)

In [15]:
#Getting unique users count
number_of_users = ratings['user_id'].unique()
len(number_of_users)

53424

In [16]:
#Getting unique books count
number_of_books = ratings['book_id'].unique()
len(number_of_books)

10000

### Let's start to recommend new books to particular user:

In [17]:
#Add user id to get recommended books
df = pd.DataFrame()
df = ratings[ratings['user_id'] == 314]
df.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
200,3,314,3
400,5,314,4
501,6,314,5
1100,12,314,4


In [18]:
#Merging to get the book Id.
books_info = pd.merge(df, books, on='book_id', how='inner')
books_info = books_info[['book_id', 'title', 'rating']].sort_values(by='book_id')
books_info.head()

Unnamed: 0,book_id,title,rating
0,1,The Hunger Games,5
1,3,Twilight,3
2,5,The Great Gatsby,4
3,6,The Fault in Our Stars,5
4,12,Divergent,4


In [19]:
#Get other users that rated same books rated by inserted user
other_users = ratings[ratings['book_id'].isin(books_info['book_id'].values)]
other_users.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [20]:
#Group up the rows by user id
user_subset = other_users.groupby(['user_id'])

In [21]:
#Let's sort these groups too, so users who read common books with input have a higher priority
user_subset = sorted(user_subset, key=lambda x: len(x[1]), reverse=True)

In [22]:
user_subset[1]

(21228,
         book_id  user_id  rating
 39            1    21228       5
 250           3    21228       3
 448           5    21228       4
 537           6    21228       4
 1138         12    21228       5
 1650         17    21228       5
 1948         20    21228       5
 2156         22    21228       4
 2852         29    21228       4
 3540         36    21228       3
 4041         41    21228       4
 4350         44    21228       3
 4556         46    21228       2
 4644         47    21228       5
 4864         49    21228       2
 5161         52    21228       3
 5558         56    21228       3
 6549         66    21228       4
 6844         69    21228       4
 6956         70    21228       4
 9031         91    21228       3
 9748         98    21228       5
 10155       102    21228       4
 10247       103    21228       4
 10446       105    21228       4
 11941       120    21228       3
 12458       125    21228       3
 13758       138    21228       4
 13950

In [23]:
user_subset[1][0]

21228

In [24]:
#Dataframe of that top user group
user_subset[1][1]

Unnamed: 0,book_id,user_id,rating
39,1,21228,5
250,3,21228,3
448,5,21228,4
537,6,21228,4
1138,12,21228,5
1650,17,21228,5
1948,20,21228,5
2156,22,21228,4
2852,29,21228,4
3540,36,21228,3


In [25]:
#Count of other users
len(user_subset)

4735

In [26]:
#Let get to other users. This limit(0-100) is set because we do not want to waste too much time on every user
top_users = user_subset[1:101]

In [27]:
from scipy.stats import pearsonr

pearson_corr = {}

#For every user group in our top_users subset
for user_id, group in top_users:
    #Let's start by sorting the input and current user group
    group = group.sort_values(by='book_id')
    books_info = books_info.sort_values(by='book_id')
    
    nratings = len(group)
    
    #Get the review scores for the movies that they both have in common
    temp = books_info[books_info['book_id'].isin(group['book_id'].tolist())]
    
    if nratings<2:
        print(books_info['book_id'].isin(group['book_id'].tolist()))
        continue
    
    #Store them in a temporary variable
    new_user_ratings = temp['rating'].tolist()
    #Store the current user group ratings
    user_ratings = group['rating'].tolist()
    
    corr = pearsonr(new_user_ratings, user_ratings)
    pearson_corr[user_id] = corr[0]

In [28]:
#Converting the output to a dataframe
pearson_df = pd.DataFrame(columns=['user_id', 'similarity_value'], data=pearson_corr.items())
pearson_df.head()

Unnamed: 0,user_id,similarity_value
0,21228,0.330086
1,11927,0.183535
2,2077,0.24765
3,48482,0.2765
4,45269,0.266343


In [29]:
# Get top50 users with the highest similarity values
top_users = pearson_df.sort_values(by='similarity_value', ascending=False)[0:50]
top_users.head()

Unnamed: 0,user_id,similarity_value
76,588,0.7866
57,16913,0.646621
13,48687,0.642461
84,40126,0.640989
52,8060,0.611456


In [30]:
#Getting the book and rating of top users
top_users_rating = top_users.merge(ratings, left_on='user_id', right_on='user_id', how='inner')
top_users_rating.head()

Unnamed: 0,user_id,similarity_value,book_id,rating
0,588,0.7866,1,5
1,588,0.7866,3,1
2,588,0.7866,8,3
3,588,0.7866,9,3
4,588,0.7866,11,4


In [31]:
#Multiplies the similarity by the user's ratings to get weighted rating
top_users_rating['weighted_rating'] = top_users_rating['similarity_value']*top_users_rating['rating']
top_users_rating.head()

Unnamed: 0,user_id,similarity_value,book_id,rating,weighted_rating
0,588,0.7866,1,5,3.932999
1,588,0.7866,3,1,0.7866
2,588,0.7866,8,3,2.359799
3,588,0.7866,9,3,2.359799
4,588,0.7866,11,4,3.146399


In [32]:
#Getting the sum of similarity value and weighted rating by book id
top_users_rating = top_users_rating.groupby('book_id').sum()[['similarity_value','weighted_rating']]
top_users_rating.columns = ['sum_similarity_value','sum_weighted_rating']
top_users_rating.head()

Unnamed: 0_level_0,sum_similarity_value,sum_weighted_rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10.747763,48.329555
2,7.739889,30.272463
3,9.449594,26.611058
4,8.238325,36.66234
5,5.98594,25.039619


In [33]:
recommendation_df = pd.DataFrame()
#Now we take the weighted average by book id
recommendation_df['weighted_average_score'] = top_users_rating['sum_weighted_rating']/top_users_rating['sum_similarity_value']
recommendation_df['book_id'] = top_users_rating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted_average_score,book_id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.496708,1
2,3.911227,2
3,2.816106,3
4,4.450218,4
5,4.183072,5


In [34]:
#Ordering the book by weighted average score
recommendation_df = recommendation_df[recommendation_df['weighted_average_score'] >= 3.0]
recommendation_df = recommendation_df.sort_values(by='weighted_average_score', ascending=False).reset_index(drop=True)
recommendation_df.head()

Unnamed: 0,weighted_average_score,book_id
0,5.0,8609
1,5.0,2852
2,5.0,2513
3,5.0,359
4,5.0,2225


In [35]:
#Finally recommended books for the inputted user
rec = books.loc[books['book_id'].isin(recommendation_df['book_id'].tolist())].reset_index(drop=True)
rec.head()

Unnamed: 0,book_id,title,authors
0,1,The Hunger Games,Suzanne Collins
1,2,Harry Potter and the Sorcerer's Stone,"J.K. Rowling, Mary GrandPré"
2,4,To Kill a Mockingbird,Harper Lee
3,5,The Great Gatsby,F. Scott Fitzgerald
4,6,The Fault in Our Stars,John Green


### Evaluate Accuracy

In [36]:
#Get recommended books for the particular user from given df
to_read_df = pd.DataFrame()
to_read_df = to_read[to_read['user_id'] == 314]
to_read_df

Unnamed: 0,user_id,book_id
3345,314,11
3346,314,476
3347,314,495
3348,314,6876


In [37]:
 #Compare both lists of recommended books and get the matching books
rec.loc[rec['book_id'].isin(to_read_df['book_id'].tolist())].reset_index(drop=True)

Unnamed: 0,book_id,title,authors
0,11,The Kite Runner,Khaled Hosseini
1,476,The World According to Garp,John Irving
2,495,A Heartbreaking Work of Staggering Genius,Dave Eggers


In [38]:
accuracy = len(rec.loc[rec['book_id'].isin(to_read_df['book_id'].tolist())].reset_index(drop=True)) / len(to_read_df)
accuracy

0.75