In [1]:
#Importing the relevant packages
import pandas as pd
import numpy as np

In [2]:
#Loading the relevant Datasets
books = pd.read_csv('Dataframes/books.csv', encoding='utf-8-sig')
ratings = pd.read_csv('Dataframes/ratings.csv', encoding='utf-8-sig')

## Data Preprocessing

### Preprocessing books dataframe

In [3]:
books.shape

(10000, 23)

In [4]:
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [5]:
books = books[['id','title','authors']]

In [6]:
books.rename(columns = {'id':'book_id'}, inplace = True) 

In [7]:
books.head()

Unnamed: 0,book_id,title,authors
0,1,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré"
2,3,"Twilight (Twilight, #1)",Stephenie Meyer
3,4,To Kill a Mockingbird,Harper Lee
4,5,The Great Gatsby,F. Scott Fitzgerald


In [8]:
#Removing the subtitle-title inside brackets from the 'title' column
books['title'] = books.title.str.replace(r"\(.*\)",'')

In [9]:
#Remove white space at the end of string
books.title = books.title.str.rstrip()

In [10]:
books.head()

Unnamed: 0,book_id,title,authors
0,1,The Hunger Games,Suzanne Collins
1,2,Harry Potter and the Sorcerer's Stone,"J.K. Rowling, Mary GrandPré"
2,3,Twilight,Stephenie Meyer
3,4,To Kill a Mockingbird,Harper Lee
4,5,The Great Gatsby,F. Scott Fitzgerald


### Preprocessing ratings dataframe

In [11]:
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [12]:
ratings.shape

(981756, 3)

In [13]:
number_of_users = ratings['user_id'].unique()
len(number_of_users)

53424

In [14]:
number_of_books = ratings['book_id'].unique()
len(number_of_books)

10000

### Let's start to recommend new books to particular user:

In [15]:
#History of books read by user in the past
user_info = [
            {'title':'Harry Potter and the Prisoner of Azkaban', 'rating':4},
            {'title':'Harry Potter and the Half-Blood Prince', 'rating':5},
            {'title':'Harry Potter and the Deathly Hallows', 'rating':5},
            {'title':'Harry Potter and the Chamber of Secrets', 'rating':5},
            {'title':'The Jungle Book', 'rating':2}
         ] 
books_info = pd.DataFrame(user_info)
books_info

Unnamed: 0,title,rating
0,Harry Potter and the Prisoner of Azkaban,4
1,Harry Potter and the Half-Blood Prince,5
2,Harry Potter and the Deathly Hallows,5
3,Harry Potter and the Chamber of Secrets,5
4,The Jungle Book,2


In [16]:
#Add book_id to the inserted user info
#Filtering out the books by title
books_info_id = books[books['title'].isin(books_info['title'].tolist())]

#Merging to get the book Id.
books_info = pd.merge(books_info_id, books_info)

books_info

Unnamed: 0,book_id,title,authors,rating
0,18,Harry Potter and the Prisoner of Azkaban,"J.K. Rowling, Mary GrandPré, Rufus Beck",4
1,23,Harry Potter and the Chamber of Secrets,"J.K. Rowling, Mary GrandPré",5
2,25,Harry Potter and the Deathly Hallows,"J.K. Rowling, Mary GrandPré",5
3,27,Harry Potter and the Half-Blood Prince,"J.K. Rowling, Mary GrandPré",5
4,1327,The Jungle Book,Rudyard Kipling,2


In [17]:
#Obtaining a list of users who have read the same books
user_subset = ratings[ratings['book_id'].isin(books_info['book_id'].tolist())]
user_subset.head()

Unnamed: 0,book_id,user_id,rating
1700,18,439,4
1701,18,588,5
1702,18,1169,3
1703,18,2077,3
1704,18,2900,4


In [18]:
#Group up the rows by user id
user_subset_group = user_subset.groupby(['user_id'])

In [19]:
#Get one of the user subset data that grouped by user id
user_subset_group.get_group(439)

Unnamed: 0,book_id,user_id,rating
1700,18,439,4
2200,23,439,2
2600,27,439,5


In [20]:
len(user_subset_group.get_group(439))

3

In [21]:
#Let's sort these groups too, so users who read common books with input have a higher priority
user_subset_group = sorted(user_subset_group,  key=lambda x: len(x[1]), reverse=True)

In [22]:
#User 314 has read 4 out of 5 books
user_subset_group[0]

(18100,
         book_id  user_id  rating
 1747         18    18100       5
 2252         23    18100       4
 2441         25    18100       5
 2640         27    18100       4
 132606     1327    18100       5)

In [23]:
user_subset_group[0][0]

18100

In [24]:
#Dataframe of that top user group
user_subset_group[0][1]

Unnamed: 0,book_id,user_id,rating
1747,18,18100,5
2252,23,18100,4
2441,25,18100,5
2640,27,18100,4
132606,1327,18100,5


<img src="pearsons.gif">

In [25]:
#We select the subgroup of users. This limit is set because we do not want to waste too much time on every user
user_subset_group = user_subset_group[0:100]

In [26]:
pearson_correlation = {}

#For every user group in our subset
for name, group in user_subset_group:
    
    #Let's start by sorting the input and current user group
    group = group.sort_values(by='book_id')
    books_info = books_info.sort_values(by='book_id')
    
    nratings = len(group)
    
    #Get the review scores for the movies that they both have in common
    temp = books_info[books_info['book_id'].isin(group['book_id'].tolist())]
    
    #Store them in a temporary variable
    temp_ratinglist = temp['rating'].tolist()
    
    #Store the current user group ratings
    temp_grouplist = group['rating'].tolist()
    
    #Calculate the pearson correlation between two users, so called, x and y
    #For hard code based
    Sxx = sum([i**2 for i in temp_ratinglist]) - pow(sum(temp_ratinglist),2)/float(nratings)
    Syy = sum([i**2 for i in temp_grouplist]) - pow(sum(temp_grouplist),2)/float(nratings)
    Sxy = sum( i*j for i, j in zip(temp_ratinglist, temp_grouplist)) - sum(temp_ratinglist)*sum(temp_grouplist)/float(nratings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearson_correlation[name] = Sxy/np.sqrt(Sxx*Syy)
    else:
        pearson_correlation[name] = 0

In [27]:
pearson_correlation.items()

dict_items([(18100, -0.5601120336112013), (21676, 0.9432422182837996), (32635, 0.505291152639912), (41074, -0.34299717028501714), (41318, 0.9432422182838017), (47476, 0.5601120336112013), (588, -0.5222329678670935), (3022, 0), (3922, 0), (5436, 0), (8167, -0.3333333333333333), (9011, -0.3333333333333333), (9731, 0), (9771, -0.5773502691896258), (10140, 0), (10249, 0), (10288, 0), (10751, -0.5222329678670935), (10944, 0), (11285, 0), (11408, 0), (11854, 0), (11927, -0.3333333333333333), (11999, 0), (12471, 0), (12476, -0.3333333333333333), (12946, 0.3333333333333333), (13544, 0), (14546, 0), (14936, 0), (15604, 0.5773502691896258), (15889, -0.8164965809277261), (16913, -0.3333333333333333), (17566, -0.3333333333333333), (18031, 0), (18313, 0), (18361, 0), (19724, 0), (19729, 0.3333333333333333), (19942, -0.3333333333333333), (20076, 0.5773502691896258), (21228, -0.5773502691896258), (21713, 0.3333333333333333), (23612, 0), (24499, 0.17407765595569785), (24845, -0.3333333333333333), (251

In [28]:
pearson_df = pd.DataFrame.from_dict(pearson_correlation, orient='index')
pearson_df.head()

Unnamed: 0,0
18100,-0.560112
21676,0.943242
32635,0.505291
41074,-0.342997
41318,0.943242


In [29]:
pearson_df.columns = ['similarity_value']
pearson_df['user_id'] = pearson_df.index
pearson_df.index = range(len(pearson_df))
pearson_df.head()

Unnamed: 0,similarity_value,user_id
0,-0.560112,18100
1,0.943242,21676
2,0.505291,32635
3,-0.342997,41074
4,0.943242,41318


In [30]:
pearson_df.shape

(100, 2)

In [31]:
top_users = pearson_df.sort_values(by='similarity_value', ascending=False)[0:50]
top_users.head()

Unnamed: 0,similarity_value,user_id
4,0.943242,41318
1,0.943242,21676
40,0.57735,20076
30,0.57735,15604
5,0.560112,47476


In [32]:
top_users_rating = top_users.merge(ratings, left_on='user_id', right_on='user_id', how='inner')
top_users_rating.head()

Unnamed: 0,similarity_value,user_id,book_id,rating
0,0.943242,41318,6,3
1,0.943242,41318,17,4
2,0.943242,41318,18,5
3,0.943242,41318,20,5
4,0.943242,41318,21,5


In [33]:
#Multiplies the similarity by the user's ratings
top_users_rating['weighted_rating'] = top_users_rating['similarity_value']*top_users_rating['rating']
top_users_rating.head()

Unnamed: 0,similarity_value,user_id,book_id,rating,weighted_rating
0,0.943242,41318,6,3,2.829727
1,0.943242,41318,17,4,3.772969
2,0.943242,41318,18,5,4.716211
3,0.943242,41318,20,5,4.716211
4,0.943242,41318,21,5,4.716211


In [34]:
top_users_rating = top_users_rating.groupby('book_id').sum()[['similarity_value','weighted_rating']]
top_users_rating.columns = ['sum_similarity_value','sum_weighted_rating']
top_users_rating.head()

Unnamed: 0_level_0,sum_similarity_value,sum_weighted_rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.150164,13.53601
2,3.42674,14.597501
3,3.493325,8.663899
4,2.187259,7.561777
5,2.789278,11.311075


In [35]:
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted_average_score'] = top_users_rating['sum_weighted_rating']/top_users_rating['sum_similarity_value']
recommendation_df['book_id'] = top_users_rating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted_average_score,book_id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,4.296922,1
2,4.25988,2
3,2.48013,3
4,3.457193,4
5,4.055199,5


In [36]:
recommendation_df = recommendation_df.sort_values(by='weighted_average_score', ascending=False).reset_index(drop=True)
recommendation_df.head()

Unnamed: 0,weighted_average_score,book_id
0,5.0,562
1,5.0,513
2,5.0,9966
3,5.0,1084
4,5.0,1270


In [37]:
books.loc[books['book_id'].isin(recommendation_df.head(10)['book_id'].tolist())].reset_index(drop=True)

Unnamed: 0,book_id,title,authors
0,186,The Other Boleyn Girl,Philippa Gregory
1,513,The Hiding Place: The Triumphant True Story of...,"Corrie ten Boom, John Sherrill, Elizabeth Sher..."
2,562,The Way of Kings,Brandon Sanderson
3,1084,To the Lighthouse,Virginia Woolf
4,1190,Rules of Civility,Amor Towles
5,1229,Stellaluna,Janell Cannon
6,1234,Sybil: The Classic True Story of a Woman Posse...,Flora Rheta Schreiber
7,1245,The Brethren,John Grisham
8,1270,Sputnik Sweetheart,"Haruki Murakami, Philip Gabriel"
9,9966,The Ground Beneath Her Feet,Salman Rushdie
