In [1]:
#Importing the relevant packages
import pandas as pd
import numpy as np

In [2]:
#Loading the relevant Datasets
books = pd.read_csv('Dataframes/books.csv', encoding='utf-8-sig')
ratings = pd.read_csv('Dataframes/ratings.csv', encoding='utf-8-sig')

## Data Preprocessing

### Preprocessing books dataframe

In [3]:
books.shape

(10000, 23)

In [4]:
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [5]:
books = books[['id','title','authors']]

In [6]:
books.rename(columns = {'id':'book_id'}, inplace = True) 

In [7]:
books.head()

Unnamed: 0,book_id,title,authors
0,1,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré"
2,3,"Twilight (Twilight, #1)",Stephenie Meyer
3,4,To Kill a Mockingbird,Harper Lee
4,5,The Great Gatsby,F. Scott Fitzgerald


In [8]:
#Removing the subtitle-title inside brackets from the 'title' column
books['title'] = books.title.str.replace(r"\(.*\)",'')

In [9]:
#Remove white space at the end of string
books.title = books.title.str.rstrip()

In [10]:
books.head()

Unnamed: 0,book_id,title,authors
0,1,The Hunger Games,Suzanne Collins
1,2,Harry Potter and the Sorcerer's Stone,"J.K. Rowling, Mary GrandPré"
2,3,Twilight,Stephenie Meyer
3,4,To Kill a Mockingbird,Harper Lee
4,5,The Great Gatsby,F. Scott Fitzgerald


### Preprocessing ratings dataframe

In [11]:
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [12]:
ratings.shape

(981756, 3)

In [13]:
#Getting unique users count
number_of_users = ratings['user_id'].unique()
len(number_of_users)

53424

In [14]:
#Getting unique books count
number_of_books = ratings['book_id'].unique()
len(number_of_books)

10000

### Let's start to recommend new books to particular user:

In [15]:
#Add book_id to the inserted user info
def get_books_info(user_info):
    #Adding history of books read by user in the past as a input
    books_info = pd.DataFrame(user_info)
    
    #Filtering out the books by title
    books_info_id = books[books['title'].isin(books_info['title'].tolist())]

    #Merging to get the book Id.
    books_info = pd.merge(books_info_id, books_info)

    return books_info

In [16]:
#let's select the subgroup of users. 
user_subset_group = pd.DataFrame()

def get_user_subset_group(books_info):
    #Obtaining a list of users who have read the same books
    user_subset = ratings[ratings['book_id'].isin(books_info['book_id'].tolist())]
    
    #Group up the rows by user id
    user_subset_group = user_subset.groupby(['user_id'])
    
    #Let's sort these groups too, so users who read common books with input have a higher priority
    user_subset_group = sorted(user_subset_group,  key=lambda x: len(x[1]), reverse=True)
    
    #This limit(0-100) is set because we do not want to waste too much time on every user
    user_subset_group = user_subset_group[0:100]
    
    return user_subset_group

In [17]:
pearson_correlation = {}

def get_pearson_correlation(user_subset_group, books_info):
    #For every user group in our subset
    for name, group in user_subset_group:

        #Let's start by sorting the input and current user group
        group = group.sort_values(by='book_id')
        books_info = books_info.sort_values(by='book_id')

        nratings = len(group)

        #Get the review scores for the movies that they both have in common
        temp = books_info[books_info['book_id'].isin(group['book_id'].tolist())]

        #Store them in a temporary variable
        temp_ratinglist = temp['rating'].tolist()

        #Store the current user group ratings
        temp_grouplist = group['rating'].tolist()

        #Calculate the pearson correlation between two users, so called, x and y
        #For hard code based
        Sxx = sum([i**2 for i in temp_ratinglist]) - pow(sum(temp_ratinglist),2)/float(nratings)
        Syy = sum([i**2 for i in temp_grouplist]) - pow(sum(temp_grouplist),2)/float(nratings)
        Sxy = sum( i*j for i, j in zip(temp_ratinglist, temp_grouplist)) - sum(temp_ratinglist)*sum(temp_grouplist)/float(nratings)

        #If the denominator is different than zero, then divide, else, 0 correlation.
        if Sxx != 0 and Syy != 0:
            pearson_correlation[name] = Sxy/np.sqrt(Sxx*Syy)
        else:
            pearson_correlation[name] = 0
    return pearson_correlation

In [18]:
#Converting the output to a dataframe
def convert_pearson_df(pearson_correlation):

    pearson_df = pd.DataFrame.from_dict(pearson_correlation, orient='index')
    
    #Setting the right name for columns
    pearson_df.columns = ['similarity_value']
    pearson_df['user_id'] = pearson_df.index
    pearson_df.index = range(len(pearson_df))
    
    return pearson_df

In [19]:
#Let's get weighted rating for recommendation dataframe
def get_recommendation_df(pearson_df):
    #Getting the top 50 users based on similarity value
    top_users = pearson_df.sort_values(by='similarity_value', ascending=False)[0:50]
    
    #Getting the book and rating of top users
    top_users_rating = top_users.merge(ratings, left_on='user_id', right_on='user_id', how='inner')
    
    #Multiplies the similarity by the user's ratings to get weighted rating
    top_users_rating['weighted_rating'] = top_users_rating['similarity_value']*top_users_rating['rating']
    
    #Getting the sum of similarity value and weighted rating by book id
    top_users_rating = top_users_rating.groupby('book_id').sum()[['similarity_value','weighted_rating']]
    top_users_rating.columns = ['sum_similarity_value','sum_weighted_rating']
    
    recommendation_df = pd.DataFrame()
    #Now we take the weighted average by book id
    recommendation_df['weighted_average_score'] = top_users_rating['sum_weighted_rating']/top_users_rating['sum_similarity_value']
    recommendation_df['book_id'] = top_users_rating.index
    
    #Ordering the book by weighted average score
    recommendation = recommendation_df.sort_values(by='weighted_average_score', ascending=False).reset_index(drop=True)
    
    return recommendation

In [28]:
def get_recommend_movies(user_info, num):
    
    books_info = get_books_info(user_info)
    user_subset_group = get_user_subset_group(books_info)
    pearson_correlation = get_pearson_correlation(user_subset_group, books_info)
    pearson_df = convert_pearson_df(pearson_correlation)
    
    #Finally recommended books for the inputted user
    recommend_movies = books.loc[books['book_id'].isin(get_recommendation_df(pearson_df).head(num)['book_id'].tolist())]
    
    print("\n Recommended Movies(",num,"):")
    return recommend_movies

In [29]:
user_info = [
            {'title':'Harry Potter and the Prisoner of Azkaban', 'rating':4},
            {'title':'Harry Potter and the Half-Blood Prince', 'rating':5},
            {'title':'Harry Potter and the Deathly Hallows', 'rating':5},
            {'title':'Harry Potter and the Chamber of Secrets', 'rating':5},
            {'title':'The Jungle Book', 'rating':2}
         ]
get_recommend_movies(user_info, 10)


 Recommended Movies( 10 ):


Unnamed: 0,book_id,title,authors
185,186,The Other Boleyn Girl,Philippa Gregory
512,513,The Hiding Place: The Triumphant True Story of...,"Corrie ten Boom, John Sherrill, Elizabeth Sher..."
561,562,The Way of Kings,Brandon Sanderson
1083,1084,To the Lighthouse,Virginia Woolf
1189,1190,Rules of Civility,Amor Towles
1228,1229,Stellaluna,Janell Cannon
1233,1234,Sybil: The Classic True Story of a Woman Posse...,Flora Rheta Schreiber
1244,1245,The Brethren,John Grisham
1269,1270,Sputnik Sweetheart,"Haruki Murakami, Philip Gabriel"
9965,9966,The Ground Beneath Her Feet,Salman Rushdie
