## Aim: Multiple recommendations based on the implementation of two different recommendation engine:
#Popularity based recommender, Content based

Dataset: Goodreads
    https://www.kaggle.com/jealousleopard/goodreadsbooks/download
        

In [22]:
# load libraries and data

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv("books.csv")
df.describe()

Unnamed: 0,bookID,average_rating,isbn13,num_pages,ratings_count,text_reviews_count
count,11127.0,11127.0,11127.0,11127.0,11127.0,11127.0
mean,21310.938887,3.933631,9759888000000.0,336.376921,17936.41,541.854498
std,13093.358023,0.352445,442896400000.0,241.127305,112479.4,2576.176608
min,1.0,0.0,8987060000.0,0.0,0.0,0.0
25%,10287.0,3.77,9780350000000.0,192.0,104.0,9.0
50%,20287.0,3.96,9780590000000.0,299.0,745.0,46.0
75%,32104.5,4.135,9780870000000.0,416.0,4993.5,237.5
max,45641.0,5.0,9790010000000.0,6576.0,4597666.0,94265.0


In [8]:
df.head(2).transpose()

Unnamed: 0,0,1
bookID,1,2
title,Harry Potter and the Half-Blood Prince (Harry ...,Harry Potter and the Order of the Phoenix (Har...
authors,J.K. Rowling/Mary GrandPré,J.K. Rowling/Mary GrandPré
average_rating,4.57,4.49
isbn,0439785960,0439358078
isbn13,9.78044e+12,9.78044e+12
language_code,eng,eng
num_pages,652,870
ratings_count,2095690,2153167
text_reviews_count,27591,29221


## Popularity based recommender

In [9]:
def popularityRecommender(df):
    
    #Define the minimum vote count
    minimum_vote_count = 0.75* df['ratings_count'].max()
    
    #Define C – the mean rating
    mean_rating = df['average_rating'].mean()

    df['weighted_rating'] = (((df['ratings_count']/(df['ratings_count']+minimum_vote_count)) * df['average_rating']) +
                             ((minimum_vote_count/(df['ratings_count']+minimum_vote_count))*mean_rating))

    recommendations = df.sort_values(by = 'weighted_rating',ascending = False).head(5)
    
    return(recommendations) 

In [10]:
top5 = popularityRecommender(df)
top5

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,weighted_rating
3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,043965548X,9780440000000.0,eng,435,2339585,36325,5/1/2004,Scholastic Inc.,4.186825
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,0439785960,9780440000000.0,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,4.174188
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,0439358078,9780440000000.0,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,4.147497
4416,15881,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling/Mary GrandPré,4.42,0439064864,9780440000000.0,eng,341,2293963,34692,6/2/1999,Arthur A. Levine Books / Scholastic Inc.,4.127931
23,34,The Fellowship of the Ring (The Lord of the Ri...,J.R.R. Tolkien,4.36,0618346252,9780620000000.0,eng,398,2128944,13670,9/5/2003,Houghton Mifflin Harcourt,4.096386


##Content-based Recommender

In [15]:
#TF-IDF Vectorizer on the author data for each book.
cbr = TfidfVectorizer(stop_words = 'english')

df['authors'] = df['authors'].fillna('')
tfidf_matrix = cbr.fit_transform(df['authors'])

tfidf_matrix.shape

df['authors'][0]

'J.K. Rowling/Mary GrandPré'

In [18]:
# Use the lambda function to split the description into words and count the length of the paragraph
df['new_column'] = df.authors.apply(lambda x: len(str(x).split(' ')))
df['new_column']


0        3
1        3
2        2
3        3
4        3
        ..
11122    5
11123    3
11124    3
11125    3
11126    2
Name: new_column, Length: 11127, dtype: int64

In [23]:
#the vector representing the importance of the words in the document. 
tfidf_matrix[0]
print(tfidf_matrix[0])

#Assign the instance of our recommender function.
distance_matrix = cosine_similarity(tfidf_matrix)
# Re-create the indices of our list of books by removing any duplicates if required
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

  (0, 2938)	0.6844068970166837
  (0, 4989)	0.44554584401479924
  (0, 6583)	0.5771274557644394


In [25]:
print(distance_matrix.size)
print(distance_matrix.shape)

123810129
(11127, 11127)


In [26]:
##The function

def ContentBasedRecommender(title, indices, distance_matrix):
    id_ = indices[title] #Fetch the index of the movie we will enter
    
    #List of tuples with distance for each movie to the entered movie (2 cols = id and distance)
    distances = list(enumerate(distance_matrix[id_])) 
    
    #sort by the distance function, which is in column[1]
    distances = sorted(distances, key=lambda x: x[1], reverse = True) 
    
    distances = distances[1:6] # Get the 5 best scores , not including itself
    print(distances)
    
    # get the indices of the top 5
    recommendations = [distance[0] for distance in distances] 
    
    # return those recommendation names by pulling title from the given 5 indices
    return df['title'].iloc[recommendations] 

In [27]:
## Example Book
ContentBasedRecommender("In a Sunburned Country", indices,distance_matrix)

[(13, 1.0), (14, 1.0), (15, 1.0), (16, 1.0), (17, 1.0)]


13                          Bill Bryson's African Diary
14    Bryson's Dictionary of Troublesome Words: A Wr...
15                               In a Sunburned Country
16    I'm a Stranger Here Myself: Notes on Returning...
17    The Lost Continent: Travels in Small Town America
Name: title, dtype: object