In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

books = pd.read_csv('books.csv', encoding = 'ISO-8859-1')
books.describe()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn13,original_publication_year,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5
count,10000.0,10000.0,10000.0,10000.0,10000.0,9415.0,9979.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,5264697.0,5471214.0,8646183.0,75.7127,9754393000000.0,1981.987674,4.002191,54001.24,59687.32,2919.9553,1345.0406,3110.885,11475.8938,19965.7,23789.81
std,2886.89568,7575462.0,7827330.0,11751060.0,170.470728,442824600000.0,152.576665,0.254427,157370.0,167803.8,6124.378132,6635.626263,9717.123578,28546.449183,51447.36,79768.89
min,1.0,1.0,1.0,87.0,1.0,195170300.0,-1750.0,2.47,2716.0,5510.0,3.0,11.0,30.0,323.0,750.0,754.0
25%,2500.75,46275.75,47911.75,1008841.0,23.0,9780000000000.0,1990.0,3.85,13568.75,15438.75,694.0,196.0,656.0,3112.0,5405.75,5334.0
50%,5000.5,394965.5,425123.5,2719524.0,40.0,9780000000000.0,2004.0,4.02,21155.5,23832.5,1402.0,391.0,1163.0,4894.0,8269.5,8836.0
75%,7500.25,9382225.0,9636112.0,14517750.0,67.0,9780000000000.0,2011.0,4.18,41053.5,45915.0,2744.25,885.0,2353.25,9287.0,16023.5,17304.5
max,10000.0,33288640.0,35534230.0,56399600.0,3455.0,9790000000000.0,2017.0,4.82,4780653.0,4942365.0,155254.0,456191.0,436802.0,793319.0,1481305.0,3011543.0


In [2]:
books = books.loc[books['average_rating'] >= 3.5]

print(books.shape)
print(books.columns)

(9661, 23)
Index(['id', 'book_id', 'best_book_id', 'work_id', 'books_count', 'isbn',
       'isbn13', 'authors', 'original_publication_year', 'original_title',
       'title', 'language_code', 'average_rating', 'ratings_count',
       'work_ratings_count', 'work_text_reviews_count', 'ratings_1',
       'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5', 'image_url',
       'small_image_url'],
      dtype='object')


In [3]:
#ratings = pd.read_csv('ratings.csv', encoding = 'ISO-8859-1')
#ratings.head()

In [4]:
book_tags = pd.read_csv('book_tags.csv', encoding = 'ISO-8859-1')
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [5]:
tags = pd.read_csv('tags.csv')
# drop the 'to-read', 'favorites' and 'currently-reading' tag
tags.drop([30574, 11557, 8717], inplace = True)
print(tags.tail())

       tag_id    tag_name
34247   34247   Ｃhildrens
34248   34248   Ｆａｖｏｒｉｔｅｓ
34249   34249       Ｍａｎｇａ
34250   34250      ＳＥＲＩＥＳ
34251   34251  ｆａｖｏｕｒｉｔｅｓ


In [6]:
book_tags_df = pd.merge(book_tags, tags, on = 'tag_id', how = 'inner').sort_values(by = 'count', ascending = False)

#row = [i for i, row in book_tags_df.iterrows() if row['tag_name'] == 'to-read']
#book_tags_df.drop(row, inplace = True)
book_tags_df.head()
book_tags_df.shape

(970272, 4)

In [7]:
#to_read = pd.read_csv('to_read.csv')
#to_read.head()

In [8]:
# author based recommender

tfidf = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 3), min_df = 0, stop_words = 'english')
#tfidf = TfidfVectorizer(stop_words = 'english')
tfidf_matrix = tfidf.fit_transform(books['authors'])
tfidf_matrix.shape
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [9]:
#Construct a reverse map of indices and book titles
indices = pd.Series(books.index, index = books['title']).drop_duplicates()

In [10]:
# Function that takes in book titles as input and outputs most similar books
def get_recommendations(title, cosine_sim = cosine_sim):
    
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:11]
    book_indices = [i[0] for i in sim_scores]
    return books['title'].iloc[book_indices]

In [11]:
books['title'].head(20)

0               The Hunger Games (The Hunger Games, #1)
1     Harry Potter and the Sorcerer's Stone (Harry P...
2                               Twilight (Twilight, #1)
3                                 To Kill a Mockingbird
4                                      The Great Gatsby
5                                The Fault in Our Stars
6                                            The Hobbit
7                                The Catcher in the Rye
8                 Angels & Demons  (Robert Langdon, #1)
9                                   Pride and Prejudice
10                                      The Kite Runner
11                            Divergent (Divergent, #1)
12                                                 1984
13                                          Animal Farm
14                            The Diary of a Young Girl
15     The Girl with the Dragon Tattoo (Millennium, #1)
16                 Catching Fire (The Hunger Games, #2)
17    Harry Potter and the Prisoner of Azkaban (

In [12]:
# tag based recommender

books_with_tags = pd.merge(books, book_tags_df, left_on = 'book_id', right_on = 'goodreads_book_id', how = 'inner')
books_with_tags['tag_name'] = books_with_tags['tag_name'].astype('category')
#books_with_tags.info()
tfidf1 = TfidfVectorizer(analyzer = 'word', ngram_range = (1, 3), min_df = 0, stop_words = 'english')
tfidf_matrix1 = tfidf1.fit_transform(books_with_tags['tag_name'].head(20000))
cosine_sim1 = linear_kernel(tfidf_matrix1, tfidf_matrix1)

In [13]:
# tag and author based recommender

temp_df = books_with_tags.groupby('book_id')['tag_name'].apply(' '.join).reset_index()
temp_df.head()

Unnamed: 0,book_id,tag_name
0,1,fantasy young-adult fiction harry-potter books...
1,2,fantasy children children-s all-time-favorites...
2,3,fantasy young-adult fiction harry-potter books...
3,5,fantasy young-adult fiction harry-potter books...
4,6,fantasy young-adult fiction harry-potter owned...


In [14]:
books0 = pd.merge(books, temp_df, on = 'book_id', how = 'inner')
#books0.head()

In [15]:
# function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(' ', '')) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(' ', ''))
        else:
            return ''

In [16]:
books0['authors'].apply(clean_data)

0                                          suzannecollins
1                               j.k.rowling,marygrandprì©
2                                          stepheniemeyer
3                                               harperlee
4                                       f.scottfitzgerald
5                                               johngreen
6                                           j.r.r.tolkien
7                                            j.d.salinger
8                                                danbrown
9                                              janeausten
10                                         khaledhosseini
11                                           veronicaroth
12                   georgeorwell,erichfromm,celì¢lìïster
13                                           georgeorwell
14      annefrank,eleanorroosevelt,b.m.mooyaart-doubleday
15                                stieglarsson,regkeeland
16                                         suzannecollins
17            

In [17]:
def create_soup(x):
    return ''.join(x['authors']) + '' + ''.join(x['tag_name'])

In [18]:
books0['soup'] = books0.apply(create_soup, axis = 1)

In [19]:
# Use the CountVectorizer() instead of TF-IDF. 
# This is because one do not want to down-weight the presence of an author if he or she has written relatively more books.
# But will still fit_transform a tfidfvectorizer for comparison 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

cv = CountVectorizer(analyzer = 'word', ngram_range=(1, 3), min_df = 0, stop_words = 'english')
count_matrix = cv.fit_transform(books0['soup'])
cosine_sim0 = cosine_similarity(count_matrix, count_matrix)
#books0.reset_index()

In [20]:
tfidf2 = TfidfVectorizer(analyzer = 'word', ngram_range=(1, 3), min_df = 0, stop_words = 'english')
tfidf_matrix2 = tfidf2.fit_transform(books0['soup'])
cosine_sim2 = linear_kernel(tfidf_matrix2, tfidf_matrix2)

In [21]:
print('author-based recommendations:')
print(get_recommendations('The Great Gatsby'), '\n')
print('tag-based recommendations:')
print(get_recommendations('The Great Gatsby', cosine_sim1), '\n')
print('author and tag based recommendations using CountVectorizer:')
print(get_recommendations('The Great Gatsby', cosine_sim0), '\n')
print('author and tag based recommendations using tf-idf:')
print(get_recommendations('The Great Gatsby', cosine_sim2))

author-based recommendations:
1183                                  Tender Is the Night
2303                                This Side of Paradise
3254                             The Beautiful and Damned
3640                  The Curious Case of Benjamin Button
7408                                    The Short Stories
8683    The Billionaire's Obsession ~ Simon (The Billi...
1279                                           The Aeneid
922     The Alchemyst (The Secrets of the Immortal Nic...
1633    The Magician (The Secrets of the Immortal Nich...
2016    The Sorceress (The Secrets of the Immortal Nic...
Name: title, dtype: object 

tag-based recommendations:
97         The Girl Who Played with Fire (Millennium, #2)
197                                      The Color Purple
590       The Absolutely True Diary of a Part-Time Indian
813                          Tempted (House of Night, #6)
1095    America (The Book): A Citizen's Guide to Democ...
1204      Hard-Boiled Wonderland and the End

Author-based and tag-based recommenders have awful performance obviously. Although, when it comes to tag and author based recommender, the results of countvectorizer and tfidfvectorizer only have relatively small differences.

In [22]:
#print('author-based recommendations:')
#print(get_recommendations('Brave New World'), '\n')
#print('tag-based recommendations:')
#print(get_recommendations('Brave New World', cosine_sim1), '\n')
#print('author and tag based recommendations using CountVectorizer:')
#print(get_recommendations('Brave New World', cosine_sim0), '\n')
#print('author and tag based recommendations using TfidfVectorizer:')
#print(get_recommendations('Brave New World', cosine_sim2))

dict = {'CountVectorizer': [x for i,x in enumerate(get_recommendations('Animal Farm', cosine_sim0))], 
        'TfidfVectorizer': [x for i,x in enumerate(get_recommendations('Animal Farm', cosine_sim2))]}
results = pd.DataFrame(data = dict)
results

Unnamed: 0,CountVectorizer,TfidfVectorizer
0,Lord of the Flies,1984
1,The Great Gatsby,The Great Gatsby
2,1984,Animal Farm / 1984
3,Of Mice and Men,Keep the Aspidistra Flying
4,Brave New World,Lord of the Flies
5,The Catcher in the Rye,"Cry, the Beloved Country"
6,Fahrenheit 451,Of Mice and Men
7,The Old Man and the Sea,The Fall of the House of Usher
8,Silas Marner,A Modest Proposal
9,To Kill a Mockingbird,A Modest Proposal and Other Satirical Works
