# Building a Book Recommender System

In [2]:
import pandas as pd
import numpy as np
import re 

from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.mode.chained_assignment = None

# Read in and Inspect Data

In [3]:
my_books = pd.read_csv(r'C:\Users\jbean\Dropbox\Other\Python\Books\goodreads_read.csv')
all_books = pd.read_csv(r'C:\Users\jbean\Dropbox\Other\Python\Books\books.csv')
ratings = pd.read_csv(r'C:\Users\jbean\Dropbox\Other\Python\Books\ratings.csv')
to_read = pd.read_csv(r'C:\Users\jbean\Dropbox\Other\Python\Books\to_read.csv')

In [4]:
my_books.head()

Unnamed: 0,Book Id,Title,Author,Author l-f,Additional Authors,ISBN,ISBN13,My Rating,Average Rating,Publisher,Binding,Number of Pages,Year Published,Original Publication Year,Date Read,Date Added,Bookshelves,Bookshelves with positions,Exclusive Shelf,My Review,Spoiler,Private Notes,Read Count,Recommended For,Recommended By,Owned Copies,Original Purchase Date,Original Purchase Location,Condition,Condition Description,BCID,to_recommend
0,23460932,Cash Landing,James Grippando,"Grippando, James",,62295454.0,9780060000000.0,3,3.55,Harper,Hardcover,384.0,2015.0,2015.0,,3/11/2018,audiobook,audiobook (#32),read,,,,1,,,0,,,,,,1
1,23168356,"House Rivals (Joe DeMarco, #10)",Mike Lawson,"Lawson, Mike",,802123600.0,9780800000000.0,4,3.99,Atlantic Monthly Press,Hardcover,285.0,2015.0,2015.0,,4/9/2018,audiobook,audiobook (#103),read,,,,1,,,0,,,,,,1
2,33613129,The Right Time,Danielle Steel,"Steel, Danielle",,,,3,4.21,Delacorte Press,Kindle Edition,336.0,2017.0,,,3/11/2018,audiobook,audiobook (#87),read,,,,1,,,0,,,,,,1
3,34121119,Camino Island,John Grisham,"Grisham, John",,385543026.0,9780390000000.0,4,3.73,Doubleday,Hardcover,290.0,2017.0,2017.0,,3/11/2018,audiobook,audiobook (#70),read,,,,1,,,0,,,,,,1
4,9509287,"The Sixth Man (Sean King & Michelle Maxwell, #5)",David Baldacci,"Baldacci, David",,446573108.0,9780450000000.0,3,4.14,Grand Central Publishing,Hardcover,416.0,2011.0,2011.0,,3/11/2018,audiobook,audiobook (#78),read,,,,1,,,0,,,,,,1


In [5]:
all_books.head()

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng,4.44,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [6]:
ratings.head()

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4
2,2,260,5
3,2,9296,5
4,2,2318,3


In [7]:
to_read.head()

Unnamed: 0,user_id,book_id
0,9,8
1,15,398
2,15,275
3,37,7173
4,34,380


# Recommendation Algorithm

In [76]:
def book_recommendation_system(my_books = my_books, all_books = all_books, ratings = ratings, to_read = to_read,
                              series = 'Yes', new_author_only = 'No', number_of_similar_quantile = 0.99, english_only = 'Yes', 
                              rec_weight_avg_rating = 0.5, rec_weight_read_ratio = 0.4, rec_weight_perc_4_5 = 0.1, 
                              rec_weight_perc_1_2 = -0.1, rec_weight_author_previous = 0.1, return_dataset = 'No',
                              num_similar_ratings = 50):
    
    """
    An algorithm to predict the books that I should read next based on what I've read to-date. The data is limited to the
    top 10,000 books on Goodreads. The 'Series' toggle allows you to determine whether you want a series or not; the 
    'new_author' toggle allows you to filter for new or previous authors; 'english_only' filters for only English
    language books; If you want the full data set, toggle 'return_dataset' to 'Yes'.
    
    To use with your own data, you would need to export your data, create a "to_recommend" column where you decide which 
    books you want to be used with the recommendor. If all, create a "to_recommend" column and put a '1' for all values.
    
    """
    
    print('Getting recommendations...')
    
    # Filter out books that I haven't yet read or am currently reading

    my_books = my_books[~my_books['Exclusive Shelf'].isin(['to read','currently reading'])]
    
    # Extract relevant columns from my books

    my_books = my_books[['Book Id', 'Title','Author','ISBN','ISBN13','My Rating','Average Rating',
                     'Publisher','Number of Pages','Year Published','to_recommend']]
    
    # Rename columns to python-friendly format

    my_books = my_books.rename(columns = {'Book Id':'book_id', 'My Rating':'my_rating','Average Rating':'average_rating',
                          'Number of Pages':'num_pages','Year Published':'year_published'})
    
    # Drop irrelevant columns from all books

    all_books = all_books.drop(['image_url','small_image_url'], axis=1)
    
    # Standardize any english-language book and filter for only english

    all_books.language_code = all_books.language_code.replace(to_replace = ['en-US','en-CA','en-GB','en'], 
                                                          value = ['eng','eng','eng','eng'])
    
    if english_only == 'Yes':
        
        all_books = all_books[all_books.language_code == 'eng']
    
    # Add a column to all books indicating if I've read the book

    all_books['jordan_read'] = [1 if i else 0 for i in all_books.best_book_id.isin(my_books.book_id)]
    all_books['to_recommend'] = [1 if i else 0 for i in all_books.best_book_id.\
                                 isin(my_books[my_books.to_recommend == 1].book_id)]
    
    # Add my rating to the books that I've read

    all_books = pd.merge(all_books, my_books[['book_id','my_rating']], left_on = 'best_book_id', right_on = 'book_id', 
                         how = 'left', copy = False)
    
    # Drop overlapping columns
    
    all_books.drop(['book_id_y'], axis=1, inplace=True)
    all_books.rename(columns = {'book_id_x':'book_id'}, inplace=True)
    
    # Merge addtional columns from my books that didn't match on book_id
    
    all_books = pd.merge(all_books, my_books[['Title','Author','my_rating','to_recommend']], left_on = ['title','authors'], 
                     right_on = ['Title','Author'], how = 'left')
    
    # Ensure that the ratings and other information carries over if no match on book_id
    
    all_books['my_rating_x'] = [j if pd.notnull(j) else all_books.loc[i,'my_rating_y'] for i,j in 
                                all_books['my_rating_x'].items()]
    all_books['to_recommend_x'] = [j if j == 1 else all_books.loc[i, 'to_recommend_y'] for 
                                   i, j in all_books.to_recommend_x.items()]
    all_books['to_recommend_x'] = [i if pd.notnull(i) else 0 for i in all_books.to_recommend_x]

    all_books['jordan_read'] = [1 if pd.notnull(all_books.loc[i,'my_rating_x']) else 0 for i in range(len(all_books))]
    
    # Drop columns that aren't relevant and rename others
    
    all_books = all_books.drop(['Title','Author','my_rating_y','to_recommend_y'], axis = 1).rename(
    columns = {'to_recommend_x':'to_recommend', 'my_rating_x':'my_rating'})
    
    # Match to ratings on only the books that I rated higher than a 3
    
    ratings = pd.merge(ratings, all_books[all_books.my_rating > 3][['book_id','my_rating','to_recommend']], left_on = 'book_id', 
                   right_on = 'book_id', how = 'left')
    
    # Identify the most similar users by grouping on user ID and summing the books that I want recommended
    
    most_similar_readers = ratings.groupby(by='user_id').agg({'to_recommend':'sum','my_rating':'mean'}).\
                           sort_values(by='to_recommend', ascending=False)

    # Keep only the readers that are higher than the median value
    
    most_similar_readers = most_similar_readers[most_similar_readers.to_recommend >= most_similar_readers.\
                            to_recommend.quantile(number_of_similar_quantile )].sort_values(by='to_recommend', ascending=False)
    
    print('\nWow! There are %i similar readers to you...' %len(most_similar_readers))
    
    # Isolate only the ratings of my most similar readers
    
    most_similar_by_book = ratings[ratings.user_id.isin(most_similar_readers.index)].groupby(by='book_id').agg(
    {'user_id':'count','rating':'mean'}).sort_values(by='rating', ascending=False)
    
    print('\n...who have read a total of %i books!'%most_similar_by_book.user_id.sum())

    # Rename columns to be more descriptive
    
    most_similar_by_book = most_similar_by_book.rename(columns={'user_id':'num_similar_ratings',
                                                            'rating':'avg_rating_similar_users'})
    
    # Identify the books marked as "to-read" by similar users
    
    most_to_read = to_read[to_read.user_id.isin(most_similar_readers.index)].groupby(by='book_id').count().sort_values(by=
                                                                                                'user_id',ascending=False)

    most_to_read = most_to_read.rename(columns={'user_id':'number_to_read'})
    
    # Create a new data frame to merge the number of reads and ratings by similar users
    
    all_books_rec = pd.merge(all_books, most_similar_by_book, left_on = 'book_id', right_index=True, how = 'left')
    all_books_rec = pd.merge(all_books_rec, most_to_read, left_on='book_id', right_index=True, how = 'left')
    
    # Fill in missing values for number to read
    
    all_books_rec['number_to_read'] = all_books_rec['number_to_read'].fillna(0)
    
    # Create a new column that has the number of ratings plus "to read" for each book
    
    all_books_rec['rated_plus_to_read'] = all_books_rec.num_similar_ratings + all_books_rec.number_to_read

    print("\nThe most read new book is: %s" % all_books_rec[all_books_rec.jordan_read == 0].sort_values(by='num_similar_ratings', 
                                                                    ascending = False)[:1]['title'].iloc[0])
    
    # Create a ratio of rated plus to read to total reads in the data set
    
    all_books_rec['rated_plus_to_read_ratio'] = all_books_rec.rated_plus_to_read / all_books_rec['work_ratings_count']
    
    # Create individual ratios for each variable
    
    all_books_rec['rated_ratio'] = all_books_rec.num_similar_ratings / all_books_rec['work_ratings_count']
    all_books_rec['to_read_ratio'] = all_books_rec.number_to_read / all_books_rec['work_ratings_count']
    
    def percent_rating(rating_number, df = all_books_rec):
        """
        Returns the percentage of ratings for each value
        """
        return df['ratings_%s'%rating_number] / df.work_ratings_count
    
    all_books_rec['percent_1'] = percent_rating(1)
    all_books_rec['percent_2'] = percent_rating(2)
    all_books_rec['percent_3'] = percent_rating(3)
    all_books_rec['percent_4'] = percent_rating(4)
    all_books_rec['percent_5'] = percent_rating(5)

    # Create a variable for very high and very low ratings
    
    all_books_rec['percent_4_5'] = all_books_rec.percent_4 + all_books_rec.percent_5
    all_books_rec['percent_1_2'] = all_books_rec.percent_1 + all_books_rec.percent_2
    
    # Add a column flag on whether the author has been read previously or not
    
    all_books_rec['author_read_previously'] = [1 if i else 0 for i in all_books_rec.authors.isin(my_books.Author)]
    
    # Pull only the relevant recommendation columns
    
    all_books_rec_cols = ['authors','title','language_code','jordan_read', 'to_recommend', 'my_rating','num_similar_ratings',
                     'avg_rating_similar_users','rated_plus_to_read_ratio','author_read_previously','percent_1_2',
                     'percent_4_5']

    recs_data = all_books_rec[all_books_rec_cols]
    
    # Standardize columns between 0 and 1 to offset different scales

    from sklearn.preprocessing import MinMaxScaler

    cols_to_standardize = ['avg_rating_similar_users','rated_plus_to_read_ratio','percent_1_2','percent_4_5']
    
    
    for col in cols_to_standardize:

        min_max_scaler = MinMaxScaler()

        recs_data.loc[:,col] = min_max_scaler.fit_transform(np.array(recs_data.loc[:,col]).reshape(-1,1))
        
    # Create recommendation weights based on user inputs
    
    recommendation_weights = {'avg_rating_similar_users':rec_weight_avg_rating,
                         'rated_plus_to_read_ratio':rec_weight_read_ratio,
                         'percent_4_5':rec_weight_perc_4_5,
                          'percent_1_2':rec_weight_perc_1_2,
                         'author_read_previously':rec_weight_author_previous}
    
    # Create the weighted value of each row, store it in a list, then add that list to the data frame
    
    rec_weights = []
    
    for i in range(len(recs_data)):

        total_val = 0

        for j in cols_to_standardize:

            total_val += recommendation_weights.get(j) * recs_data.loc[i, j]

        rec_weights.append(total_val)

    recs_data.loc[:,'recommendation_weights'] = rec_weights
    
    # Eliminate series if toggle set to "No"
    
    if series == 'No':
        
        recs_data = recs_data[~(recs_data.title.str.contains('#'))]
        
    # Include only new authors if toggle set to "Yes"
    
    if new_author_only == 'Yes':
        
        recs_data = recs_data[recs_data.author_read_previously == 0]
    
    # Filter for only books that I haven't read
    
    recs_data = recs_data[recs_data.jordan_read == 0]
    
    
    # If a series is in the recommendation data set, see whether I've read any books in the series and if yes, remove
    # any values that had an average rating below 4
    
    for i in recs_data.title:
        
        if '#' in i:
            
            try:
                relevant_books = my_books[my_books.Title.str.contains(i.split('(')[1].split(',')[0])]
                            
                average_relevant_books = relevant_books['my_rating'].mean()
            
                if average_relevant_books < 4:
                
                    recs_data = recs_data[~recs_data.title.str.contains(i.split('(')[1].split(',')[0])]
                    
            except:
                
                continue
    
    # Remove any "Boxed Set" books as they don't represent a single book
    
    recs_data = recs_data[~(recs_data.title.str.contains('Boxed Set') | recs_data.title.str.contains('Boxset'))]
    
    # Get the top recommendations by total score, popularity, and rating for books that have more than the values
    # set by the user
    
    top_rec_by_weight = recs_data[recs_data.num_similar_ratings > num_similar_ratings].sort_values(by='recommendation_weights', 
                                                                                   ascending=False)[:1]
    
    top_rec_by_popularity = recs_data[recs_data.num_similar_ratings > num_similar_ratings].sort_values(by = 'rated_plus_to_read_ratio', 
                                                                                       ascending=False)[:1]
    
    top_rec_by_rating = recs_data[recs_data.num_similar_ratings > num_similar_ratings].sort_values(by='avg_rating_similar_users', 
                                                                                   ascending=False)[:1]
    
    
    print('\nIf you want to read the highest rated book by your similar users, you should read %s by %s' %(
        top_rec_by_rating['title'].iloc[0], top_rec_by_rating['authors'].iloc[0]))
    
    print('\nTaking into account all factors, we recommend that you should read %s by %s!' %(
        top_rec_by_popularity['title'].iloc[0], top_rec_by_popularity['authors'].iloc[0]))
    
    # Returns the full data set of recommendation values if set by the user
    
    if return_dataset == 'Yes':
        
        return recs_data    

In [77]:
book_recommendation_system()

Getting recommendations...

Wow! There are 627 similar readers to you...

...who have read a total of 73584 books!

The most read new book is: Gone Girl

If you want to read the highest rated book by your similar users, you should read The Help by Kathryn Stockett

Taking into account all factors, we recommend that you should read True Blue by David Baldacci!


In [78]:
book_recommendation_system(series = 'No')

Getting recommendations...

Wow! There are 627 similar readers to you...

...who have read a total of 73584 books!

The most read new book is: Gone Girl

If you want to read the highest rated book by your similar users, you should read The Help by Kathryn Stockett

Taking into account all factors, we recommend that you should read True Blue by David Baldacci!


In [79]:
book_recommendation_system(new_author_only = 'Yes')

Getting recommendations...

Wow! There are 627 similar readers to you...

...who have read a total of 73584 books!

The most read new book is: Gone Girl

If you want to read the highest rated book by your similar users, you should read The Help by Kathryn Stockett

Taking into account all factors, we recommend that you should read The Survivor (Mitch Rapp, #14) by Vince Flynn, Kyle Mills, Armand Schultz!


In [82]:
book_recommendation_system(number_of_similar_quantile = 0.95)

Getting recommendations...

Wow! There are 3826 similar readers to you...

...who have read a total of 447984 books!

The most read new book is: Gone Girl

If you want to read the highest rated book by your similar users, you should read Lonesome Dove by Larry McMurtry

Taking into account all factors, we recommend that you should read Radiant Angel (John Corey, #7) by Nelson DeMille!


In [83]:
book_recommendation_system(new_author_only = 'Yes', number_of_similar_quantile = 0.95, rec_weight_avg_rating = 0.6, 
                           rec_weight_read_ratio = 0.3, rec_weight_perc_4_5 = 0.1, rec_weight_perc_1_2 = 0, 
                           rec_weight_author_previous = 0)

Getting recommendations...

Wow! There are 3826 similar readers to you...

...who have read a total of 447984 books!

The most read new book is: Gone Girl

If you want to read the highest rated book by your similar users, you should read Lonesome Dove by Larry McMurtry

Taking into account all factors, we recommend that you should read Descent by Tim Johnston!
