In [1]:
#load saved embedding models and preprocessed string features
from tensorflow import keras
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

ratings_df = pd.read_csv('data/ratings.csv')
to_read_df = pd.read_csv('data/to_read.csv')
books_df = pd.read_csv('data/books.csv')
tags_df = pd.read_csv('data/tags.csv')
book_tags_df = pd.read_csv('data/book_tags.csv')

nn_embedding_1 = keras.models.load_model("neural_networks_embedding_1.h5")
nn_embedding_2 = keras.models.load_model("neural_networks_embedding_2.h5")
combined_string_features_df = pd.read_csv('combined_string_features_df.csv')

In [2]:
#Here, we generate some features that can be used for exactly matching the user query string

In [3]:
book_tags_df = book_tags_df.merge(tags_df, on = 'tag_id')
book_tags_df = book_tags_df.merge(books_df[['book_id','goodreads_book_id']], on = 'goodreads_book_id',how = 'inner')
book_tags_df.head()

Unnamed: 0,goodreads_book_id,tag_id,count,tag_name,book_id
0,1,30574,167697,to-read,27
1,1,11305,37174,fantasy,27
2,1,11557,34173,favorites,27
3,1,8717,12986,currently-reading,27
4,1,33114,12716,young-adult,27


In [4]:
books_exact_match_df = books_df[['book_id','isbn','isbn13','authors','original_publication_year','original_title','title','language_code']]
books_exact_match_df

Unnamed: 0,book_id,isbn,isbn13,authors,original_publication_year,original_title,title,language_code
0,1,439023483,9.780439e+12,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng
1,2,439554934,9.780440e+12,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,Harry Potter and the Sorcerer's Stone (Harry P...,eng
2,3,316015849,9.780316e+12,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US
3,4,61120081,9.780061e+12,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng
4,5,743273567,9.780743e+12,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng
...,...,...,...,...,...,...,...,...
9995,9996,441019455,9.780441e+12,Ilona Andrews,2010.0,Bayou Moon,"Bayou Moon (The Edge, #2)",eng
9996,9997,067973371X,9.780680e+12,Robert A. Caro,1990.0,Means of Ascent,"Means of Ascent (The Years of Lyndon Johnson, #2)",eng
9997,9998,039330762X,9.780393e+12,Patrick O'Brian,1977.0,The Mauritius Command,The Mauritius Command,eng
9998,9999,61711527,9.780062e+12,Peggy Orenstein,2011.0,Cinderella Ate My Daughter: Dispatches from th...,Cinderella Ate My Daughter: Dispatches from th...,eng


In [5]:
temp_df = book_tags_df['tag_name'].dropna()
past_seen_tags = set(temp_df.unique())

temp_df = books_exact_match_df['isbn'].dropna()
past_seen_isbn = set(temp_df.unique())

temp_df = books_exact_match_df['isbn13'].dropna()
past_seen_isbn13 = set(temp_df.unique())

temp_df = books_exact_match_df['authors'].dropna()
past_seen_authors = set(temp_df.unique())

temp_df = books_exact_match_df['original_publication_year'].dropna()
past_seen_original_publication_year = set(temp_df.unique())
past_seen_original_publication_year = set(map(str, past_seen_original_publication_year))

temp_df = books_exact_match_df['original_title'].dropna()
past_seen_original_title = set(temp_df.unique())

temp_df = books_exact_match_df['title'].dropna()
past_seen_title = set(temp_df.unique())

temp_df = books_exact_match_df['language_code'].dropna()
past_seen_language_code = set(temp_df.unique())

<font size="5">Driver function for the recommender system</font>

In [6]:
def book_recommender_prototype(query_user_id,query_string,number_of_recommendation):
    
    
    #the first step we do here
    #is to see if the query string can be exactly matched in some form.
    #for example, if the user typed in a specific author name, 
    #then, we will only recommend the book by this specific author. 
    #also, if the user typed in an ISBN number, then we will only return the specific book
    all_potential_book_id = None
    found_some_exact_match_feature = False

    if query_string in past_seen_tags:
        all_potential_book_id = book_tags_df[book_tags_df['tag_name'] == query_string].book_id
        found_some_exact_match_feature = True

    if query_string in past_seen_isbn:
        all_potential_book_id = books_exact_match_df[books_exact_match_df['isbn'] == query_string].book_id
        found_some_exact_match_feature = True

    if query_string in past_seen_isbn13:
        all_potential_book_id = books_exact_match_df[books_exact_match_df['isbn13'] == query_string].book_id
        found_some_exact_match_feature = True

    if query_string in past_seen_authors:
        all_potential_book_id = books_exact_match_df[books_exact_match_df['authors'] == query_string].book_id
        found_some_exact_match_feature = True

    if query_string in past_seen_original_publication_year:
        original_publication_year = int(query_string)
        all_potential_book_id = books_exact_match_df[books_exact_match_df['original_publication_year'] == original_publication_year].book_id
        found_some_exact_match_feature = True

    if query_string in past_seen_original_title:
        all_potential_book_id = books_exact_match_df[books_exact_match_df['original_title'] == query_string].book_id
        found_some_exact_match_feature = True

    if query_string in past_seen_title:
        all_potential_book_id = books_exact_match_df[books_exact_match_df['title'] == query_string].book_id
        found_some_exact_match_feature = True

    if query_string in past_seen_language_code:
        all_potential_book_id = books_exact_match_df[books_exact_match_df['language_code'] == query_string].book_id
        found_some_exact_match_feature = True
        
        
    #if user query string didn't match exactly with a string feature
    #that can be matched exactly. We will then proceed to process the
    #user query string by measuring its similarity with all the string
    #features that are preprocessed.
    if found_some_exact_match_feature == False:
        #compute similarity between strings by tfid
        vect = TfidfVectorizer(min_df=1, stop_words="english")
        given_query_string = [query_string]
        candidate_string = combined_string_features_df.combined_string_features.tolist()
        corpus = given_query_string + candidate_string
        tfidf = vect.fit_transform(corpus)
        pairwise_similarity = tfidf * tfidf.T
        arr = pairwise_similarity.toarray()
        book_rating_by_query_string_along = arr[0][1:]

        string_similarity_df = pd.DataFrame()
        string_similarity_df['string_similarity'] = book_rating_by_query_string_along

        #scale the string similarity measure
        scaler = MinMaxScaler(feature_range = (0,3))
        string_similarity_df['string_similarity'] = scaler.fit_transform(string_similarity_df.values.reshape(-1,1))

        string_similarity_df['book_id'] = list(range(1,10001))

        
    #we now ask the neural network to predict
    #how the query user would rate on those candidate book_ids
    neural_nets_query_df = pd.DataFrame()
    if found_some_exact_match_feature:
        neural_nets_query_df['user_id'] = [query_user_id]*all_potential_book_id.shape[0]
        neural_nets_query_df['book_id'] = all_potential_book_id.unique()
        potential_book_recommendation_rating1 = nn_embedding_1.predict([neural_nets_query_df['user_id'], neural_nets_query_df['book_id']])
        potential_book_recommendation_rating2 = nn_embedding_2.predict([neural_nets_query_df['user_id'], neural_nets_query_df['book_id']])
    else:
        neural_nets_query_df['book_id'] = list(range(1,10001))
        neural_nets_query_df['user_id'] = [query_user_id]*len(list(range(1,10001)))
        potential_book_recommendation_rating1 = nn_embedding_1.predict([neural_nets_query_df['user_id'], neural_nets_query_df['book_id']])
        potential_book_recommendation_rating2 = nn_embedding_2.predict([neural_nets_query_df['user_id'], neural_nets_query_df['book_id']])
        
    
    #average the neural network ratings if there is an exact string feature match
    #otherwise, average the string similarity along with the two neural network ratings
    potential_book_recommendation_df = pd.DataFrame()
    potential_book_recommendation_df['book_id'] = neural_nets_query_df['book_id']
    potential_book_recommendation_df['neural_nets_rating1'] = potential_book_recommendation_rating1[:,0]
    potential_book_recommendation_df['neural_nets_rating2'] = potential_book_recommendation_rating2[:,0]
    
    if found_some_exact_match_feature:
        potential_book_recommendation_df['overall_rating'] = potential_book_recommendation_df[['neural_nets_rating1','neural_nets_rating2']].mean(axis = 1)
    else:
        potential_book_recommendation_df = potential_book_recommendation_df.merge(string_similarity_df,on = 'book_id')
        potential_book_recommendation_df['overall_rating'] = potential_book_recommendation_df[['neural_nets_rating1','neural_nets_rating2','string_similarity']].mean(axis = 1)

    potential_book_recommendation_df = potential_book_recommendation_df.sort_values(by = ['overall_rating'],ascending = False)
    
    
    #lastly, we don't want to recommend a book that the user has already seen
    query_user_seen_books_df = ratings_df[ratings_df['user_id'] == query_user_id]
    query_user_seen_books = set(query_user_seen_books_df['book_id'].unique())

    final_book_recommendation_by_bookid = []

    for row_index in range(potential_book_recommendation_df.shape[0]):
        current_book_id = int(potential_book_recommendation_df.iloc[row_index].book_id)
        if current_book_id not in query_user_seen_books:
            final_book_recommendation_by_bookid.append(current_book_id)

    output_book_title = []
    for current_book_id in final_book_recommendation_by_bookid:
        book_title = books_df[books_df['book_id'] == current_book_id].original_title.tolist()[0]
        if type(book_title) == str:
            output_book_title.append(books_df[books_df['book_id'] == current_book_id].original_title.tolist()[0])

    if len(output_book_title) > number_of_recommendation:
        output_book_title = output_book_title[0:number_of_recommendation]
        
    return output_book_title



<font size="5">Example Use Cases</font>

In [7]:
query_user_id = 99
query_string = 'F. Scott Fitzgerald'
number_of_recommendation = 5

book_recommender_prototype(query_user_id,query_string,number_of_recommendation)

['Tender Is the Night',
 'This Side of Paradise',
 'The Beautiful and Damned',
 'The Great Gatsby',
 'The Curious Case of Benjamin Button']

In [8]:
query_user_id = 99
query_string = 'I want a book about love'
number_of_recommendation = 5

book_recommender_prototype(query_user_id,query_string,number_of_recommendation)

['P.S. I Still Love You',
 'Endless Summer',
 'Accordance',
 'The Kiss of Deception',
 'Edenbrooke']

In [9]:
query_user_id = 99
query_string = 'show me something interesting about philosophy'
number_of_recommendation = 5

book_recommender_prototype(query_user_id,query_string,number_of_recommendation)

['Philosophische Untersuchungen',
 'Logisch-Philosophische Abhandlung',
 'Sein und Zeit',
 'Die fröhliche Wissenschaft',
 'Phänomenologie des Geistes']

In [10]:
query_user_id = 99
query_string = '439554934' #book isbn number
number_of_recommendation = 5

book_recommender_prototype(query_user_id,query_string,number_of_recommendation)

["Harry Potter and the Philosopher's Stone"]

In [11]:
#here, we have a different user but the same query string
#the recommender system is able to personalize the recommendation

query_user_id = 999
query_string = 'I want a book about love'
number_of_recommendation = 5

book_recommender_prototype(query_user_id,query_string,number_of_recommendation)

['Lady Midnight',
 'P.S. I Still Love You',
 'Edenbrooke',
 'Me Before You',
 'Accordance']