<h2> Model Implementation

In [2]:
import pandas as pd
import numpy as np
import time
import random

from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate

In [3]:
# Load the data
df_ratings = pd.read_csv('/DataScience/Final Capstone Files/ModelRatings.csv')
df_books = pd.read_csv('/DataScience/Final Capstone Files/ModelBooks.csv')

# Global Parameters
SVD_FACTORS = 60
SVD_EPOCHS = 20

Higher numbers improve accuracy but increase runtime. Higher factors and epochs don't improve RMSE much, but seem to diversify (and therefore improve) recommendations anecdotally.

In [13]:
def get_top_isbn(cluster, n):
    # Returns the nth best ISBN from a given cluster, and returns false if n > number of isbns in cluster
    if n > len(df_books[df_books.Cluster == cluster]):
        return False
    sorted_scores = sorted(df_books.PopularityScore[df_books.Cluster == cluster])
    return df_books.ISBN[(df_books.PopularityScore == sorted_scores[-n]) & (df_books.Cluster == cluster)].iloc[0]

def get_titles(isbn_list):
    # Converts a list of ISBNs into their titles. ISBNs need to be in list form, so a singular string
    # of an ISBN should be put into brackets.
    title = []
    for isbn in isbn_list:
        title.append(df_books.Title[df_books.ISBN == isbn].iloc[0])
    return title

def recommend_books_for_userid(userid, top_n):
    # Recommend the top n books for a user in the database.
    # Returns the singular top ISBN for the top n clusters. if they're already read, 
    # return the next best in that cluster.
    cluster_ratings, tops, recommendations = [], [], []
    svd = SVD(n_factors = SVD_FACTORS, n_epochs=SVD_EPOCHS)
    reader = Reader(rating_scale=(1,10))
    data = Dataset.load_from_df(df_ratings[['User', 'Cluster', 'Rating']], reader)
    svd.fit(data.build_full_trainset())
        
    for cluster in range(len(df_ratings.Cluster.unique())):
        cluster_ratings.append(svd.predict(userid, cluster)[3])
    
    while len(tops) < top_n:
        for i in range(len(cluster_ratings)):
            if cluster_ratings[i] == max(cluster_ratings):
                tops.append(i)
                cluster_ratings[i] = 0
                break
                
    for cluster in tops:
        go, n = True, 1
        while go == True:
            rec = get_top_isbn(cluster, n)
            if rec not in df_ratings.ISBN[df_ratings.User == userid]:
                recommendations.append(rec)
                go = False
            n += 1
    return recommendations

def recommend_books_for_new_data(isbn_list, rating_list, top_n):
    # Takes as inputs a list of ISBNs a second list of ordered ratings, and returns the top n ISBNs.    
    cluster_list = []
    for isbn in isbn_list:
        cluster_list.append(df_books.Cluster[df_books.ISBN == isbn].iloc[0]) # Convert isbn_list to clusters
    new_user = pd.DataFrame()
    new_user['User'] = [-1] * len(cluster_list) # Set userid as -1 becuase that isn't in our data
    new_user['Cluster'] = cluster_list
    new_user['Rating'] = rating_list
    
    reader = Reader(rating_scale=(1,10))
    svd = SVD(n_factors=SVD_FACTORS, n_epochs=SVD_EPOCHS)
    train_set = pd.concat([df_ratings[['User', 'Cluster', 'Rating']], new_user])
    train_set.reset_index(drop=True, inplace=True)
    data = Dataset.load_from_df(train_set, reader)
    
    svd.fit(data.build_full_trainset())
    cluster_preds = [] # Gets populated with SVD predictions for each cluster
    for cluster in range(len(train_set.Cluster.unique())):
        cluster_preds.append(svd.predict(-1, cluster, verbose=False)[3])
    
    tops = [] # Finds the highest rated n clusters according to SVD
    while len(tops) < top_n:
        for pred in range(len(cluster_preds)):
            if cluster_preds[pred] == max(cluster_preds):
                tops.append(pred)
                cluster_preds[pred] = 0
                break
    
    recommendations = [] # For each of the clusters, find the best book that hasn't been read yet.
    for cluster in tops:
        go, n = True, 1
        while go == True:
            rec = get_top_isbn(cluster, n)
            if rec not in isbn_list:
                recommendations.append(rec)
                go = False
            n += 1           
    return recommendations

def input_ratings(max_ratings):
    # This function is a user facing way to gather books they have read and what they rate them in order
    # to generate recommendations.
    print("Please rate books 1 - 10, press x to begin recommending early, any other input means not read")
    isbns, ratings, asked_books, go = [], [], [], True
    while (len(isbns) < max_ratings) & (go == True):
        book = -1
        while(len(df_ratings[df_ratings.ISBN == book]) < 30) or (book in asked_books):
            book = random.choice(df_books.ISBN)
        asked_books.append(book)
        user_input = input('{} '.format(get_titles([book])[0]))
        if user_input in ['1','2','3','4','5','6','7','8','9','10']:
            isbns.append(book)
            ratings.append(user_input)
        elif user_input == 'x':
            go = False
    print(get_titles(recommend_books_for_new_data(isbns, ratings, 3)))

<h2> Examples for each of the 5 functions

In [6]:
print([get_top_isbn(2, 1)])
print([get_top_isbn(32, 1)])
print([get_top_isbn(0, 5)])

['0070064601']
['0152050167']
['0446365505']


These 3 ISBNs are: 
1. The best book from the 2nd cluster
2. The best book from the 32nd cluster
3. The 5th best book from the 0th cluster.

In [8]:
print(get_titles([get_top_isbn(2, 1)]))
print(get_titles([get_top_isbn(32, 1)]))
print(get_titles([get_top_isbn(0, 5)]))

['Family: The Ties That Bind and Gag!']
['The Whale Rider']
['Pleading Guilty']


The 'get_titles' function converts ISBNs into titles

In [10]:
print(recommend_books_for_userid(df_ratings.User.unique()[37], 5))
print(get_titles(recommend_books_for_userid(df_ratings.User.unique()[37], 5)))

['0345339738', '0060938455', '950491036X', '0394404289', '0440498058']
['To Kill a Mockingbird', 'Night', 'Six by Seuss: A Treasury of Dr. Seuss Classics', 'Griffin &amp; Sabine: An Extraordinary Correspondence', 'Seabiscuit: An American Legend']


The top 5 recommended books for the 37th user in our data. Note that none of these books have been previously rated by this user. The recommender always returns new books.

In [18]:
isbns = ['8445071408', '8472236552', '8408043641', '950491036X']
ratings = [10, 10, 10, 10]
print('New Ratings:')
print(get_titles(isbns), '\n')
print('My Recommendations:')
print(get_titles(recommend_books_for_new_data(isbns, ratings, 5)))

New Ratings:
['El Senor De Los Anillos: LA Comunidad Del Anillo (Lord of the Rings (Spanish))', 'UN Viejo Que Leia Novelas De Amor/the Old Men Who Read Love Stories (ColecciÃ³n Andanzas)', 'La Sombra Del Viento/The Shadow Of The Wind', 'La Sombra del Viento'] 

My Recommendations:
['El principito (Spanish)', '100 Selected Poems by E. E. Cummings', 'The Return of the King (The Lord of the Rings, Part 3)', 'The Fellowship of the Ring (The Lord of the Rings, Part 1)', "I Know This Much Is True (Oprah's Book Club)"]


I tried entering 4 books in Spanish and rating them 10/10 and the recommender system returns firstly another book in Spanish! It only returns 1 Spanish book because there is only 1 cluster of books in Spanish. The other recommendations must come from clusters that are aggreable to Spanish speakers according to my data.

In [20]:
isbns = ['8445071408', '8472236552', '8408043641', '950491036X']
ratings = [1, 1, 1, 1]
print('New Ratings:')
print(get_titles(isbns), '\n')
print('My Recommendations:')
print(get_titles(recommend_books_for_new_data(isbns, ratings, 5)))

New Ratings:
['El Senor De Los Anillos: LA Comunidad Del Anillo (Lord of the Rings (Spanish))', 'UN Viejo Que Leia Novelas De Amor/the Old Men Who Read Love Stories (ColecciÃ³n Andanzas)', 'La Sombra Del Viento/The Shadow Of The Wind', 'La Sombra del Viento'] 

My Recommendations:
['The Meaning Of Life', 'Traveling Light: Releasing the Burdens You Were Never Intended to Bear', "Ender's Game (Ender Wiggins Saga (Paperback))", 'The Two Towers (The Lord of the Rings, Part 2)', 'The Joy Luck Club']


The same thing, but I rated the 4 Spanish books a 1/10 to see what would happen. It did not recommend any spanish books... But seems to recommend Lord of the Rings very highly regardless of the users taste in Spanish speaking books.

The last function is a user-interactive function that must be used, not shown.