<h2> Model Implementation

What I want from this notebook:
Ability to further evaluate model subjectively
generate recommendations for users in the dataset
generate recommendations from new data
implement all of this for both the fast and the performance models
compare book recommendations for fast, performance, and baseline

In [106]:
import pandas as pd
import numpy as np
import time
import random

from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate

In [2]:
# From parameter tuning notebook, we want 1. averaged ratings_df  2. df_books with no vectors but clusters

In [4]:
# Load the data
df_ratings = pd.read_csv('/DataScience/Final Capstone Files/ModelRatings.csv')
df_books = pd.read_csv('/DataScience/Final Capstone Files/ModelBooks.csv')

# Higher numbers improve accuracy but increase runtime.
# Note: Higher factors and epochs don't improve RMSE much, but seem to diversify (and therefore improve)
# recommendations anecdotally.
SVD_FACTORS = 60
SVD_EPOCHS = 20

In [112]:
def get_top_isbn(cluster, n):
    # Returns the nth best ISBN from a given cluster, and returns false if n > number of isbns in cluster
    if n > len(df_books[df_books.Cluster == cluster]):
        return False
    sorted_scores = sorted(df_books.PopularityScore[df_books.Cluster == cluster])
    return df_books.ISBN[(df_books.PopularityScore == sorted_scores[-n]) & (df_books.Cluster == cluster)].iloc[0]

def get_titles(isbn_list):
    # Converts a list of ISBNs into their titles. ISBNs need to be in list form, so a singular string
    # of an ISBN should be put into brackets.
    title = []
    for isbn in isbn_list:
        title.append(df_books.Title[df_books.ISBN == isbn].iloc[0])
    return title

def recommend_books_for_userid(userid, top_n):
    # Recommend the top n books for a user in the database.
    # Returns the singular top ISBN for the top n clusters. if they're already read, 
    # return the next best in that cluster.
    cluster_ratings, tops, recommendations = [], [], []
    svd = SVD(n_factors = SVD_FACTORS, n_epochs=SVD_EPOCHS)
    reader = Reader(rating_scale=(1,10))
    data = Dataset.load_from_df(df_ratings[['User', 'Cluster', 'Rating']], reader)
    svd.fit(data.build_full_trainset())
        
    for cluster in range(len(df_ratings.Cluster.unique())):
        cluster_ratings.append(svd.predict(userid, cluster)[3])
    
    while len(tops) < top_n:
        for i in range(len(cluster_ratings)):
            if cluster_ratings[i] == max(cluster_ratings):
                tops.append(i)
                cluster_ratings[i] = 0
                break
                
    for cluster in tops:
        go, n = True, 1
        while go == True:
            rec = get_top_isbn(cluster, n)
            if rec not in df_ratings.ISBN[df_ratings.User == userid]:
                recommendations.append(rec)
                go = False
            n += 1
    return recommendations

def recommend_books_for_new_data(isbn_list, rating_list, top_n):
    # Takes as inputs a list of ISBNs a second list of ordered ratings, and returns the top n ISBNs.
    start = time.time()
    
    cluster_list = []
    for isbn in isbn_list:
        cluster_list.append(df_books.Cluster[df_books.ISBN == isbn].iloc[0]) # Convert isbn_list to clusters
    new_user = pd.DataFrame()
    new_user['User'] = [-1] * len(cluster_list) # Set userid as -1 becuase that isn't in our data
    new_user['Cluster'] = cluster_list
    new_user['Rating'] = rating_list
    
    reader = Reader(rating_scale=(1,10))
    svd = SVD(n_factors=SVD_FACTORS, n_epochs=SVD_EPOCHS)
    train_set = pd.concat([df_ratings[['User', 'Cluster', 'Rating']], new_user])
    train_set.reset_index(drop=True, inplace=True)
    data = Dataset.load_from_df(train_set, reader)
    
    svd.fit(data.build_full_trainset())
    cluster_preds = [] # Gets populated with SVD predictions for each cluster
    for cluster in range(len(train_set.Cluster.unique())):
        cluster_preds.append(svd.predict(-1, cluster, verbose=False)[3])
    
    tops = [] # Finds the highest rated n clusters according to SVD
    while len(tops) < top_n:
        for pred in range(len(cluster_preds)):
            if cluster_preds[pred] == max(cluster_preds):
                tops.append(pred)
                cluster_preds[pred] = 0
                break
    
    recommendations = [] # For each of the clusters, find the best book that hasn't been read yet.
    for cluster in tops:
        go, n = True, 1
        while go == True:
            rec = get_top_isbn(cluster, n)
            if rec not in isbn_list:
                recommendations.append(rec)
                go = False
            n += 1

    print('Runtime:', time.time() - start)
                    
    return recommendations

def input_ratings(max_ratings):
    # This function is a user facing way to gather books they have read and what they rate them in order
    # to generate recommendations.
    print("Please rate books 1 - 10, press x to begin recommending early, any other input means not read")
    isbns, ratings, asked_books, go = [], [], [], True
    while (len(isbns) < max_ratings) & (go == True):
        book = -1
        while(len(df_ratings[df_ratings.ISBN == book]) < 30) or (book in asked_books):
            book = random.choice(df_books.ISBN)
        asked_books.append(book)
        user_input = input('{} '.format(get_titles([book])[0]))
        if user_input in ['1','2','3','4','5','6','7','8','9','10']:
            isbns.append(book)
            ratings.append(user_input)
        elif user_input == 'x':
            go = False
    print(get_titles(recommend_books_for_new_data(isbns, ratings, 3)))

In [32]:
get_titles([get_top_isbn(2, 1)])

['Family: The Ties That Bind and Gag!']

In [83]:
get_titles(recommend_books_for_userid(df_ratings.User.unique()[39], 3))

['A Prayer for Owen Meany',
 'Maus a Survivors Tale: My Father Bleeds History',
 'A Wrinkle In Time']

In [81]:
df_books[df_books.ISBN.isin(df_ratings.ISBN[df_ratings.User == df_ratings.User.unique()[37]])]

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb,AverageRating,NumberRatings,PopularityScore,Cluster
189,0316666343,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown",The spirit of fourteen-year-old Susie Salmon d...,8.185290,707,3.664922,112
212,0385503857,Oryx and Crake,Margaret Atwood,2003,Nan A. Talese,A stunning and provocative new novel by the in...,8.375000,32,2.593408,179
327,0140293248,The Girls' Guide to Hunting and Fishing,Melissa Bank,2000,Penguin Books,Hailed by critics as the debut of a major lite...,6.909091,165,-3.664076,101
332,015600710X,Strange Fits of Passion: A Novel,Anita Shreve,1999,Harvest Books,The reader is left to uncover the truth in thi...,8.153846,26,1.717491,247
347,0385505833,Skipping Christmas,JOHN GRISHAM,2001,Doubleday,Imagine a year without Christmas. No crowded m...,7.441667,120,-0.885848,390
403,015216250X,So You Want to Be a Wizard: The First Book in ...,Diane Duane,2001,Magic Carpet Books,Something stopped Nita's hand as it ran along ...,7.823529,17,0.557658,45
451,0394820371,The Phantom Tollbooth,Norton Juster,1993,Yearling Books,"Hailed as “a classic. . . . humorous, full of ...",8.551724,58,3.756005,297
477,0345443284,While I Was Gone,Sue Miller,1999,Ballantine Books,Jo Becker has every reason to be content. She ...,7.021739,138,-2.980798,509
526,0380789035,American Gods,Neil Gaiman,2002,HarperTorch,Shadow is a man with a past. But now he wants ...,8.013245,151,1.939402,20
528,0060964049,"Tales of the City (Tales of the City Series, V...",Armistead Maupin,1989,Perennial,"San Francisco, 1976. A naïve young secretary, ...",8.146341,41,1.929724,391


In [86]:
df_books[df_books.Cluster == 5]

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb,AverageRating,NumberRatings,PopularityScore,Cluster
37,8445071408,El Senor De Los Anillos: LA Comunidad Del Anil...,J. R. R. Tolkien,2001,Minotauro,"En la adormecida e idílica Comarca, un joven h...",8.076923,13,1.154798,5
928,844507119X,Fahrenheit 451 - T.D. -,Ray Bradbury,1997,Minotauro,"Guy Montag es un bombero, y el trabajo de un b...",8.333333,6,1.266116,5
1521,8472236552,UN Viejo Que Leia Novelas De Amor/the Old Men ...,Luis Sepulveda,1993,Tusquets Editores,"Antonio José Bolívar Proaño vive en El Idilio,...",7.941176,17,0.890977,5
1718,8433969978,El Libro de Las Ilusiones,Paul Auster,2003,Anagrama,"David Zimmer, un escritor y profesor de litera...",8.454545,11,1.985085,5
1719,950491036X,La Sombra del Viento,Carlos Ruiz Zafon,2003,Planeta,"Un amanecer de 1945, un muchacho es conducido ...",9.555556,9,4.238128,5
1840,8408043641,La Sombra Del Viento/The Shadow Of The Wind,Carlos Ruiz Zafon,2003,Planeta Pub Corp,"Un amanecer de 1945, un muchacho es conducido ...",7.9,20,0.818732,5
2149,846630679X,La caverna = A caverna,Jose Saramago,2002,Punto de Lectura,"Una pequeña alfarería, un centro comercial gig...",8.307692,13,1.746709,5
2575,8420633127,Ficciones,Jorge Luis Borges,1997,Alianza Editorial,es quizás el libro más famoso de Jorge Luis B...,8.6,5,1.566465,5
3556,8472237966,Historia De Una Gaviota Y Del Gato (Andanzas),Luis Sepulveda,2002,Tusquets Editores,"Luis Sepúlveda, a quien el público de lengua e...",8.222222,9,1.308495,5
4227,0345310020,Chronicle of a Death Foretold,GABRIEL GARCIA MARQUEZ,1984,Ballantine Books,Acaso sea 'Crónica de una muerte anunciada' la...,8.1875,16,1.554866,5


In [102]:
get_titles(recommend_books_for_new_data(['8445071408', '8472236552', '8408043641', '950491036X'], [1, 1, 1, 1], 3))

Runtime: 1.398488998413086


['Maus a Survivors Tale: My Father Bleeds History',
 'The Diamond Age',
 'A Suitable Boy : Novel, A']

In [101]:
get_titles(recommend_books_for_new_data(['8445071408', '8472236552', '8408043641', '950491036X'], [10, 10, 10, 10], 3))

Runtime: 1.499297857284546


['El principito (Spanish)',
 'The Return of the King (The Lord of the Rings, Part 3)',
 '100 Selected Poems by E. E. Cummings']

In [96]:
df_books[df_books.Title == 'La Sombra del Viento']

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb,AverageRating,NumberRatings,PopularityScore,Cluster
1719,950491036X,La Sombra del Viento,Carlos Ruiz Zafon,2003,Planeta,"Un amanecer de 1945, un muchacho es conducido ...",9.555556,9,4.238128,5


In [111]:
input_ratings(3)

Please rate books 1 - 10, press x to begin recommending early, any other input means not read
Dude, Where's My Country? x
Runtime: 1.3763458728790283
['The Return of the King (The Lord of the Rings, Part 3)', 'The Fellowship of the Ring (The Lord of the Rings, Part 1)', 'Seabiscuit: An American Legend']
