<h2> Model Implementation

In [1]:
import pandas as pd
import numpy as np
import time
import random
from scipy.spatial.distance import cosine

from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import Reader
from surprise import Dataset

In [146]:
# Load the data
df_ratings = pd.read_csv('/DataScience/Final Capstone Files/ModelRatings.csv')
df_books = pd.read_csv('/DataScience/Final Capstone Files/ModelBooks.csv')

# Global Parameters
SVD_FACTORS = 150
SVD_EPOCHS = 30

Higher numbers improve recommendations but increase runtime. Higher factors and epochs don't improve RMSE much, but seem to diversify (and therefore improve) recommendations anecdotally. My goal was to keep recommendation time under 5 seconds, but much faster times are possible.

In [138]:
def get_top_isbn(cluster, n):
    # Returns the nth best ISBN from a given cluster, and returns false if n > number of isbns in cluster
    if n > len(df_books[df_books.Cluster == cluster]):
        return False
    sorted_scores = sorted(df_books.PopularityScore[df_books.Cluster == cluster])
    return df_books.ISBN[(df_books.PopularityScore == sorted_scores[-n]) & (df_books.Cluster == cluster)].iloc[0]

def get_titles(isbn_list):
    # Converts a list of ISBNs into their titles. ISBNs need to be in list form, so a singular string
    # of an ISBN should be put into brackets.
    title = []
    for isbn in isbn_list:
        title.append(df_books.Title[df_books.ISBN == isbn].iloc[0])
    return title

def recommend_books_for_userid(userid, top_n):
    # Recommend the top n books for a user in the database.
    # Returns the singular top ISBN for the top n clusters. if they're already read, 
    # return the next best in that cluster.
    cluster_ratings, tops, recommendations = [], [], []
    svd = SVD(n_factors = SVD_FACTORS, n_epochs=SVD_EPOCHS)
    reader = Reader(rating_scale=(1,10))
    data = Dataset.load_from_df(df_ratings[['User', 'Cluster', 'Rating']], reader)
    svd.fit(data.build_full_trainset())
        
    for cluster in range(len(df_ratings.Cluster.unique())):
        cluster_ratings.append(svd.predict(userid, cluster)[3])
    
    while len(tops) < top_n:
        for i in range(len(cluster_ratings)):
            if cluster_ratings[i] == max(cluster_ratings):
                tops.append(i)
                cluster_ratings[i] = 0
                break
                
    for cluster in tops:
        go, n = True, 1
        while go == True:
            rec = get_top_isbn(cluster, n)
            if rec not in df_ratings.ISBN[df_ratings.User == userid]:
                recommendations.append(rec)
                go = False
            n += 1
    return recommendations

def recommend_books_for_new_data(isbn_list, rating_list, top_n):
    # Takes as inputs a list of ISBNs a second list of ordered ratings, and returns the top n ISBNs.    
    cluster_list = []
    for isbn in isbn_list:
        cluster_list.append(df_books.Cluster[df_books.ISBN == isbn].iloc[0]) # Convert isbn_list to clusters
    new_user = pd.DataFrame()
    new_user['User'] = [-1] * len(cluster_list) # Set userid as -1 becuase that isn't in our data
    new_user['Cluster'] = cluster_list
    new_user['Rating'] = rating_list
    
    reader = Reader(rating_scale=(1,10))
    svd = SVD(n_factors=SVD_FACTORS, n_epochs=SVD_EPOCHS)
    train_set = pd.concat([df_ratings[['User', 'Cluster', 'Rating']], new_user])
    train_set.reset_index(drop=True, inplace=True)
    data = Dataset.load_from_df(train_set, reader)
    
    svd.fit(data.build_full_trainset())
    cluster_preds = [] # Gets populated with SVD predictions for each cluster
    for cluster in range(len(train_set.Cluster.unique())):
        cluster_preds.append(svd.predict(-1, cluster, verbose=False)[3])
    tops = [] # Finds the highest rated n clusters according to SVD
    while len(tops) < top_n:
        for pred in range(len(cluster_preds)):
            if cluster_preds[pred] == max(cluster_preds):
                tops.append(pred)
                cluster_preds[pred] = 0
                break
    
    recommendations = [] # For each of the clusters, find the best book that hasn't been read yet.
    for cluster in tops:
        go, n = True, 1
        while go == True:
            rec = get_top_isbn(cluster, n)
            if rec not in isbn_list:
                recommendations.append(rec)
                go = False
            n += 1           
    return recommendations

def input_ratings(max_ratings):
    # This function is a user facing way to gather books they have read and what they rate them in order
    # to generate recommendations.
    print("Please rate books 1 - 10, press x to begin recommending early, any other input means not read")
    isbns, ratings, asked_books, go = [], [], [], True
    while (len(isbns) < max_ratings) & (go == True):
        book = -1
        while(len(df_ratings[df_ratings.ISBN == book]) < 30) or (book in asked_books):
            book = random.choice(df_books.ISBN)
        asked_books.append(book)
        user_input = input('{} '.format(get_titles([book])[0]))
        if user_input in ['1','2','3','4','5','6','7','8','9','10']:
            isbns.append(book)
            ratings.append(user_input)
        elif user_input == 'x':
            go = False
    print(get_titles(recommend_books_for_new_data(isbns, ratings, 3)))

<h2> Examples for each of the 5 functions

<h3> get_top_isbn </h3>

returns the nth best isbn from a given cluster

In [139]:
print([get_top_isbn(2, 1)])
print([get_top_isbn(32, 1)])
print([get_top_isbn(0, 5)])

['0070064601']
['0152050167']
['0446365505']


These 3 ISBNs are: 
1. The best book from the 2nd cluster
2. The best book from the 32nd cluster
3. The 5th best book from the 0th cluster.

<h3> get_titles </h3>

The 'get_titles' function converts ISBNs into titles:

In [140]:
print(get_titles([get_top_isbn(2, 1)]))
print(get_titles([get_top_isbn(32, 1)]))
print(get_titles([get_top_isbn(0, 5)]))

['Family: The Ties That Bind and Gag!']
['The Whale Rider']
['Pleading Guilty']


In [145]:
input_ratings(5)

Please rate books 1 - 10, press x to begin recommending early, any other input means not read
Unnatural Selections 
Johnny Got His Gun 
The Jury 
Letters for Emily 4
The Night Listener : A Novel 1
Dragon Tears 6
The Janson Directive 
Watership Down 2
The Renegades of Pern (Dragonriders of Pern (Paperback)) 
Black Friday 
Big Trouble 
Kushiel's Dart 
Vertical Run 
The Meaning Of Life 
Once a Thief 8
['The Da Vinci Code', 'Remember When (Roberts, Nora)', 'Johnny Got His Gun']


In [141]:
print(recommend_books_for_userid(df_ratings.User.unique()[37], 5))
print(get_titles(recommend_books_for_userid(df_ratings.User.unique()[37], 5)))

['0553272535', '0394404289', '067942895X', '0618002227', '0345339738']
['The Complete Collected Poems of Maya Angelou', 'The Giver (21st Century Reference)', 'Night', 'Little House in the Big Woods', "Harry Potter and the Sorcerer's Stone (Book 1)"]


The top 5 recommended books for the 37th user in our data. Note that none of these books have been previously rated by this user. The recommender always returns new books.

In [142]:
start = time.time()
isbns = ['8445071408', '8472236552', '8408043641', '950491036X']
ratings = [10, 10, 10, 10]
print('My Books:')
for title in range(len(isbns)):
    print(ratings[0], '/ 10', get_titles([isbns[title]])[0])
print('\nMachine Recommendations:')
for title in get_titles(recommend_books_for_new_data(isbns, ratings, 5)):
    print(title)
print('\nRuntime:', time.time() - start)

My Books:
10 / 10 El Senor De Los Anillos: LA Comunidad Del Anillo (Lord of the Rings (Spanish))
10 / 10 UN Viejo Que Leia Novelas De Amor/the Old Men Who Read Love Stories (ColecciÃ³n Andanzas)
10 / 10 La Sombra Del Viento/The Shadow Of The Wind
10 / 10 La Sombra del Viento

Machine Recommendations:
El principito (Spanish)
The Giving Tree
The Return of the King (The Lord of the Rings, Part 3)
I Know This Much Is True (Oprah's Book Club)
The Importance of Being Earnest

Runtime: 4.234344005584717


I tried entering 4 books in Spanish and rating them 10/10 and the recommender system returns firstly another book in Spanish! It only returns 1 Spanish book because there is only 1 cluster of books in Spanish. The other recommendations must come from clusters that are aggreable to Spanish speakers according to my data.

In [20]:
isbns = ['8445071408', '8472236552', '8408043641', '950491036X']
ratings = [1, 1, 1, 1]
print('New Ratings:')
print(get_titles(isbns), '\n')
print('My Recommendations:')
print(get_titles(recommend_books_for_new_data(isbns, ratings, 5)))

New Ratings:
['El Senor De Los Anillos: LA Comunidad Del Anillo (Lord of the Rings (Spanish))', 'UN Viejo Que Leia Novelas De Amor/the Old Men Who Read Love Stories (ColecciÃ³n Andanzas)', 'La Sombra Del Viento/The Shadow Of The Wind', 'La Sombra del Viento'] 

My Recommendations:
['The Meaning Of Life', 'Traveling Light: Releasing the Burdens You Were Never Intended to Bear', "Ender's Game (Ender Wiggins Saga (Paperback))", 'The Two Towers (The Lord of the Rings, Part 2)', 'The Joy Luck Club']


The same thing, but I rated the 4 Spanish books a 1/10 to see what would happen. It did not recommend any spanish books... But seems to recommend Lord of the Rings very highly regardless of the users taste in Spanish speaking books.

The last function is a user-interactive function that must be used, not shown.

In [49]:
start = time.time()
isbns = ['0451523415', '1573225487', '0312995423', '0440224675', '0743424425', '0590021117']
ratings = [10, 10, 1, 1, 1, 10]
print('My Books:')
for title in range(len(isbns)):
    print(ratings[title], '/ 10', get_titles([isbns[title]])[0])
print('\nMachine Recommendations:')
for title in get_titles(recommend_books_for_new_data(isbns, ratings, 5)):
    print(title)
print('\nRuntime:', time.time() - start)

My Books:
10 / 10 Little Women (Signet Classic)
10 / 10 The Romance Reader
1 / 10 Digital Fortress : A Thriller
1 / 10 Hannibal
1 / 10 The Shining
10 / 10 Little House On the Prairie

Machine Recommendations:
The Secret Life of Bees
On the Banks of Plum Creek
The Velveteen Rabbit
Harry Potter and the Sorcerer's Stone (Book 1)
\Surely You're Joking, Mr. Feynman!\": Adventures of a Curious Character"

Runtime: 2.4221839904785156


In [147]:
start = time.time()
isbns = ['0451523415', '1573225487', '0312995423', '0440224675', '0743424425', '0590021117']
ratings = [1, 1, 10, 10, 10, 1]
print('My Books:')
for title in range(len(isbns)):
    print(ratings[title], '/ 10', get_titles([isbns[title]])[0])
print('\nMachine Recommendations:')
for title in get_titles(recommend_books_for_new_data(isbns, ratings, 5)):
    print(title)
print('\nRuntime:', time.time() - start)

My Books:
1 / 10 Little Women (Signet Classic)
1 / 10 The Romance Reader
10 / 10 Digital Fortress : A Thriller
10 / 10 Hannibal
10 / 10 The Shining
1 / 10 Little House On the Prairie

Machine Recommendations:
Traveling Light: Releasing the Burdens You Were Never Intended to Bear
The Stand: The Complete &amp; Uncut Edition
Against All Enemies: Inside America's War on Terror
Classics of Western Literature: Bloom County 1986-1989
Good Grief : A Novel

Runtime: 3.9836130142211914


In [24]:
n = 5
df_books.iloc[n*50 : (n+1)*50]

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb,AverageRating,NumberRatings,PopularityScore,Cluster
250,0385721420,Three Junes,JULIA GLASS,2003,Anchor,"A luminous first novel, set in Greece, Scotlan...",7.333333,54,-1.170237,296
251,0451202341,Back Roads,Tawni O'Dell,2001,Signet Book,Meet Harley Altmyer. His mother's in prison fo...,7.27451,51,-1.384753,171
252,0743418190,In Her Shoes : A Novel,Jennifer Weiner,2002,Atria Books,The Feller sisters are equal but opposite. Mag...,7.68,25,0.171564,291
253,0385335407,The Kiss,Danielle Steel,2001,Delacorte Press,"In her 53rd bestselling novel, Danielle Steel ...",7.416667,12,-0.521915,144
254,0060923245,Sweet Hereafter Movie Tie-In : A Novel,Russell Banks,1992,Perennial,When fourteen children from the small town of ...,8.117647,17,1.390956,53
255,0060913509,In Country RI,Bobbie Ann Mason,1986,Perennial,"In the summer of 1984, the war in Vietnam come...",6.2,5,-2.296186,491
256,0553272837,Brazen Virtue,Nora Roberts,1996,Bantam Books,When a superstar mystery writer visits her sis...,7.103448,29,-1.761945,7
257,0440225078,Ghost Moon,Karen Robards,2001,Dell Publishing Company,"Nine years after leaving in disgrace, Olivia M...",7.888889,9,0.576087,310
258,0425181111,Strangers,Dean R. Koontz,2002,Berkley Publishing Group,Six strangers are unaccountably seized by nigh...,7.571429,28,-0.184178,360
259,0380732688,Milk and Honey (Peter Decker/Rina Lazarus Novels),Faye Kellerman,2002,Avon,In the silent pre-dawn city hours -- alone wit...,8.0,14,0.985159,137


In [42]:
df_books.Blurb[df_books.Title == 'Traveling Light: Releasing the Burdens You Were Never Intended to Bear'].iloc[0]

"Weary travelers. You've seen them -- everything they own crammed into their luggage. Staggering through terminals and hotel lobbies with overstuffed suitcases, trunks, duffels, and backpacks.,Backs ache. Feet burn. Eyelids droop.,We've all seen people like that.,At times, we are people like that -- if not with our physical luggage, then at least with our spiritual load.,We all lug loads we were never intended to carry. Fear. Worry. Discontent. No wonder we get so weary. We're worn out from carrying that excess baggage. Wouldn't it be nice to lose some of those bags?,That's the invitation of Max Lucado. With the Twenty-third Psalm as our guide, let's release some of the burdens we were never intended to bear.,Using these verses as a guide, Max Lucado walks us through a helpful inventory of our burdens. May God use this Psalm to remind you to release the burdens you were never meant to bear."

In [40]:
df_books[df_books.Cluster == 216]

Unnamed: 0,ISBN,Title,Author,Year,Publisher,Blurb,AverageRating,NumberRatings,PopularityScore,Cluster
110,0451186648,Silent Snow,Steve Thayer,2000,Signet Book,Rick Beanblossom is the Twin Cities' top inves...,7.692308,13,0.168279,216
190,0440224675,Hannibal,Thomas Harris,2000,Dell Publishing Company,You remember Hannibal Lecter: gentleman. geniu...,7.158416,101,-2.161190,216
204,0843136685,Sheldon &amp; Mrs. Levine: An Excruciating Cor...,Sam Bobrick,1994,Price Stern Sloan,"Humor, Letters sent from Mrs. Levine to Sheldo...",4.200000,5,-5.515062,216
1163,055357549X,Holy Fire: A Novel (Bantam Spectra Book),Bruce Sterling,1997,Bantam Books,"Bruce Sterling, named ""one of the best thinker...",8.111111,9,1.064359,216
1220,0425144569,For All Eternity,Linda Lael Miller,1994,Berkley Publishing Group,She is a creature of the night. Beautiful. Sed...,8.571429,7,1.838356,216
1494,0449219631,Jaws,Peter Benchley,1991,Fawcett Books,"Jaws,The suspense. The shocks. The shark. It's...",7.571429,7,-0.107554,216
1497,0451458125,"Fool Moon (The Dresden Files, Book 2)",Jim Butcher,2001,New American Library,Lost Items Found. Paranormal Investigations. C...,8.200000,10,1.320071,216
1602,0882893327,The Voodoo Queen: A Novel (Pelican Pouch Series),Robert Tallant,1983,Pelican Publishing Company,Witch? Sorceress? Daughter of Satan? Thief? Sa...,7.800000,5,0.278915,216
1766,0312924585,Silence of the Lambs,Thomas Harris,1991,St. Martin's Press,Hannibal Lecter. The ultimate villain of moder...,8.366071,112,3.488721,216
1831,0811825558,The Worst-Case Scenario Survival Handbook,Joshua Piven,1999,Chronicle Books,Danger! It lurks at every corner. Volcanoes. S...,7.700000,40,0.270393,216


<h2> Model Evaluation </h2>

MAP and MAR don't seem like good evaluation metrics here because there are 10,000 items in the dataset, and most of our users have not read a high enough percentage of the books.

Coverage: Our recommender system has 548 clusters and 10,000 items, making it's coverage %5.48...Except that if a user has already read a book it can recommend another so in another sense the coverage is % 100

In [133]:
SVD_FACTORS = 50
SVD_EPOCHS  = 12
num_users = len(df_ratings.User.unique())
recs_per_user = 20

print('Number of users in dataset:', num_users)
print('Total number of Clusters:', len(df_books.Cluster.unique()))

user_vectors = np.zeros((num_users, len(df_books)))
user_ids = df_ratings.User.unique()[:num_users]
unique_recommendations = []
for userid in range(num_users):
    recommendations = recommend_books_for_userid(user_ids[userid], recs_per_user)
    for rec in recommendations:
        if recnot in unique_recommendations:
            unique_recommendations.append(rec)
print('Total number of unique recommednations for {} users:'.format(num_users), len(unique_recommendations))
print('with {} rec per user'.format(recs_per_user))
print('Coverage: %', 100 * len(unique_recommendations) / len(df_books))

Number of users in dataset: 495
Total number of Clusters: 548
Total number of unique recommednations for 495 users: 138
with 20 rec per user
Coverage: % 1.2580909836812837


In [136]:
SVD_FACTORS = 200
SVD_EPOCHS = 40
num_users = len(df_ratings.User.unique())
recs_per_user = 20

print('Number of users in dataset:', num_users)
print('Total number of Clusters:', len(df_books.Cluster.unique()))

user_vectors = np.zeros((num_users, len(df_books)))
user_ids = df_ratings.User.unique()[:num_users]
unique_recommendations, unique_clusters = [], []
for userid in range(num_users):
    recommendations = recommend_books_for_userid(user_ids[userid], recs_per_user)
    for rec in recommendations:
        if rec not in unique_recommendations:
            unique_recommendations.append(rec)
        cluster = df_books.Cluster[df_books.ISBN == rec].iloc[0]
        if cluster not in unique_clusters:
            unique_clusters.append(cluster)
print('Total number of unique recommendations for {} users:'.format(num_users), len(unique_recommendations))
print('with {} rec per user'.format(recs_per_user))
print('ISBN Coverage: %', 100 * len(unique_recommendations) / len(df_books))
print('Cluster Coverage: %', 100 * len(unique_clusters) / len(df_books.Cluster.unique()))

Number of users in dataset: 495
Total number of Clusters: 548
Total number of unique recommendations for 495 users: 443
with 20 rec per user
ISBN Coverage: % 4.038654389643541
Cluster Coverage: % 80.83941605839416


In [129]:
# Personalization
SVD_FACTORS = 100
SVD_EPOCHS = 25

num_users = len(df_ratings.User.unique())
print('Number of users in dataset:', num_users)
user_vectors = np.zeros((num_users, len(df_books)))
user_ids = df_ratings.User.unique()[:num_users]
for userid in range(num_users):
    recommendations = recommend_books_for_userid(user_ids[userid], 20)
    for isbn in range(len(df_books)):
        if df_books.ISBN[isbn] in recommendations:
            user_vectors[userid][isbn] = 1
            
mean_cosine, count = 0, 0
for i in range(num_users):
    for j in range(i + 1, num_users):
        mean_cosine += cosine(user_vectors[i], user_vectors[j])
        count += 1
mean_cosine /= count
mean_cosine = 1 - mean_cosine
print(mean_cosine)

Number of users in dataset: 495
0.22407434670590076
