In [1]:
import pandas as pd
import numpy as np
import time

from surprise.prediction_algorithms.matrix_factorization import SVD
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate

In [2]:
fast_df = pd.read_csv('/DataScience/Final Capstone Files/fast_df.csv')
perf_df = pd.read_csv('/DataScience/Final Capstone Files/performance_df.csv')

In [3]:
# Setting up ratings 

df_ratings = pd.read_csv('/DataScience/BX-CSV-Dump/BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding = "latin-1")
df_ratings.rename(columns={'User-ID': 'User', 'Book-Rating': 'Rating'}, inplace=True)
df_ratings = df_ratings[df_ratings['Rating'] > 0]
df_ratings.reset_index(inplace=True, drop=True)

fast_ratings = df_ratings.copy()
fast_ratings = fast_ratings[fast_ratings.ISBN.isin(fast_df.ISBN.unique())]
cluster_list = []
for isbn in fast_ratings.ISBN:
    cluster_list.append(fast_df.Cluster[fast_df.ISBN == isbn].iloc[0])
fast_ratings['Cluster'] = cluster_list

perf_ratings = df_ratings.copy()
perf_ratings = perf_ratings[perf_ratings.ISBN.isin(perf_df.ISBN.unique())]
cluster_list = []
for isbn in perf_ratings.ISBN:
    cluster_list.append(perf_df.Cluster[perf_df.ISBN == isbn].iloc[0])
perf_ratings['Cluster'] = cluster_list

In [4]:
print('Cross Validation on SVD for our fast, performance, and baseline models.')
reader = Reader(rating_scale=(1,10))

# Base
print('\nBaseline:')
start = time.time()
base_svd = SVD(n_factors=500, n_epochs=40)
base_data = Dataset.load_from_df(df_ratings, reader)
cross_validate(base_svd, base_data, ['RMSE'], cv=4, verbose=True)
print(time.time() - start)


# Fast
print('\nFast:')
start = time.time()
cluster_dict = {}
for user in fast_ratings.User.unique():
    x = fast_ratings[fast_ratings.User == user]
    cluster_dict[user] = {}
    for cluster in x.Cluster.unique():
        cluster_dict[user][cluster] = np.mean(x.Rating[x.Cluster == cluster])

new_ratings = []
for rating in range(len(fast_ratings)):
    new_ratings.append(cluster_dict[fast_ratings.iloc[rating, 0]][fast_ratings.iloc[rating, 3]])
test_eval = fast_ratings.copy()
test_eval['Rating'] = new_ratings

x = test_eval[['User', 'Cluster', 'Rating']]
fast_svd = SVD(n_factors=30, n_epochs=10)
fast_data = Dataset.load_from_df(x, reader)
cross_validate(fast_svd, fast_data, ['RMSE'], cv=4, verbose=True)
print(time.time() - start)

# Performance
print('\nPerformance:')
start = time.time()
cluster_dict = {}
for user in perf_ratings.User.unique():
    x = perf_ratings[perf_ratings.User == user]
    cluster_dict[user] = {}
    for cluster in x.Cluster.unique():
        cluster_dict[user][cluster] = np.mean(x.Rating[x.Cluster == cluster])

new_ratings = []
for rating in range(len(perf_ratings)):
    new_ratings.append(cluster_dict[perf_ratings.iloc[rating, 0]][perf_ratings.iloc[rating, 3]])
test_eval = perf_ratings.copy()
test_eval['Rating'] = new_ratings

x = test_eval[['User', 'Cluster', 'Rating']]
perf_svd = SVD(n_factors=500, n_epochs=40)
perf_data  = Dataset.load_from_df(x, reader)
cross_validate(perf_svd, perf_data, ['RMSE'], cv=4, verbose=True)
print(time.time() - start)

Cross Validation on SVD for our fast, performance, and baseline models.

Baseline:
Evaluating RMSE of algorithm SVD on 4 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Mean    Std     
RMSE (testset)    1.6599  1.6609  1.6618  1.6592  1.6605  0.0010  
Fit time          261.47  268.43  276.38  272.78  269.76  5.55    
Test time         2.45    2.16    2.20    1.78    2.15    0.24    
1099.3573048114777

Fast:
Evaluating RMSE of algorithm SVD on 4 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Mean    Std     
RMSE (testset)    1.3729  1.3724  1.3925  1.3795  1.3793  0.0081  
Fit time          4.43    4.32    4.60    4.12    4.37    0.17    
Test time         1.09    1.17    0.95    0.95    1.04    0.09    
262.05074524879456

Performance:
Evaluating RMSE of algorithm SVD on 4 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Mean    Std     
RMSE (testset)    1.4909  1.4833  1.4880  1.4866  1.4872  0.0027  
Fit time          210.44  181.92  174.80 

In [40]:
df_ratings.head()

Unnamed: 0,User,ISBN,Rating
0,276726,0155061224,5
1,276729,052165615X,3
2,276729,0521795028,6
3,276736,3257224281,8
4,276737,0600570967,6


In [63]:
def get_top_isbn(cluster, df, n):
    sorted_scores = sorted(df.PopularityScore[df.Cluster == cluster])
    return df.ISBN[df.PopularityScore == sorted_scores[-n]].iloc[0]

def get_titles(isbn_list, df):
    title = []
    for isbn in isbn_list:
        title.append(df.Title[df.ISBN == isbn].iloc[0])
    return title

def recommend_books_for_userid(userid, df, top_n):
    # predict all clusters
    cluster_ratings = []
    if len(df) == len(fast_df):
        svd = SVD(n_factors = 30, n_epochs=10)      
    else:
        svd = SVD(n_factors = 500, n_epochs=40)
        
    reader = Reader(rating_scale=(1,10))
    data = Dataset.load_from_df(df[['User', 'Cluster', 'Rating']], reader)
    svd.fit(data.build_full_trainset())
        
    for cluster in range(len(df.Cluster.unique())):
        cluster_ratings.append(svd.predict(userid, cluster)[3])
    tops = []
    while len(tops) < top_n:
        for i in range(len(cluster_ratings)):
            if cluster_ratings[i] == max(cluster_ratings):
                tops.append(i)
                cluster_ratings[i] = 0
                break
                
    recommendations = []
    for cluster in tops:
        go, n = True, 1
        while go == True:
            rec = get_top_isbn(cluster, perf_df, n)
            if rec not in df_ratings.ISBN[df_ratings.User == userid]:
                recommendations.append(rec)
                go = False
            n += 1
    return recommendations

def recommend_books_for_new_data(isbn_list, rating_list, df, top_n):
    
    start = time.time()
    
    cluster_list = []
    for isbn in isbn_list:
        cluster_list.append(df.Cluster[df.ISBN == isbn].iloc[0])
    new_user = pd.DataFrame()
    new_user['User'] = [-1] * len(cluster_list)
    new_user['Cluster'] = cluster_list
    new_user['Rating'] = rating_list
    
    print('Phase 1:', time.time() - start)
    start = time.time()
    
    reader = Reader(rating_scale=(1,10))
    svd = SVD(n_factors=30, n_epochs=10)
    train_set = pd.concat([fast_ratings, new_user])
    train_set.reset_index(drop=True, inplace=True)
    data = Dataset.load_from_df(train_set, reader)
    
    print('Phase 2:', time.time() - start)
    start = time.time()
    
    svd.fit(data.build_full_trainset())
    cluster_preds = []
    for cluster in range(len(train_set.Cluster.unique())):
        cluster_preds.append(svd.predict(-1, cluster, verbose=False)[3])
        
    print('Phase 3:', time.time() - start)
    start = time.time()
    
    tops = []
    while len(tops) < top_n:
        for pred in range(len(cluster_preds)):
            if cluster_preds[pred] == max(cluster_preds):
                tops.append(pred)
                cluster_preds[pred] = 0
                break
    print('Phase 4:', time.time() - start)
    start = time.time()
    
    recommendations = []
    for cluster in tops:
        go, n = True, 1
        while go == True:
            rec = get_top_isbn(cluster, perf_df, n)
            if rec not in fast_ratings.ISBN[fast_ratings.User == userid]:
                recommendations.append(rec)
                go = False
            n += 1

    print('Phase 5:', time.time() - start)
                    
    return recommendations

def input_ratings(max_ratings):
    print("Please rate books 1 - 10, press x to begin recommending early, any other input means not read")
    isbns = []
    ratings = []
    asked_books = []
    go = True
    while (len(isbns) < max_ratings) & (go == True):
        book = -1
        while(len(df_ratings2[df_ratings2.ISBN == book]) < 30) or (book in asked_books):
            book = random.choice(popularity_df.ISBN)
        asked_books.append(book)
        user_input = input('{} '.format(get_titles([book])[0]))
        if user_input in ['1','2','3','4','5','6','7','8','9','10']:
            isbns.append(book)
            ratings.append(user_input)
        elif user_input == 'x':
            go = False
    print(get_titles(recommend_books_for_new_data(isbns, ratings, 3)))

In [116]:
start = time.time()
books = recommend_books_for_userid(fast_ratings.User.unique()[1237], fast_ratings, 3)
print(get_titles(books, fast_df))
print(time.time() - start)

["The Hitchhiker's Guide to the Galaxy", 'Chobits (Chobits)', 'A Little Princess']
326.04071497917175


In [114]:
get_titles(fast_ratings.ISBN[fast_ratings.User == fast_ratings.User.unique()[1237]], fast_df)

['The Poisonwood Bible',
 'The Night Listener : A Novel',
 'Of Mice and Men (Penguin Great Books of the 20th Century)',
 'The Lost World',
 'Shopaholic Ties the Knot',
 'Carriers',
 'Earthshine: A Novel',
 '20,000 Leagues Under the Sea (Bantam Classics)',
 'Sense and Sensibility (Bantam Classics)',
 'Soulmates',
 "Don't Know Much About Geography: Everything You Need to Know About the World but Never Learned",
 'The Devil Wears Prada : A Novel',
 'A Density of Souls',
 'Naked Lunch',
 'The Front Runner',
 'Blue Lawn']

In [115]:
fast_ratings[fast_ratings.User == fast_ratings.User.unique()[1237]]

Unnamed: 0,User,ISBN,Rating,Cluster
8707,4622,0060175400,10,54
8708,4622,006093090X,4,57
8709,4622,0140177396,8,91
8710,4622,034540288X,4,85
8711,4622,0385336179,10,94
8712,4622,0425154882,7,95
8713,4622,0440219892,6,49
8715,4622,0553212524,8,85
8716,4622,0553213342,5,62
8717,4622,0553251503,5,73
