# Dataset
https://www.kaggle.com/zygmunt/goodbooks-10k

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Load data

In [14]:
ratings_data = pd.read_csv('./data/ratings.csv')
books_metadata = pd.read_csv('./data/books.csv')
ratings_data.head(10)

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4
5,1,2077,4
6,1,2487,4
7,1,2900,5
8,1,3662,4
9,1,3922,5


# Create the surprise dataset
The framework expects a Dataset object with three feilds: userIDs, itemIDs, and rating\
Can load the dataset object from directly from the pandas dataframe, or from a csv\
A Reader class is used to parse the Dataset (each line (userID, itemID, rating)



In [15]:
from surprise import Dataset
from surprise import Reader

reader = Reader(rating_scale=(1, 5)) #Can change scale based on your requirements

#The order of the data is important, must be user, item, user's rating for the item
data = Dataset.load_from_df(ratings_data[['user_id', 'book_id', 'rating']], reader) 

# Train the model
Suprise supports several recommender system algorithms\
See https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html \
The details have been abstracted away, so executing different algorithms is a simple as creating new classes\
<code> for algorithm in [SVD(),KNNBasic(),KNNWithMeans(),KNNWithZScore(), CoClustering(), SlopeOne(), SVDpp(), BaselineOnly()]:\
    #do stuff here using the Surprise algorithm API
</code>

In [16]:
from surprise import SVD
from surprise.model_selection import cross_validate

svd = SVD(verbose=True, n_epochs=10) 
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True) #Auto-magic cross validation....NOICE!!!

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8558  0.8571  0.8555  0.8561  0.0007  
MAE (testset)     0.6753  0.6760  0.6745  0.6753  0.0006  
Fit time          28.67   30.48   29.42   29.52   0.74    
Test time         5.71    4.89    5.30    5.30    0.33    


{'test_rmse': array([0.85578815, 0.85705833, 0.85547575]),
 'test_mae': array([0.67531517, 0.67601235, 0.6744978 ]),
 'fit_time': (28.66756844520569, 30.475956439971924, 29.416333436965942),
 'test_time': (5.706516981124878, 4.888370752334595, 5.297398567199707)}

# Ready to predict
Now that we are satisfied with the prediction accuracy we can use the full dataset to make predictions

In [17]:
trainset = data.build_full_trainset() #Prepare all the data we have for training

svd.fit(trainset) #Train the model

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1cc34eb8a60>

# Make some predictions

Call the <code>.predict</code> method\
The <code>est</code> value is the ratings prediction for the query

In [18]:
svd.predict(uid=10, iid=100)

Prediction(uid=10, iid=100, r_ui=None, est=4.028136979372539, details={'was_impossible': False})

## User and item mappings
Surprise refers to the user and item IDs provided in the data set as <code>raw user</code> and <code>raw item</code> IDs\
There are some useful functions for converting between <code>internal</code> and <code>raw</code> user IDs\
<code>.trainset.to_raw_uid\
.trainset.to_raw_iid\
.trainset.to_inner_uid\
.trainset.to_inner_iid
</code>

In [19]:
trainset.to_raw_uid(10)

5379

In [20]:
trainset.to_raw_iid(100)

101

## Custom predicitons
Can go under the hood of the SVD and use the user and item embeddings, and biases to make custom predictions\
Suprise has some useful functions from the uid an item to user and item mappings respectively\

To demonstrate making a custom predciton, we can define a dot product prediciton\

In [21]:
user = 5379 #Surprise preserves the type of the raw user and item embeddings
item = 101

uid = trainset.to_inner_uid(user) #map the raw user ID to the internal uid
iid = trainset.to_inner_iid(item)  #map the raw item ID to the internal iid 
print(uid, iid)


10 100


In [22]:
uvector = svd.pu[uid] #extract the user embedding
print(uvector)

[-5.81082863e-02 -2.32527091e-02 -1.24756261e-01  1.35591632e-01
  1.18585389e-01 -1.94933588e-02 -6.54001093e-02 -2.88187968e-02
 -9.37582622e-02  8.01520218e-02 -1.17874494e-01  1.75216017e-02
 -8.64168510e-02  1.86225672e-02 -2.97290703e-02  1.51635941e-01
  3.68247614e-02  2.80391034e-01  1.35803257e-01  5.17600251e-02
  2.99344118e-02 -7.88225850e-02 -4.45167837e-02  4.75651527e-02
 -8.71605462e-02  4.30358086e-02  1.01875490e-02 -1.32967589e-01
 -8.19796466e-02 -9.40823156e-02 -1.28297262e-03 -1.27267324e-01
 -6.79648256e-02  9.30337102e-02  1.70656026e-01  2.31275396e-03
 -1.88771915e-01  8.09973087e-02 -9.66459912e-02 -7.25819536e-03
 -7.94156446e-02  7.67816262e-02  1.06881049e-01 -3.41988189e-02
  4.19124582e-02  9.72208258e-02 -2.08582696e-02 -1.07459366e-01
 -9.72261978e-02  4.42551029e-02  6.17436554e-02 -6.42290897e-02
 -1.30673359e-01 -8.55306319e-02 -5.78578299e-02  4.61501003e-02
 -5.52830179e-02 -4.25145778e-02 -2.95152029e-02 -1.91261241e-01
  1.63502367e-01 -6.75528

In [23]:
ivector = svd.qi[iid] #extract the user embedding 
print(ivector)

[-0.02378776 -0.15380413  0.17565923 -0.03419546 -0.04165622 -0.04454289
 -0.01546681  0.23271852 -0.00386554  0.09491584  0.02855786  0.00261349
  0.09381375 -0.21346082 -0.19223475  0.12747813  0.0745868  -0.10038401
 -0.07131176  0.01091166 -0.04587154  0.02146601 -0.04437396  0.02788106
  0.02576024 -0.06907332  0.05070058 -0.03646311 -0.02376708 -0.07432859
  0.17896058 -0.10859144  0.08490717 -0.1442716   0.02666435  0.17632242
  0.15858055 -0.07253314 -0.0680954   0.05931636  0.01002315  0.11373707
  0.06897613 -0.10931415 -0.10880217  0.15888985  0.0647619   0.07298371
 -0.02878435 -0.21046617  0.06447144 -0.02207366  0.05506398 -0.1095088
  0.02132166 -0.24151452  0.02355563 -0.25422441 -0.01416074 -0.10717286
 -0.07200942 -0.15783712  0.09717583  0.11382237 -0.02599864  0.01087289
 -0.01917045 -0.0971077  -0.08247491 -0.28442451  0.11531487  0.01051354
 -0.19213853  0.10221332 -0.15383829  0.02082981 -0.14130888 -0.09470028
  0.06724253  0.05602134  0.1858196  -0.22831001 -0.

In [24]:
ubias = svd.bu[uid] #user bias 
print(ubias)

0.03411892056129956


In [25]:
ibias = svd.bi[iid] #item bias
print(ibias)

0.049357048056888696


In [26]:
global_mean = svd.trainset.global_mean #Think of this as 'zero' i.e., normalized data
print(global_mean)

3.8565335989797873


In [27]:
dp = np.dot(uvector,ivector)
est = global_mean + ubias + ibias + dp
print(est)

3.9295855379797024


# Custom recommendations

### Define some helper methods

In [28]:
import difflib
import random

def get_book_id(book_title, metadata):
    
    """
    Gets the book ID for a book title based on the closest match in the metadata dataframe.
    """
    
    existing_titles = list(metadata['title'].values)
    closest_titles = difflib.get_close_matches(book_title, existing_titles) #Returns the titles that are similar 
    
    #Assuming the book title is in the dataset, it must be the cloeset match
    book_id = metadata[metadata['title'] == closest_titles[0]]['id'].values[0] 
    return book_id

def get_book_info(book_id, metadata):
    
    """
    Returns some basic information about a book given the book id and the metadata dataframe.
    """
    
    book_info = metadata[metadata['id'] == book_id][['id', 'isbn', 
                                                    'authors', 'title', 'original_title']]
    return book_info.to_dict(orient='records')

def predict_review(user_id, book_title, model, metadata):
    
    """
    Predicts the review (on a scale of 1-5) that a user would assign to a specific book. 
    """
    
    book_id = get_book_id(book_title, metadata)
    review_prediction = model.predict(uid=user_id, iid=book_id)
    return review_prediction.est

Custom recommendation: Return the first book that the use would rate >= 4

In [29]:
def generate_recommendation(user_id, model, metadata, thresh=4):
    
    """
    Generates a book recommendation for a user based on a rating threshold. Only
    books with a predicted rating at or above the threshold will be recommended
    """
    
    book_titles = list(metadata['title'].values)
    random.shuffle(book_titles)
    
    for book_title in book_titles:
        rating = predict_review(user_id, book_title, model, metadata)
        if rating >= thresh:
            book_id = get_book_id(book_title, metadata)
            return get_book_info(book_id, metadata)


In [30]:
generate_recommendation(1000, svd, books_metadata)

[{'id': 2334,
  'isbn': '1408816032',
  'authors': 'Madeline Miller',
  'title': 'The Song of Achilles',
  'original_title': 'The Song of Achilles'}]

Find the top k recommendations\
The following runs a bit slow since it is an exhaustive search
It can be optimipzied, for each user based on pre-computed user and item embeddings\
Once the raiting information is not updated, the user-item embeddings will be the same

In [31]:
def generate_recommendation_top_k(user_id, model, metadata, thresh=4, k=5):
    
    """
    Generates a book recommendation for a user based on a rating threshold. Only
    the top k books with a predicted rating at or above the threshold will be recommended
    """
    
    book_titles = list(metadata['title'].values)
    random.shuffle(book_titles)
    
    recommendations = []
    
    for book_title in book_titles:
        rating = predict_review(user_id, book_title, model, metadata)
        if rating >= thresh:
            book_id = get_book_id(book_title, metadata)
            recommendations.append(get_book_info(book_id, metadata)[0])
            if len(recommendations) == k:
                return recommendations
            
    return recommendations

In [32]:
generate_recommendation_top_k(1000, svd, books_metadata)

[{'id': 4749,
  'isbn': '743449746',
  'authors': 'Stephen E. Ambrose',
  'title': 'D-Day, June 6, 1944: The Battle for the Normandy Beaches',
  'original_title': 'D-Day June 6, 1944: The Climactic Battle of WWII'},
 {'id': 9725,
  'isbn': nan,
  'authors': 'Neal Shusterman',
  'title': 'UnDivided (Unwind, #4)',
  'original_title': 'UnDivided'},
 {'id': 7630,
  'isbn': '1592289444',
  'authors': 'Slavomir Rawicz',
  'title': 'The Long Walk: The True Story of a Trek to Freedom',
  'original_title': 'The Long Walk: The True Story of a Trek to Freedom'},
 {'id': 1426,
  'isbn': '60878061',
  'authors': 'Jodi Picoult',
  'title': 'Keeping Faith',
  'original_title': 'Keeping Faith'},
 {'id': 81,
  'isbn': '074324754X',
  'authors': 'Jeannette Walls',
  'title': 'The Glass Castle',
  'original_title': 'The Glass Castle'}]

# Custom algorithm
https://surprise.readthedocs.io/en/stable/building_custom_algo.html

Extend the AlgoBase class\
Provide implementatios for the <code>fit</code> and <code>estimate</code> methods\
General rule, where applicable, call the parent method before providing custom implementations

In [33]:
from surprise import AlgoBase
from surprise import Dataset
from surprise.model_selection import cross_validate

class MyOwnAlgorithm(AlgoBase):

    def __init__(self):

        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def fit(self, trainset):

        # Here again: call base method before doing anything.
        AlgoBase.fit(self, trainset)

        # Compute the average rating. We might as well use the
        # trainset.global_mean attribute ;)
        self.the_mean = np.mean([r for (_, _, r) in
                                 self.trainset.all_ratings()])

        return self

    def estimate(self, u, i):

        return self.the_mean

In [34]:
algo = MyOwnAlgorithm()
cross_validate(algo, data, verbose=True)

Evaluating RMSE, MAE of algorithm MyOwnAlgorithm on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9824  0.9864  0.9824  0.9833  0.9852  0.9839  0.0016  
MAE (testset)     0.7853  0.7889  0.7858  0.7862  0.7881  0.7869  0.0014  
Fit time          0.57    0.83    0.82    0.91    0.87    0.80    0.12    
Test time         1.69    1.66    1.63    2.11    1.77    1.77    0.17    


{'test_rmse': array([0.98244045, 0.9864007 , 0.98237982, 0.98327633, 0.98520317]),
 'test_mae': array([0.78534157, 0.78888395, 0.78584321, 0.78620255, 0.78812397]),
 'fit_time': (0.5670411586761475,
  0.8260617256164551,
  0.817063570022583,
  0.9080674648284912,
  0.8700647354125977),
 'test_time': (1.6881277561187744,
  1.655123233795166,
  1.6321206092834473,
  2.1071600914001465,
  1.7721333503723145)}

In [76]:
#from surprise import *

benchmark = []

for algorithm in [SVD(), MyOwnAlgorithm()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=2, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.857134,44.899949,8.799812
MyOwnAlgorithm,0.983941,0.543883,6.178328


dict_keys([314, 439, 588, 1169, 1185, 2077, 2487, 2900, 3662, 3922, 5379, 5461, 5885, 6630, 7563, 9246, 10140, 10146, 10246, 10335, 10610, 10944, 11854, 11927, 12471, 13282, 13544, 15494, 16377, 16913, 17434, 17663, 17984, 18031, 18313, 18361, 20076, 20467, 20848, 21228, 21487, 21713, 22602, 23576, 23612, 24326, 24389, 24499, 24834, 24845, 25164, 25182, 25214, 26145, 26629, 26661, 28158, 28767, 29123, 29703, 30681, 31001, 32055, 32305, 32592, 32635, 32748, 32923, 33065, 33697, 33716, 33872, 33890, 37284, 37834, 38080, 38082, 38475, 39423, 41074, 42404, 43985, 44243, 44397, 45269, 45493, 46977, 47476, 47746, 47800, 48482, 49298, 50104, 50342, 51166, 51460, 51480, 51838, 52036, 53245, 3022, 5115, 5436, 6063, 6342, 8167, 9731, 10111, 10288, 10509, 10751, 11285, 11408, 11691, 11692, 11868, 11945, 12874, 12946, 13794, 14372, 14546, 14603, 15604, 17566, 17643, 19526, 19724, 19729, 19942, 21217, 21676, 21733, 27499, 30313, 30944, 32918, 36099, 42508, 42810, 46421, 47478, 48559, 48687, 50096, 

# Link to a dash baord

In [None]:
import dash
import dash_html_components as html
import dash_core_components as dcc

import pandas as pd
import dash_table

def _uid_to_username(uid):
    return trainset.to_raw_uid(uid)

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

_default_userID = 1000
_default_un = _uid_to_username(_default_userID)
df  = pd.DataFrame(generate_recommendation_top_k(_default_userID, svd, books_metadata))

#Make data for drop down
options_data = []
for (raw, inner ) in iteritems(trainset._raw2inner_id_users):
    options_data.append({ 'label': raw , 'value' : inner })
    

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)

dropdown = html.Div([
    dcc.Dropdown(
        id='demo-dropdown',
        options=options_data,
        value=_default_userID
    ),
    html.Div(id='dd-output-container', children='Book recommendations for user {}'.format(_default_un))
])


table = dash_table.DataTable(
    id='table',
    columns=[{"name": i, "id": i} for i in df.columns],
    data=df.to_dict('records'),
)

app.layout = html.Div([dropdown, table])


@app.callback(
    [dash.dependencies.Output('dd-output-container', 'children'), Output("table", "data")],
    [dash.dependencies.Input('demo-dropdown', 'value')])
def update_output(value):
    df  = pd.DataFrame(generate_recommendation_top_k(int(value), svd, books_metadata))
    _un = _uid_to_username(value)
    return 'Book recommendations for user {}'.format(_un), df.to_dict('records')


if __name__ == '__main__':
    app.run_server()

# References
https://towardsdatascience.com/how-you-can-build-simple-recommender-systems-with-surprise-b0d32a8e4802 \
https://www.kaggle.com/zygmunt/goodbooks-10k \
https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html \
https://surprise.readthedocs.io/en/stable/building_custom_algo.html