# Dataset
https://www.kaggle.com/zygmunt/goodbooks-10k

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Load data

In [2]:
ratings_data = pd.read_csv('./data/ratings.csv')
books_metadata = pd.read_csv('./data/books.csv')
ratings_data.head(10)

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4
5,1,2077,4
6,1,2487,4
7,1,2900,5
8,1,3662,4
9,1,3922,5


# Create the surprise dataset
The framework expects a Dataset object with three feilds: userIDs, itemIDs, and rating\
Can load the dataset object from directly from the pandas dataframe, or from a csv\
A Reader class is used to parse the Dataset (each line (userID, itemID, rating)



In [7]:
try:
    from surprise import Dataset
except ImportError:
    !conda install -c conda-forge scikit-surprise -y
    from surprise import Dataset

from surprise import Reader

reader = Reader(rating_scale=(1, 5)) #Can change scale based on your requirements

#The order of the data is important, must be user, item, user's rating for the item
data = Dataset.load_from_df(ratings_data[['user_id', 'book_id', 'rating']], reader) 

# Train the model
Suprise supports several recommender system algorithms\
See https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html \
The details have been abstracted away, so executing different algorithms is a simple as creating new classes\
<code> for algorithm in [SVD(),KNNBasic(),KNNWithMeans(),KNNWithZScore(), CoClustering(), SlopeOne(), SVDpp(), BaselineOnly()]:\
    #do stuff here using the Surprise algorithm API
</code>

In [9]:
from surprise import SVD
from surprise.model_selection import cross_validate

svd = SVD(verbose=True, n_epochs=3) 
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True) #Auto-magic cross validation....NOICE!!!

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 0
Processing epoch 1
Processing epoch 2
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8875  0.8885  0.8902  0.8887  0.0011  
MAE (testset)     0.7077  0.7095  0.7102  0.7091  0.0011  
Fit time          7.16    7.81    7.81    7.59    0.31    
Test time         5.15    4.63    4.54    4.77    0.27    


{'test_rmse': array([0.88746273, 0.88851717, 0.89015986]),
 'test_mae': array([0.70768728, 0.70948701, 0.7101959 ]),
 'fit_time': (7.157114028930664, 7.806339740753174, 7.814021825790405),
 'test_time': (5.152777194976807, 4.625096559524536, 4.544297933578491)}

# Ready to predict
Now that we are satisfied with the prediction accuracy we can use the full dataset to make predictions

In [10]:
trainset = data.build_full_trainset() #Prepare all the data we have for training

svd.fit(trainset) #Train the model

Processing epoch 0
Processing epoch 1
Processing epoch 2


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f9b6ad632e0>

# Make some predictions

Call the <code>.predict</code> method\
The <code>est</code> value is the ratings prediction for the query

In [12]:
svd.predict(uid=10, iid=100)

Prediction(uid=10, iid=100, r_ui=None, est=4.054441370797001, details={'was_impossible': False})

## User and item mappings
Surprise refers to the user and item IDs provided in the data set as <code>raw user</code> and <code>raw item</code> IDs\
There are some useful functions for converting between <code>internal</code> and <code>raw</code> user IDs\
<code>.trainset.to_raw_uid\
.trainset.to_raw_iid\
.trainset.to_inner_uid\
.trainset.to_inner_iid
</code>

In [13]:
trainset.to_raw_uid(10)

5379

In [14]:
trainset.to_raw_iid(100)

101

## Custom predicitons
Can go under the hood of the SVD and use the user and item embeddings, and biases to make custom predictions\
Suprise has some useful functions from the uid an item to user and item mappings respectively\

To demonstrate making a custom predciton, we can define a dot product prediciton\

In [15]:
user = 5379 #Surprise preserves the type of the raw user and item embeddings
item = 101

uid = trainset.to_inner_uid(user) #map the raw user ID to the internal uid
iid = trainset.to_inner_iid(item)  #map the raw item ID to the internal iid 
print(uid, iid)


10 100


In [16]:
uvector = svd.pu[uid] #extract the user embedding
print(uvector)

[-8.46516253e-02 -1.80309517e-02 -3.86988943e-02  3.28080519e-02
  2.16188698e-02 -3.45825544e-02 -1.94436508e-01  1.03234706e-01
 -1.02937169e-01  3.96642074e-02  1.21778241e-03 -1.40578330e-01
  3.53408770e-03 -5.13682382e-03 -3.85221543e-02 -9.79488566e-02
  1.78449769e-02  1.39383840e-01  9.52608565e-02 -9.96451116e-02
 -5.66504679e-02 -6.47967722e-02  8.35129584e-02 -6.39213449e-02
  1.21616903e-01  1.33954250e-01 -5.02637368e-03  8.62671531e-02
  3.86005371e-02 -1.43919190e-01  3.61908289e-02  6.62229710e-03
 -1.13684351e-01  1.08505001e-02 -1.17555302e-01 -1.56395946e-01
  5.07502638e-02 -7.38891108e-02  1.09686369e-02 -1.88371140e-02
 -1.02219768e-02  2.34453578e-02 -4.23290111e-02 -1.41049948e-02
  1.04194775e-01 -1.58551764e-01 -2.09813765e-02 -1.22386611e-01
  1.72667695e-01 -4.73269647e-02 -1.46604671e-01  1.36401160e-01
 -2.52086121e-02 -9.06468823e-02  9.95892558e-02 -3.70157712e-02
  7.02213429e-02 -3.09394791e-02  6.58677420e-02  4.44365084e-02
 -4.70962136e-02  1.27114

In [17]:
ivector = svd.qi[iid] #extract the user embedding 
print(ivector)

[ 0.00820788  0.01007331 -0.11177818  0.06305484  0.07670616  0.0611004
  0.12343619 -0.06171201 -0.00659903 -0.01782585  0.0612415  -0.06326322
  0.10982753 -0.01090559 -0.05679156 -0.0786478   0.07102125 -0.00165396
  0.04730088  0.0639501  -0.02847888  0.08069979  0.07783425 -0.01639082
 -0.04881306  0.04410616  0.15406206 -0.06788968  0.01585481 -0.07401798
  0.07959198 -0.10021469 -0.04011538  0.01401352  0.00303613 -0.1114238
 -0.20390406  0.08165459 -0.01418364 -0.09003103 -0.11693414 -0.13593316
 -0.02444804  0.16151701  0.03237139  0.02225994 -0.03145099 -0.09472469
 -0.00586675  0.05263351 -0.03287573 -0.00400836 -0.10360958 -0.06031982
  0.16979077 -0.05874401 -0.05477965 -0.01749883 -0.0259682   0.08039657
  0.03382609  0.04213696 -0.02704677 -0.18044848 -0.19700545  0.11909409
  0.14304131  0.05689868  0.0591437   0.28775542 -0.12050541 -0.1060383
  0.00425844  0.03817375  0.09803934 -0.12300912 -0.24121863 -0.10585198
  0.20442726 -0.28935598 -0.02507758  0.25149491 -0.06

In [18]:
ubias = svd.bu[uid] #user bias 
print(ubias)

0.06412638515474735


In [19]:
ibias = svd.bi[iid] #item bias
print(ibias)

0.02485504402293399


In [20]:
global_mean = svd.trainset.global_mean #Think of this as 'zero' i.e., normalized data
print(global_mean)

3.8565335989797873


In [21]:
dp = np.dot(uvector,ivector)
est = global_mean + ubias + ibias + dp
print(est)

3.94831524298186


# Custom recommendations

### Define some helper methods

In [22]:
import difflib
import random

def get_book_id(book_title, metadata):
    
    """
    Gets the book ID for a book title based on the closest match in the metadata dataframe.
    """
    
    existing_titles = list(metadata['title'].values)
    closest_titles = difflib.get_close_matches(book_title, existing_titles) #Returns the titles that are similar 
    
    #Assuming the book title is in the dataset, it must be the cloeset match
    book_id = metadata[metadata['title'] == closest_titles[0]]['id'].values[0] 
    return book_id

def get_book_info(book_id, metadata):
    
    """
    Returns some basic information about a book given the book id and the metadata dataframe.
    """
    
    book_info = metadata[metadata['id'] == book_id][['id', 'isbn', 
                                                    'authors', 'title', 'original_title']]
    return book_info.to_dict(orient='records')

def predict_review(user_id, book_title, model, metadata):
    
    """
    Predicts the review (on a scale of 1-5) that a user would assign to a specific book. 
    """
    
    book_id = get_book_id(book_title, metadata)
    review_prediction = model.predict(uid=user_id, iid=book_id)
    return review_prediction.est

Custom recommendation: Return the first book that the use would rate >= 4

In [23]:
def generate_recommendation(user_id, model, metadata, thresh=4):
    
    """
    Generates a book recommendation for a user based on a rating threshold. Only
    books with a predicted rating at or above the threshold will be recommended
    """
    
    book_titles = list(metadata['title'].values)
    random.shuffle(book_titles)
    
    for book_title in book_titles:
        rating = predict_review(user_id, book_title, model, metadata)
        if rating >= thresh:
            book_id = get_book_id(book_title, metadata)
            return get_book_info(book_id, metadata)


In [24]:
generate_recommendation(1000, svd, books_metadata)

[{'id': 8200,
  'isbn': '1926760689',
  'authors': 'pleasefindthis, Iain S. Thomas, Jon Ellis',
  'title': 'I Wrote This For You',
  'original_title': 'I Wrote This For You'}]

Find the top k recommendations\
The following runs a bit slow since it is an exhaustive search
It can be optimipzied, for each user based on pre-computed user and item embeddings\
Once the raiting information is not updated, the user-item embeddings will be the same

In [25]:
def generate_recommendation_top_k(user_id, model, metadata, thresh=4, k=5):
    
    """
    Generates a book recommendation for a user based on a rating threshold. Only
    the top k books with a predicted rating at or above the threshold will be recommended
    """
    
    book_titles = list(metadata['title'].values)
    random.shuffle(book_titles)
    
    recommendations = []
    
    for book_title in book_titles:
        rating = predict_review(user_id, book_title, model, metadata)
        if rating >= thresh:
            book_id = get_book_id(book_title, metadata)
            recommendations.append(get_book_info(book_id, metadata)[0])
            if len(recommendations) == k:
                return recommendations
            
    return recommendations

In [26]:
generate_recommendation_top_k(1000, svd, books_metadata)

[{'id': 3963,
  'isbn': '553268929',
  'authors': 'Pat Conroy',
  'title': 'The Great Santini',
  'original_title': 'The Great Santini'},
 {'id': 389,
  'isbn': '076531178X',
  'authors': 'Brandon Sanderson',
  'title': 'The Final Empire (Mistborn, #1)',
  'original_title': 'Mistborn : The Final Empire'},
 {'id': 5768,
  'isbn': '62267175',
  'authors': 'Nicole  Williams',
  'title': 'Crush (Crash, #3)',
  'original_title': 'Crush'},
 {'id': 7799,
  'isbn': '316084255',
  'authors': 'Yana Toboso, Tomo Kimura',
  'title': 'Black Butler, Vol. 2 (Black Butler, #2)',
  'original_title': '黒執事 II [Kuroshitsuji II]'},
 {'id': 2988,
  'isbn': '1557091552',
  'authors': 'Carolyn Keene, Russell H. Tandy, Sara Paretsky',
  'title': 'The Secret of the Old Clock (Nancy Drew, #1)',
  'original_title': 'The Secret of the Old Clock (Nancy Drew Mystery Stories, #1)'}]

# Custom algorithm
https://surprise.readthedocs.io/en/stable/building_custom_algo.html

Extend the AlgoBase class\
Provide implementatios for the <code>fit</code> and <code>estimate</code> methods\
General rule, where applicable, call the parent method before providing custom implementations

In [27]:
from surprise import AlgoBase
from surprise import Dataset
from surprise.model_selection import cross_validate

class MyOwnAlgorithm(AlgoBase):

    def __init__(self):

        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def fit(self, trainset):

        # Here again: call base method before doing anything.
        AlgoBase.fit(self, trainset)

        # Compute the average rating. We might as well use the
        # trainset.global_mean attribute ;)
        self.the_mean = np.mean([r for (_, _, r) in
                                 self.trainset.all_ratings()])

        return self

    def estimate(self, u, i):

        return self.the_mean

In [28]:
algo = MyOwnAlgorithm()
cross_validate(algo, data, verbose=True)

Evaluating RMSE, MAE of algorithm MyOwnAlgorithm on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9850  0.9815  0.9832  0.9833  0.9867  0.9839  0.0018  
MAE (testset)     0.7868  0.7848  0.7868  0.7861  0.7899  0.7869  0.0017  
Fit time          0.52    1.24    1.24    1.24    1.22    1.09    0.28    
Test time         2.21    2.25    2.19    2.16    2.22    2.20    0.03    


{'test_rmse': array([0.98499663, 0.98151183, 0.9832459 , 0.98327713, 0.98666977]),
 'test_mae': array([0.78680473, 0.78477723, 0.78680755, 0.78613047, 0.78987618]),
 'fit_time': (0.522496223449707,
  1.2396535873413086,
  1.2383008003234863,
  1.2426872253417969,
  1.2155101299285889),
 'test_time': (2.2112576961517334,
  2.247276782989502,
  2.1896824836730957,
  2.1584644317626953,
  2.215437650680542)}

In [29]:
#from surprise import *

benchmark = []

for algorithm in [SVD(), MyOwnAlgorithm()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=2, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.856887,37.201967,7.36761
MyOwnAlgorithm,0.98394,0.534446,4.831557


# Link to a dash baord

In [34]:
!pip3 install sixer

Collecting sixer
  Downloading sixer-1.6.1-py3-none-any.whl (20 kB)
Installing collected packages: sixer
Successfully installed sixer-1.6.1


In [37]:
from jupyter_dash import JupyterDash

In [38]:
__name__

'__main__'

In [44]:
try:
    import dash
except ImportError:
    !conda install -c conda-forge -c plotly jupyter-dash -y
    import dash
    
import dash_html_components as html
import dash_core_components as dcc

import pandas as pd
import dash_table

from six import iteritems

def _uid_to_username(uid):
    return trainset.to_raw_uid(uid)

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
proxy_port = 80

_default_userID = 1000
_default_un = _uid_to_username(_default_userID)
df  = pd.DataFrame(generate_recommendation_top_k(_default_userID, svd, books_metadata))

#Make data for drop down
options_data = []
for (raw, inner ) in iteritems(trainset._raw2inner_id_users):
    options_data.append({ 'label': raw , 'value' : inner })
    

#app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
app = JupyterDash(__name__, 
                  external_stylesheets=external_stylesheets,
                  requests_pathname_prefix='/proxy/' + str(proxy_port) + '/')

dropdown = html.Div([
    dcc.Dropdown(
        id='demo-dropdown',
        options=options_data,
        value=_default_userID
    ),
    html.Div(id='dd-output-container', children='Book recommendations for user {}'.format(_default_un))
])


table = dash_table.DataTable(
    id='table',
    columns=[{"name": i, "id": i} for i in df.columns],
    data=df.to_dict('records'),
)

app.layout = html.Div([dropdown, table])


@app.callback(
    [dash.dependencies.Output('dd-output-container', 'children'), dash.dependencies.Output("table", "data")],
    [dash.dependencies.Input('demo-dropdown', 'value')])
def update_output(value):
    df  = pd.DataFrame(generate_recommendation_top_k(int(value), svd, books_metadata))
    _un = _uid_to_username(value)
    return 'Book recommendations for user {}'.format(_un), df.to_dict('records')


srv = app.run_server(debug=True, use_reloader=False, port=proxy_port)
srv

Dash app running on http://127.0.0.1:80/proxy/80/


In [43]:


#not work
## This is a sharable version of the link (for Paperspace)
import os
nid = os.environ['PAPERSPACE_NOTEBOOK_ID']
cid = os.environ['PAPERSPACE_CLUSTER_ID']
print('https://'+nid+'.'+cid+'.paperspacegradient.com/proxy/80/')

https://nbnt35tkym.clg07azjl.paperspacegradient.com/proxy/80/


# References
https://towardsdatascience.com/how-you-can-build-simple-recommender-systems-with-surprise-b0d32a8e4802 \
https://www.kaggle.com/zygmunt/goodbooks-10k \
https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html \
https://surprise.readthedocs.io/en/stable/building_custom_algo.html