# Dataset
https://www.kaggle.com/zygmunt/goodbooks-10k

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## Load data

In [2]:
ratings_data = pd.read_csv('./data/ratings.csv')
books_metadata = pd.read_csv('./data/books.csv')
ratings_data.head(10)

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4
5,1,2077,4
6,1,2487,4
7,1,2900,5
8,1,3662,4
9,1,3922,5


# Create the surprise dataset
The framework expects a Dataset object with three feilds: userIDs, itemIDs, and rating\
Can load the dataset object from directly from the pandas dataframe, or from a csv\
A Reader class is used to parse the Dataset (each line (userID, itemID, rating)



In [3]:
try:
    from surprise import Dataset
except ImportError:
    !conda install -c conda-forge scikit-surprise -y
    from surprise import Dataset

from surprise import Reader

reader = Reader(rating_scale=(1, 5)) #Can change scale based on your requirements

#The order of the data is important, must be user, item, user's rating for the item
data = Dataset.load_from_df(ratings_data[['user_id', 'book_id', 'rating']], reader) 

# Train the model
Suprise supports several recommender system algorithms\
See https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html \
The details have been abstracted away, so executing different algorithms is a simple as creating new classes\
<code> for algorithm in [SVD(),KNNBasic(),KNNWithMeans(),KNNWithZScore(), CoClustering(), SlopeOne(), SVDpp(), BaselineOnly()]:\
    #do stuff here using the Surprise algorithm API
</code>

In [4]:
from surprise import SVD
from surprise.model_selection import cross_validate

svd = SVD(verbose=True, n_epochs=3) 
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=3, verbose=True) #Auto-magic cross validation....NOICE!!!

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 0
Processing epoch 1
Processing epoch 2
Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8898  0.8877  0.8887  0.8887  0.0009  
MAE (testset)     0.7098  0.7086  0.7090  0.7091  0.0005  
Fit time          7.06    7.71    7.67    7.48    0.30    
Test time         4.50    4.29    4.11    4.30    0.16    


{'test_rmse': array([0.88977336, 0.88766441, 0.88873584]),
 'test_mae': array([0.70977587, 0.7086191 , 0.70903517]),
 'fit_time': (7.056204319000244, 7.707362413406372, 7.6737446784973145),
 'test_time': (4.503728151321411, 4.287333011627197, 4.109003067016602)}

# Ready to predict
Now that we are satisfied with the prediction accuracy we can use the full dataset to make predictions

In [5]:
trainset = data.build_full_trainset() #Prepare all the data we have for training

svd.fit(trainset) #Train the model

Processing epoch 0
Processing epoch 1
Processing epoch 2


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2148938c70>

# Make some predictions

Call the <code>.predict</code> method\
The <code>est</code> value is the ratings prediction for the query

In [7]:
svd.predict(uid=10, iid=100)

Prediction(uid=10, iid=100, r_ui=None, est=3.8870123194261397, details={'was_impossible': False})

## User and item mappings
Surprise refers to the user and item IDs provided in the data set as <code>raw user</code> and <code>raw item</code> IDs\
There are some useful functions for converting between <code>internal</code> and <code>raw</code> user IDs\
<code>.trainset.to_raw_uid\
.trainset.to_raw_iid\
.trainset.to_inner_uid\
.trainset.to_inner_iid
</code>

In [8]:
trainset.to_raw_uid(10)

5379

In [9]:
trainset.to_raw_iid(100)

101

## Custom predicitons
Can go under the hood of the SVD and use the user and item embeddings, and biases to make custom predictions\
Suprise has some useful functions from the uid an item to user and item mappings respectively\

To demonstrate making a custom predciton, we can define a dot product prediciton\

In [10]:
user = 5379 #Surprise preserves the type of the raw user and item embeddings
item = 101

uid = trainset.to_inner_uid(user) #map the raw user ID to the internal uid
iid = trainset.to_inner_iid(item)  #map the raw item ID to the internal iid 
print(uid, iid)


10 100


In [11]:
uvector = svd.pu[uid] #extract the user embedding
print(uvector)

[-0.04359509 -0.09204349 -0.093612   -0.01658127  0.06962907 -0.15567284
 -0.02952315 -0.14161182 -0.09841204  0.00904953 -0.05779087 -0.02953855
  0.09129887  0.0326396  -0.06582377  0.16631687 -0.07839788  0.09137723
 -0.03481    -0.12541538  0.04574812 -0.06807655  0.06837357  0.08166956
 -0.11127101  0.01546851 -0.17154329 -0.09146493  0.02771905  0.04835724
  0.08189842 -0.05858128  0.18992326  0.10535358  0.08080988  0.08979732
 -0.07059684 -0.01789224 -0.04214804  0.06134328 -0.16475552 -0.07077482
  0.21788742 -0.083333    0.02714778 -0.02651905  0.04181547 -0.01907267
 -0.00780944 -0.04238715 -0.07333258 -0.00318903  0.00631478 -0.27643375
 -0.09444518 -0.00777795 -0.04930344 -0.03966269 -0.17194925  0.04766953
  0.14830174  0.04025439  0.06597677  0.25616563  0.0391612  -0.07472876
  0.07340879 -0.06016934 -0.08679573 -0.05311435  0.04958604 -0.0161306
  0.08497214 -0.05302005 -0.11700575 -0.07891167 -0.12127615  0.05781533
  0.08123776  0.03904038 -0.1057517  -0.02942429  0.

In [12]:
ivector = svd.qi[iid] #extract the user embedding 
print(ivector)

[-0.0613442   0.18020941 -0.03727923 -0.04073174  0.17539209 -0.01411787
  0.04776542  0.19474819  0.32823774 -0.07860561  0.00722265 -0.18498294
  0.0288283  -0.01977797 -0.0117363   0.02034766  0.11230405 -0.06430811
  0.10172178 -0.00909409  0.07358057 -0.09509257  0.04821051  0.06487116
  0.04127878 -0.13465126 -0.1837986  -0.09306715  0.03550737 -0.11089367
  0.11246511 -0.09364572 -0.11336519 -0.08142238  0.02841446  0.09061202
  0.18937792  0.02317409  0.06869623  0.03303629 -0.07476576 -0.11921036
  0.09619974 -0.00215544  0.04830497  0.13876177 -0.16543141 -0.0732465
  0.07362238  0.23579607 -0.1536954  -0.09112581 -0.14792187 -0.16961425
  0.03676392 -0.04796331  0.09594699  0.09024747  0.08289047 -0.00153326
 -0.17349679 -0.1310743   0.00585269  0.10474739  0.04353726  0.10631792
 -0.08606463 -0.06196225 -0.04327081  0.19666438  0.0179961   0.0659792
  0.0347804  -0.13913764 -0.08914445 -0.07444772  0.01475166  0.08771635
  0.02679021 -0.09124956 -0.00434249  0.1351138   0.0

In [13]:
ubias = svd.bu[uid] #user bias 
print(ubias)

0.08420910496802402


In [14]:
ibias = svd.bi[iid] #item bias
print(ibias)

0.04968198274942113


In [15]:
global_mean = svd.trainset.global_mean #Think of this as 'zero' i.e., normalized data
print(global_mean)

3.8565335989797873


In [16]:
dp = np.dot(uvector,ivector)
est = global_mean + ubias + ibias + dp
print(est)

4.041352244631293


# Custom recommendations

### Define some helper methods

In [17]:
import difflib
import random

def get_book_id(book_title, metadata):
    
    """
    Gets the book ID for a book title based on the closest match in the metadata dataframe.
    """
    
    existing_titles = list(metadata['title'].values)
    closest_titles = difflib.get_close_matches(book_title, existing_titles) #Returns the titles that are similar 
    
    #Assuming the book title is in the dataset, it must be the cloeset match
    book_id = metadata[metadata['title'] == closest_titles[0]]['id'].values[0] 
    return book_id

def get_book_info(book_id, metadata):
    
    """
    Returns some basic information about a book given the book id and the metadata dataframe.
    """
    
    book_info = metadata[metadata['id'] == book_id][['id', 'isbn', 
                                                    'authors', 'title', 'original_title']]
    return book_info.to_dict(orient='records')

def predict_review(user_id, book_title, model, metadata):
    
    """
    Predicts the review (on a scale of 1-5) that a user would assign to a specific book. 
    """
    
    book_id = get_book_id(book_title, metadata)
    review_prediction = model.predict(uid=user_id, iid=book_id)
    return review_prediction.est

Custom recommendation: Return the first book that the use would rate >= 4

In [18]:
def generate_recommendation(user_id, model, metadata, thresh=4):
    
    """
    Generates a book recommendation for a user based on a rating threshold. Only
    books with a predicted rating at or above the threshold will be recommended
    """
    
    book_titles = list(metadata['title'].values)
    random.shuffle(book_titles)
    
    for book_title in book_titles:
        rating = predict_review(user_id, book_title, model, metadata)
        if rating >= thresh:
            book_id = get_book_id(book_title, metadata)
            return get_book_info(book_id, metadata)


In [19]:
generate_recommendation(1000, svd, books_metadata)

[{'id': 1224,
  'isbn': '375708111',
  'authors': 'Brian Greene',
  'title': 'The Elegant Universe: Superstrings, Hidden Dimensions, and the Quest for the Ultimate Theory',
  'original_title': 'The Elegant Universe: Superstrings, Hidden Dimensions, and the Quest for the Ultimate Theory'}]

Find the top k recommendations\
The following runs a bit slow since it is an exhaustive search
It can be optimipzied, for each user based on pre-computed user and item embeddings\
Once the raiting information is not updated, the user-item embeddings will be the same

In [20]:
def generate_recommendation_top_k(user_id, model, metadata, thresh=4, k=5):
    
    """
    Generates a book recommendation for a user based on a rating threshold. Only
    the top k books with a predicted rating at or above the threshold will be recommended
    """
    
    book_titles = list(metadata['title'].values)
    random.shuffle(book_titles)
    
    recommendations = []
    
    for book_title in book_titles:
        rating = predict_review(user_id, book_title, model, metadata)
        if rating >= thresh:
            book_id = get_book_id(book_title, metadata)
            recommendations.append(get_book_info(book_id, metadata)[0])
            if len(recommendations) == k:
                return recommendations
            
    return recommendations

In [21]:
generate_recommendation_top_k(1000, svd, books_metadata)

[{'id': 9096,
  'isbn': '1595540547',
  'authors': 'Charles Martin',
  'title': 'When Crickets Cry',
  'original_title': 'When Crickets Cry'},
 {'id': 8924,
  'isbn': '067088278X',
  'authors': 'Janet Ahlberg, Allan Ahlberg',
  'title': 'Each Peach Pear Plum',
  'original_title': 'Each Peach Pear Plum'},
 {'id': 189,
  'isbn': '618640150',
  'authors': 'J.R.R. Tolkien',
  'title': 'The Lord of the Rings (The Lord of the Rings, #1-3)',
  'original_title': 'The Lord of the Rings'},
 {'id': 7283,
  'isbn': '64400964',
  'authors': 'Maud Hart Lovelace, Lois Lenski',
  'title': 'Betsy-Tacy (Betsy-Tacy, #1)',
  'original_title': 'Betsy-Tacy'},
 {'id': 4475,
  'isbn': '571203132',
  'authors': 'Nikos Kazantzakis, Νίκος Καζαντζάκης',
  'title': 'Zorba the Greek',
  'original_title': 'Βίος και πολιτεία του Αλέξη Ζορμπά'}]

# Custom algorithm
https://surprise.readthedocs.io/en/stable/building_custom_algo.html

Extend the AlgoBase class\
Provide implementatios for the <code>fit</code> and <code>estimate</code> methods\
General rule, where applicable, call the parent method before providing custom implementations

In [22]:
from surprise import AlgoBase
from surprise import Dataset
from surprise.model_selection import cross_validate

class MyOwnAlgorithm(AlgoBase):

    def __init__(self):

        # Always call base method before doing anything.
        AlgoBase.__init__(self)

    def fit(self, trainset):

        # Here again: call base method before doing anything.
        AlgoBase.fit(self, trainset)

        # Compute the average rating. We might as well use the
        # trainset.global_mean attribute ;)
        self.the_mean = np.mean([r for (_, _, r) in
                                 self.trainset.all_ratings()])

        return self

    def estimate(self, u, i):

        return self.the_mean

In [23]:
algo = MyOwnAlgorithm()
cross_validate(algo, data, verbose=True)

Evaluating RMSE, MAE of algorithm MyOwnAlgorithm on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9838  0.9855  0.9851  0.9838  0.9816  0.9839  0.0014  
MAE (testset)     0.7863  0.7881  0.7874  0.7875  0.7851  0.7869  0.0010  
Fit time          0.52    1.22    1.27    1.26    1.24    1.10    0.29    
Test time         1.70    2.25    1.78    2.27    2.26    2.05    0.26    


{'test_rmse': array([0.98376972, 0.98547785, 0.9850672 , 0.98383845, 0.98155107]),
 'test_mae': array([0.78631431, 0.7880795 , 0.78738093, 0.78750004, 0.78511995]),
 'fit_time': (0.5243387222290039,
  1.2246849536895752,
  1.2693052291870117,
  1.2638347148895264,
  1.2415225505828857),
 'test_time': (1.6957714557647705,
  2.253708839416504,
  1.7781527042388916,
  2.2743003368377686,
  2.2574896812438965)}

In [24]:
#from surprise import *

benchmark = []

for algorithm in [SVD(), MyOwnAlgorithm()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE'], cv=2, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    benchmark.append(tmp)
    
pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') 

Unnamed: 0_level_0,test_rmse,fit_time,test_time
Algorithm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
SVD,0.85714,32.749216,6.896137
MyOwnAlgorithm,0.983942,0.554503,4.488219


# Link to a dash baord

In [25]:
!pip3 install sixer
!conda install -c conda-forge -c plotly jupyter-dash -y



In [28]:
from jupyter_dash import JupyterDash
from IPython.display import display, HTML

In [33]:
import dash    
import dash_html_components as html
import dash_core_components as dcc

import pandas as pd
import dash_table

from six import iteritems

def _uid_to_username(uid):
    return trainset.to_raw_uid(uid)

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
proxy_port = 8080

_default_userID = 1000
_default_un = _uid_to_username(_default_userID)
df  = pd.DataFrame(generate_recommendation_top_k(_default_userID, svd, books_metadata))

#Make data for drop down
options_data = []
for (raw, inner ) in iteritems(trainset._raw2inner_id_users):
    options_data.append({ 'label': raw , 'value' : inner })
    

#app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
app = JupyterDash(__name__, 
                  external_stylesheets=external_stylesheets,
                  requests_pathname_prefix='/proxy/' + str(proxy_port) + '/')

dropdown = html.Div([
    dcc.Dropdown(
        id='demo-dropdown',
        options=options_data,
        value=_default_userID
    ),
    html.Div(id='dd-output-container', children='Book recommendations for user {}'.format(_default_un))
])


table = dash_table.DataTable(
    id='table',
    columns=[{"name": i, "id": i} for i in df.columns],
    data=df.to_dict('records'),
)

app.layout = html.Div([dropdown, table])


@app.callback(
    [dash.dependencies.Output('dd-output-container', 'children'), dash.dependencies.Output("table", "data")],
    [dash.dependencies.Input('demo-dropdown', 'value')])
def update_output(value):
    df  = pd.DataFrame(generate_recommendation_top_k(int(value), svd, books_metadata))
    _un = _uid_to_username(value)
    return 'Book recommendations for user {}'.format(_un), df.to_dict('records')

#Magic Js 
js = "<b style='color: red'>Please click on <a href='/proxy/" + str(proxy_port) + "/' target='_blank'>here</a> to open the dash</b>"
display(HTML(js))

srv = app.run_server(debug=True, use_reloader=False, port=proxy_port)
srv

  func()


Dash app running on http://127.0.0.1:8080/proxy/8080/


In [None]:


#not work
## This is a sharable version of the link (for Paperspace)
#import os
#nid = os.environ['PAPERSPACE_NOTEBOOK_ID']
#cid = os.environ['PAPERSPACE_CLUSTER_ID']
#print('https://'+nid+'.'+cid+'.paperspacegradient.com/proxy/80/')

https://nbnt35tkym.clg07azjl.paperspacegradient.com/proxy/80/


# References
https://towardsdatascience.com/how-you-can-build-simple-recommender-systems-with-surprise-b0d32a8e4802 \
https://www.kaggle.com/zygmunt/goodbooks-10k \
https://surprise.readthedocs.io/en/stable/prediction_algorithms_package.html \
https://surprise.readthedocs.io/en/stable/building_custom_algo.html