In [3]:
from scipy import spatial

a = [1., 2.]
b = [2., 4.]
c = [2.5, 4.]
d = [4.5, 5.]

# euclidean distance
ca = spatial.distance.euclidean(c, a)
cb = spatial.distance.euclidean(c, b)
cd = spatial.distance.euclidean(c, d)

print("ca = " + str(ca))
print("cb = " + str(cb))
print("cd = " + str(cd))

ca = 2.5
cb = 0.5
cd = 2.23606797749979


In [4]:
from scipy import spatial
# cosine distance using 1- Sc(A,B) = cos(theta)
ca_dist = spatial.distance.cosine(c, a)
cb_dist = spatial.distance.cosine(c, b)
cd_dist = spatial.distance.cosine(c, d)
ab_dist = spatial.distance.cosine(a, b)
print("cosine dists:")
print("ca = " + str(ca_dist))
print("cb = " + str(cb_dist))
print("cd = " + str(cd_dist))
print("ab = " + str(ab_dist))

cosine dists:
ca = 0.004504527406047898
cb = 0.004504527406047898
cd = 0.015137225946083022
ab = 0.0


In [5]:
from surprise import SVD
from surprise import Dataset 
from surprise.model_selection import cross_validate

# load the movie lense dataset 
data = Dataset.load_builtin('ml-100k')

In [6]:
# use the svd algorithm for reducing dimensionality of the data
algo = SVD()

# run 5-fold cross validation
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9388  0.9335  0.9422  0.9266  0.9396  0.9361  0.0055  
MAE (testset)     0.7380  0.7373  0.7440  0.7321  0.7383  0.7379  0.0038  
Fit time          0.55    0.59    0.57    0.57    0.57    0.57    0.01    
Test time         0.09    0.09    0.09    0.09    0.11    0.09    0.01    


{'test_rmse': array([0.93875732, 0.93347213, 0.94219172, 0.92657735, 0.93958452]),
 'test_mae': array([0.73798887, 0.73729206, 0.74404991, 0.7320549 , 0.73832417]),
 'fit_time': (0.5500338077545166,
  0.5883688926696777,
  0.572803258895874,
  0.5700273513793945,
  0.5689244270324707),
 'test_time': (0.08746910095214844,
  0.08813977241516113,
  0.08902573585510254,
  0.08841609954833984,
  0.1124715805053711)}

In [7]:
# Dataset module is used to load data from files,
# into Pandas data frames, and built in datasets
Dataset.load_builtin()
# Dataset.load_from_file()
# Dataset.load_from_df()

# Reader class is used to parse a file containing ratings
# Default format - each rating stored in separate line in the order
# 'user', 'item', 'rating'. Order and separator configuration using params
# 1. line_format - string that stores the order of the data with field names 
#     eg. "item user rating"
# 2. sep - separator between fields such as ','
# 3. rating_scale - specify rating sscale - default = (1,5)
# 4. skip_lines - indicate number of lines to skip at beginning of file

<surprise.dataset.DatasetAutoFolds at 0x7fb4dc097ca0>

In [8]:
# load_data.py

import pandas as pd
from surprise import Dataset
from surprise import Reader

# One new user "E" who's rated only 1 movie
ratings_dict = {
    "item": [1, 2, 1, 2, 1, 2, 1, 2, 1],
    "user": ['A', 'A', 'B', 'B', 'C', 'C', 'D', 'D', 'E'],
    "rating": [1, 2, 2, 4, 2.5, 4, 4.5, 5, 3],
}

df = pd.DataFrame(ratings_dict)
reader = Reader(rating_scale=(1,5))
print(df)
print(reader)

# load pandas data frame into Surpise dataset
data = Dataset.load_from_df(df[["user", "item", "rating"]], reader)
print(data)

# load builtin MovieLense-100k data set
movieLens = Dataset.load_builtin('ml-100k')
print(movieLens)

   item user  rating
0     1    A     1.0
1     2    A     2.0
2     1    B     2.0
3     2    B     4.0
4     1    C     2.5
5     2    C     4.0
6     1    D     4.5
7     2    D     5.0
8     1    E     3.0
<surprise.reader.Reader object at 0x7fb518580eb0>
<surprise.dataset.DatasetAutoFolds object at 0x7fb51dfef700>
<surprise.dataset.DatasetAutoFolds object at 0x7fb4dc0a5fa0>


In [9]:
# recommender.py

# KNN With Means = centered cos algo
# name - similartiy metric to use (cosine, msd, pearson, pearson_baseline)
# user_based - boolean for user based or item based
# min_support - min number of common items needed btwn users to consider similarity
from surprise import KNNWithMeans

# item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False # comput item similarity
}
algo = KNNWithMeans(sim_options=sim_options)
print(algo)

<surprise.prediction_algorithms.knns.KNNWithMeans object at 0x7fb4dc0a5b20>


In [10]:
# to train use N-fold
# MovieLens 100K data uses 5 splits (5-fold cross-validation)
# u1.base, u1.test, u2.base, u2.test...u5.base, u5.test
# from load_data import data - load_data from code above
# from recommender import  algo - recommender from python file above

trainingSet = data.build_full_trainset()
algo.fit(trainingSet)

Computing the cosine similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x7fb4dc0a5b20>

In [11]:
# predict what user E would rate the movie '2'
prediction = algo.predict('E', 2)
prediction.est

4.15

In [12]:
# Tuning the Algo Params
# GridSearchCV tries combos of params and reports the best params for accuracy
from surprise import KNNWithMeans
from surprise import Dataset
from surprise.model_selection import GridSearchCV

data = Dataset.load_builtin("ml-100k")
sim_options = {
    "name": ["msd", "cosine"],
    "min_support": [3, 4, 5],
    "user_based": [False, True]
}

param_grid = {"sim_options": sim_options}
gs = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [13]:
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

print(gs.best_score["mae"])
print(gs.best_params["mae"])

0.9442179979419243
{'sim_options': {'name': 'msd', 'min_support': 3, 'user_based': False}}
0.7407435682907325
{'sim_options': {'name': 'msd', 'min_support': 3, 'user_based': False}}


In [14]:
# Means that our Centered KNN Algo works best with
# 1. Item based
# 2. MSD similartiy metric
# 3. Min support of 3

# Model-based approaches, we can use Surprise to check which values work best
# 1used in statistics to . n_epochs - number of iterations of SGD, an iterative method
# 2. lr_all - learning rate for all params, param that decides how much params are adjusted
# 3. reg_all - regularization term for all params, penalty term to prevent overfitting
#NOTE: No similarity metrics in matrix factorization

In [15]:
# Best values for SVD algorithm - Singular Value Decomposition
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import GridSearchCV

data = Dataset.load_builtin("ml-100k")
param_grid = {
    "n_epochs": [5, 10],
    "lr_all": [0.002, 0.005],
    "reg_all": [0.4, 0.6]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)



In [16]:
# SVD outputs for rmse/mae for the GridSearchCV class
# SVD algo is best for RMSE with below params
print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

print(gs.best_score["mae"])
print(gs.best_params["mae"])

0.9640578222001827
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
0.7725480658363456
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}
