<a href="https://colab.research.google.com/github/jhancuch/recommederAnalysis/blob/main/svd_v_svdpp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

I analyze both the SVD and SVD++ alogrithms from the Surpise package to demonstrate their usefullness for our final project OnMart product recommendations. I use the practice transactions csv file and conduct a comparative analysis using run-time, cross-validation, and exhaustive search.

### Data Ingestion

In [None]:
!pip3 install surprise

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 27.4 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1633732 sha256=3a313b3ba9e7aa6f915022046d09742b5fc1555fb46f90ead544f5f3e34a8876
  Stored in directory: /root/.cache/pip/wheels/76/44/74/b498c42be47b2406bd27994e16c5188e337c657025ab400c1c
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.1 surprise-0.1


In [None]:
import numpy as np
import pandas as pd

import os
import csv
from collections import defaultdict

import surprise
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import SVD
from surprise import SVDpp
from surprise.model_selection import GridSearchCV
from surprise import accuracy
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

In [None]:
np.random.seed(1234)

In [None]:
transactions = pd.read_csv("https://raw.githubusercontent.com/jhancuch/onmart/main/data/Transactions.csv")
transactions.head()

Unnamed: 0,CustomerID,FirstName,LastName,CreditCardNumber,OrderID,Customer_Occupation,Credit_Card_Number,Order_ID,Order_Date,Expected_Delivery_Date,...,Product_Name,Category,Manufacturer,Review_Rating,Delivery_Tracking_ID,Delivery_Type,Delivery_Zip_Code,Transaction_Status,Order_Returned,Order_Delivered_on_Time
0,566-26-0321,Rebeka,Stokes,xxxx-xxxx-xxxx-2650,50787284-d78d-4b6a-b93f-7419b0d46590,Lawyer,12345789032,98,4/7/2020,4/11/2020,...,Fitbit small,fitness,Fitbit small,3,98,Pickup,10048,Disputed,1,1
1,274-18-0017,Rahsaan,Koch,xxxx-xxxx-xxxx-5055,f61639d7-28de-4507-a63a-71e99921d8f9,Lawyer,12345789032,131,5/10/2020,5/14/2020,...,Fitbit small,fitness,Fitbit small,3,131,Pickup,10048,Disputed,1,1
2,455-28-7950,Zion,Osinski,xxxx-xxxx-xxxx-0150,7b1caf69-bd5b-441c-9115-6fa31ba1c313,Lawyer,12345789032,318,11/13/2020,11/17/2020,...,Fitbit small,fitness,Fitbit small,3,318,Pickup,10048,Disputed,1,1
3,254-72-5938,Ward,Brekke,xxxx-xxxx-xxxx-5280,cb58e8e9-572b-4d9e-af9f-dbecf9dfd3c5,Actor,12345789032,528,2/12/2020,2/16/2020,...,Fitbit large,fitness,Fitbit large,5,528,Pickup,10048,Disputed,0,1
4,563-47-8401,Terrence,Franecki,xxxx-xxxx-xxxx-1019,a9d66460-a48f-419f-8d39-7818290a3041,Actor,12345789032,561,2/12/2020,2/16/2020,...,Fitbit large,fitness,Fitbit large,5,561,Pickup,10048,Disputed,0,1


## Data Preparation 

In [None]:
data = pd.DataFrame()
data['user'] = transactions['CustomerID'].astype(str)
data['item'] = transactions['Product_Name'].astype(str)
data['rating'] = transactions['Review_Rating'].astype(int)
data = data.dropna()
data.head()

Unnamed: 0,user,item,rating
0,566-26-0321,Fitbit small,3
1,274-18-0017,Fitbit small,3
2,455-28-7950,Fitbit small,3
3,254-72-5938,Fitbit large,5
4,563-47-8401,Fitbit large,5


In [None]:
data.to_csv('transaction_train.csv', index=False)

with open("transaction_train.csv", "r") as f:
    reader = csv.DictReader(f, delimiter=',')
    with open("transaction_test.csv", "w",newline='') as f_out:
        writer = csv.DictWriter(f_out, fieldnames=reader.fieldnames, delimiter=",")
        for row in reader:
            writer.writerow(row)

In [None]:
def get_top_n(predictions, n=3):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

## SVD Model

### Data

In [None]:
svd_data = Dataset.load_from_file('transaction_test.csv', reader=Reader(line_format='user item rating', sep=','))
svd_trainset = svd_data.build_full_trainset()

### Grid Search

In [None]:
svd_param_grid = {'n_epochs': [5, 10, 20], 'lr_all': [0.002, 0.005, 0.007, 0.009], 'reg_all': [0.0, 0.2, 0.4, 0.6]}

svd_gs = GridSearchCV(SVD, svd_param_grid, measures=['rmse', 'mae'], cv=5)

In [None]:
%timeit
svd_gs.fit(svd_data)

In [None]:
# best RMSE score
print(svd_gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(svd_gs.best_params['rmse'])

0.19453025599059537
{'n_epochs': 10, 'lr_all': 0.007, 'reg_all': 0.0}


### Create Model

In [None]:
svd_algorithm = SVD(n_epochs=10, lr_all=0.007, reg_all=0.0)

In [None]:
%time
svd_algorithm.fit(svd_trainset)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.25 µs


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb9dc49e910>

In [None]:
# Then predict ratings for all pairs (u, i) that are NOT in the training set.
svd_testset = svd_trainset.build_anti_testset()

In [None]:
%time
svd_predictions = svd_algorithm.test(svd_testset)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.11 µs


In [None]:
top_n = get_top_n(svd_predictions, n=3)

In [None]:
# Print the recommended items for each user
count = 0
for uid, user_ratings in top_n.items():
  print(uid, [iid for (iid, _) in user_ratings])
  count += 1
  if count == 9:
    break

566-26-0321 ['VR7', 'Redmi', 'Harman Kardon']
274-18-0017 ['VR7', 'Redmi', 'Harman Kardon']
455-28-7950 ['Fitbit large', 'Logitech', 'VR7']
254-72-5938 ['Logitech', 'Sony', 'Redmi']
563-47-8401 ['VR7', 'Sony', 'Redmi']
848-43-9454 ['Fitbit large', 'VR7', 'Sony']
056-77-2913 ['VR7', 'Redmi', 'Fitbit small']
404-86-6657 ['VR7', 'Sony', 'Samsung s10']
653-90-2575 ['Logitech', 'VR7', 'Sony']


### Validation

In [None]:
cross_validate(svd_algorithm, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.1965  0.1957  0.1975  0.1929  0.1984  0.1962  0.0019  
MAE (testset)     0.0950  0.0907  0.0959  0.0914  0.0943  0.0935  0.0020  
Fit time          0.43    0.42    0.43    0.43    0.41    0.42    0.01    
Test time         0.03    0.03    0.02    0.02    0.02    0.03    0.00    


{'fit_time': (0.42838048934936523,
  0.4199085235595703,
  0.4312012195587158,
  0.4263026714324951,
  0.41325998306274414),
 'test_mae': array([0.09503722, 0.09070615, 0.09586213, 0.0913729 , 0.09431822]),
 'test_rmse': array([0.1965307 , 0.19569974, 0.19749821, 0.19288698, 0.19840951]),
 'test_time': (0.026318788528442383,
  0.02803182601928711,
  0.024149179458618164,
  0.024747848510742188,
  0.024579763412475586)}

## SVD++ Model

### Data

In [None]:
svdpp_data = Dataset.load_from_file('transaction_test.csv', reader=Reader(line_format='user item rating', sep=','))
svdpp_trainset = svdpp_data.build_full_trainset()

### Grid Search

In [None]:
svdpp_param_grid = {'n_epochs': [5, 10, 20], 'lr_all': [0.002, 0.005, 0.007, 0.009], 'reg_all': [0.0, 0.2, 0.4, 0.6]}

svdpp_gs = GridSearchCV(SVDpp, svdpp_param_grid, measures=['rmse', 'mae'], cv=5)

In [None]:
%timeit
svdpp_gs.fit(svdpp_data)

In [None]:
# best RMSE score
print(svdpp_gs.best_score['rmse'])

# combination of parameters that gave the best RMSE score
print(svdpp_gs.best_params['rmse'])

0.19800136697519322
{'n_epochs': 10, 'lr_all': 0.009, 'reg_all': 0.0}


### Model

In [None]:
svdpp_algorithm = SVDpp(n_epochs=10, lr_all=0.009, reg_all=0.0)

In [None]:
%time
svdpp_algorithm.fit(svdpp_trainset)

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.25 µs


<surprise.prediction_algorithms.matrix_factorization.SVDpp at 0x7fb9dc538950>

In [None]:
# Then predict ratings for all pairs (u, i) that are NOT in the training set.
svdpp_testset = svdpp_trainset.build_anti_testset()

In [None]:
%time
svdpp_predictions = svdpp_algorithm.test(svdpp_testset)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.87 µs


In [None]:
top_n = get_top_n(svdpp_predictions, n=3)

In [None]:
# Print the recommended items for each user
count = 0
for uid, user_ratings in top_n.items():
  print(uid, [iid for (iid, _) in user_ratings])
  count += 1
  if count == 9:
    break

566-26-0321 ['VR7', 'Redmi', 'Harman Kardon']
274-18-0017 ['VR7', 'Redmi', 'Harman Kardon']
455-28-7950 ['Fitbit large', 'VR7', 'Logitech']
254-72-5938 ['Logitech', 'Sony', 'Redmi']
563-47-8401 ['VR7', 'Sony', 'Redmi']
848-43-9454 ['Fitbit large', 'VR7', 'Sony']
056-77-2913 ['VR7', 'Redmi', 'Fitbit small']
404-86-6657 ['VR7', 'Sony', 'Samsung s10']
653-90-2575 ['VR7', 'Logitech', 'Sony']


### Validation

In [None]:
cross_validate(svdpp_algorithm, svdpp_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVDpp on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2014  0.1981  0.1964  0.1981  0.2011  0.1990  0.0019  
MAE (testset)     0.1013  0.0980  0.0982  0.0980  0.1035  0.0998  0.0022  
Fit time          0.92    0.91    0.93    0.92    0.92    0.92    0.00    
Test time         0.06    0.07    0.06    0.08    0.06    0.06    0.01    


{'fit_time': (0.9237861633300781,
  0.9141921997070312,
  0.926285982131958,
  0.9182925224304199,
  0.9237527847290039),
 'test_mae': array([0.10128751, 0.09801132, 0.098166  , 0.09799862, 0.1034702 ]),
 'test_rmse': array([0.20139426, 0.1980771 , 0.19641264, 0.19814595, 0.20107043]),
 'test_time': (0.05914950370788574,
  0.06871414184570312,
  0.05883288383483887,
  0.07536935806274414,
  0.057782649993896484)}