In [1]:
import gzip
import json
from typing import Iterator

import pandas as pd
from surprise import (
    accuracy,
    BaselineOnly,
    Dataset,
    KNNBasic,
    NormalPredictor,
    Reader,
    SlopeOne,
    SVD,
)
from surprise.model_selection import cross_validate, KFold

# Table of Contents

- [Step B.1](#Step-B.1)
- [Step B.2](#Step-B.2)
- [Step B.3](#Step-B.3)
- [Step B.4](#Step-B.4)

In [2]:
# download the “small” 5-core dataset for the category "Digital Music"
# dataset source: https://nijianmo.github.io/amazon/index.html

!wget --backups=1 http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Digital_Music_5.json.gz -P data/

--2022-02-20 00:35:49--  http://deepyeti.ucsd.edu/jianmo/amazon/categoryFilesSmall/Digital_Music_5.json.gz
Resolving deepyeti.ucsd.edu (deepyeti.ucsd.edu)... 169.228.63.50
Connecting to deepyeti.ucsd.edu (deepyeti.ucsd.edu)|169.228.63.50|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19408584 (19M) [application/octet-stream]
Saving to: ‘data/Digital_Music_5.json.gz’


2022-02-20 00:36:40 (376 KB/s) - ‘data/Digital_Music_5.json.gz’ saved [19408584/19408584]



## Step B.1

In [3]:
def inspect_df(df: pd.DataFrame, n: int = 5) -> pd.DataFrame:
    """Helper method to easily inspect DataFrames."""

    print(f"shape: {df.shape}")

    return df.head(n)

In [4]:
def parse(filepath: str) -> Iterator[dict]:
    file_obj = gzip.open(filepath, "rb")
    for line in file_obj:
        yield json.loads(line)

In [5]:
def file_to_dataframe(filepath: str) -> pd.DataFrame:
    i = 0
    df = {}
    for d in parse(filepath):
        df[i] = {
            "user": d["reviewerID"],
            "item": d["asin"],
            "rating": d["overall"],
            "timestamp": d["unixReviewTime"],
        }  # keep only the essential data
        i += 1
    return pd.DataFrame.from_dict(df, orient="index")

In [6]:
review_data = file_to_dataframe("data/Digital_Music_5.json.gz")

In [7]:
reader = Reader(rating_scale=(1, 5))

In [8]:
data = Dataset.load_from_df(review_data[["user", "item", "rating"]], reader)

inspect_df(data.df, 10)

shape: (169781, 3)


Unnamed: 0,user,item,rating
0,A2TYZ821XXK2YZ,3426958910,5.0
1,A3OFSREZADFUDY,3426958910,5.0
2,A2VAMODP8M77NG,3426958910,5.0
3,AAKSLZ9IDTEH0,3426958910,4.0
4,A3OH43OZJLKI09,5557706259,5.0
5,A6JP9YYJCK3WO,5557706259,4.0
6,A20Z0JD766DPG6,5557706259,5.0
7,AXQ89O6YRT7AX,5557706259,5.0
8,A1M379MK8MKGUF,5557706259,5.0
9,AA762OWLWCPQ2,5557706259,5.0


## Step B.2

In [9]:
cv_results = {}

In [10]:
"""
Algorithm predicting a random rating based on the distribution of the training set, 
which is assumed to be normal.
"""
cv_results["NormalPredictor"] = cross_validate(
    algo=NormalPredictor(), data=data, verbose=True, cv=5
)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8940  0.8989  0.8961  0.8954  0.8927  0.8954  0.0021  
MAE (testset)     0.5933  0.5951  0.5947  0.5932  0.5937  0.5940  0.0008  
Fit time          0.13    0.18    0.19    0.17    0.17    0.17    0.02    
Test time         0.21    0.20    0.27    0.20    0.19    0.22    0.03    


In [11]:
"""
Algorithm predicting the baseline estimate for given user and item.
"""
cv_results["BaselineOnly"] = cross_validate(
    algo=BaselineOnly(), data=data, verbose=True, cv=5
)

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Evaluating RMSE, MAE of algorithm BaselineOnly on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.5943  0.5902  0.5990  0.5930  0.5820  0.5917  0.0056  
MAE (testset)     0.3724  0.3704  0.3723  0.3695  0.3676  0.3704  0.0018  
Fit time          0.42    0.42    0.48    0.41    0.47    0.44    0.03    
Test time         0.14    0.15    0.15    0.14    0.15    0.15    0.01    


In [12]:
"""
A basic item-item collaborative filtering algorithm.
"""
cv_results["KNNBasic (cosine) item-based"] = cross_validate(
    algo=KNNBasic(
        sim_options={
            "name": "cosine",
            "user_based": False,
        }
    ),
    data=data,
    verbose=True,
    cv=5,
)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.6131  0.6100  0.6162  0.6084  0.6049  0.6105  0.0039  
MAE (testset)     0.3150  0.3139  0.3170  0.3146  0.3122  0.3145  0.0016  
Fit time          4.89    4.73    5.06    4.75    4.78    4.84    0.12    
Test time         0.86    0.74    0.66    0.67    0.60    0.71    0.09    


In [13]:
"""
A simple yet accurate item-based collaborative filtering algorithm.

This is a straightforward implementation of the SlopeOne algorithm: https://arxiv.org/abs/cs/0702144
"""
cv_results["SlopeOne"] = cross_validate(
    algo=SlopeOne(),
    data=data,
    verbose=True,
    cv=5,
)

Evaluating RMSE, MAE of algorithm SlopeOne on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.5785  0.5863  0.5874  0.5859  0.5768  0.5830  0.0044  
MAE (testset)     0.2706  0.2746  0.2758  0.2756  0.2687  0.2730  0.0029  
Fit time          2.59    2.70    2.60    2.84    2.84    2.71    0.11    
Test time         0.54    0.55    0.53    0.54    0.55    0.54    0.01    


In [14]:
# """
# A basic user-user collaborative filtering algorithm.
# """
# cv_results["KNNBasic (cosine) user-based"] = cross_validate(
#     algo=KNNBasic(
#         sim_options={
#             "name": "cosine",
#             "user_based": True,
#         }
#     ),
#     data=data,
#     verbose=True,
#     cv=5,
# )

In [15]:
kf = KFold(n_splits=5, random_state=123456789)

algo = SVD()

for trainset, testset in kf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 0.5611
RMSE: 0.5651
RMSE: 0.5572
RMSE: 0.5657
RMSE: 0.5604


## Step B.3

## Step B.4