# SYMA

In [1]:
import pandas as pd
import os

## Loading Data

In [2]:
PATH_DATA = os.path.join("..", "data")

train_session_df = pd.read_csv(os.path.join(PATH_DATA, "train_sessions.csv"))
train_purchase_df = pd.read_csv(os.path.join(PATH_DATA, "train_purchases.csv"))

candidate_items_df = pd.read_csv(os.path.join(PATH_DATA, "candidate_items.csv"))
item_features_df = pd.read_csv(os.path.join(PATH_DATA, "item_features.csv"))

In [3]:
train_session_df.describe()

Unnamed: 0,session_id,item_id
count,4743820.0,4743820.0
mean,2218286.0,14022.11
std,1281012.0,8177.893
min,3.0,2.0
25%,1110000.0,6946.0
50%,2214788.0,14033.0
75%,3325631.0,21000.0
max,4440001.0,28143.0


In [4]:
train_purchase_df.describe()

Unnamed: 0,session_id,item_id
count,1000000.0,1000000.0
mean,2221071.0,13978.825051
std,1281018.0,8187.993593
min,3.0,3.0
25%,1112741.0,6977.0
50%,2220268.0,13922.0
75%,3329927.0,20879.0
max,4440001.0,28143.0


In [5]:
candidate_items_df.describe()

Unnamed: 0,item_id
count,4990.0
mean,14007.035271
std,8218.231425
min,4.0
25%,6833.5
50%,14108.5
75%,21200.0
max,28137.0


In [6]:
item_features_df.describe()

Unnamed: 0,item_id,feature_category_id,feature_value_id
count,471751.0,471751.0,471751.0
mean,14058.539477,42.424597,486.345578
std,8107.465455,22.186285,258.865151
min,2.0,1.0,1.0
25%,7060.0,25.0,273.0
50%,14045.0,47.0,512.0
75%,21063.0,61.0,708.0
max,28143.0,73.0,905.0


## Data exploration

*How many different items does exist?*

In [7]:
distinct_item_number = len(item_features_df.item_id.unique())
print("Unique item number :", distinct_item_number)
print("Item id are unique : ", item_features_df.item_id.nunique() == len(item_features_df.item_id.unique()))

Unique item number : 23691
Item id are unique :  True


*How many different sessions does exist?*

In [8]:
distinct_session_number = len(pd.concat([train_session_df.session_id, train_purchase_df.session_id]).unique())
print("Unique user number :", distinct_session_number)

Unique user number : 1000000


*Does session always look an item before buying it?*

In [9]:
import numpy as np

print("A user never look at one item before buying it.")
pd.merge(train_purchase_df, train_session_df, on=['session_id','item_id'], how='left', indicator='Exist')["Exist"].value_counts()

A user never look at one item before buying it.


left_only     1000000
right_only          0
both                0
Name: Exist, dtype: int64

*Can a session look at items without buying any?*

In [10]:
print("Every session bought exactly one item.")

pd.merge(train_purchase_df, train_session_df, on=['session_id'], how='left', indicator='Exist')["Exist"].value_counts()

Every session bought exactly one item.


both          4743820
left_only           0
right_only          0
Name: Exist, dtype: int64

*What is the average number of different items every user usually look?*

In [11]:
print("Average number of items seen by user :", train_session_df.groupby("session_id").count()["item_id"].mean())

Average number of items seen by user : 4.74382


*What will be the size of our rating matrix?*

In [12]:
print("Size of the maximum full rating matrix : ", (distinct_session_number * distinct_item_number, 3))

Size of the maximum full rating matrix :  (23691000000, 3)


## SVD++

We want to create ratings given by every session for every item. We will first choose the following rating system:
- If the user has seen the item, we will give it a rating of 1.
- If the user purchased the item, we will give it a rating of 2. 

In [13]:
# ----------------------------- WE CREATE RATINGS ---------------------------- #
train_rating_df = pd.concat([train_session_df.assign(rating=1), train_purchase_df.assign(rating=2)])
train_rating_df.describe()

Unnamed: 0,session_id,item_id,rating
count,5743820.0,5743820.0,5743820.0
mean,2218771.0,14014.57,1.1741
std,1281013.0,8179.668,0.3791956
min,3.0,2.0,1.0
25%,1110573.0,6952.0,1.0
50%,2215782.0,14017.0,1.0
75%,3326251.0,20969.0,1.0
max,4440001.0,28143.0,2.0


In [14]:
# ---------------------------- SHUFFLE AND RENAME ---------------------------- #
train_ratings_df_shuffled = train_rating_df.sample(len(train_rating_df))
train_ratings_df_shuffled.rename(columns={"session_id" : "user_id", "rating" : "raw_ratings"}, inplace=True)

In [15]:
# -------------- WE REDUCE THE SIZE OF OUR DATASET FOR RESEARCH -------------- #

train_set_df_reduced = train_ratings_df_shuffled[:10000]

In [16]:
# ----------------------------- WE CREATE OUR SET ---------------------------- #

import surprise

rating_reader = surprise.Reader(rating_scale=(1, 2))
dataset = surprise.dataset.Dataset.load_from_df(df=train_set_df_reduced[["user_id", "item_id", "raw_ratings"]], reader=rating_reader)

In [17]:
import surprise
import sklearn.model_selection

train_set_df, test_set_df = sklearn.model_selection.train_test_split(train_set_df_reduced)

rating_reader = surprise.Reader(rating_scale=(1, 2))
train_set = surprise.dataset.Dataset.load_from_df(df=train_set_df[["user_id", "item_id", "raw_ratings"]], reader=rating_reader)
test_set = surprise.dataset.Dataset.load_from_df(df=test_set_df[["user_id", "item_id", "raw_ratings"]], reader=rating_reader)

In [18]:
# ------------------------- WE TRAIN OUR FIRST MODEL ------------------------- #

model = surprise.SVD()

surprise.model_selection.cross_validate(model, train_set, measures=["RMSE", "MAE"], cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.4007  0.3775  0.3869  0.3798  0.3780  0.3846  0.0087  
MAE (testset)     0.3050  0.2913  0.2967  0.2942  0.2919  0.2958  0.0050  
Fit time          0.42    0.38    0.38    0.37    0.34    0.38    0.02    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


{'test_rmse': array([0.40073756, 0.37754943, 0.38685571, 0.37975056, 0.3780363 ]),
 'test_mae': array([0.30503739, 0.29134224, 0.29667013, 0.29417354, 0.29190019]),
 'fit_time': (0.41888904571533203,
  0.3776090145111084,
  0.3796830177307129,
  0.36913299560546875,
  0.34375572204589844),
 'test_time': (0.0073473453521728516,
  0.0070340633392333984,
  0.0073549747467041016,
  0.007274627685546875,
  0.007431983947753906)}

# Let's compare our models

In [19]:
model_list = [surprise.NormalPredictor(), surprise.BaselineOnly(), surprise.KNNBaseline(), surprise.KNNBasic(), surprise.KNNWithMeans(), surprise.KNNWithZScore(), surprise.SlopeOne(), surprise.SVD(), surprise.SVDpp(), surprise.NMF(), surprise.CoClustering(), surprise.SlopeOne()]

result = {}
for model in model_list:
    scores = surprise.model_selection.cross_validate(model, train_set, measures=["RMSE", "MAE"], cv=5, verbose=False)
    result[model.__class__.__name__] = (scores["test_rmse"].mean(), scores["test_mae"].mean())

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matr

In [20]:
# ------------------------- BEST ALGORITHMS WITH RMSE ------------------------ #
sorted(result.items(), key=lambda x: x[1][0])

[('NMF', (0.38369588696351264, 0.2938766559314043)),
 ('KNNBasic', (0.3837257562018383, 0.2945646222222222)),
 ('KNNWithZScore', (0.3841332625352265, 0.2942441777777777)),
 ('SVD', (0.384250641636696, 0.295959336022494)),
 ('KNNWithMeans', (0.3842585331632329, 0.29434948888888884)),
 ('CoClustering', (0.38436807181465554, 0.2946157482084649)),
 ('BaselineOnly', (0.3844058431766527, 0.29594424904499)),
 ('KNNBaseline', (0.38442519243575096, 0.2959769510388076)),
 ('SlopeOne', (0.3845823851505409, 0.2944956888888889)),
 ('SVDpp', (0.3852439814173815, 0.29590232904094227)),
 ('NormalPredictor', (0.4803733198128734, 0.34588960079227177))]

In [21]:
# ------------------------- BEST ALGORITHMS WITH MAE ------------------------- #
sorted(result.items(), key=lambda x: x[1][1])

[('NMF', (0.38369588696351264, 0.2938766559314043)),
 ('KNNWithZScore', (0.3841332625352265, 0.2942441777777777)),
 ('KNNWithMeans', (0.3842585331632329, 0.29434948888888884)),
 ('SlopeOne', (0.3845823851505409, 0.2944956888888889)),
 ('KNNBasic', (0.3837257562018383, 0.2945646222222222)),
 ('CoClustering', (0.38436807181465554, 0.2946157482084649)),
 ('SVDpp', (0.3852439814173815, 0.29590232904094227)),
 ('BaselineOnly', (0.3844058431766527, 0.29594424904499)),
 ('SVD', (0.384250641636696, 0.295959336022494)),
 ('KNNBaseline', (0.38442519243575096, 0.2959769510388076)),
 ('NormalPredictor', (0.4803733198128734, 0.34588960079227177))]

In [22]:
# ------------------------------ BEST ALGORITHMS ----------------------------- #
import numpy as np
sorted(result.items(), key=lambda x: np.mean(x[1]))

[('NMF', (0.38369588696351264, 0.2938766559314043)),
 ('KNNBasic', (0.3837257562018383, 0.2945646222222222)),
 ('KNNWithZScore', (0.3841332625352265, 0.2942441777777777)),
 ('KNNWithMeans', (0.3842585331632329, 0.29434948888888884)),
 ('CoClustering', (0.38436807181465554, 0.2946157482084649)),
 ('SlopeOne', (0.3845823851505409, 0.2944956888888889)),
 ('SVD', (0.384250641636696, 0.295959336022494)),
 ('BaselineOnly', (0.3844058431766527, 0.29594424904499)),
 ('KNNBaseline', (0.38442519243575096, 0.2959769510388076)),
 ('SVDpp', (0.3852439814173815, 0.29590232904094227)),
 ('NormalPredictor', (0.4803733198128734, 0.34588960079227177))]

## Now perform some Grid Search

In [27]:
class MyCrossValidation:
    def __init__(self, params):
        self.SVD_list = [
            (surprise.SVD(**args, verbose=False), args)
            for args in list(sklearn.model_selection.ParameterGrid(params))
        ]
        self.full_train_set = train_set.build_full_trainset()

    def __train_test_model(self, svd_model, params, verbose=1):
        svd_model.fit(
            self.full_train_set,
        )
        predictions = svd_model.test(self.full_train_set.build_testset())
        score = surprise.accuracy.rmse(
            predictions, verbose=True if verbose == 2 else False
        )
        if verbose == 1:
            print("Params {} :".format(str(params)), score)
        return (params, score)

    def __call__(self, verbose=1):
        res = []
        while len(self.SVD_list):
            svd_model, params = self.SVD_list.pop()
            if verbose == 1:
                print("{} left".format(len(self.SVD_list)), end=" --- ")
            res.append(self.__train_test_model(svd_model, params, verbose))
            del svd_model
        return sorted(res, key=lambda x: x[1])

In [32]:
# ----------------------------- FIND BEST PARAMS ----------------------------- #
params = {
    "biased" : [False],
    "init_std_dev" : [0.5, 1, 5],
    "lr_all" : [0.001],
    "reg_bu" : [0.05, 0.1, 0.5],
    "reg_bi" : [0.005, 0.01, 0.05],
    "reg_qi" : [0.005, 0.01, 0.05],
    "reg_pu" : [0.0005, 0.001, 0.005],
}

best_model = MyCrossValidation(params)()

242 left --- Params {'biased': False, 'init_std_dev': 5, 'lr_all': 0.001, 'reg_bi': 0.05, 'reg_bu': 0.5, 'reg_pu': 0.005, 'reg_qi': 0.05} : 0.9026155442740667
241 left --- Params {'biased': False, 'init_std_dev': 5, 'lr_all': 0.001, 'reg_bi': 0.05, 'reg_bu': 0.5, 'reg_pu': 0.005, 'reg_qi': 0.01} : 0.9019266173524731
240 left --- Params {'biased': False, 'init_std_dev': 5, 'lr_all': 0.001, 'reg_bi': 0.05, 'reg_bu': 0.5, 'reg_pu': 0.005, 'reg_qi': 0.005} : 0.9025194111872448
239 left --- Params {'biased': False, 'init_std_dev': 5, 'lr_all': 0.001, 'reg_bi': 0.05, 'reg_bu': 0.5, 'reg_pu': 0.001, 'reg_qi': 0.05} : 0.902583424200978
238 left --- Params {'biased': False, 'init_std_dev': 5, 'lr_all': 0.001, 'reg_bi': 0.05, 'reg_bu': 0.5, 'reg_pu': 0.001, 'reg_qi': 0.01} : 0.9035917745211172
237 left --- Params {'biased': False, 'init_std_dev': 5, 'lr_all': 0.001, 'reg_bi': 0.05, 'reg_bu': 0.5, 'reg_pu': 0.001, 'reg_qi': 0.005} : 0.9030834966496667
236 left --- Params {'biased': False, 'init_s

In [33]:
best_model[:5]

[({'biased': False,
   'init_std_dev': 1,
   'lr_all': 0.001,
   'reg_bi': 0.05,
   'reg_bu': 0.05,
   'reg_pu': 0.005,
   'reg_qi': 0.05},
  0.09989684766237528),
 ({'biased': False,
   'init_std_dev': 1,
   'lr_all': 0.001,
   'reg_bi': 0.01,
   'reg_bu': 0.5,
   'reg_pu': 0.005,
   'reg_qi': 0.01},
  0.09996056277436083),
 ({'biased': False,
   'init_std_dev': 1,
   'lr_all': 0.001,
   'reg_bi': 0.05,
   'reg_bu': 0.5,
   'reg_pu': 0.005,
   'reg_qi': 0.05},
  0.10001164725099751),
 ({'biased': False,
   'init_std_dev': 1,
   'lr_all': 0.001,
   'reg_bi': 0.005,
   'reg_bu': 0.05,
   'reg_pu': 0.005,
   'reg_qi': 0.01},
  0.1000452258791643),
 ({'biased': False,
   'init_std_dev': 1,
   'lr_all': 0.001,
   'reg_bi': 0.005,
   'reg_bu': 0.05,
   'reg_pu': 0.0005,
   'reg_qi': 0.01},
  0.10066874107263071)]

In [None]:
final_model = surprise.SVD(**best_model[0][1])