# SYMA

In [1]:
import pandas as pd
import os

## Loading Data

In [2]:
PATH_DATA = os.path.join("..", "data")

train_session_df = pd.read_csv(os.path.join(PATH_DATA, "train_sessions.csv"))
train_purchase_df = pd.read_csv(os.path.join(PATH_DATA, "train_purchases.csv"))

candidate_items_df = pd.read_csv(os.path.join(PATH_DATA, "candidate_items.csv"))
item_features_df = pd.read_csv(os.path.join(PATH_DATA, "item_features.csv"))

In [3]:
train_session_df.describe()

Unnamed: 0,session_id,item_id
count,4743820.0,4743820.0
mean,2218286.0,14022.11
std,1281012.0,8177.893
min,3.0,2.0
25%,1110000.0,6946.0
50%,2214788.0,14033.0
75%,3325631.0,21000.0
max,4440001.0,28143.0


In [4]:
train_purchase_df.describe()

Unnamed: 0,session_id,item_id
count,1000000.0,1000000.0
mean,2221071.0,13978.825051
std,1281018.0,8187.993593
min,3.0,3.0
25%,1112741.0,6977.0
50%,2220268.0,13922.0
75%,3329927.0,20879.0
max,4440001.0,28143.0


In [5]:
candidate_items_df.describe()

Unnamed: 0,item_id
count,4990.0
mean,14007.035271
std,8218.231425
min,4.0
25%,6833.5
50%,14108.5
75%,21200.0
max,28137.0


In [6]:
item_features_df.describe()

Unnamed: 0,item_id,feature_category_id,feature_value_id
count,471751.0,471751.0,471751.0
mean,14058.539477,42.424597,486.345578
std,8107.465455,22.186285,258.865151
min,2.0,1.0,1.0
25%,7060.0,25.0,273.0
50%,14045.0,47.0,512.0
75%,21063.0,61.0,708.0
max,28143.0,73.0,905.0


## Data exploration

*How many different items does exist?*

In [7]:
distinct_item_number = len(item_features_df.item_id.unique())
print("Unique item number :", distinct_item_number)
print("Item id are unique : ", item_features_df.item_id.nunique() == len(item_features_df.item_id.unique()))

Unique item number : 23691
Item id are unique :  True


*How many different sessions does exist?*

In [8]:
distinct_session_number = len(pd.concat([train_session_df.session_id, train_purchase_df.session_id]).unique())
print("Unique user number :", distinct_session_number)

Unique user number : 1000000


*Does session always look an item before buying it?*

In [9]:
import numpy as np

print("A user never look at one item before buying it.")
pd.merge(train_purchase_df, train_session_df, on=['session_id','item_id'], how='left', indicator='Exist')["Exist"].value_counts()

A user never look at one item before buying it.


left_only     1000000
right_only          0
both                0
Name: Exist, dtype: int64

*Can a session look at items without buying any?*

In [10]:
print("Every session bought exactly one item.")

pd.merge(train_purchase_df, train_session_df, on=['session_id'], how='left', indicator='Exist')["Exist"].value_counts()

Every session bought exactly one item.


both          4743820
left_only           0
right_only          0
Name: Exist, dtype: int64

*What is the average number of different items every user usually look?*

In [11]:
print("Average number of items seen by user :", train_session_df.groupby("session_id").count()["item_id"].mean())

Average number of items seen by user : 4.74382


*What will be the size of our rating matrix?*

In [12]:
print("Size of the maximum full rating matrix : ", (distinct_session_number * distinct_item_number, 3))

Size of the maximum full rating matrix :  (23691000000, 3)


## SVD++

We want to create ratings given by every session for every item. We will first choose the following rating system:
- If the user has seen the item, we will give it a rating of 1.
- If the user purchased the item, we will give it a rating of 2. 

In [13]:
# ----------------------------- WE CREATE RATINGS ---------------------------- #
train_rating_df = pd.concat([train_session_df.assign(rating=1), train_purchase_df.assign(rating=2)])
train_rating_df.describe()

Unnamed: 0,session_id,item_id,rating
count,5743820.0,5743820.0,5743820.0
mean,2218771.0,14014.57,1.1741
std,1281013.0,8179.668,0.3791956
min,3.0,2.0,1.0
25%,1110573.0,6952.0,1.0
50%,2215782.0,14017.0,1.0
75%,3326251.0,20969.0,1.0
max,4440001.0,28143.0,2.0


In [14]:
# ---------------------------- SHUFFLE AND RENAME ---------------------------- #
train_ratings_df_shuffled = train_rating_df.sample(len(train_rating_df))
train_ratings_df_shuffled.rename(columns={"session_id" : "user_id", "rating" : "raw_ratings"}, inplace=True)

In [15]:
# -------------- WE REDUCE THE SIZE OF OUR DATASET FOR RESEARCH -------------- #

train_set_df_reduced = train_ratings_df_shuffled[:10000]

In [16]:
# ----------------------------- WE CREATE OUR SET ---------------------------- #

import surprise

rating_reader = surprise.Reader(rating_scale=(1, 2))
dataset = surprise.dataset.Dataset.load_from_df(df=train_set_df_reduced[["user_id", "item_id", "raw_ratings"]], reader=rating_reader)

In [17]:
import surprise
import sklearn.model_selection

train_set_df, test_set_df = sklearn.model_selection.train_test_split(train_set_df_reduced)

rating_reader = surprise.Reader(rating_scale=(1, 2))
train_set = surprise.dataset.Dataset.load_from_df(df=train_set_df[["user_id", "item_id", "raw_ratings"]], reader=rating_reader)
test_set = surprise.dataset.Dataset.load_from_df(df=test_set_df[["user_id", "item_id", "raw_ratings"]], reader=rating_reader)

In [18]:
# ------------------------- WE TRAIN OUR FIRST MODEL ------------------------- #

model = surprise.SVD()

surprise.model_selection.cross_validate(model, train_set, measures=["RMSE", "MAE"], cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.3893  0.3746  0.3664  0.3705  0.3797  0.3761  0.0079  
MAE (testset)     0.2898  0.2821  0.2776  0.2790  0.2851  0.2827  0.0044  
Fit time          0.39    0.38    0.34    0.36    0.34    0.36    0.02    
Test time         0.01    0.01    0.01    0.01    0.01    0.01    0.00    


{'test_rmse': array([0.38929695, 0.37464803, 0.36638123, 0.37048045, 0.37971612]),
 'test_mae': array([0.28978332, 0.2820518 , 0.27760589, 0.27902141, 0.28514466]),
 'fit_time': (0.3886566162109375,
  0.3816335201263428,
  0.3351600170135498,
  0.3603065013885498,
  0.3379371166229248),
 'test_time': (0.013039112091064453,
  0.007359504699707031,
  0.007533073425292969,
  0.0070400238037109375,
  0.007886648178100586)}

# Let's compare our models

In [19]:
model_list = [surprise.NormalPredictor(), surprise.BaselineOnly(), surprise.KNNBaseline(), surprise.KNNBasic(), surprise.KNNWithMeans(), surprise.KNNWithZScore(), surprise.SlopeOne(), surprise.SVD(), surprise.SVDpp(), surprise.NMF(), surprise.CoClustering(), surprise.SlopeOne()]

result = {}
for model in model_list:
    scores = surprise.model_selection.cross_validate(model, train_set, measures=["RMSE", "MAE"], cv=5, verbose=False)
    result[model.__class__.__name__] = (scores["test_rmse"].mean(), scores["test_mae"].mean())

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matr

In [20]:
# ------------------------- BEST ALGORITHMS WITH RMSE ------------------------ #
sorted(result.items(), key=lambda x: x[1][0])

[('KNNBasic', (0.3756149370180923, 0.28216844444444444)),
 ('NMF', (0.37567029358443, 0.28165728130769346)),
 ('KNNWithZScore', (0.37570280584278837, 0.2816638222222222)),
 ('KNNWithMeans', (0.3758726344201063, 0.28182364444444447)),
 ('CoClustering', (0.3761587694225965, 0.2822100277982957)),
 ('KNNBaseline', (0.3762555199073618, 0.2828159731757588)),
 ('BaselineOnly', (0.37627617440355254, 0.2830257197443878)),
 ('SlopeOne', (0.37636238971266467, 0.28215519999999994)),
 ('SVD', (0.37653609601052046, 0.2831849467549553)),
 ('SVDpp', (0.3766360484081435, 0.2827790485735288)),
 ('NormalPredictor', (0.4721333813770521, 0.33600701838575764))]

In [21]:
# ------------------------- BEST ALGORITHMS WITH MAE ------------------------- #
sorted(result.items(), key=lambda x: x[1][1])

[('NMF', (0.37567029358443, 0.28165728130769346)),
 ('KNNWithZScore', (0.37570280584278837, 0.2816638222222222)),
 ('KNNWithMeans', (0.3758726344201063, 0.28182364444444447)),
 ('SlopeOne', (0.37636238971266467, 0.28215519999999994)),
 ('KNNBasic', (0.3756149370180923, 0.28216844444444444)),
 ('CoClustering', (0.3761587694225965, 0.2822100277982957)),
 ('SVDpp', (0.3766360484081435, 0.2827790485735288)),
 ('KNNBaseline', (0.3762555199073618, 0.2828159731757588)),
 ('BaselineOnly', (0.37627617440355254, 0.2830257197443878)),
 ('SVD', (0.37653609601052046, 0.2831849467549553)),
 ('NormalPredictor', (0.4721333813770521, 0.33600701838575764))]

In [22]:
# ------------------------------ BEST ALGORITHMS ----------------------------- #
import numpy as np
sorted(result.items(), key=lambda x: np.mean(x[1]))

[('NMF', (0.37567029358443, 0.28165728130769346)),
 ('KNNWithZScore', (0.37570280584278837, 0.2816638222222222)),
 ('KNNWithMeans', (0.3758726344201063, 0.28182364444444447)),
 ('KNNBasic', (0.3756149370180923, 0.28216844444444444)),
 ('CoClustering', (0.3761587694225965, 0.2822100277982957)),
 ('SlopeOne', (0.37636238971266467, 0.28215519999999994)),
 ('KNNBaseline', (0.3762555199073618, 0.2828159731757588)),
 ('BaselineOnly', (0.37627617440355254, 0.2830257197443878)),
 ('SVDpp', (0.3766360484081435, 0.2827790485735288)),
 ('SVD', (0.37653609601052046, 0.2831849467549553)),
 ('NormalPredictor', (0.4721333813770521, 0.33600701838575764))]

## Now perform some Grid Search

In [33]:
# ------------------------- CREATING CROSS VALIDATION ------------------------ #

import sklearn.model_selection
from multiprocessing import Pool

class MyCrossValidation():
    def __init__(self, params):
        self.SVD_list = [(surprise.SVD(**args), args) for args in list(sklearn.model_selection.ParameterGrid(params))]
        self.full_train_set = train_set.build_full_trainset()

    def __train_test_model(self, args):
        svd_model, params = args
        svd_model.fit(self.full_train_set)
        predictions = svd_model.test(self.full_train_set.build_testset())
        return (svd_model, params, surprise.accuracy.rmse(predictions))
    
    def __call__(self):
        with Pool() as pool:
            for svd_model, svd_params in self.SVD_list:
                res = pool.map(self.__train_test_model, self.SVD_list)
        return sorted(res, key=lambda x : x[2])

RMSE: 0.2846
RMSE: 0.4123
RMSE: 0.2633
RMSE: 0.4123
RMSE: 0.4123
RMSE: 0.2408
RMSE: 0.4123
RMSE: 0.2197
RMSE: 0.4123
RMSE: 0.2846
RMSE: 0.4123
RMSE: 0.2633
RMSE: 0.4123
RMSE: 0.2408
RMSE: 0.4123
RMSE: 0.2197
RMSE: 0.2854
RMSE: 0.4123
RMSE: 0.2624
RMSE: 0.4123
RMSE: 0.2412
RMSE: 0.4123
RMSE: 0.2208
RMSE: 0.4123
RMSE: 0.4123
RMSE: 0.2861

RMSE: 0.4123
RMSE: 0.2624RMSE: 0.2437
RMSE: 0.4123
RMSE: 0.4123
RMSE: 0.2208
RMSE: 0.4123
RMSE: 0.2855
RMSE: 0.2630
RMSE: 0.4123
RMSE: 0.4123
RMSE: 0.2407
RMSE: 0.4123
RMSE: 0.2234
RMSE: 0.4123
RMSE: 0.2851
RMSE: 0.2627
RMSE: 0.4123
RMSE: 0.4123
RMSE: 0.2392
RMSE: 0.2216
RMSE: 0.4123
RMSE: 0.4123
RMSE: 0.2845
RMSE: 0.2627
RMSE: 0.4123
RMSE: 0.2396
RMSE: 0.4123
RMSE: 0.4123
RMSE: 0.2211
RMSE: 0.4123
RMSE: 0.2858
RMSE: 0.2620
RMSE: 0.4123
RMSE: 0.4123
RMSE: 0.2413
RMSE: 0.2197
RMSE: 0.4123


In [34]:
# ----------------------------- FIND BEST PARAMS ----------------------------- #
params = {
    "biased" : [True, False],
}

(<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f624b9f7160>,
 {'biased': True, 'n_factors': 200},
 0.21965668734981583)