# SYMA

In [1]:
import pandas as pd
import os

## Loading Data

In [2]:
PATH_DATA = os.path.join("..", "data")

train_session_df = pd.read_csv(os.path.join(PATH_DATA, "train_sessions.csv"))
train_purchase_df = pd.read_csv(os.path.join(PATH_DATA, "train_purchases.csv"))

candidate_items_df = pd.read_csv(os.path.join(PATH_DATA, "candidate_items.csv"))
item_features_df = pd.read_csv(os.path.join(PATH_DATA, "item_features.csv"))

In [3]:
train_session_df.describe()

Unnamed: 0,session_id,item_id
count,4743820.0,4743820.0
mean,2218286.0,14022.11
std,1281012.0,8177.893
min,3.0,2.0
25%,1110000.0,6946.0
50%,2214788.0,14033.0
75%,3325631.0,21000.0
max,4440001.0,28143.0


In [4]:
train_purchase_df.describe()

Unnamed: 0,session_id,item_id
count,1000000.0,1000000.0
mean,2221071.0,13978.825051
std,1281018.0,8187.993593
min,3.0,3.0
25%,1112741.0,6977.0
50%,2220268.0,13922.0
75%,3329927.0,20879.0
max,4440001.0,28143.0


In [5]:
candidate_items_df.describe()

Unnamed: 0,item_id
count,4990.0
mean,14007.035271
std,8218.231425
min,4.0
25%,6833.5
50%,14108.5
75%,21200.0
max,28137.0


In [6]:
item_features_df.describe()

Unnamed: 0,item_id,feature_category_id,feature_value_id
count,471751.0,471751.0,471751.0
mean,14058.539477,42.424597,486.345578
std,8107.465455,22.186285,258.865151
min,2.0,1.0,1.0
25%,7060.0,25.0,273.0
50%,14045.0,47.0,512.0
75%,21063.0,61.0,708.0
max,28143.0,73.0,905.0


## Data exploration

*How many different items does exist?*

In [7]:
distinct_item_number = len(item_features_df.item_id.unique())
print("Unique item number :", distinct_item_number)
print("Item id are unique : ", item_features_df.item_id.nunique() == len(item_features_df.item_id.unique()))

Unique item number : 23691
Item id are unique :  True


*How many different sessions does exist?*

In [8]:
distinct_session_number = len(pd.concat([train_session_df.session_id, train_purchase_df.session_id]).unique())
print("Unique user number :", distinct_session_number)

Unique user number : 1000000


*Does session always look an item before buying it?*

In [9]:
import numpy as np

print("A user never look at one item before buying it.")
pd.merge(train_purchase_df, train_session_df, on=['session_id','item_id'], how='left', indicator='Exist')["Exist"].value_counts()

A user never look at one item before buying it.


left_only     1000000
right_only          0
both                0
Name: Exist, dtype: int64

*Can a session look at items without buying any?*

In [10]:
print("Every session bought exactly one item.")

pd.merge(train_purchase_df, train_session_df, on=['session_id'], how='left', indicator='Exist')["Exist"].value_counts()

Every session bought exactly one item.


both          4743820
left_only           0
right_only          0
Name: Exist, dtype: int64

*What is the average number of different items every user usually look?*

In [11]:
print("Average number of items seen by user :", train_session_df.groupby("session_id").count()["item_id"].mean())

Average number of items seen by user : 4.74382


*What will be the size of our rating matrix?*

In [12]:
print("Size of the maximum full rating matrix : ", (distinct_session_number * distinct_item_number, 3))

Size of the maximum full rating matrix :  (23691000000, 3)


## SVD++

We want to create ratings given by every session for every item. We will first choose the following rating system:
- If the user has seen the item, we will give it a rating of 1.
- If the user purchased the item, we will give it a rating of 2. 

In [13]:
# ----------------------------- WE CREATE RATINGS ---------------------------- #
train_rating_df = pd.concat([train_session_df.assign(rating=1), train_purchase_df.assign(rating=2)])
train_rating_df.describe()

Unnamed: 0,session_id,item_id,rating
count,5743820.0,5743820.0,5743820.0
mean,2218771.0,14014.57,1.1741
std,1281013.0,8179.668,0.3791956
min,3.0,2.0,1.0
25%,1110573.0,6952.0,1.0
50%,2215782.0,14017.0,1.0
75%,3326251.0,20969.0,1.0
max,4440001.0,28143.0,2.0


In [14]:
# --------------------------- TRAIN TEST SET SPLIT --------------------------- #
import sklearn.model_selection


train_ratings_df_shuffled = train_rating_df.sample(len(train_rating_df))

train_set, test_set = sklearn.model_selection.train_test_split(train_ratings_df_shuffled, test_size=0.2, random_state=42)

In [15]:
# -------------- WE REDUCE THE SIZE OF OUR DATASET FOR RESEARCH -------------- #

train_set_reduced = train_set[:10000]

In [16]:
# ----------------------------- WE CREATE OUR SET ---------------------------- #

import surprise

rating_reader = surprise.Reader(rating_scale=(1, 2))
train_set = surprise.dataset.Dataset.load_from_df(df=train_set_reduced[["session_id", "item_id", "rating"]], reader=rating_reader)
test_set = surprise.dataset.Dataset.load_from_df(df=test_set[["session_id", "item_id", "rating"]], reader=rating_reader)

In [17]:
# ------------------------- WE TRAIN OUR FIRST MODEL ------------------------- #

model = surprise.SVD()

surprise.model_selection.cross_validate(model, train_set, measures=["RMSE", "MAE"], cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.3844  0.3818  0.3865  0.3793  0.3834  0.3831  0.0024  
MAE (testset)     0.2949  0.2930  0.2956  0.2906  0.2943  0.2937  0.0017  
Fit time          0.46    0.50    0.44    0.48    0.47    0.47    0.02    
Test time         0.02    0.01    0.02    0.01    0.01    0.01    0.00    


{'test_rmse': array([0.3844343 , 0.38177514, 0.38649137, 0.37933862, 0.3834367 ]),
 'test_mae': array([0.29489116, 0.29295948, 0.29561399, 0.29064797, 0.29428904]),
 'fit_time': (0.4564938545227051,
  0.5017485618591309,
  0.438387393951416,
  0.4776496887207031,
  0.4745056629180908),
 'test_time': (0.015576839447021484,
  0.009366989135742188,
  0.01679062843322754,
  0.009724617004394531,
  0.010237932205200195)}

# Let's compare our models

In [18]:
model_list = [surprise.NormalPredictor(), surprise.BaselineOnly(), surprise.KNNBaseline(), surprise.KNNBasic(), surprise.KNNWithMeans(), surprise.KNNWithZScore(), surprise.SlopeOne(), surprise.SVD(), surprise.SVDpp(), surprise.NMF(), surprise.CoClustering(), surprise.SlopeOne()]

result = {}
for model in model_list:
    scores = surprise.model_selection.cross_validate(model, train_set, measures=["RMSE", "MAE"], cv=5, verbose=False)
    result[model.__class__.__name__] = (scores["test_rmse"].mean(), scores["test_mae"].mean())

Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...


: 

: 

In [None]:
# ------------------------- BEST ALGORITHMS WITH RMSE ------------------------ #
sorted(result.items(), key=lambda x: x[1][0])

[('SVD', (0.3795323715181679, 0.29001879870668973)),
 ('BaselineOnly', (0.3799472231153003, 0.2901737096361625)),
 ('KNNBaseline', (0.3801831994698909, 0.2901044349344999)),
 ('SVDpp', (0.38065748297560986, 0.2896657723644876)),
 ('NMF', (0.3811061017480477, 0.28942517230474896)),
 ('KNNBasic', (0.38114509738811597, 0.2905057958333333)),
 ('KNNWithMeans', (0.3815050259896631, 0.2896348)),
 ('SlopeOne', (0.3817983521556306, 0.2897450125)),
 ('KNNWithZScore', (0.3820940947667248, 0.29010767499999995)),
 ('CoClustering', (0.3824299486433041, 0.2907501434982519)),
 ('NormalPredictor', (0.47464981981254295, 0.33872879395316297))]

In [None]:
# ------------------------- BEST ALGORITHMS WITH MAE ------------------------- #
sorted(result.items(), key=lambda x: x[1][1])

[('NMF', (0.3811061017480477, 0.28942517230474896)),
 ('KNNWithMeans', (0.3815050259896631, 0.2896348)),
 ('SVDpp', (0.38065748297560986, 0.2896657723644876)),
 ('SlopeOne', (0.3817983521556306, 0.2897450125)),
 ('SVD', (0.3795323715181679, 0.29001879870668973)),
 ('KNNBaseline', (0.3801831994698909, 0.2901044349344999)),
 ('KNNWithZScore', (0.3820940947667248, 0.29010767499999995)),
 ('BaselineOnly', (0.3799472231153003, 0.2901737096361625)),
 ('KNNBasic', (0.38114509738811597, 0.2905057958333333)),
 ('CoClustering', (0.3824299486433041, 0.2907501434982519)),
 ('NormalPredictor', (0.47464981981254295, 0.33872879395316297))]

In [None]:
# ------------------------------ BEST ALGORITHMS ----------------------------- #
import numpy as np
sorted(result.items(), key=lambda x: np.mean(x[1]))

[('SVD', (0.3795323715181679, 0.29001879870668973)),
 ('BaselineOnly', (0.3799472231153003, 0.2901737096361625)),
 ('KNNBaseline', (0.3801831994698909, 0.2901044349344999)),
 ('SVDpp', (0.38065748297560986, 0.2896657723644876)),
 ('NMF', (0.3811061017480477, 0.28942517230474896)),
 ('KNNWithMeans', (0.3815050259896631, 0.2896348)),
 ('SlopeOne', (0.3817983521556306, 0.2897450125)),
 ('KNNBasic', (0.38114509738811597, 0.2905057958333333)),
 ('KNNWithZScore', (0.3820940947667248, 0.29010767499999995)),
 ('CoClustering', (0.3824299486433041, 0.2907501434982519)),
 ('NormalPredictor', (0.47464981981254295, 0.33872879395316297))]

## Now perform some Grid Search

In [None]:
import sklearn.model_selection
from multiprocessing import Pool

params = {
    "n_factors" : [50, 100, 150, 200],
    "biased" : [True, False],
}

SVD_list = [(surprise.SVD(**args), args) for args in list(sklearn.model_selection.ParameterGrid(params))]

def train_test_model(svd_model, params):
    svd_model.fit(train_set)
    predictions = svd_model.test(test_set)
    return (svd_model, params, predictions)

with Pool() as pool:
# for svd_model, svd_params in SVD_list:
    res = pool.map(train_test_model, SVD_list)

AttributeError: 'DataFrame' object has no attribute 'global_mean'