# SYMA

In [76]:
import pandas as pd
import os

## Loading Data

In [77]:
PATH_DATA = os.path.join("..", "data")

train_session_df = pd.read_csv(os.path.join(PATH_DATA, "train_sessions.csv"))
train_purchase_df = pd.read_csv(os.path.join(PATH_DATA, "train_purchases.csv"))

candidate_items_df = pd.read_csv(os.path.join(PATH_DATA, "candidate_items.csv"))
item_features_df = pd.read_csv(os.path.join(PATH_DATA, "item_features.csv"))

In [78]:
train_session_df.describe()

Unnamed: 0,session_id,item_id
count,4743820.0,4743820.0
mean,2218286.0,14022.11
std,1281012.0,8177.893
min,3.0,2.0
25%,1110000.0,6946.0
50%,2214788.0,14033.0
75%,3325631.0,21000.0
max,4440001.0,28143.0


In [79]:
train_purchase_df.describe()

Unnamed: 0,session_id,item_id
count,1000000.0,1000000.0
mean,2221071.0,13978.825051
std,1281018.0,8187.993593
min,3.0,3.0
25%,1112741.0,6977.0
50%,2220268.0,13922.0
75%,3329927.0,20879.0
max,4440001.0,28143.0


In [80]:
candidate_items_df.describe()

Unnamed: 0,item_id
count,4990.0
mean,14007.035271
std,8218.231425
min,4.0
25%,6833.5
50%,14108.5
75%,21200.0
max,28137.0


In [81]:
item_features_df.describe()

Unnamed: 0,item_id,feature_category_id,feature_value_id
count,471751.0,471751.0,471751.0
mean,14058.539477,42.424597,486.345578
std,8107.465455,22.186285,258.865151
min,2.0,1.0,1.0
25%,7060.0,25.0,273.0
50%,14045.0,47.0,512.0
75%,21063.0,61.0,708.0
max,28143.0,73.0,905.0


## Data exploration

*How many different items does exist?*

In [82]:
distinct_item_number = len(item_features_df.item_id.unique())
print("Unique item number :", distinct_item_number)
print("Item id are unique : ", item_features_df.item_id.nunique() == len(item_features_df.item_id.unique()))

Unique item number : 23691
Item id are unique :  True


*How many different sessions does exist?*

In [83]:
distinct_session_number = len(pd.concat([train_session_df.session_id, train_purchase_df.session_id]).unique())
print("Unique user number :", distinct_session_number)

Unique user number : 1000000


*Does session always look an item before buying it?*

In [84]:
import numpy as np

print("A user never look at one item before buying it.")
pd.merge(train_purchase_df, train_session_df, on=['session_id','item_id'], how='left', indicator='Exist')["Exist"].value_counts()

A user never look at one item before buying it.


left_only     1000000
right_only          0
both                0
Name: Exist, dtype: int64

*Can a session look at items without buying any?*

In [85]:
print("Every session bought exactly one item.")

pd.merge(train_purchase_df, train_session_df, on=['session_id'], how='left', indicator='Exist')["Exist"].value_counts()

Every session bought exactly one item.


both          4743820
left_only           0
right_only          0
Name: Exist, dtype: int64

## SVD++

We want to create ratings given by every session for every item. We will first choose the following rating system:
- If the user has seen the item, we will give it a rating of 1.
- If the user purchased the item, we will give it a rating of 2. 

*What will be the size of our rating matrix?*

In [86]:
print("Size of the maximum full rating matrix : ", (distinct_session_number * distinct_item_number, 3))

Size of the maximum full rating matrix :  (23691000000, 3)


In [87]:

# ----------------------------- WE CREATE RATINGS ---------------------------- #

train_rating_df = pd.concat([train_session_df.assign(rating=1), train_purchase_df.assign(rating=2)])
train_ratings_df_shuffled = train_rating_df.sample(len(train_rating_df))
train_ratings_df_shuffled.describe()

Unnamed: 0,session_id,item_id,rating
count,5743820.0,5743820.0,5743820.0
mean,2218771.0,14014.57,1.1741
std,1281013.0,8179.668,0.3791956
min,3.0,2.0,1.0
25%,1110573.0,6952.0,1.0
50%,2215782.0,14017.0,1.0
75%,3326251.0,20969.0,1.0
max,4440001.0,28143.0,2.0


In [88]:
# -------------- WE REDUCE THE SIZE OF OUR DATASET FOR RESEARCH -------------- #

train_rating_df = train_rating_df[:100000]

In [89]:
# --------------------------- WE CREATE A TRAIN SET -------------------------- #

import surprise

rating_reader = surprise.Reader(rating_scale=(1, 2))
trainset_df = surprise.dataset.Dataset.load_from_df(df=train_rating_df[["session_id", "item_id", "rating"]], reader=rating_reader)

In [90]:
KFOLD_SPLIT_NUMBER = 10

recommandation_algorithm = surprise.SVD()
comparizon_algorithm = surprise.prediction_algorithms.random_pred.NormalPredictor()
kfold_split = surprise.model_selection.KFold(n_splits=KFOLD_SPLIT_NUMBER)

for trainset, testset in kfold_split.split(trainset_df):
    
    # train and test algorithm.
    comparizon_algorithm.fit(trainset)
    recommandation_algorithm.fit(trainset)

    predictions = recommandation_algorithm.test(testset)
    predictions_comparizon = comparizon_algorithm.test(testset)

    # Compute and print Root Mean Squared Error
    print("SVD RMSE :", surprise.accuracy.rmse(predictions, verbose=True))
    print("Random normal law distribution RMSE :", surprise.accuracy.rmse(predictions_comparizon, verbose=True))

RMSE: 0.0601
SVD RMSE : 0.06012020438539535
RMSE: 0.0000
Random normal law distribution RMSE : 0.0
RMSE: 0.0588
SVD RMSE : 0.05883645111131686
RMSE: 0.0000
Random normal law distribution RMSE : 0.0
RMSE: 0.0586
SVD RMSE : 0.05863753265548446
RMSE: 0.0000
Random normal law distribution RMSE : 0.0
RMSE: 0.0588
SVD RMSE : 0.05878795893723654
RMSE: 0.0000
Random normal law distribution RMSE : 0.0
RMSE: 0.0605
SVD RMSE : 0.06052888975007615
RMSE: 0.0000
Random normal law distribution RMSE : 0.0
RMSE: 0.0608
SVD RMSE : 0.060794088022735504
RMSE: 0.0000
Random normal law distribution RMSE : 0.0
RMSE: 0.0603
SVD RMSE : 0.060288039404525956
RMSE: 0.0000
Random normal law distribution RMSE : 0.0
RMSE: 0.0597
SVD RMSE : 0.05967860561007402
RMSE: 0.0000
Random normal law distribution RMSE : 0.0
RMSE: 0.0592
SVD RMSE : 0.059239246851956924
RMSE: 0.0000
Random normal law distribution RMSE : 0.0
RMSE: 0.0591
SVD RMSE : 0.059134250468023224
RMSE: 0.0000
Random normal law distribution RMSE : 0.0
