# Baseline Model

In [1]:
import numpy as np
import pandas as pd
import yaml
import glob
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

In [None]:
# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

In [None]:
df = pd.DataFrame(data.__dict__['raw_ratings'], columns=['user_id','item_id','rating','timestamp'])

In [None]:
df.head()

# Load Config


In [3]:
%pwd

'/Users/CPang/Ying/GNN-eCommerce/notebooks'

In [4]:
%cd ..

/Users/CPang/Ying/GNN-eCommerce


In [34]:
with open("params.yaml") as config_file:
    config = yaml.safe_load(config_file)

config

{'base': {'random_seed': 42},
 'data': {'cosmetic_shop': 'data/raw/cosmetic-shop-ecommerce-events/'},
 'training': {'event_type_weights': {'view': 0.01,
   'cart': 0.2,
   'remove_from_cart': -0.19,
   'purchase': 1.0}},
 'reports': None}

# Load Dataset and EDA

In [6]:
# Get CSV files list of cosmetic shop dataset
path = config['data']['cosmetic_shop']
csv_files = glob.glob(path + "/*.csv")

# Read each CSV file into DataFrame. This creates a list of dataframes
df_list = (pd.read_csv(file) for file in csv_files)

# Concatenate all DataFrames
all_events = pd.concat(df_list, ignore_index=True)

In [7]:
print(all_events.shape)
all_events.sample(10)

(20692840, 9)


Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
15450247,2019-11-24 12:44:04 UTC,remove_from_cart,5842141,1487580006317032337,,,1.43,419246960,a9bf9099-52bd-4270-8996-eb40f360ddae
7889880,2020-02-01 16:23:36 UTC,view,5808664,2089259162625114209,,ardell,6.37,520347806,9c834180-5e71-465a-9e9c-188dfaf60f96
12874681,2019-11-07 17:03:54 UTC,view,5836725,1487580011702517887,,bergamo,22.51,563538360,cc501cc6-1cbb-4659-864d-dc01aafc3784
9114872,2020-02-09 17:46:30 UTC,cart,5882417,1487580009286598681,,zinger,0.44,545348327,35a80fca-8f38-4098-b7de-5085c1594411
15420664,2019-11-24 10:02:13 UTC,purchase,5875432,2084144451428549153,,,1.94,509688411,75b4914c-d209-4feb-9711-8e64ecd9de86
1422242,2020-01-12 19:22:10 UTC,view,5706479,1487580013950664926,,irisk,11.35,543105680,c8e6d876-16e2-40d5-8add-f3ee83732b6a
20514804,2019-10-30 14:13:58 UTC,view,5738800,1487580013858390233,,concept,5.56,565779164,b3f53999-b965-4e3a-95ed-610edf92e24f
19442119,2019-10-22 05:52:01 UTC,view,5793261,1487580010100293687,,,22.21,562526648,4f5b573f-a137-4d05-8551-6212a7f68e47
8727429,2020-02-06 20:19:38 UTC,remove_from_cart,5723529,1487580005268456287,,runail,2.94,552908674,f6defebb-9ecf-40fb-9667-c7d6a702dd08
19635286,2019-10-23 13:33:21 UTC,view,5730206,1487580005092295511,,,10.32,527472834,a1165777-9882-4618-8f93-e89964401294


In [13]:
# Number of unique users
users_count = len(pd.unique(all_events['user_id']))
print("Unique users: ", users_count)

# Number of unique products
products_count = len(pd.unique(all_events['product_id']))
print("Unique products: ", products_count)

print("Event types: ", pd.unique(all_events.event_type).tolist())


Unique users:  1639358
Unique products:  54571
Event types:  ['view', 'cart', 'remove_from_cart', 'purchase']


In [40]:
# a small dataset used for code development only
all_events_mini = all_events.head(100000)
all_events_mini = all_events_mini[['user_id', 'product_id', 'event_type']]


In [73]:
def interaction_matrix(events, event_type_weights):

    events = events[['user_id', 'product_id', 'event_type']]
    events['weight'] = events.event_type.apply(lambda x: config['training']['event_type_weights'][x])

    from functools import reduce

    # Add weight column to all_events dataframe
    #events['weight'] = events.apply(lambda x: event_type_weights[x['event_type']], axis=1)
    i_matrix = events.groupby(['user_id', 'product_id'], as_index=False) \
        .agg({'event_type': list, 'weight': lambda x: 0.01 if x.sum() < 0 else 1.0 if x.sum() > 1.0 else x.sum()})

    return i_matrix

In [74]:
im = interaction_matrix(all_events_mini, config['training']['event_type_weights'])
im

Unnamed: 0,user_id,product_id,event_type,weight
0,38560385,5861734,[view],0.01
1,43695999,5686278,[view],0.01
2,43695999,5712736,[view],0.01
3,43695999,5730212,[view],0.01
4,43695999,5730214,[view],0.01
...,...,...,...,...
61751,595853462,5804820,[view],0.01
61752,595853555,5714119,[view],0.01
61753,595853918,5863525,[view],0.01
61754,595853961,5677422,[view],0.01


In [75]:
from surprise import Dataset
from surprise import Reader
reader = Reader(rating_scale=(0, 1))
surprise_dataset = Dataset.load_from_df(im[['user_id', 'product_id', 'weight']], reader)

In [78]:
df = pd.DataFrame(surprise_dataset.__dict__['raw_ratings'])
df

Unnamed: 0,0,1,2,3
0,38560385,5861734,0.01,
1,43695999,5686278,0.01,
2,43695999,5712736,0.01,
3,43695999,5730212,0.01,
4,43695999,5730214,0.01,
...,...,...,...,...
61751,595853462,5804820,0.01,
61752,595853555,5714119,0.01,
61753,595853918,5863525,0.01,
61754,595853961,5677422,0.01,


# Further Work
Use event time to influence the weight of the event. i.e. More recent events carry more weight.

In [80]:
# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, surprise_dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.2166  0.2165  0.2189  0.2218  0.2174  0.2183  0.0020  
MAE (testset)     0.1406  0.1399  0.1414  0.1429  0.1412  0.1412  0.0010  
Fit time          0.72    0.69    0.64    0.65    0.65    0.67    0.03    
Test time         0.08    0.06    0.06    0.06    0.06    0.06    0.01    


{'test_rmse': array([0.2166297 , 0.21653958, 0.2189495 , 0.22178635, 0.21735138]),
 'test_mae': array([0.14056058, 0.13987806, 0.14136231, 0.14288741, 0.14124248]),
 'fit_time': (0.7152256965637207,
  0.688176155090332,
  0.6414568424224854,
  0.6458718776702881,
  0.6473128795623779),
 'test_time': (0.0792088508605957,
  0.060974836349487305,
  0.06274080276489258,
  0.061547040939331055,
  0.0601191520690918)}