# eCommerce behaviour dataset

In [4]:
import pandas as pd
import numpy as np
from numpy import random
import json

In [6]:
# == ecommerceBehaviourOct.csv == Data Source: https://www.kaggle.com/datasets/mkechinov/ecommerce-behavior-data-from-multi-category-store
#ds = pd.read_csv('../data/ecommerceBehaviour.csv')

# == eCommerceBehaviourOct.pkl == Data cleaning and memory data type adjustments applied
ds = pd.read_pickle('../data/ecommerceBehaviourCleaned.pkl')


# 1 Initial exploration

## 1.1 Dataset info

In [7]:
ds.shape

(26560620, 9)

In [8]:
ds.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
1,2019-10-01 00:00:00+00:00,view,3900821,2053013552326770905,appliances.environment.water_heater,aqua,33.1875,554748717,9333dfbd-b87a-4708-9857-6336556b0fcc
3,2019-10-01 00:00:01+00:00,view,1307067,2053013558920217191,computers.notebook,lenovo,251.75,550050854,7c90fc70-0e80-4590-96f3-13c02c18c713
4,2019-10-01 00:00:04+00:00,view,1004237,2053013555631882655,electronics.smartphone,apple,1082.0,535871217,c6bd7419-2748-4c56-95b4-8cec9ff8b80d
5,2019-10-01 00:00:05+00:00,view,1480613,2053013561092866779,computers.desktop,pulser,908.5,512742880,0d0d91c2-c9c2-4e81-90a5-86594dec0db9
8,2019-10-01 00:00:10+00:00,view,28719074,2053013565480109009,apparel.shoes.keds,baden,102.6875,520571932,ac1cd4e5-a3ce-4224-a2d7-ff660a105880


In [9]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26560620 entries, 1 to 42448762
Data columns (total 9 columns):
 #   Column         Dtype              
---  ------         -----              
 0   event_time     datetime64[ns, UTC]
 1   event_type     category           
 2   product_id     int32              
 3   category_id    int64              
 4   category_code  object             
 5   brand          object             
 6   price          float16            
 7   user_id        int32              
 8   user_session   object             
dtypes: category(1), datetime64[ns, UTC](1), float16(1), int32(2), int64(1), object(3)
memory usage: 1.5+ GB


In [10]:
ds.nunique(axis=0, dropna=False)

event_time       2558113
event_type             3
product_id         60371
category_id          248
category_code        126
brand               1731
price               8465
user_id          2323036
user_session     6419693
dtype: int64

## 1.2 Graph info

In [11]:
CustomerNodes = ds['user_id'].nunique()
ProductNodes = ds['product_id'].nunique() 
print('Number of customer nodes: ',CustomerNodes,'\n')
print('Number of product nodes: ', ProductNodes,'\n')
print('Number of edges: ',str(len(ds)))
ds['event_type'].value_counts()

Number of customer nodes:  2323036 

Number of product nodes:  60371 

Number of edges:  26560620


view        25201706
cart          809407
purchase      549507
Name: event_type, dtype: int64

# 2 Data cleaning

## 2.1 Rows: Missing values and nonsense data

In [12]:
ds.isnull().sum(axis = 0)

event_time       0
event_type       0
product_id       0
category_id      0
category_code    0
brand            0
price            0
user_id          0
user_session     0
dtype: int64

In [79]:
ds = ds.dropna()

In [13]:
ds[ds['price'] == 0].shape[0]

0

In [81]:
ds.drop(ds[ds['price'] == 0].index, inplace = True)

## 2.2 Columns: Data types

In [76]:
ds['event_time'] = pd.to_datetime(ds['event_time'],yearfirst=True,utc=True)

In [71]:
ds['event_type'] = ds['event_type'].astype('category')

In [72]:
ds['category_code'] = ds['category_code'].astype(object)
ds['brand'] = ds['brand'].astype(object)

In [75]:
for col in ds:

   colDataType = ds[col].dtype

   if str(colDataType)[:3] == 'int' or str(colDataType)[:5] == 'float':

      maxValue = ds[col].min()
      minValue = ds[col].max()

      if str(colDataType)[:3] == 'int':
         if minValue > np.iinfo(np.int8).min and maxValue < np.iinfo(np.int8).max:
            ds[col] = ds[col].astype(np.int8)
         elif minValue > np.iinfo(np.int16).min and maxValue < np.iinfo(np.int16).max:
            ds[col] = ds[col].astype(np.int16)
         elif minValue > np.iinfo(np.int32).min and maxValue < np.iinfo(np.int32).max:
            ds[col] = ds[col].astype(np.int32)
         elif minValue > np.iinfo(np.int64).min and maxValue < np.iinfo(np.int64).max:
            ds[col] = ds[col].astype(np.int64)

      if str(colDataType)[:5] == 'float':
         if minValue > np.finfo(np.float16).min and maxValue < np.finfo(np.float16).max:
            ds[col] = ds[col].astype(np.float16)
         elif minValue > np.finfo(np.float32).min and maxValue < np.finfo(np.float32).max:
            ds[col] = ds[col].astype(np.float32)
         else:
            ds[col] = ds[col].astype(np.float64)

# 3 Users and Products grouping

In [14]:
ds_view=ds[ds['event_type']=='view']
ds_cart=ds[ds['event_type']=='cart']
ds_purchase=ds[ds['event_type']=='purchase']

## 3.1 Users

#### Interactions

In [15]:
user_interactions = ds.groupby(by=['user_id'],as_index=False).agg(user_interactions=('user_id','count'))
user_view_interactions = ds_view.groupby(by=['user_id'],as_index=False).agg(user_view_interactions=('user_id','count'))
user_cart_interactions = ds_cart.groupby(by=['user_id'],as_index=False).agg(user_cart_interactions=('user_id','count'))
user_purchase_interactions = ds_purchase.groupby(by=['user_id'],as_index=False).agg(user_purchase_interactions=('user_id','count'))
print('Number of customers by event:')
print('- All events: ' + str(len(user_interactions))+ ' customers' + '   (Interactions/edges mean: ' + str(user_interactions['user_interactions'].mean()) + ')')
print('- View: ' + str(len(user_view_interactions))+ ' customers' + '   (Interactions mean: ' + str(user_view_interactions['user_view_interactions'].mean()) + ')')
print('- Cart: ' + str(len(user_cart_interactions))+ ' customers' + '   (Interactions mean: ' + str(user_cart_interactions['user_cart_interactions'].mean()) + ')')
print('- Purchase: ' + str(len(user_purchase_interactions))+ ' customers' + '   (Interactions mean: ' + str(user_purchase_interactions['user_purchase_interactions'].mean()) + ')')


Number of customers by event:
- All events: 2323036 customers   (Interactions/edges mean: 11.433580882948004)
- View: 2322867 customers   (Interactions mean: 10.849396887553183)
- Cart: 294902 customers   (Interactions mean: 2.7446643291669774)
- Purchase: 263445 customers   (Interactions mean: 2.085850936628139)


In [16]:
user_interactions['user_interactions'].describe()

count    2.323036e+06
mean     1.143358e+01
std      2.544709e+01
min      1.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      1.100000e+01
max      6.043000e+03
Name: user_interactions, dtype: float64

In [17]:
user_view_interactions['user_view_interactions'].describe()

count    2.322867e+06
mean     1.084940e+01
std      2.445638e+01
min      1.000000e+00
25%      2.000000e+00
50%      4.000000e+00
75%      1.100000e+01
max      6.043000e+03
Name: user_view_interactions, dtype: float64

In [18]:
user_cart_interactions['user_cart_interactions'].describe()

count    294902.000000
mean          2.744664
std           4.154872
min           1.000000
25%           1.000000
50%           2.000000
75%           3.000000
max         494.000000
Name: user_cart_interactions, dtype: float64

In [19]:
user_purchase_interactions['user_purchase_interactions'].describe()

count    263445.000000
mean          2.085851
std           3.618213
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max         322.000000
Name: user_purchase_interactions, dtype: float64

#### RFM

In [20]:
user_purchase_rfm = ds_purchase.groupby(by=['user_id'],as_index=False).agg(recency=('event_time','max'),frequency=('user_id','count'),monetary=('price','sum'))

last_event_time = ds_purchase['event_time'].max()
user_purchase_rfm['recency'] = last_event_time - user_purchase_rfm['recency']
user_purchase_rfm['recency'] = user_purchase_rfm['recency'] / np.timedelta64(1,'h')

In [21]:
user_purchase_rfm.quantile(q=[0.25,0.5,0.75])

Unnamed: 0,user_id,recency,frequency,monetary
0.25,517519214.0,160.136667,1.0,145.375
0.5,536585624.0,328.573056,1.0,288.0
0.75,555555894.0,510.8825,2.0,738.5


In [22]:
Quartile_segments = user_purchase_rfm.quantile(q=[0.25,0.5,0.75]).to_dict()

def setScoringRecency (score):

   if score <= Quartile_segments['recency'][0.25]:
      return 4
   elif score <= Quartile_segments['recency'][0.5]:
      return 3
   elif score <= Quartile_segments['recency'][0.75]:
      return 2
   else:
      return 1

def setScoringFrequency (score):

   if score <= Quartile_segments['frequency'][0.25]:
      return 1
   elif score <= Quartile_segments['frequency'][0.5]:
      return 2
   elif score <= Quartile_segments['frequency'][0.75]:
      return 3
   else:
      return 4

def setScoringMonetary (score):

   if score <= Quartile_segments['monetary'][0.25]:
      return 1
   elif score <= Quartile_segments['monetary'][0.5]:
      return 2
   elif score <= Quartile_segments['monetary'][0.75]:
      return 3
   else:
      return 4

user_purchase_rfm_scores = user_purchase_rfm
user_purchase_rfm_scores['recency_q'] = user_purchase_rfm['recency'].apply(setScoringRecency)
user_purchase_rfm_scores['frequency_q'] = user_purchase_rfm['frequency'].apply(setScoringFrequency)
user_purchase_rfm_scores['monetary_q'] = user_purchase_rfm['monetary'].apply(setScoringRecency)

In [23]:
user_purchase_rfm_scores['rfm_str_segmentation'] = user_purchase_rfm_scores['recency_q'].map(str) + user_purchase_rfm_scores['frequency_q'].map(str) + user_purchase_rfm_scores['monetary_q'].map(str)
user_purchase_rfm_scores['rfm_int_scoring'] = user_purchase_rfm_scores['recency_q'].map(int) + user_purchase_rfm_scores['frequency_q'].map(int) + user_purchase_rfm_scores['monetary_q'].map(int)

In [24]:
user_purchase_rfm_scores = user_purchase_rfm_scores[['user_id','rfm_str_segmentation','rfm_int_scoring']]

In [25]:
user_purchase_rfm_scores

Unnamed: 0,user_id,rfm_str_segmentation,rfm_int_scoring
0,303160429,212,5
1,340041246,212,5
2,384989212,314,8
3,401021311,313,7
4,403013066,331,7
...,...,...,...
263440,566265448,411,6
263441,566267483,434,11
263442,566270060,434,11
263443,566274637,411,6


## 3.2 Products

### Interactions

In [26]:
product_interactions = ds.groupby(by=['product_id'],as_index=False).agg(product_interactions=('product_id','count'))
product_view_interactions = ds_view.groupby(by=['product_id'],as_index=False).agg(product_view_interactions=('product_id','count'))
product_cart_interactions = ds_cart.groupby(by=['product_id'],as_index=False).agg(product_cart_interactions=('product_id','count'))
product_purchase_interactions = ds_purchase.groupby(by=['product_id'],as_index=False).agg(product_purchase_interactions=('product_id','count'))
print('Number of products by event:')
print('- All events: ' + str(len(product_interactions)) + ' products' + '   (Interactions mean: ' + str(product_interactions['product_interactions'].mean()) + ')')
print('- View: ' + str(len(product_view_interactions)) + ' products' + '  (Interactions mean: ' + str(product_view_interactions['product_view_interactions'].mean()) + ')')
print('- Cart: ' + str(len(product_cart_interactions)) + ' products' + '  (Interactions mean: ' + str(product_cart_interactions['product_cart_interactions'].mean()) + ')')
print('- Purchase: ' + str(len(product_purchase_interactions)) + ' products' + '  (Interactions mean: ' + str(product_purchase_interactions['product_purchase_interactions'].mean()) + ')')

Number of products by event:
- All events: 60371 products   (Interactions mean: 439.9566016796144)
- View: 60371 products  (Interactions mean: 417.4472180351493)
- Cart: 6750 products  (Interactions mean: 119.91214814814815)
- Purchase: 16737 products  (Interactions mean: 32.831869510664994)


In [27]:
product_interactions['product_interactions'].describe()

count     60371.000000
mean        439.956602
std        4896.841211
min           1.000000
25%           9.000000
50%          34.000000
75%         129.000000
max      500354.000000
Name: product_interactions, dtype: float64

In [28]:
product_view_interactions['product_view_interactions'].describe()

count     60371.000000
mean        417.447218
std        4353.750795
min           1.000000
25%           9.000000
50%          34.000000
75%         128.000000
max      419287.000000
Name: product_view_interactions, dtype: float64

In [29]:
product_cart_interactions['product_cart_interactions'].describe()

count     6750.000000
mean       119.912148
std       1096.419850
min          1.000000
25%          2.000000
50%          6.000000
75%         23.000000
max      52123.000000
Name: product_cart_interactions, dtype: float64

In [30]:
product_purchase_interactions['product_purchase_interactions'].describe()

count    16737.000000
mean        32.831870
std        403.472662
min          1.000000
25%          1.000000
50%          2.000000
75%          7.000000
max      28944.000000
Name: product_purchase_interactions, dtype: float64

# 4 Dataset sampling

### 4.1 Sampling of User indexed dataframes

#### Interactions

In [31]:
user_interactions_sample = user_interactions.loc[user_interactions['user_interactions'] > 921]
user_view_interactions_sample = user_view_interactions.loc[user_view_interactions['user_view_interactions'] > 750]
user_cart_interactions_sample = user_cart_interactions.loc[user_cart_interactions['user_cart_interactions'] > 65]
user_purchase_interactions_sample = user_purchase_interactions.loc[user_purchase_interactions['user_purchase_interactions'] > 66]
print('Number of customers sampled by event:')
print('- All events: ' + str(len(user_interactions_sample)) + ' customers')
print('- View: ' + str(len(user_view_interactions_sample)) + ' customers')
print('- Cart: ' + str(len(user_cart_interactions_sample)) + ' customers')
print('- Purchase: ' + str(len(user_purchase_interactions_sample)) + ' customers')


Number of customers sampled by event:
- All events: 60 customers
- View: 111 customers
- Cart: 106 customers
- Purchase: 97 customers


### 4.2 Sampling of Product indexed dataframes

#### Interactions

In [33]:
product_interactions_sample = product_interactions.loc[product_interactions['product_interactions'] > 53600]
product_view_interactions_sample = product_view_interactions.loc[product_view_interactions['product_view_interactions'] > 29200]
product_cart_interactions_sample = product_cart_interactions.loc[product_cart_interactions['product_cart_interactions'] > 1290]
product_purchase_interactions_sample = product_purchase_interactions.loc[product_purchase_interactions['product_purchase_interactions'] > 800]
print('Number of products sampled by event:')
print('- All events: ' + str(len(product_interactions_sample)) + ' products')
print('- View: ' + str(len(product_view_interactions_sample)) + ' products')
print('- Cart: ' + str(len(product_cart_interactions_sample)) + ' products')
print('- Purchase: ' + str(len(product_purchase_interactions_sample)) + ' products')

Number of products sampled by event:
- All events: 50 products
- View: 100 products
- Cart: 103 products
- Purchase: 103 products


### 4.3 Dataset sampling

#### -> All events

##### Sample 1 - Small network

* Customer number of interactions > 921
* Product number of interactions > 53600

In [257]:
ds_sample_1 = ds[ds.user_id.isin(user_interactions_sample.user_id)]
ds_sample_1 = ds_sample_1[ds_sample_1.product_id.isin(product_interactions_sample.product_id)]

In [258]:
customerNodes = str(ds_sample_1['user_id'].nunique())
productNodes = str(ds_sample_1['product_id'].nunique())
events = str(len(ds_sample_1))
print('Graph data:')
print('- Number of customer nodes: ' + customerNodes)
print('- Number of product nodes: ' + productNodes)
print('- Number of edges: ' + events)

Graph data:
- Number of customer nodes: 50
- Number of product nodes: 50
- Number of edges: 12046


##### Sample 2 - Big network

* Customer number of interactions > 768
* Product number of interactions > 30700

In [247]:
ds_sample_2 = ds[ds.user_id.isin(user_interactions_sample.user_id)]
ds_sample_2 = ds_sample_2[ds_sample_2.product_id.isin(product_interactions_sample.product_id)]

In [248]:
customerNodes = str(ds_sample_2['user_id'].nunique())
productNodes = str(ds_sample_2['product_id'].nunique())
events = str(len(ds_sample_2))
print('Graph data:')
print('- Number of customer nodes: ' + customerNodes)
print('- Number of product nodes: ' + productNodes)
print('- Number of edges: ' + events)

Graph data:
- Number of customer nodes: 100
- Number of product nodes: 100
- Number of edges: 25671


#### -> 3.3.1 View

##### Sample 1 - Small network

* Customer number of interactions > 899
* Product number of interactions > 50250

In [312]:
ds_view_sample_1 = ds_view[ds_view.user_id.isin(user_view_interactions_sample.user_id)]
ds_view_sample_1 = ds_view_sample_1[ds_view_sample_1.product_id.isin(product_view_interactions_sample.product_id)]

In [313]:
customerNodes = str(ds_view_sample_1['user_id'].nunique())
productNodes = str(ds_view_sample_1['product_id'].nunique())
events = str(len(ds_view_sample_1))
print('Graph data:')
print('- Number of customer nodes: ' + customerNodes)
print('- Number of product nodes: ' + productNodes)
print('- Number of edges: ' + events)

Graph data:
- Number of customer nodes: 49
- Number of product nodes: 50
- Number of edges: 10221


##### Sample 2 - Big network

* Customer number of interactions > 700
* Product number of interactions > 23500

In [371]:
ds_view_sample_2 = ds_view[ds_view.user_id.isin(user_view_interactions_sample.user_id)]
ds_view_sample_2 = ds_view_sample_2[ds_view_sample_2.product_id.isin(product_view_interactions_sample.product_id)]

In [372]:
customerNodes = str(ds_view_sample_2['user_id'].nunique())
productNodes = str(ds_view_sample_2['product_id'].nunique())
events = str(len(ds_view_sample_2))
print('Graph data:')
print('- Number of customer nodes: ' + customerNodes)
print('- Number of product nodes: ' + productNodes)
print('- Number of edges: ' + events)

Graph data:
- Number of customer nodes: 99
- Number of product nodes: 100
- Number of edges: 21340


#### -> 3.3.1 Cart

##### Sample 1 - Small network

* Customer number of interactions > 95
* Product number of interactions > 3300

In [406]:
ds_cart_sample_1 = ds_cart[ds_cart.user_id.isin(user_cart_interactions_sample.user_id)]
ds_cart_sample_1 = ds_cart_sample_1[ds_cart_sample_1.product_id.isin(product_cart_interactions_sample.product_id)]

In [407]:
customerNodes = str(ds_cart_sample_1['user_id'].nunique())
productNodes = str(ds_cart_sample_1['product_id'].nunique())
events = str(len(ds_cart_sample_1))
print('Graph data:')
print('- Number of customer nodes: ' + customerNodes)
print('- Number of product nodes: ' + productNodes)
print('- Number of edges: ' + events)

Graph data:
- Number of customer nodes: 50
- Number of product nodes: 50
- Number of edges: 4584


##### Sample 2 - Big network

* Customer number of interactions > 65
* Product number of interactions > 1290

In [445]:
ds_cart_sample_2 = ds_cart[ds_cart.user_id.isin(user_cart_interactions_sample.user_id)]
ds_cart_sample_2 = ds_cart_sample_2[ds_cart_sample_2.product_id.isin(product_cart_interactions_sample.product_id)]

In [446]:
customerNodes = str(ds_cart_sample_2['user_id'].nunique())
productNodes = str(ds_cart_sample_2['product_id'].nunique())
events = str(len(ds_cart_sample_2))
print('Graph data:')
print('- Number of customer nodes: ' + customerNodes)
print('- Number of product nodes: ' + productNodes)
print('- Number of edges: ' + events)

Graph data:
- Number of customer nodes: 100
- Number of product nodes: 100
- Number of edges: 8490


#### -> 3.3.1 Purchase

##### Sample 1 - Small network

* Customer number of interactions > 110
* Product number of interactions > 1400

In [485]:
ds_purchase_sample_1 = ds_purchase[ds_purchase.user_id.isin(user_purchase_interactions_sample.user_id)]
ds_purchase_sample_1 = ds_purchase_sample_1[ds_purchase_sample_1.product_id.isin(product_purchase_interactions_sample.product_id)]

In [486]:
customerNodes = str(ds_purchase_sample_1['user_id'].nunique())
productNodes = str(ds_purchase_sample_1['product_id'].nunique())
events = str(len(ds_purchase_sample_1))
print('Graph data:')
print('- Number of customer nodes: ' + customerNodes)
print('- Number of product nodes: ' + productNodes)
print('- Number of edges: ' + events)

Graph data:
- Number of customer nodes: 49
- Number of product nodes: 50
- Number of edges: 3400


##### Sample 2 - Big network

* Customer number of interactions > 65
* Product number of interactions > 800

In [512]:
ds_purchase_sample_2 = ds_purchase[ds_purchase.user_id.isin(user_purchase_interactions_sample.user_id)]
ds_purchase_sample_2 = ds_purchase_sample_2[ds_purchase_sample_2.product_id.isin(product_purchase_interactions_sample.product_id)]

In [513]:
customerNodes = str(ds_purchase_sample_2['user_id'].nunique())
productNodes = str(ds_purchase_sample_2['product_id'].nunique())
events = str(len(ds_purchase_sample_2))
print('Graph data:')
print('- Number of customer nodes: ' + customerNodes)
print('- Number of product nodes: ' + productNodes)
print('- Number of edges: ' + events)

Graph data:
- Number of customer nodes: 95
- Number of product nodes: 100
- Number of edges: 6508


# 5 Graph data modelling

In [514]:
#ds_graph = ds_sample
ds_graph = ds_purchase_sample_2

graph = dict()
nodes = []
edges = []

users = dict()
users_index = 0
products = dict()
products_index = 0

for row in range(0,len(ds_graph)):

   # == Edge data == 
   event = ds_graph.iloc[row].event_type

   # == User node data ==
   user_id = ds_graph.iloc[row].user_id
   user_session = ds_graph.iloc[row].user_session
   #rfm
   if(user_id in user_purchase_rfm_scores['user_id'].values):
      rfm = user_purchase_rfm_scores.loc[user_purchase_rfm_scores['user_id'] == user_id].rfm_str_segmentation.item()
   else:
      rfm = 'Only view customer'
   #Interactions
   userInteractions = user_interactions.loc[user_interactions['user_id'] == user_id].user_interactions.item()
   if(user_id in user_view_interactions['user_id'].values):
      userViewInteractions = user_view_interactions.loc[user_view_interactions['user_id'] == user_id].user_view_interactions.item()
   else:
      userViewInteractions = 0
   if(user_id in user_cart_interactions['user_id'].values):
      userCartInteractions = user_cart_interactions.loc[user_cart_interactions['user_id'] == user_id].user_cart_interactions.item()
   else:
      userCartInteractions = 0
   if(user_id in user_purchase_interactions['user_id'].values):
      userPurchaseInteractions = user_purchase_interactions.loc[user_purchase_interactions['user_id'] == user_id].user_purchase_interactions.item()
   else:
      userPurchaseInteractions = 0

   # == Product node data ==
   product_id = ds_graph.iloc[row].product_id
   category = ds_graph.iloc[row].category_code
   brand = ds_graph.iloc[row].brand
   price = ds_graph.iloc[row].price
   #Interactions
   productInteractions = product_interactions.loc[product_interactions['product_id'] == product_id].product_interactions.item()
   if(product_id in product_view_interactions['product_id'].values):
      productViewInteractions = product_view_interactions.loc[product_view_interactions['product_id'] == product_id].product_view_interactions.item()
   else:
      productViewInteractions = 0
   if(product_id in product_cart_interactions['product_id'].values):
      productCartInteractions = product_cart_interactions.loc[product_cart_interactions['product_id'] == product_id].product_cart_interactions.item()
   else:
      productCartInteractions= 0
   if(product_id in product_purchase_interactions['product_id'].values):
      productPurchaseInteractions = product_purchase_interactions.loc[product_purchase_interactions['product_id'] == product_id].product_purchase_interactions.item()
   else:
      productPurchaseInteractions = 0

   # == Node common property ==
   date = ds_graph.iloc[row].event_time
   date = date.to_pydatetime().strftime('%d-%m-%Y')

   # == Nodes array ==
   if(user_id not in users):
      users[user_id] = users_index
      users_index += users_index
      nodes.append({'id': str(user_id), 'date': date, 'class': 'customer', 'sess': user_session, 'rfm': rfm,'int': str(userInteractions), 'int_v': str(userViewInteractions), 'int_c': str(userCartInteractions), 'int_p': str(userPurchaseInteractions)})

   if(product_id not in products):
      products[product_id] = products_index
      products_index += products_index
      nodes.append({'id': str(product_id), 'date': date, 'class': 'product', 'cat': str(category), 'bra': str(brand), 'pri': str(price),'int': str(productInteractions), 'int_v': str(productViewInteractions), 'int_c': str(productCartInteractions), 'int_p': str(productPurchaseInteractions)})
   
   # == Edges array ==
   edges.append({'source':str(user_id), 'target':str(product_id), 'event': str(event)})

# == Graph array ==
graph = {'nodes': nodes, 'edges': edges}

In [488]:
print('Nodes: ' + str(len(nodes)) + '\nEdges: ' + str(len(edges)))

Nodes: 99
Edges: 3400


In [515]:
with open('/Users/juanmanuel/Code/src/jjmpalma.github.io/data_json/purchase_events_big_network.json', 'w') as outfile:
   json.dump(graph, outfile)