In [1]:
import pandas as pd
from datetime import datetime

import numpy as np

FILE_CATEGORY_TREE = '../data/category_tree.csv'
FILE_EVENTS = '../data/events.csv'
FILE_ITEM_PROPERTIES_1 = '../data/item_properties_part1.csv'
FILE_ITEM_PROPERTIES_2 = '../data/item_properties_part2.csv'
FILE_ITEM_PROPERTIES_ALL = '../data/item_properties_all.csv'

In [2]:
category_tree = pd.read_csv(FILE_CATEGORY_TREE)

In [3]:
category_tree.head()

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0


In [4]:
category_tree.shape

(1669, 2)

In [5]:
category_tree[category_tree.parentid.isnull()].sort_values('categoryid')

Unnamed: 0,categoryid,parentid
1629,140,
5,231,
1395,250,
1142,378,
1624,395,
939,431,
1628,653,
1322,659,
1657,679,
1111,755,


In [6]:
def find_root(x, df):
    while True: 
        if np.isnan(df.loc[df.categoryid == x, 'parentid'].unique()[0]):
            return x
        else:
            x = df.loc[df.categoryid == x, 'parentid'].unique()[0]

In [7]:
l = []
for c in category_tree.categoryid:
    l.append(find_root(c,category_tree))

category_tree = pd.concat([category_tree,pd.Series(l,name='top_parent')],axis=1)

In [8]:
top_parent_summary = category_tree.groupby('top_parent')['categoryid'].count().reset_index()
top_parent_summary.rename(columns = {'categoryid':'num_categories'}, inplace=True)
top_parent_summary.sort_values('num_categories',ascending=False)

Unnamed: 0,top_parent,num_categories
0,140.0,290
22,1600.0,187
6,653.0,154
21,1579.0,135
20,1532.0,130
4,395.0,120
18,1482.0,101
2,250.0,97
15,1224.0,70
5,431.0,66


In [9]:
top_parent_summary.shape

(25, 2)

25 top-level categories 

In [10]:
events = pd.read_csv(FILE_EVENTS)

In [11]:
events.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


In [12]:
events.describe(include='all')

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
count,2756101.0,2756101.0,2756101,2756101.0,22457.0
unique,,,3,,
top,,,view,,
freq,,,2664312,,
mean,1436424000000.0,701922.9,,234922.5,8826.497796
std,3366312000.0,405687.5,,134195.4,5098.99629
min,1430622000000.0,0.0,,3.0,0.0
25%,1433478000000.0,350566.0,,118120.0,4411.0
50%,1436453000000.0,702060.0,,236067.0,8813.0
75%,1439225000000.0,1053437.0,,350715.0,13224.0


In [13]:
events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2756101 entries, 0 to 2756100
Data columns (total 5 columns):
timestamp        int64
visitorid        int64
event            object
itemid           int64
transactionid    float64
dtypes: float64(1), int64(3), object(1)
memory usage: 105.1+ MB


In [14]:
events.event.value_counts()

view           2664312
addtocart        69332
transaction      22457
Name: event, dtype: int64

In [15]:
events.itemid.unique().shape

(235061,)

235,061 products view/addtocart/transcation

In [16]:
def convert_to_local(x):
    return datetime.fromtimestamp(x/1000)

In [17]:
events['local_date_time'] = events.timestamp.apply(convert_to_local)

In [18]:
events[events.visitorid == 1150086].sort_values('local_date_time').head(10)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,local_date_time
230843,1434034517389,1150086,view,133542,,2015-06-11 07:55:17.389
240552,1434035735608,1150086,view,167873,,2015-06-11 08:15:35.608
243885,1434036006651,1150086,view,231726,,2015-06-11 08:20:06.651
229405,1434036288806,1150086,view,427777,,2015-06-11 08:24:48.806
243973,1434036332155,1150086,view,398115,,2015-06-11 08:25:32.155
243884,1434036403191,1150086,addtocart,398115,,2015-06-11 08:26:43.191
237133,1434036525614,1150086,transaction,398115,7510.0,2015-06-11 08:28:45.614
229386,1434036543539,1150086,view,398115,,2015-06-11 08:29:03.539
240457,1434036727711,1150086,view,203425,,2015-06-11 08:32:07.711
237227,1434036891672,1150086,view,458489,,2015-06-11 08:34:51.672


In [19]:
events.sort_values(['visitorid','local_date_time'], inplace=True)
events['time_diff'] = events.groupby('visitorid')['timestamp'].diff(periods=-1) *-1

In [20]:
events.time_diff = events.time_diff / 1000 # convert from milliseconds to seconds

In [21]:
events.groupby('visitorid')['time_diff'].agg(['mean','count']).reset_index().sort_values('count', ascending=False).head()

Unnamed: 0,visitorid,mean,count
1150086,1150086,1086.141548,7756
530559,530559,1863.020505,4327
152963,152963,1378.517402,3023
895999,895999,4698.376979,2473
163561,163561,1124.994173,2409


In [22]:
events[events.visitorid == 280150].sort_values('local_date_time').head(10)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,local_date_time,time_diff
807459,1439361235385,280150,view,208756,,2015-08-11 23:33:55.385,0.015
814692,1439361235400,280150,view,208756,,2015-08-11 23:33:55.400,


In [23]:
events.agg({'local_date_time':['min','max']})

Unnamed: 0,local_date_time
min,2015-05-02 20:00:04.384
max,2015-09-17 19:59:47.788


In [24]:
item_properties = pd.read_csv(FILE_ITEM_PROPERTIES_1)

In [25]:
item_properties['local_date_time'] = item_properties.timestamp.apply(convert_to_local)

In [26]:
item_properties[item_properties.itemid.isin([133542])].sort_values('timestamp')

Unnamed: 0,timestamp,itemid,property,value,local_date_time
1223968,1431226800000,133542,112,679677,2015-05-09 20:00:00
2108400,1431226800000,133542,678,596263 643643,2015-05-09 20:00:00
7426099,1431226800000,133542,6,984245 1322342,2015-05-09 20:00:00
9768161,1432436400000,133542,810,n168.000 424566,2015-05-23 20:00:00
6557805,1433041200000,133542,928,769062,2015-05-30 20:00:00
10563297,1433041200000,133542,810,n168.000 424566,2015-05-30 20:00:00
6858405,1433041200000,133542,686,769062,2015-05-30 20:00:00
9569377,1433646000000,133542,810,n168.000 424566,2015-06-06 20:00:00
9966945,1434250800000,133542,810,n168.000 424566,2015-06-13 20:00:00
7745268,1435460400000,133542,839,596263 643643,2015-06-27 20:00:00


In [27]:
item_properties_2 = pd.read_csv(FILE_ITEM_PROPERTIES_2)

In [28]:
item_properties_2['local_date_time'] = item_properties_2.timestamp.apply(convert_to_local)

In [29]:
item_properties_2[item_properties_2.itemid.isin([133542])].sort_values('timestamp')

Unnamed: 0,timestamp,itemid,property,value,local_date_time
4421443,1431226800000,133542,159,519769,2015-05-09 20:00:00
6760379,1431226800000,133542,888,1237022 553907 1042151,2015-05-09 20:00:00
4342960,1431226800000,133542,776,485843,2015-05-09 20:00:00
2961207,1431226800000,133542,917,1042151,2015-05-09 20:00:00
2579930,1431226800000,133542,available,0,2015-05-09 20:00:00
2567116,1431226800000,133542,categoryid,366,2015-05-09 20:00:00
1409439,1431226800000,133542,790,n3480.000,2015-05-09 20:00:00
6561596,1431831600000,133542,888,1237022 553907 1042151,2015-05-16 20:00:00
1065032,1431831600000,133542,283,984245 1322342 596263 643643 1237022 553907 10...,2015-05-16 20:00:00
6959162,1432436400000,133542,888,1237022 553907 1042151,2015-05-23 20:00:00


In [30]:
item_properties_master = item_properties.append(item_properties_2)

In [31]:
item_properties_master[item_properties_master.itemid.isin([133542])].sort_values(['property','local_date_time'])

Unnamed: 0,timestamp,itemid,property,value,local_date_time
1223968,1431226800000,133542,112,679677,2015-05-09 20:00:00
4421443,1431226800000,133542,159,519769,2015-05-09 20:00:00
2454499,1435460400000,133542,19,n108.000 350726 30603 832471,2015-06-27 20:00:00
8718260,1433041200000,133542,202,1237022 553907 1042151,2015-05-30 20:00:00
7253748,1439694000000,133542,227,827388,2015-08-15 20:00:00
773218,1435460400000,133542,28,150169 610517,2015-06-27 20:00:00
1065032,1431831600000,133542,283,984245 1322342 596263 643643 1237022 553907 10...,2015-05-16 20:00:00
1374117,1435460400000,133542,364,580082,2015-06-27 20:00:00
875790,1439694000000,133542,521,769062,2015-08-15 20:00:00
1633064,1439694000000,133542,550,769062,2015-08-15 20:00:00


In [32]:
item_properties_master[item_properties_master.itemid.isin([167873])].sort_values(['property','local_date_time'])

Unnamed: 0,timestamp,itemid,property,value,local_date_time
3318724,1431226800000,167873,112,679677,2015-05-09 20:00:00
3540962,1439694000000,167873,119,18800,2015-08-15 20:00:00
6963406,1433646000000,167873,159,519769,2015-06-06 20:00:00
2897962,1439694000000,167873,202,993008 n1097064.000,2015-08-15 20:00:00
3282127,1433041200000,167873,235,1277294,2015-05-30 20:00:00
6986210,1431226800000,167873,283,305351 322391 1254637 993008 n1097064.000 3223...,2015-05-09 20:00:00
7118066,1431226800000,167873,364,583869,2015-05-09 20:00:00
7874336,1432436400000,167873,369,199526 n324.000,2015-05-23 20:00:00
475313,1433041200000,167873,480,1007882,2015-05-30 20:00:00
5593672,1431831600000,167873,6,305351 322391,2015-05-16 20:00:00


In [33]:
item_property_unique = item_properties_master.loc[:,['itemid','property']].drop_duplicates()

In [34]:
property_count = item_property_unique.groupby('property')['itemid'].count().sort_values(ascending=False).reset_index()

In [35]:
property_count[property_count.itemid == 417053]

Unnamed: 0,property,itemid
0,categoryid,417053
1,283,417053
2,888,417053
3,790,417053
4,764,417053
5,available,417053
6,112,417053
7,159,417053
8,364,417053


Above properties are represented of all items. Let's see what the unique values are for these properties.

In [36]:
item_properties_master.loc[item_properties_master.property == '364','value'].value_counts().sort_values(ascending=False).reset_index().head(10)

Unnamed: 0,index,value
0,621884,18
1,845786,18
2,397659,18
3,702816,18
4,190497,18
5,350641,18
6,1286447,18
7,824320,18
8,664944,18
9,1251632,18


In [37]:
item_properties_master[item_properties_master.value.str.contains(r'[^\s]')]

Unnamed: 0,timestamp,itemid,property,value,local_date_time
0,1435460400000,460429,categoryid,1338,2015-06-27 20:00:00
1,1441508400000,206783,888,1116713 960601 n277.200,2015-09-05 20:00:00
2,1439089200000,395014,400,n552.000 639502 n720.000 424566,2015-08-08 20:00:00
3,1431226800000,59481,790,n15360.000,2015-05-09 20:00:00
4,1431831600000,156781,917,828513,2015-05-16 20:00:00
5,1436065200000,285026,available,0,2015-07-04 20:00:00
6,1434250800000,89534,213,1121373,2015-06-13 20:00:00
7,1431831600000,264312,6,319724,2015-05-16 20:00:00
8,1433646000000,229370,202,1330310,2015-06-06 20:00:00
9,1434250800000,98113,451,1141052 n48.000,2015-06-13 20:00:00


In [48]:
events.to_csv('../data/events_enhanced.csv', index=False)

In [49]:
category_tree.to_csv('../data/category_tree_parent.csv', index=False)

In [50]:
item_properties_master.to_csv('../data/item_properties_master.csv', index=False)

In [67]:
events_trimmed = events[events.local_date_time >= datetime(2015, 8, 15)]

In [68]:
events_trimmed.shape

(599871, 7)

In [69]:
events_trimmed.event.value_counts()

view           579229
addtocart       15666
transaction      4976
Name: event, dtype: int64

In [70]:
# unique visitors
events_trimmed[events_trimmed.event == 'transaction'].visitorid.unique().shape

(2672,)

In [71]:
# all visitors where at least one session ended in a transaction
visitors = events_trimmed[events_trimmed.event == 'transaction'].visitorid.unique()
events_trimmed[events_trimmed.visitorid.isin(visitors)].shape

(43737, 7)

In [None]:
# how do i tell the unique sessions
# session start = first view
# session end = last transaction
buy_visitors = events_trimmed[events_trimmed.visitorid.isin(visitors)]

In [74]:
# should get the high-level category, availability, sesion time, hour of day, day of week
buy_visitors[buy_visitors.visitorid == 264]

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,local_date_time,time_diff
1282567,1441645752830,264,view,161949,,2015-09-07 10:09:12.830,1096.545
1279890,1441646849375,264,view,161949,,2015-09-07 10:27:29.375,213.134
1270212,1441647062509,264,view,459835,,2015-09-07 10:31:02.509,1.579
1266750,1441647064088,264,addtocart,459835,,2015-09-07 10:31:04.088,2.096
1270337,1441647066184,264,addtocart,161949,,2015-09-07 10:31:06.184,219.43
1276395,1441647285614,264,transaction,459835,8445.0,2015-09-07 10:34:45.614,0.094
1283314,1441647285708,264,transaction,161949,8445.0,2015-09-07 10:34:45.708,
