In [1]:
import pandas as pd
from datetime import datetime

import numpy as np

FILE_CATEGORY_TREE = '../data/category_tree.csv'
FILE_EVENTS = '../data/events.csv'
FILE_ITEM_PROPERTIES_1 = '../data/item_properties_part1.csv'
FILE_ITEM_PROPERTIES_2 = '../data/item_properties_part2.csv'
FILE_ITEM_PROPERTIES_ALL = '../data/item_properties_all.csv'

### Original EDA analysis

``` python
category_tree = pd.read_csv(FILE_CATEGORY_TREE)

category_tree.head()

category_tree.shape

category_tree[category_tree.parentid.isnull()].sort_values('categoryid')

def find_root(x, df):
    while True: 
        if np.isnan(df.loc[df.categoryid == x, 'parentid'].unique()[0]):
            return x
        else:
            x = df.loc[df.categoryid == x, 'parentid'].unique()[0]

l = []
for c in category_tree.categoryid:
    l.append(find_root(c,category_tree))

category_tree = pd.concat([category_tree,pd.Series(l,name='top_parent')],axis=1)

top_parent_summary = category_tree.groupby('top_parent')['categoryid'].count().reset_index()
top_parent_summary.rename(columns = {'categoryid':'num_categories'}, inplace=True)
top_parent_summary.sort_values('num_categories',ascending=False)

top_parent_summary.shape

25 top-level categories 

events = pd.read_csv(FILE_EVENTS)

events.head()

events.describe(include='all')

events.info()

events.event.value_counts()

events.itemid.unique().shape

235,061 products view/addtocart/transcation

def convert_to_local(x):
    return datetime.fromtimestamp(x/1000)

events['local_date_time'] = events.timestamp.apply(convert_to_local)

events[events.visitorid == 1150086].sort_values('local_date_time').head(10)

events.sort_values(['visitorid','local_date_time'], inplace=True)
events['time_diff'] = events.groupby('visitorid')['timestamp'].diff(periods=-1) *-1

events.time_diff = events.time_diff / 1000 # convert from milliseconds to seconds

events.groupby('visitorid')['time_diff'].agg(['mean','count']).reset_index().sort_values('count', ascending=False).head()

events[events.visitorid == 280150].sort_values('local_date_time').head(10)

events.agg({'local_date_time':['min','max']})

item_properties = pd.read_csv(FILE_ITEM_PROPERTIES_1)

item_properties['local_date_time'] = item_properties.timestamp.apply(convert_to_local)

item_properties[item_properties.itemid.isin([133542])].sort_values('timestamp')

item_properties_2 = pd.read_csv(FILE_ITEM_PROPERTIES_2)

item_properties_2['local_date_time'] = item_properties_2.timestamp.apply(convert_to_local)

item_properties_2[item_properties_2.itemid.isin([133542])].sort_values('timestamp')

item_properties_master = item_properties.append(item_properties_2)

item_properties_master[item_properties_master.itemid.isin([133542])].sort_values(['property','local_date_time'])

item_properties_master[item_properties_master.itemid.isin([167873])].sort_values(['property','local_date_time'])

item_property_unique = item_properties_master.loc[:,['itemid','property']].drop_duplicates()

property_count = item_property_unique.groupby('property')['itemid'].count().sort_values(ascending=False).reset_index()

property_count[property_count.itemid == 417053]

Above properties are represented of all items. Let's see what the unique values are for these properties.

item_properties_master.loc[item_properties_master.property == '364','value'].value_counts().sort_values(ascending=False).reset_index().head(10)

item_properties_master[item_properties_master.value.str.contains(r'[^\s]')]
```

### New EDA Analysis

In [2]:
events = pd.read_csv('../data/events_enhanced.csv')
events.local_date_time = pd.to_datetime(events.local_date_time)

In [3]:
category_tree = pd.read_csv('../data/category_tree_parent.csv')

In [4]:
item_properties_master = pd.read_csv('../data/item_properties_master.csv')
item_properties_master.local_date_time = pd.to_datetime(item_properties_master.local_date_time)

### Reduce data set size and begin to build MVP feature set

In [5]:
events_trimmed = events[events.local_date_time >= datetime(2015, 8, 15)]

print(f'Trimmed events {events_trimmed.shape[0]:,}')
print()

print(f'Count of events: \n{events_trimmed.event.value_counts()}')
print()
# unique visitors
print(f'Visitors that bought something {events_trimmed[events_trimmed.event == "transaction"].visitorid.unique().shape[0]:,}')
print()

# all visitors where at least one session ended in a transaction
visitors = events_trimmed[events_trimmed.event == 'transaction'].visitorid.unique()
print(f'Events for visitors who bought something {events_trimmed[events_trimmed.visitorid.isin(visitors)].shape[0]:,}')
buy_visitors = events_trimmed[events_trimmed.visitorid.isin(visitors)]

# calculate the session_id
# session_id identifes each pattern of view...transaction for each visitor as unique
l = []
for v in visitors:
    v_df = buy_visitors[buy_visitors.visitorid == v].sort_values('local_date_time')
    prev_event = 'view'
    session_id = 1
    for i in v_df.index:
        if prev_event == 'transaction' and v_df.loc[i,'event'] != 'transaction':
            session_id += 1

        prev_event = v_df.loc[i,'event']
        l.append(session_id)

# assign each session_id and make it unique
buy_visitors['session_id'] = l
buy_visitors.session_id = buy_visitors.visitorid.astype('str') + '_' + buy_visitors.session_id.astype('str')

# group by session_id and remove those sessions without a transaction
grouped_events = buy_visitors.groupby(['session_id','event'])['visitorid'].count().reset_index()
valid_sessions = grouped_events[grouped_events.event == 'transaction'].session_id
buy_visitors = buy_visitors[buy_visitors.session_id.isin(valid_sessions)]

print()
print(f'Sessions that have at least one transaction {buy_visitors.shape[0]:,}')

# calcaulte session length feature
buy_visitors.time_diff = buy_visitors.time_diff.shift(1)
buy_visitors.rename(columns={'time_diff':'session_length'}, inplace=True)

# calculate hour of day and day of week
buy_visitors['session_hour'] = buy_visitors.local_date_time.dt.hour
buy_visitors['session_dow'] = buy_visitors.local_date_time.dt.dayofweek

# get category_id
category_property = item_properties_master[item_properties_master.property == 'categoryid']

event_category_df = buy_visitors.merge(category_property, how='left',on='itemid')
print(f'Rows when left join {event_category_df.shape[0]:,}')

event_category_df = buy_visitors.merge(category_property, how='inner',on='itemid')
print(f'Rows when inner join {event_category_df.shape[0]:,}')

print('Using inner join for now, and will come back later.')

# get all categories that were set prior to the visitor seeing the page
event_category_df = event_category_df[event_category_df.local_date_time_x > event_category_df.local_date_time_y]

category_max_date_time = event_category_df.groupby(['local_date_time_x','itemid','session_id'])['local_date_time_y'].max().reset_index()
event_category_df = event_category_df.merge(category_max_date_time, how='inner', on=['local_date_time_x','itemid','session_id','local_date_time_y'])

# clean up the df
drop_c = ['timestamp_y', 'property', 'local_date_time_y']
event_category_df.drop(columns=drop_c, inplace=True)

rename_c = {'timestamp_x':'timestamp', 'local_date_time_x':'local_date_time','value':'category_id'}
event_category_df.rename(columns=rename_c, inplace=True)

# set it back to buy_visitors
buy_visitors = event_category_df
del(event_category_df)
print(f'Updated shape of the feature DF {buy_visitors.shape[0]:,}')

Trimmed events 599,871

Count of events: 
view           579229
addtocart       15666
transaction      4976
Name: event, dtype: int64

Visitors that bought something 2,672

Events for visitors who bought something 43,737


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value



Sessions that have at least one transaction 37,096
Rows when left join 69,523
Rows when inner join 68,260
Using inner join for now, and will come back later.
Updated shape of the feature DF 35,036


In [6]:
buy_visitors.head()
# TODO Available and then figure out the t-1 thing.
# Thinking of adding a feature to start with transaction and count backwards within each session ordered by time

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,local_date_time,session_length,session_id,session_hour,session_dow,category_id
0,1441645752830,264,view,161949,,2015-09-07 10:09:12.830,,264_1,10,0,1421
1,1441646849375,264,view,161949,,2015-09-07 10:27:29.375,1096.545,264_1,10,0,1421
2,1441647066184,264,addtocart,161949,,2015-09-07 10:31:06.184,2.096,264_1,10,0,1421
3,1441647285708,264,transaction,161949,8445.0,2015-09-07 10:34:45.708,0.094,264_1,10,0,1421
4,1441647062509,264,view,459835,,2015-09-07 10:31:02.509,213.134,264_1,10,0,1421
