In [1]:
import pandas as pd
import json
from itertools import groupby, chain

## Load datasets

In [2]:
df_events = pd.read_csv('./events.csv')
df_item1 = pd.read_csv('./item_properties_part1.csv')
df_item2 = pd.read_csv('./item_properties_part2.csv')
df_item = pd.concat([df_item1, df_item2])

### See some items in the datasets

In [3]:
df_events.head(10)

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,
5,1433224086234,972639,view,22556,
6,1433221923240,810725,view,443030,
7,1433223291897,794181,view,439202,
8,1433220899221,824915,view,428805,
9,1433221204592,339335,view,82389,


In [4]:
df_item.head(10)

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513
5,1436065200000,285026,available,0
6,1434250800000,89534,213,1121373
7,1431831600000,264312,6,319724
8,1433646000000,229370,202,1330310
9,1434250800000,98113,451,1141052 n48.000


### Filter out unnecessary records

In [5]:
df_item_cats = df_item.loc[df_item['property'] == 'categoryid']
df_item_cats.sort_values(['itemid', 'timestamp']).head(10)

Unnamed: 0,timestamp,itemid,property,value
8220985,1431226800000,0,categoryid,209
8428979,1431226800000,1,categoryid,1114
9157980,1431226800000,2,categoryid,1305
3828355,1433041200000,3,categoryid,1171
8767464,1431831600000,4,categoryid,1038
9682959,1432436400000,5,categoryid,1061
562977,1431226800000,6,categoryid,1091
8724941,1435460400000,7,categoryid,512
1286993,1435460400000,8,categoryid,173
5032420,1431226800000,10,categoryid,1301


In [6]:
df_item_cats_collapsed = df_item_cats.sort_values('timestamp', ascending=True).drop_duplicates(['itemid', 'value'])
df_item_cats_collapsed.sort_values('itemid').head(10)

Unnamed: 0,timestamp,itemid,property,value
8220985,1431226800000,0,categoryid,209
8428979,1431226800000,1,categoryid,1114
9157980,1431226800000,2,categoryid,1305
3828355,1433041200000,3,categoryid,1171
8767464,1431831600000,4,categoryid,1038
9682959,1432436400000,5,categoryid,1061
562977,1431226800000,6,categoryid,1091
8724941,1435460400000,7,categoryid,512
1286993,1435460400000,8,categoryid,173
5032420,1431226800000,10,categoryid,1301


### Joint two tables

In [7]:
df_events_cats = pd.merge(df_item_cats_collapsed, df_events, on='itemid')
df_events_cats.sort_values('visitorid').head(10)

Unnamed: 0,timestamp_x,itemid,property,value,timestamp_y,visitorid,event,transactionid
838575,1431226800000,357564,categoryid,256,1442004759591,0,view,
1595267,1431831600000,285930,categoryid,1188,1442004589439,0,view,
1706702,1432436400000,67045,categoryid,333,1442004917175,0,view,
1363344,1431226800000,72028,categoryid,1192,1439487966444,1,view,
2538349,1435460400000,325215,categoryid,299,1438970013790,2,view,
1669020,1432436400000,259884,categoryid,299,1438970212664,2,view,
2538347,1435460400000,325215,categoryid,299,1438971657845,2,view,
2538350,1435460400000,325215,categoryid,299,1438969904567,2,view,
1184212,1431226800000,342816,categoryid,444,1438970905669,2,view,
1184211,1431226800000,342816,categoryid,444,1438971444375,2,view,


Remove invalid entries

In [8]:
df_events_cats = df_events_cats[df_events_cats.timestamp_x < df_events_cats.timestamp_y]

Make concise table

In [9]:
df_events_cats['event_title'] = df_events_cats[['event', 'value']].apply(lambda x: ':'.join(x), axis=1)

In [10]:
df_event_seqs = df_events_cats[['visitorid', 'timestamp_y', 'event_title', 'itemid']]
df_event_seqs.columns = ['visitorid', 'timestamp', 'event_title', 'itemid']
df_event_seqs = df_event_seqs.sort_values(['visitorid', 'timestamp'])
df_event_seqs.head(50)

Unnamed: 0,visitorid,timestamp,event_title,itemid
1595267,0,1442004589439,view:1188,285930
838575,0,1442004759591,view:256,357564
1706702,0,1442004917175,view:333,67045
1363344,1,1439487966444,view:1192,72028
2538350,2,1438969904567,view:299,325215
2538349,2,1438970013790,view:299,325215
1669020,2,1438970212664,view:299,259884
1921503,2,1438970468920,view:299,216305
1184212,2,1438970905669,view:444,342816
1184211,2,1438971444375,view:444,342816


## Group by visitor

Convert to dictionaries

In [11]:
events = df_event_seqs.sort_values('visitorid').to_dict(orient='records')

Group by visitor id

In [12]:
event_seqs = []
for k, g in groupby(events, lambda x: x['visitorid']):
    event_seq = list(g)
    event_seqs.append({
        'visitorId': event_seq[0]['visitorid'],
        'events': [{
            'timestamp': event['timestamp'],
            'title': event['event_title'],
            'itemId': event['itemid']
        } for event in event_seq]
    })

In [13]:
event_seqs[0:10]

[{'events': [{'itemId': 285930,
    'timestamp': 1442004589439,
    'title': 'view:1188'},
   {'itemId': 357564, 'timestamp': 1442004759591, 'title': 'view:256'},
   {'itemId': 67045, 'timestamp': 1442004917175, 'title': 'view:333'}],
  'visitorId': 0},
 {'events': [{'itemId': 72028,
    'timestamp': 1439487966444,
    'title': 'view:1192'}],
  'visitorId': 1},
 {'events': [{'itemId': 325215,
    'timestamp': 1438969904567,
    'title': 'view:299'},
   {'itemId': 325215, 'timestamp': 1438970013790, 'title': 'view:299'},
   {'itemId': 259884, 'timestamp': 1438970212664, 'title': 'view:299'},
   {'itemId': 216305, 'timestamp': 1438970468920, 'title': 'view:299'},
   {'itemId': 342816, 'timestamp': 1438970905669, 'title': 'view:444'},
   {'itemId': 342816, 'timestamp': 1438971444375, 'title': 'view:444'},
   {'itemId': 216305, 'timestamp': 1438971463170, 'title': 'view:299'},
   {'itemId': 325215, 'timestamp': 1438971657845, 'title': 'view:299'}],
  'visitorId': 2},
 {'events': [{'itemId'

In [14]:
ts_diff = 30 * 60 * 1000
def split_events(event_seq):
    seqs = []
    seq = {
        'visitorId': event_seq['visitorId'],
        'events': [event_seq['events'][0]]
    }
    seqs.append(seq)
    for i in range(1, len(event_seq['events'])):
        event = event_seq['events'][i]
        previous_event = event_seq['events'][i - 1]
        if (event['timestamp'] - previous_event['timestamp'] > ts_diff):
            seq = {
                'visitorId': event_seq['visitorId'],
                'events': [event]
            }
            seqs.append(seq)
        else:
            seq['events'].append(event)
    return seqs

In [15]:
split_events(list(filter(lambda x: len(x['events']) > 40, event_seqs))[0])

[{'events': [{'itemId': 293512,
    'timestamp': 1438360638722,
    'title': 'view:589'},
   {'itemId': 395407, 'timestamp': 1438360764412, 'title': 'view:589'},
   {'itemId': 270798, 'timestamp': 1438360782479, 'title': 'view:589'},
   {'itemId': 270798, 'timestamp': 1438360805941, 'title': 'view:589'},
   {'itemId': 178991, 'timestamp': 1438360958824, 'title': 'view:1135'},
   {'itemId': 80661, 'timestamp': 1438361004536, 'title': 'view:1135'},
   {'itemId': 270798, 'timestamp': 1438361162414, 'title': 'view:589'},
   {'itemId': 80661, 'timestamp': 1438361189536, 'title': 'view:1135'}],
  'visitorId': 1879},
 {'events': [{'itemId': 270798,
    'timestamp': 1438710985871,
    'title': 'view:589'},
   {'itemId': 380440, 'timestamp': 1438622869427, 'title': 'view:586'},
   {'itemId': 270798, 'timestamp': 1438622894828, 'title': 'view:589'},
   {'itemId': 80661, 'timestamp': 1438622912948, 'title': 'view:1135'}],
  'visitorId': 1879},
 {'events': [{'itemId': 125819,
    'timestamp': 1438

In [16]:
event_seqs_split30mins = list(chain.from_iterable(map(split_events, event_seqs)))

In [20]:
event_seqs_split30mins[0:10]

[{'events': [{'itemId': 285930,
    'timestamp': 1442004589439,
    'title': 'view:1188'},
   {'itemId': 357564, 'timestamp': 1442004759591, 'title': 'view:256'},
   {'itemId': 67045, 'timestamp': 1442004917175, 'title': 'view:333'}],
  'visitorId': 0},
 {'events': [{'itemId': 72028,
    'timestamp': 1439487966444,
    'title': 'view:1192'}],
  'visitorId': 1},
 {'events': [{'itemId': 325215,
    'timestamp': 1438969904567,
    'title': 'view:299'},
   {'itemId': 325215, 'timestamp': 1438970013790, 'title': 'view:299'},
   {'itemId': 259884, 'timestamp': 1438970212664, 'title': 'view:299'},
   {'itemId': 216305, 'timestamp': 1438970468920, 'title': 'view:299'},
   {'itemId': 342816, 'timestamp': 1438970905669, 'title': 'view:444'},
   {'itemId': 342816, 'timestamp': 1438971444375, 'title': 'view:444'},
   {'itemId': 216305, 'timestamp': 1438971463170, 'title': 'view:299'},
   {'itemId': 325215, 'timestamp': 1438971657845, 'title': 'view:299'}],
  'visitorId': 2},
 {'events': [{'itemId'

In [18]:
f = open('ecommerce-eventseq.jsonl', 'w')
for event_seq in event_seqs_split30mins:
    f.write(json.dumps(event_seq))
    f.write('\n')
f.flush()