In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_rows', 100)
from datetime import datetime, timezone, timedelta

In [5]:
data = pd.read_csv('./prepared/event1_train_full.0.txt', sep='\t')

In [6]:
data.head(10)

Unnamed: 0,Time,UserId,ItemId,SessionId
0,1438969904,2,325215,2
1,1438970013,2,325215,2
2,1438970212,2,259884,2
3,1438970468,2,216305,2
4,1438970905,2,342816,2
5,1438971444,2,342816,2
6,1438971463,2,216305,2
7,1438971657,2,325215,2
8,1440916778,6,253615,7
9,1440916823,6,344723,7


In [9]:
data.columns = ['timestamp', 'user_id', 'item_id', 'session_id']

In [10]:
data.to_csv('train.csv', sep=',')

In [12]:
test = pd.read_csv('./prepared/event1_test.0.txt', sep='\t')
test.columns = ['timestamp', 'user_id', 'item_id', 'session_id']
test.to_csv('test.csv', sep=',')

In [12]:
# set max interval
MAX_INTERVAL = 60 * 30  # 30 minutes

In [14]:
data = pd.read_csv('./raw/events.csv', sep=',', usecols=[0, 1, 2, 3])
data.columns = ['Time', 'UserId', 'Type', 'ItemId']

In [15]:
data.head(10)

Unnamed: 0,Time,UserId,Type,ItemId
0,1433221332117,257597,view,355908
1,1433224214164,992329,view,248676
2,1433221999827,111016,view,318965
3,1433221955914,483717,view,253185
4,1433221337106,951259,view,367447
5,1433224086234,972639,view,22556
6,1433221923240,810725,view,443030
7,1433223291897,794181,view,439202
8,1433220899221,824915,view,428805
9,1433221204592,339335,view,82389


In [16]:
# convert ms to s
data['Time'] = (data.Time / 1000).astype( int )

In [17]:
data['Time'].max(), data['Time'].min()

(1442545187, 1430622004)

In [6]:
# get session
data.sort_values(by=['UserId', 'Time'], ascending=True, inplace=True)
# compute the time difference between queries
tdiff = np.diff(data['Time'].values) # tdiff.shape = data.shape - 1
# check whether time interval is bigger than threshold
split_time = tdiff > MAX_INTERVAL
split_time = np.r_[True, split_time]  # concat a True

In [7]:
# check whether the neighbor data has different user
split_user = data['UserId'].values[1:] != data['UserId'].values[:-1]
split_user = np.r_[True, split_user]

In [8]:
# if time interval is big or there is different user, it is a cut point
split_session = np.logical_or(split_time, split_user)

In [9]:
session_ids = np.cumsum(split_session)  # 累加
data['SessionId'] = session_ids
data.sort_values( ['SessionId','Time'], ascending=True, inplace=True )

In [11]:
# only use the view data
data = data[data.Type == 'view']

In [12]:
data_start = datetime.fromtimestamp( data.Time.min(), timezone.utc )
data_end = datetime.fromtimestamp( data.Time.max(), timezone.utc )
    
print('Loaded data set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}\n\tSpan: {} / {}\n\n'.
      format( len(data), data.SessionId.nunique(), data.ItemId.nunique(), data_start.date().isoformat(), data_end.date().isoformat() ) )

Loaded data set
	Events: 2664312
	Sessions: 1755766
	Items: 234838
	Span: 2015-05-03 / 2015-09-18




In [14]:
data.columns = ['timestamp', 'user_id', 'type', 'item_id', 'session_id']

In [17]:
data.head(10)

Unnamed: 0,timestamp,user_id,type,item_id,session_id
0,1442004589,0,view,285930,1
1,1442004759,0,view,357564,1
2,1442004917,0,view,67045,1
3,1439487966,1,view,72028,2
4,1438969904,2,view,325215,3
5,1438970013,2,view,325215,3
6,1438970212,2,view,259884,3
7,1438970468,2,view,216305,3
8,1438970905,2,view,342816,3
9,1438971444,2,view,342816,3


In [16]:
data.reset_index(drop=True, inplace=True)

In [19]:
data.to_csv('train-item-views.csv', sep=',')

In [20]:
data.timestamp.min(), data.timestamp.max()

(1430622011, 1442545187)