In [1]:
import time
import random
import ciso8601
import numpy as np
import pandas as pd
import datetime as dt

from multiprocessing import Pool

In [2]:
## load data
raw_data = pd.read_csv('./yoochoose-clicks.dat', sep=',', \
                   header=None, usecols=[0,1,2], dtype={0:np.int32, 1:str, 2:np.int64})
raw_data.columns = ['SessionId', 'TimeStr', 'ItemId']
raw_data.shape

(33003944, 3)

In [3]:
random.seed(1050)
sample_rate = 0.1


In [4]:
u_sessid = raw_data.SessionId.unique()
s_sessid = random.sample(u_sessid.tolist(), int(len(u_sessid)*sample_rate))
raw_data = raw_data[np.in1d(raw_data.SessionId, s_sessid)]
raw_data.shape

(3301436, 3)

In [5]:
raw_data

Unnamed: 0,SessionId,TimeStr,ItemId
15,6,2014-04-06T16:58:20.848Z,214701242
16,6,2014-04-06T17:02:26.976Z,214826623
55,21,2014-04-07T09:01:28.552Z,214838503
56,21,2014-04-07T09:03:39.903Z,214838503
57,21,2014-04-07T09:04:00.598Z,214838503
...,...,...,...
33003922,11299818,2014-09-24T19:55:12.619Z,214855209
33003923,11299819,2014-09-25T08:29:28.743Z,214854855
33003924,11299819,2014-09-25T08:30:27.345Z,214854815
33003925,11299819,2014-09-25T08:31:39.129Z,214854815


In [6]:
num_cores = 8

def timestr_to_timestamp(df):
    df['timestamp'] = df.TimeStr.apply(lambda x: ciso8601.parse_datetime(x).timestamp())
    return df

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_cores)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [7]:
raw_data['timestamp'] = raw_data.TimeStr.apply(lambda x: ciso8601.parse_datetime(x).timestamp())
data = raw_data
del(data['TimeStr'])

In [8]:
data

Unnamed: 0,SessionId,ItemId,timestamp
15,6,214701242,1.396804e+09
16,6,214826623,1.396804e+09
55,21,214838503,1.396861e+09
56,21,214838503,1.396861e+09
57,21,214838503,1.396861e+09
...,...,...,...
33003922,11299818,214855209,1.411589e+09
33003923,11299819,214854855,1.411634e+09
33003924,11299819,214854815,1.411634e+09
33003925,11299819,214854815,1.411634e+09


In [9]:
data = data.sort_values(['SessionId','timestamp'])


In [10]:
## data length by sessionid
session_lengths = data.groupby('SessionId').size()
print("length:", len(session_lengths))
print("min length", min(session_lengths))
print("max length", max(session_lengths))

length: 924972
min length 1
max length 200


In [11]:
session_lengths

SessionId
6           2
21          6
36          2
41          5
53          4
           ..
11562112    2
11562122    2
11562131    3
11562151    2
11562157    2
Length: 924972, dtype: int64

In [12]:
data = data[np.in1d(data.SessionId, session_lengths[session_lengths>=2].index)]

In [13]:
item_supports = data.groupby('ItemId').size()
print("length:", len(item_supports))
print("min length", min(item_supports))
print("max length", max(item_supports))

length: 34452
min length 1
max length 13262


In [14]:
item_supports

ItemId
214507226        5
214507228        1
214507331     1282
214507365      155
214507385       63
              ... 
643078907        1
643078950        3
1178799879       1
1178804544       1
1178833843       1
Length: 34452, dtype: int64

In [15]:
data = data[np.in1d(data.ItemId, item_supports[item_supports>=5].index)]

In [16]:
# item5번 미만 나온 data 는 제거하였으니 다시 한 번 session 길이 검토 
session_lengths = data.groupby('SessionId').size()
data = data[np.in1d(data.SessionId, session_lengths[session_lengths>=2].index)]

In [17]:
#train/test set 나누는 과정 
tmax = data.timestamp.max()
session_max_times = data.groupby('SessionId').timestamp.max()
#index 구하기 
session_train = session_max_times[session_max_times < tmax-86400].index
session_test = session_max_times[session_max_times >= tmax-86400].index

In [18]:
train = data[np.in1d(data.SessionId, session_train)]
test = data[np.in1d(data.SessionId, session_test)]
test = test[np.in1d(test.ItemId, train.ItemId)]

In [19]:
test

Unnamed: 0,SessionId,ItemId,timestamp
32209768,11255868,214853754,1.412011e+09
32209769,11255868,214577709,1.412011e+09
32209770,11255868,214853754,1.412011e+09
32209802,11255882,214855046,1.411965e+09
32209803,11255882,214854913,1.411965e+09
...,...,...,...
32232694,11561912,214853092,1.412001e+09
32232695,11561912,214853092,1.412002e+09
32232400,11561946,214586805,1.412004e+09
32232401,11561946,214586805,1.412006e+09


In [20]:
train

Unnamed: 0,SessionId,ItemId,timestamp
15,6,214701242,1.396804e+09
16,6,214826623,1.396804e+09
55,21,214838503,1.396861e+09
56,21,214838503,1.396861e+09
57,21,214838503,1.396861e+09
...,...,...,...
32230538,11562131,214854542,1.411823e+09
32230502,11562151,214536296,1.411769e+09
32230503,11562151,214536296,1.411769e+09
32230485,11562157,214580372,1.411648e+09


In [21]:
file_type='sample'
## save processed data
print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), 
        train.SessionId.nunique(), train.ItemId.nunique()))
train.to_csv('./rsc15_train_{}.txt'.format(file_type), 
             sep='\t', index=False)
print('./rsc15_train_{}.txt'.format(file_type))

print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), 
        test.SessionId.nunique(), test.ItemId.nunique()))
test.to_csv('./rsc15_test_{}.txt'.format(file_type), 
            sep='\t', index=False)
print('./rsc15_test_{}.txt'.format(file_type))

Full train set
	Events: 3140310
	Sessions: 791707
	Items: 21878
./rsc15_train_sample.txt
Test set
	Events: 7451
	Sessions: 1618
	Items: 1675
./rsc15_test_sample.txt


In [22]:
# validation set 분리 
tmax = train.timestamp.max()
session_max_times = train.groupby('SessionId').timestamp.max()
session_train = session_max_times[session_max_times < tmax-86400].index
session_valid = session_max_times[session_max_times >= tmax-86400].index

train_tr = train[np.in1d(train.SessionId, session_train)]
valid = train[np.in1d(train.SessionId, session_valid)]
valid = valid[np.in1d(valid.ItemId, train_tr.ItemId)]
tslength = valid.groupby('SessionId').size()
valid = valid[np.in1d(valid.SessionId, tslength[tslength>=2].index)]


In [23]:
train_tr

Unnamed: 0,SessionId,ItemId,timestamp
15,6,214701242,1.396804e+09
16,6,214826623,1.396804e+09
55,21,214838503,1.396861e+09
56,21,214838503,1.396861e+09
57,21,214838503,1.396861e+09
...,...,...,...
32230538,11562131,214854542,1.411823e+09
32230502,11562151,214536296,1.411769e+09
32230503,11562151,214536296,1.411769e+09
32230485,11562157,214580372,1.411648e+09


In [24]:
print('Train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train_tr), 
        train_tr.SessionId.nunique(), train_tr.ItemId.nunique()))
train_tr.to_csv( './rsc15_train_{}_trn.txt'.format(file_type), 
                sep='\t', index=False)

print('./rsc15_train_{}_trn.txt'.format(file_type))
print('Validation set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(valid), 
        valid.SessionId.nunique(), valid.ItemId.nunique()))
valid.to_csv('./rsc15_train_{}_valid.txt'.format(file_type), 
             sep='\t', index=False)
print('./rsc15_train_{}_valid.txt'.format(file_type))



Train set
	Events: 3134339
	Sessions: 790449
	Items: 21878
./rsc15_train_sample_trn.txt
Validation set
	Events: 5971
	Sessions: 1258
	Items: 1399
./rsc15_train_sample_valid.txt
