In [1]:
import time
import random
import ciso8601
import numpy as np
import pandas as pd
import datetime as dt

from multiprocessing import Pool

## Data Source
https://2015.recsyschallenge.com/challenge.html

In [2]:
## set data path
#PATH_TO_ORIGINAL_DATA = '/PATH/TO/raw/'
#PATH_TO_PROCESSED_DATA = '/PATH/TO/processed/'
PATH_TO_ORIGINAL_DATA = '/home/khlee/git/recommendation/GRU4Rec_TensorFlow/data/raw/'
PATH_TO_PROCESSED_DATA = '/home/khlee/git/recommendation/GRU4Rec_TensorFlow/data/processed/'

In [3]:
## load data
%time raw_data = pd.read_csv(PATH_TO_ORIGINAL_DATA + 'yoochoose-clicks.dat', sep=',', \
                   header=None, usecols=[0,1,2], dtype={0:np.int32, 1:str, 2:np.int64})
raw_data.columns = ['SessionId', 'TimeStr', 'ItemId']
raw_data.shape

CPU times: user 28.2 s, sys: 4.06 s, total: 32.3 s
Wall time: 32.3 s


(33003944, 3)

In [4]:
## parameters
sampling=True
sample_rate = 0.1
single_process=True
file_type = "sample" if sampling==True else "full"
file_type

'sample'

In [5]:
## sampling
### raw data의 수가 많으므로 tutorial을 원활히 수행하기 위해,
### sessionId 기준으로 샘플링을 수행한다.
random.seed(1050)
if sampling:
    u_sessid = raw_data.SessionId.unique()
    s_sessid = random.sample(u_sessid.tolist(), int(len(u_sessid)*sample_rate))
    raw_data = raw_data[np.in1d(raw_data.SessionId, s_sessid)]
raw_data.shape

(3301436, 3)

In [6]:
## multi processing: transpose timestr to timestamp
num_cores = 8

def timestr_to_timestamp(df):
    df['timestamp'] = df.TimeStr.apply(lambda x: ciso8601.parse_datetime(x).timestamp())
    return df

def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_cores)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [7]:
## transpose timestr to timestamp
if single_process:
    ## single processing
    %time raw_data['timestamp'] = raw_data.TimeStr.apply(lambda x: ciso8601.parse_datetime(x).timestamp())
    data = raw_data
    del(data['TimeStr'])
else:
    ## multi processing
    %time data = parallelize_dataframe(raw_data, timestr_to_timestamp)
    del(data['TimeStr'])

CPU times: user 3.47 s, sys: 644 ms, total: 4.11 s
Wall time: 4.11 s


In [8]:
## sorted by sessionid, timestamp
%time data = data.sort_values(['SessionId','timestamp'])
data[:5]

CPU times: user 3.3 s, sys: 204 ms, total: 3.5 s
Wall time: 3.5 s


Unnamed: 0,SessionId,ItemId,timestamp
15,6,214701242,1396804000.0
16,6,214826623,1396804000.0
55,21,214838503,1396861000.0
56,21,214838503,1396861000.0
57,21,214838503,1396861000.0


In [9]:
## data length by sessionid
session_lengths = data.groupby('SessionId').size()
print("length:", len(session_lengths))
print("min length", min(session_lengths))
print("max length", max(session_lengths))

length: 924972
min length 1
max length 200


In [10]:
## filter by session length
### session이 2이상인 데이터만 필터한다.
data = data[np.in1d(data.SessionId, session_lengths[session_lengths>=2].index)]

In [11]:
## data length by itemid
item_supports = data.groupby('ItemId').size()
print("length:", len(item_supports))
print("min length", min(item_supports))
print("max length", max(item_supports))

length: 34452
min length 1
max length 13262


In [12]:
## filter by item length
### item이 5이상인 데이터만 필터한다.
data = data[np.in1d(data.ItemId, item_supports[item_supports>=5].index)]

In [13]:
## filter by session length
### item에 의해 session length가 1인 id생길 수 있으므로 한번더 수행한다.
session_lengths = data.groupby('SessionId').size()
data = data[np.in1d(data.SessionId, session_lengths[session_lengths>=2].index)]

In [14]:
## split train & test set
### 마지막 시간으로 부터 24시간 전까지 테스트 데이터로 사용하고 그 이전을 학습 데이터로 인덱스 구성
tmax = data.timestamp.max()
session_max_times = data.groupby('SessionId').timestamp.max()
session_train = session_max_times[session_max_times < tmax-86400].index
session_test = session_max_times[session_max_times >= tmax-86400].index
### 학습 & 테스트 데이터 구성, 테스트 데이터의 아이템은 학습데이터에 있는 아이템만을 선택
train = data[np.in1d(data.SessionId, session_train)]
test = data[np.in1d(data.SessionId, session_test)]
test = test[np.in1d(test.ItemId, train.ItemId)]

In [15]:
## filter by session length
### test data의 item 필터 후 session length가 1이 될 수 있으므로, 필터링을 한번 더 수행
tslength = test.groupby('SessionId').size()
test = test[np.in1d(test.SessionId, tslength[tslength>=2].index)]

In [16]:
## save processed data
print('Full train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train), 
        train.SessionId.nunique(), train.ItemId.nunique()))
train.to_csv(PATH_TO_PROCESSED_DATA + 'rsc15_train_{}.txt'.format(file_type), 
             sep='\t', index=False)
print(PATH_TO_PROCESSED_DATA + 'rsc15_train_{}.txt'.format(file_type))
print('Test set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(test), 
        test.SessionId.nunique(), test.ItemId.nunique()))
test.to_csv(PATH_TO_PROCESSED_DATA + 'rsc15_test_{}.txt'.format(file_type), 
            sep='\t', index=False)
print(PATH_TO_PROCESSED_DATA + 'rsc15_test_{}.txt'.format(file_type))

Full train set
	Events: 3140310
	Sessions: 791707
	Items: 21878
/home/khlee/git/recommendation/GRU4Rec_TensorFlow/data/processed/rsc15_train_sample.txt
Test set
	Events: 7451
	Sessions: 1618
	Items: 1675
/home/khlee/git/recommendation/GRU4Rec_TensorFlow/data/processed/rsc15_test_sample.txt


In [17]:
## make validation set
### 동일한 과정으로 train data에서 validation data를 분리한다.
tmax = train.timestamp.max()
session_max_times = train.groupby('SessionId').timestamp.max()
session_train = session_max_times[session_max_times < tmax-86400].index
session_valid = session_max_times[session_max_times >= tmax-86400].index
train_tr = train[np.in1d(train.SessionId, session_train)]
valid = train[np.in1d(train.SessionId, session_valid)]
valid = valid[np.in1d(valid.ItemId, train_tr.ItemId)]
tslength = valid.groupby('SessionId').size()
valid = valid[np.in1d(valid.SessionId, tslength[tslength>=2].index)]
print('Train set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(train_tr), 
        train_tr.SessionId.nunique(), train_tr.ItemId.nunique()))
train_tr.to_csv(PATH_TO_PROCESSED_DATA + 'rsc15_train_{}_trn.txt'.format(file_type), 
                sep='\t', index=False)
print(PATH_TO_PROCESSED_DATA + 'rsc15_train_{}_trn.txt'.format(file_type))
print('Validation set\n\tEvents: {}\n\tSessions: {}\n\tItems: {}'.format(len(valid), 
        valid.SessionId.nunique(), valid.ItemId.nunique()))
valid.to_csv(PATH_TO_PROCESSED_DATA + 'rsc15_train_{}_valid.txt'.format(file_type), 
             sep='\t', index=False)
print(PATH_TO_PROCESSED_DATA + 'rsc15_train_{}_valid.txt'.format(file_type))

Train set
	Events: 3134339
	Sessions: 790449
	Items: 21878
/home/khlee/git/recommendation/GRU4Rec_TensorFlow/data/processed/rsc15_train_sample_trn.txt
Validation set
	Events: 5971
	Sessions: 1258
	Items: 1399
/home/khlee/git/recommendation/GRU4Rec_TensorFlow/data/processed/rsc15_train_sample_valid.txt
