In [1]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
ls -lah ../data/

total 24G
drwxrwxr-x 5 ubuntu ubuntu 4.0K May  3 09:35 [0m[01;34m.[0m/
drwxrwxr-x 8 ubuntu ubuntu 4.0K Apr 30 17:30 [01;34m..[0m/
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 28 16:52 [01;34m.ipynb_checkpoints[0m/
-rw-r--r-- 1 ubuntu ubuntu 4.8G Apr 30 14:27 click_data.feather
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 29 14:21 [01;34mkenkoooos[0m/
-rw-r--r-- 1 ubuntu ubuntu  18G May  2 14:38 merge.feather
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 28 17:13 [01;34mraw[0m/
-rw-rw-r-- 1 ubuntu ubuntu 1.4G Apr 28 17:20 [01;31mraw.zip[0m


In [3]:
%time
merge = pd.read_feather("../data/click_data.feather", nthreads=4)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.3 µs


In [4]:
train_size = merge[merge.is_test == 0].shape[0]

In [5]:
merge.shape

(203693876, 9)

In [6]:
merge.dtypes

app                      uint16
channel                  uint16
click_id                 uint32
device                   uint16
ip                       uint32
is_attributed             uint8
os                       uint16
click_time       datetime64[ns]
is_test                    bool
dtype: object

### get time values

In [7]:
%%time
merge['dow'] = merge.click_time.dt.dayofweek.astype('uint8')
merge['hour'] = merge.click_time.dt.hour.astype('uint8')
merge['minute'] = merge.click_time.dt.minute.astype('uint8')
merge['min5'] = ((merge.click_time.dt.minute/5).apply(lambda x: x//1)*5).astype('uint8')
merge['second'] = merge.click_time.dt.second.astype('uint8')

CPU times: user 1min 54s, sys: 11 s, total: 2min 5s
Wall time: 2min 2s


In [8]:
%%time
MOD = int(1e+8)
seconds = (merge.hour.astype('uint32')*60*60
           + merge.minute.astype('uint32')*60
           + merge.second.astype('uint32'))
merge['sin_time'] = np.sin(2*np.pi*seconds/(24*60*60)).round(8) + 0
merge['cos_time'] = np.cos(2*np.pi*seconds/(24*60*60)).round(8) + 0
del seconds

CPU times: user 17.1 s, sys: 6.79 s, total: 23.9 s
Wall time: 19.5 s


In [9]:
print(merge[['sin_time', 'cos_time']][merge.hour == 0].head(1))
print(merge[['sin_time', 'cos_time']][merge.hour == 6].head(1))
print(merge[['sin_time', 'cos_time']][merge.hour == 12].head(1))
print(merge[['sin_time', 'cos_time']][merge.hour == 18].head(1))

         sin_time  cos_time
9308085       0.0       1.0
          sin_time  cos_time
29409346       1.0       0.0
          sin_time  cos_time
47448221       0.0      -1.0
         sin_time  cos_time
3571125      -1.0       0.0


In [10]:
merge.dtypes

app                      uint16
channel                  uint16
click_id                 uint32
device                   uint16
ip                       uint32
is_attributed             uint8
os                       uint16
click_time       datetime64[ns]
is_test                    bool
dow                       uint8
hour                      uint8
minute                    uint8
min5                      uint8
second                    uint8
sin_time                float64
cos_time                float64
dtype: object

In [11]:
print(merge.shape)
merge

(203693876, 16)


Unnamed: 0,app,channel,click_id,device,ip,is_attributed,os,click_time,is_test,dow,hour,minute,min5,second,sin_time,cos_time
0,20,259,99999999,1,14901,0,17,2017-11-06 16:00:00,False,0,16,0,0,0,-0.866025,-0.500000
1,2,477,99999999,2,5729,0,37,2017-11-06 16:00:00,False,0,16,0,0,0,-0.866025,-0.500000
2,8,145,99999999,1,105475,0,19,2017-11-06 16:00:00,False,0,16,0,0,0,-0.866025,-0.500000
3,26,121,99999999,1,93021,0,13,2017-11-06 16:00:00,False,0,16,0,0,0,-0.866025,-0.500000
4,20,259,99999999,1,78507,0,30,2017-11-06 16:00:00,False,0,16,0,0,0,-0.866025,-0.500000
5,14,379,99999999,1,97463,0,14,2017-11-06 16:00:00,False,0,16,0,0,0,-0.866025,-0.500000
6,15,315,99999999,1,95766,0,27,2017-11-06 16:00:00,False,0,16,0,0,0,-0.866025,-0.500000
7,12,245,99999999,1,156391,0,19,2017-11-06 16:00:00,False,0,16,0,0,0,-0.866025,-0.500000
8,15,138,99999999,1,73555,0,10,2017-11-06 16:00:00,False,0,16,0,0,0,-0.866025,-0.500000
9,15,153,99999999,1,5314,0,19,2017-11-06 16:00:00,False,0,16,0,0,0,-0.866025,-0.500000


In [12]:
merge.to_feather("../data/merge.feather")