In [1]:
import gc
import time
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
ls -lah ../data/

total 13G
drwxrwxr-x 5 ubuntu ubuntu 4.0K Apr 30 07:21 [0m[01;34m.[0m/
drwxrwxr-x 8 ubuntu ubuntu 4.0K Apr 30 17:30 [01;34m..[0m/
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 28 16:52 [01;34m.ipynb_checkpoints[0m/
-rw-r--r-- 1 ubuntu ubuntu 4.8G Apr 30 14:27 click_data.feather
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 29 14:21 [01;34mkenkoooos[0m/
-rw-r--r-- 1 ubuntu ubuntu 6.3G May  2 13:22 merge.feather
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 28 17:13 [01;34mraw[0m/
-rw-rw-r-- 1 ubuntu ubuntu 1.4G Apr 28 17:20 [01;31mraw.zip[0m


In [3]:
%time
merge = pd.read_feather("../data/merge.feather", nthreads=4)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 8.34 µs


In [4]:
train_size = merge[merge.is_test == 0].shape[0]

In [5]:
merge.shape

(145713707, 16)

In [6]:
merge.dtypes

app                      uint16
channel                  uint16
click_id                 uint32
device                   uint16
ip                       uint32
is_attributed             uint8
os                       uint16
click_time       datetime64[ns]
is_test                    bool
dow                       uint8
hour                      uint8
minute                    uint8
min5                      uint8
second                    uint8
sin_time                float64
cos_time                float64
dtype: object

### get "ip_os_device_dow"
### get "ip_os_device_dow_hour"
### get "ip_os_device_dow_hour_min5"
ip_os_device_dowは，testデータがhour単位で虫食いなので作成しない

In [7]:
print('ip:     ', np.min(merge.ip), np.max(merge.ip))
print('os:     ', np.min(merge.os), np.max(merge.os))
print('device: ', np.min(merge.device), np.max(merge.device))

ip:      0 364778
os:      0 956
device:  0 4227


In [8]:
%%time
merge['ip_os_device'] = \
    (merge.device.astype('uint64')
     + merge.os.astype('uint64')*10000
     + merge.ip.astype('uint64')*10000000)

merge['ip_os_device_dow_hour'] = \
    (merge.hour.astype('uint64')
     + merge.dow.astype('uint64')*100
     + merge.device.astype('uint64')*1000
     + merge.os.astype('uint64')*10000000
     + merge.ip.astype('uint64')*10000000000)
    
merge['ip_os_device_dow_hour_min5'] = \
    (merge.min5.astype('uint64')
     + merge.hour.astype('uint64')*100
     + merge.dow.astype('uint64')*10000
     + merge.device.astype('uint64')*100000
     + merge.os.astype('uint64')*1000000000
     + merge.ip.astype('uint64')*1000000000000)

CPU times: user 8.56 s, sys: 11.1 s, total: 19.7 s
Wall time: 19.7 s


In [9]:
merge.dtypes

app                                   uint16
channel                               uint16
click_id                              uint32
device                                uint16
ip                                    uint32
is_attributed                          uint8
os                                    uint16
click_time                    datetime64[ns]
is_test                                 bool
dow                                    uint8
hour                                   uint8
minute                                 uint8
min5                                   uint8
second                                 uint8
sin_time                             float64
cos_time                             float64
ip_os_device                          uint64
ip_os_device_dow_hour                 uint64
ip_os_device_dow_hour_min5            uint64
dtype: object

In [10]:
merge.tail()

Unnamed: 0,app,channel,click_id,device,ip,is_attributed,os,click_time,is_test,dow,hour,minute,min5,second,sin_time,cos_time,ip_os_device,ip_os_device_dow_hour,ip_os_device_dow_hour_min5
145713702,9,127,18790464,1,99442,99,13,2017-11-10 14:59:59,True,4,14,59,55,59,-0.707107,-0.707107,994420130001,994420130001414,99442013000141455
145713703,23,153,18790465,1,88046,99,37,2017-11-10 14:59:59,True,4,14,59,55,59,-0.707107,-0.707107,880460370001,880460370001414,88046037000141455
145713704,18,265,18790467,1,81398,99,17,2017-11-10 14:59:59,True,4,14,59,55,59,-0.707107,-0.707107,813980170001,813980170001414,81398017000141455
145713705,27,122,18790466,1,123236,99,13,2017-11-10 14:59:59,True,4,14,59,55,59,-0.707107,-0.707107,1232360130001,1232360130001414,123236013000141455
145713706,12,265,18790468,2,73516,99,27,2017-11-10 14:59:59,True,4,14,59,55,59,-0.707107,-0.707107,735160270002,735160270002414,73516027000241455


### get "ip_os_device_app_encoded"
### get "ip_os_device_channel_encoded"
stringで作ってhashすると時間かかりすぎてアレだったのでLabelEncoderを使用

In [11]:
print('app: ', np.min(merge.app), np.max(merge.app))
print('channel: ', np.min(merge.channel), np.max(merge.channel))

app:  0 768
channel:  0 500


In [12]:
%%time
le = LabelEncoder()
merge['ip_os_device_app_encoded'] = \
    le.fit_transform(merge.ip_os_device*1000 + merge.app.astype('uint64')).astype('uint32')
del le

CPU times: user 40.9 s, sys: 3.03 s, total: 43.9 s
Wall time: 43.9 s


In [13]:
%%time
le = LabelEncoder()
merge['ip_os_device_channel_encoded'] = \
    le.fit_transform(merge.ip_os_device*1000 + merge.channel.astype('uint64')).astype('uint32')
del le

CPU times: user 41.6 s, sys: 3.04 s, total: 44.7 s
Wall time: 44.7 s


In [14]:
print(np.max(merge.ip_os_device_app_encoded))
print(np.max(merge.ip_os_device_channel_encoded))

21160759
37126014


In [15]:
merge.to_feather("../data/merge.feather")