In [1]:
import gc
import time
from logzero import logger
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
ls -lah ../data/

total 17G
drwxrwxr-x 5 ubuntu ubuntu 4.0K Apr 30 07:21 [0m[01;34m.[0m/
drwxrwxr-x 7 ubuntu ubuntu 4.0K Apr 28 16:26 [01;34m..[0m/
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 28 16:52 [01;34m.ipynb_checkpoints[0m/
-rw-r--r-- 1 ubuntu ubuntu 4.8G Apr 30 14:27 click_data.feather
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 29 14:21 [01;34mkenkoooos[0m/
-rw-r--r-- 1 ubuntu ubuntu  11G Apr 30 14:43 merge.feather
drwxrwxr-x 2 ubuntu ubuntu 4.0K Apr 28 17:13 [01;34mraw[0m/
-rw-rw-r-- 1 ubuntu ubuntu 1.4G Apr 28 17:20 [01;31mraw.zip[0m


In [3]:
%time
merge = pd.read_feather("../data/merge.feather", nthreads=4)

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 6.2 µs


In [4]:
# merge = merge[:1000000]
gc.collect()

14

In [5]:
train_size = merge[merge.is_test == 0].shape[0]

In [6]:
merge.shape

(145713707, 21)

In [7]:
merge.dtypes

app                                     uint16
channel                                 uint16
click_id                                uint32
device                                  uint16
ip                                      uint32
is_attributed                            uint8
os                                      uint16
click_time                      datetime64[ns]
is_test                                   bool
dow                                      uint8
hour                                     uint8
minute                                   uint8
min5                                     uint8
second                                   uint8
sin_time                               float64
cos_time                               float64
ip_os_device                            uint64
ip_os_device_dow_hour                   uint64
ip_os_device_dow_hour_min5              uint64
ip_os_device_app_encoded                uint32
ip_os_device_channel_encoded            uint32
dtype: object

### count

In [8]:
columns = ['ip_os_device_dow_hour', 'ip_os_device_dow_hour_min5']

In [9]:
%%time
for col in columns:
    merge = pd.merge(merge,
                     merge.sort_index().sort_values([col])
                         .groupby(col, as_index=False).ip
                         .count()
                         .rename(columns={"ip": "count_by_" + col})
                    )
    logger.info("{} {}".format(col, merge["count_by_" + col].max()))

[I 180430 14:51:06 <timed exec>:8] ip_os_device_dow_hour 9284
[I 180430 14:53:37 <timed exec>:8] ip_os_device_dow_hour_min5 1620


CPU times: user 5min 3s, sys: 1min 7s, total: 6min 10s
Wall time: 6min 10s


In [10]:
%%time
for col in columns:
    merge["count_by_" + col] = merge["count_by_" + col].astype('uint32')

CPU times: user 568 ms, sys: 592 ms, total: 1.16 s
Wall time: 1.16 s


In [11]:
print(merge.shape)
merge.head(8)

(145713707, 23)


Unnamed: 0,app,channel,click_id,device,ip,is_attributed,os,click_time,is_test,dow,...,second,sin_time,cos_time,ip_os_device,ip_os_device_dow_hour,ip_os_device_dow_hour_min5,ip_os_device_app_encoded,ip_os_device_channel_encoded,count_by_ip_os_device_dow_hour,count_by_ip_os_device_dow_hour_min5
0,23,153,99999999,1,76508,0,35,2017-11-07 03:00:00,False,1,...,0,0.707107,0.707107,765080350001,765080350001103,76508035000110300,8759992,16206049,7,7
1,18,134,99999999,1,76508,0,35,2017-11-07 03:02:24,False,1,...,24,0.714473,0.699663,765080350001,765080350001103,76508035000110300,8759990,16206047,7,7
2,15,315,99999999,1,76508,0,35,2017-11-07 03:02:29,False,1,...,29,0.714727,0.699403,765080350001,765080350001103,76508035000110300,8759989,16206054,7,7
3,11,469,99999999,1,76508,0,35,2017-11-07 03:02:32,False,1,...,32,0.71488,0.699248,765080350001,765080350001103,76508035000110300,8759985,16206062,7,7
4,3,379,99999999,1,76508,0,35,2017-11-07 03:02:36,False,1,...,36,0.715083,0.69904,765080350001,765080350001103,76508035000110300,8759984,16206057,7,7
5,2,452,99999999,1,76508,0,35,2017-11-07 03:02:37,False,1,...,37,0.715134,0.698988,765080350001,765080350001103,76508035000110300,8759983,16206061,7,7
6,14,439,99999999,1,76508,0,35,2017-11-07 03:02:38,False,1,...,38,0.715185,0.698936,765080350001,765080350001103,76508035000110300,8759988,16206059,7,7
7,3,280,99999999,1,23907,0,18,2017-11-07 03:00:00,False,1,...,0,0.707107,0.707107,239070180001,239070180001103,23907018000110300,2725266,5029631,20,7


In [12]:
merge.dtypes

app                                            uint16
channel                                        uint16
click_id                                       uint32
device                                         uint16
ip                                             uint32
is_attributed                                   uint8
os                                             uint16
click_time                             datetime64[ns]
is_test                                          bool
dow                                             uint8
hour                                            uint8
minute                                          uint8
min5                                            uint8
second                                          uint8
sin_time                                      float64
cos_time                                      float64
ip_os_device                                   uint64
ip_os_device_dow_hour                          uint64
ip_os_device_dow_hour_min5  

### variety of app and channel

In [13]:
columns = ['ip_os_device_dow_hour', 'ip_os_device_dow_hour_min5']

In [14]:
%%time
for col in columns:
    merge = pd.merge(merge,
                     merge[[col, 'app', 'channel']]
                         .groupby(col, as_index=False)
                         .agg({'app':'nunique', 'channel':'nunique'})
                         .rename(columns={"app": "app_variety_" + col, "channel": "chan_variety_" + col}))
    logger.info("{} {}".format(col, merge["app_variety_" + col].max()))
    logger.info("{} {}".format(col, merge["chan_variety_" + col].max()))

[I 180430 14:57:04 <timed exec>:7] ip_os_device_dow_hour 43
[I 180430 14:57:05 <timed exec>:8] ip_os_device_dow_hour 119
[I 180430 15:00:51 <timed exec>:7] ip_os_device_dow_hour_min5 33
[I 180430 15:00:53 <timed exec>:8] ip_os_device_dow_hour_min5 90


CPU times: user 6min 12s, sys: 1min 2s, total: 7min 14s
Wall time: 7min 14s


In [15]:
%%time
for j in columns:
    for k in ['app', 'chan']:
        merge[k + "_variety_" + j] = merge[k + "_variety_" + j].astype('uint16')

CPU times: user 1.36 s, sys: 1.22 s, total: 2.57 s
Wall time: 2.57 s


In [16]:
merge.dtypes

app                                                uint16
channel                                            uint16
click_id                                           uint32
device                                             uint16
ip                                                 uint32
is_attributed                                       uint8
os                                                 uint16
click_time                                 datetime64[ns]
is_test                                              bool
dow                                                 uint8
hour                                                uint8
minute                                              uint8
min5                                                uint8
second                                              uint8
sin_time                                          float64
cos_time                                          float64
ip_os_device                                       uint64
ip_os_device_d

### rank

In [17]:
columns = ['ip_os_device_dow_hour', 'ip_os_device_dow_hour_min5']

In [18]:
%%time
merge.sort_index(inplace=True)
for col in columns:
    merge['rank_by_' + col] = merge.sort_index().groupby(col, as_index=False).cumcount() + 1
    logger.info("{}".format(col))

[I 180430 15:01:34 <timed exec>:4] ip_os_device_dow_hour
[I 180430 15:02:16 <timed exec>:4] ip_os_device_dow_hour_min5


CPU times: user 1min, sys: 21.1 s, total: 1min 21s
Wall time: 1min 20s


In [19]:
%%time
for j in columns:
    merge["rank_by_" + j] = merge["rank_by_" + j].astype('uint32')

CPU times: user 540 ms, sys: 620 ms, total: 1.16 s
Wall time: 1.16 s


In [20]:
merge.dtypes

app                                                uint16
channel                                            uint16
click_id                                           uint32
device                                             uint16
ip                                                 uint32
is_attributed                                       uint8
os                                                 uint16
click_time                                 datetime64[ns]
is_test                                              bool
dow                                                 uint8
hour                                                uint8
minute                                              uint8
min5                                                uint8
second                                              uint8
sin_time                                          float64
cos_time                                          float64
ip_os_device                                       uint64
ip_os_device_d

### previous values / click interval

In [21]:
columns = ['ip_os_device_dow_hour', 'rank_by_ip_os_device_dow_hour', 'click_time', 'app', 'channel']
prev_names = {
    "click_time": "click_time_prev",
    "app": "app_prev",
    "channel": "channel_prev"
}

merge_prev = merge[columns].rename(columns=prev_names)
merge_prev['rank_by_ip_os_device_dow_hour'] = merge_prev['rank_by_ip_os_device_dow_hour'] + 1

In [22]:
merge_prev.head()

Unnamed: 0,ip_os_device_dow_hour,rank_by_ip_os_device_dow_hour,click_time_prev,app_prev,channel_prev
0,765080350001103,2,2017-11-07 03:00:00,23,153
1,765080350001103,3,2017-11-07 03:02:24,18,134
2,765080350001103,4,2017-11-07 03:02:29,15,315
3,765080350001103,5,2017-11-07 03:02:32,11,469
4,765080350001103,6,2017-11-07 03:02:36,3,379


In [23]:
%%time
merge = pd.merge(merge,
                 merge_prev,
                 on=['ip_os_device_dow_hour', 'rank_by_ip_os_device_dow_hour'],
                 how='left')

del merge_prev
gc.collect()

CPU times: user 1min 57s, sys: 36.7 s, total: 2min 34s
Wall time: 2min 34s


In [24]:
merge.head()

Unnamed: 0,app,channel,click_id,device,ip,is_attributed,os,click_time,is_test,dow,...,count_by_ip_os_device_dow_hour_min5,app_variety_ip_os_device_dow_hour,chan_variety_ip_os_device_dow_hour,app_variety_ip_os_device_dow_hour_min5,chan_variety_ip_os_device_dow_hour_min5,rank_by_ip_os_device_dow_hour,rank_by_ip_os_device_dow_hour_min5,click_time_prev,app_prev,channel_prev
0,23,153,99999999,1,76508,0,35,2017-11-07 03:00:00,False,1,...,7,7,7,7,7,1,1,NaT,,
1,18,134,99999999,1,76508,0,35,2017-11-07 03:02:24,False,1,...,7,7,7,7,7,2,2,2017-11-07 03:00:00,23.0,153.0
2,15,315,99999999,1,76508,0,35,2017-11-07 03:02:29,False,1,...,7,7,7,7,7,3,3,2017-11-07 03:02:24,18.0,134.0
3,11,469,99999999,1,76508,0,35,2017-11-07 03:02:32,False,1,...,7,7,7,7,7,4,4,2017-11-07 03:02:29,15.0,315.0
4,3,379,99999999,1,76508,0,35,2017-11-07 03:02:36,False,1,...,7,7,7,7,7,5,5,2017-11-07 03:02:32,11.0,469.0


In [25]:
%%time
merge['app_prev'] = merge.app_prev.fillna(9999).astype('uint16')
merge['app_prev'] = merge.app_prev.where((merge.app == merge.app_prev) | (merge.app_prev == 9999), 8888)

CPU times: user 2.54 s, sys: 1.43 s, total: 3.96 s
Wall time: 3.96 s


In [26]:
%%time
merge['channel_prev'] = merge.channel_prev.fillna(9999).astype('uint16')
merge['channel_prev'] = merge.channel_prev.where((merge.channel == merge.channel_prev) | (merge.channel_prev == 9999), 8888)

CPU times: user 2.32 s, sys: 1.12 s, total: 3.44 s
Wall time: 3.44 s


In [27]:
%%time
merge['interval_prev'] = (merge.click_time - merge.click_time_prev).dt.seconds.fillna(9999).astype('uint16')

CPU times: user 16min 37s, sys: 5.45 s, total: 16min 42s
Wall time: 16min 42s


In [28]:
merge[merge.app_prev != 8888].head()

Unnamed: 0,app,channel,click_id,device,ip,is_attributed,os,click_time,is_test,dow,...,app_variety_ip_os_device_dow_hour,chan_variety_ip_os_device_dow_hour,app_variety_ip_os_device_dow_hour_min5,chan_variety_ip_os_device_dow_hour_min5,rank_by_ip_os_device_dow_hour,rank_by_ip_os_device_dow_hour_min5,click_time_prev,app_prev,channel_prev,interval_prev
0,23,153,99999999,1,76508,0,35,2017-11-07 03:00:00,False,1,...,7,7,7,7,1,1,NaT,9999,9999,9999
7,3,280,99999999,1,23907,0,18,2017-11-07 03:00:00,False,1,...,6,7,3,2,1,1,NaT,9999,9999,9999
21,12,259,99999999,1,23907,0,18,2017-11-07 03:25:53,False,1,...,6,7,2,2,15,1,2017-11-07 03:12:55,12,8888,778
27,12,265,99999999,1,81489,0,17,2017-11-07 03:00:00,False,1,...,2,2,2,2,1,1,NaT,9999,9999,9999
29,20,259,99999999,1,123642,0,22,2017-11-07 03:00:00,False,1,...,6,7,6,7,1,1,NaT,9999,9999,9999


In [29]:
merge.dtypes

app                                                uint16
channel                                            uint16
click_id                                           uint32
device                                             uint16
ip                                                 uint32
is_attributed                                       uint8
os                                                 uint16
click_time                                 datetime64[ns]
is_test                                              bool
dow                                                 uint8
hour                                                uint8
minute                                              uint8
min5                                                uint8
second                                              uint8
sin_time                                          float64
cos_time                                          float64
ip_os_device                                       uint64
ip_os_device_d

### subsequent values / click interval

In [30]:
next_names = {
    "click_time": "click_time_next",
    "app": "app_next",
    "channel": "channel_next"
}

merge_next = merge[columns].rename(columns=next_names)
merge_next['rank_by_ip_os_device_dow_hour'] = merge_next['rank_by_ip_os_device_dow_hour'] - 1

In [31]:
merge_next.head()

Unnamed: 0,ip_os_device_dow_hour,rank_by_ip_os_device_dow_hour,click_time_next,app_next,channel_next
0,765080350001103,0,2017-11-07 03:00:00,23,153
1,765080350001103,1,2017-11-07 03:02:24,18,134
2,765080350001103,2,2017-11-07 03:02:29,15,315
3,765080350001103,3,2017-11-07 03:02:32,11,469
4,765080350001103,4,2017-11-07 03:02:36,3,379


In [32]:
%%time
merge = pd.merge(merge,
                 merge_next,
                 on=['ip_os_device_dow_hour', 'rank_by_ip_os_device_dow_hour'],
                 how='left')
del merge_next
gc.collect()

CPU times: user 2min 7s, sys: 41.8 s, total: 2min 49s
Wall time: 2min 49s


In [33]:
%%time
merge['app_next'] = merge.app_next.fillna(9999).astype('uint16')
merge['app_next'] = merge.app_next.where((merge.app == merge.app_next) | (merge.app_next == 9999), 8888)

CPU times: user 2.61 s, sys: 1.36 s, total: 3.97 s
Wall time: 3.97 s


In [34]:
%%time
merge['channel_next'] = merge.channel_next.fillna(9999).astype('uint16')
merge['channel_next'] = merge.channel_next.where((merge.channel == merge.channel_next) | (merge.channel_next == 9999), 8888)

CPU times: user 2.37 s, sys: 1.08 s, total: 3.46 s
Wall time: 3.45 s


In [35]:
%%time
merge['interval_next'] = (merge.click_time - merge.click_time_next).dt.seconds.fillna(9999).astype('uint16')

CPU times: user 17min 2s, sys: 7.79 s, total: 17min 10s
Wall time: 17min 9s


In [36]:
merge[37:41]

Unnamed: 0,app,channel,click_id,device,ip,is_attributed,os,click_time,is_test,dow,...,rank_by_ip_os_device_dow_hour,rank_by_ip_os_device_dow_hour_min5,click_time_prev,app_prev,channel_prev,interval_prev,click_time_next,app_next,channel_next,interval_next
37,9,258,99999999,1,123642,0,22,2017-11-07 03:00:04,False,1,...,9,9,2017-11-07 03:00:04,8888,8888,0,2017-11-07 03:00:04,8888,8888,0
38,24,178,99999999,1,123642,0,22,2017-11-07 03:00:04,False,1,...,10,10,2017-11-07 03:00:04,8888,8888,0,2017-11-07 03:00:04,8888,8888,0
39,8,259,99999999,1,123642,0,22,2017-11-07 03:00:04,False,1,...,11,11,2017-11-07 03:00:04,8888,8888,0,2017-11-07 03:00:04,8888,8888,0
40,9,442,99999999,1,123642,0,22,2017-11-07 03:00:04,False,1,...,12,12,2017-11-07 03:00:04,8888,8888,0,2017-11-07 03:00:04,8888,8888,0


In [None]:
print(merge.shape)
merge.dtypes

(145713707, 37)


app                                                uint16
channel                                            uint16
click_id                                           uint32
device                                             uint16
ip                                                 uint32
is_attributed                                       uint8
os                                                 uint16
click_time                                 datetime64[ns]
is_test                                              bool
dow                                                 uint8
hour                                                uint8
minute                                              uint8
min5                                                uint8
second                                              uint8
sin_time                                          float64
cos_time                                          float64
ip_os_device                                       uint64
ip_os_device_d

In [None]:
merge.to_feather("../data/merge.feather")