In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import gc 
import time 
from tqdm import tqdm_notebook


plt.figure(figsize=(12,9))
%matplotlib inline 


In [2]:
t1 = time.time()
with pd.HDFStore('../input/raw_data.h5') as store:
    print(store.keys())
    train_df = store['train_df'] 
    test_df  = store['test_df'] 
t2 = time.time()

print('loading time: {:.1f} sec'.format(t2-t1))

['/test_df', '/train_df']
loading time: 7.7 sec


* add hour, day

In [3]:
train_df['hour'] = train_df.click_time.dt.hour.astype('uint8')
train_df['day']  = train_df.click_time.dt.day.astype('uint8')
train_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,hour,day
0,83230,3,1,13,379,2017-11-06 14:32:21,0,14,6
1,17357,3,1,19,379,2017-11-06 14:33:34,0,14,6
2,35810,3,1,13,379,2017-11-06 14:34:12,0,14,6
3,45745,14,1,13,478,2017-11-06 14:34:52,0,14,6
4,161007,3,1,13,379,2017-11-06 14:35:08,0,14,6


In [4]:
subtrn_df = train_df.iloc[:1000000].copy()

In [5]:
subtrn_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000000 entries, 0 to 999999
Data columns (total 9 columns):
ip               1000000 non-null uint32
app              1000000 non-null uint16
device           1000000 non-null uint16
os               1000000 non-null uint16
channel          1000000 non-null uint16
click_time       1000000 non-null datetime64[ns]
is_attributed    1000000 non-null uint8
hour             1000000 non-null uint8
day              1000000 non-null uint8
dtypes: datetime64[ns](1), uint16(4), uint32(1), uint8(3)
memory usage: 29.6 MB


# Feat
* cnt by (ip,app,device,os,channel,day, hour)

In [6]:
subtrn_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,hour,day
0,83230,3,1,13,379,2017-11-06 14:32:21,0,14,6
1,17357,3,1,19,379,2017-11-06 14:33:34,0,14,6
2,35810,3,1,13,379,2017-11-06 14:34:12,0,14,6
3,45745,14,1,13,478,2017-11-06 14:34:52,0,14,6
4,161007,3,1,13,379,2017-11-06 14:35:08,0,14,6


In [7]:
groupby_cols = ['ip','app','device','os','channel','day','hour']
clicks = subtrn_df.groupby(groupby_cols)['click_time'].count().\
    reset_index().rename(columns={'click_time':'click_cnt'}).sort_values('click_cnt',ascending=False)
subtrn_df = subtrn_df.merge(clicks,how='left',on=groupby_cols)


In [8]:
subtrn_df.click_cnt.value_counts()

1      458047
2      165146
3       91209
4       51436
6       38532
5       30650
7       17332
8       14952
9       13617
10       9960
11       8547
12       7428
13       6045
15       5400
14       4704
16       4416
17       3944
18       3600
19       2812
24       2592
20       2380
21       2310
22       2266
33       2112
31       2077
23       2047
26       1898
27       1809
25       1750
30       1470
        ...  
144       144
143       143
142       142
140       140
137       137
135       135
134       134
133       133
129       129
122       122
121       121
117       117
115       115
113       113
112       112
111       111
108       108
107       107
105       105
101       101
95         95
92         92
91         91
90         90
89         89
88         88
85         85
82         82
78         78
61         61
Name: click_cnt, Length: 141, dtype: int64

In [9]:
subtrn_df.groupby('click_cnt')['is_attributed'].sum()

click_cnt
1      1254.0
2       257.0
3        95.0
4        33.0
5        18.0
6        14.0
7         3.0
8         4.0
9         4.0
10        0.0
11        1.0
12        0.0
13        2.0
14        0.0
15        1.0
16        0.0
17        0.0
18        2.0
19        0.0
20        0.0
21        0.0
22        0.0
23        0.0
24        0.0
25        0.0
26        1.0
27        0.0
28        0.0
29        0.0
30        0.0
        ...  
129       0.0
133       0.0
134       0.0
135       0.0
137       0.0
140       0.0
142       0.0
143       0.0
144       0.0
147       0.0
150       0.0
153       0.0
157       0.0
159       0.0
165       0.0
167       0.0
168       0.0
172       0.0
183       0.0
187       0.0
193       0.0
198       0.0
201       0.0
210       0.0
218       0.0
221       0.0
222       0.0
232       0.0
244       0.0
248       0.0
Name: is_attributed, Length: 141, dtype: float64

* prev click time by (app,ip,os,device)

In [10]:
gp = subtrn_df.groupby(['app','ip','os','device'])
prevClickTime = gp.click_time.shift(1)
prev2ClickTime = gp.click_time.shift(2)
subtrn_df['prev_click_time'] = prevClickTime
subtrn_df['prev2_click_time'] = prev2ClickTime
subtrn_df['prev_click_diff'] = subtrn_df.click_time - prevClickTime  
subtrn_df['prev2_click_diff'] = subtrn_df.click_time - prev2ClickTime
subtrn_df.tail(5)

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,hour,day,click_cnt,prev_click_time,prev2_click_time,prev_click_diff,prev2_click_diff
999995,29748,9,1,12,134,2017-11-06 16:21:51,0,16,6,1,NaT,NaT,NaT,NaT
999996,124520,12,1,15,178,2017-11-06 16:21:51,0,16,6,1,NaT,NaT,NaT,NaT
999997,206446,18,1,42,107,2017-11-06 16:21:51,0,16,6,1,NaT,NaT,NaT,NaT
999998,167577,12,1,13,265,2017-11-06 16:21:51,0,16,6,1,2017-11-06 16:12:11,NaT,00:09:40,NaT
999999,121848,24,1,19,105,2017-11-06 16:21:51,0,16,6,1,NaT,NaT,NaT,NaT


In [13]:
# mask = ~subtrn_df.prev_click_diff.isnull()
# subtrn_df[mask].head()
mask_ = (subtrn_df.ip==108942) & (subtrn_df.app==3) & (subtrn_df.os==19) & (subtrn_df.device==1)
subtrn_df[mask_]

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,hour,day,click_cnt,prev_click_time,prev2_click_time,prev_click_diff,prev2_click_diff
52,108942,3,1,19,379,2017-11-06 15:04:28,0,15,6,4,NaT,NaT,NaT,NaT
190,108942,3,1,19,379,2017-11-06 15:45:18,0,15,6,4,2017-11-06 15:04:28,NaT,00:40:50,NaT
205,108942,3,1,19,379,2017-11-06 15:46:12,0,15,6,4,2017-11-06 15:45:18,2017-11-06 15:04:28,00:00:54,00:41:44
271,108942,3,1,19,379,2017-11-06 15:49:03,0,15,6,4,2017-11-06 15:46:12,2017-11-06 15:45:18,00:02:51,00:03:45
85389,108942,3,1,19,130,2017-11-06 16:01:31,0,16,6,1,2017-11-06 15:49:03,2017-11-06 15:46:12,00:12:28,00:15:19
245676,108942,3,1,19,280,2017-11-06 16:04:48,0,16,6,4,2017-11-06 16:01:31,2017-11-06 15:49:03,00:03:17,00:15:45
299694,108942,3,1,19,135,2017-11-06 16:05:45,0,16,6,2,2017-11-06 16:04:48,2017-11-06 16:01:31,00:00:57,00:04:14
300432,108942,3,1,19,115,2017-11-06 16:05:45,0,16,6,2,2017-11-06 16:05:45,2017-11-06 16:04:48,00:00:00,00:00:57
300972,108942,3,1,19,417,2017-11-06 16:05:46,0,16,6,1,2017-11-06 16:05:45,2017-11-06 16:05:45,00:00:01,00:00:01
340228,108942,3,1,19,135,2017-11-06 16:06:30,0,16,6,2,2017-11-06 16:05:46,2017-11-06 16:05:45,00:00:44,00:00:45


* cumcount by (ip,app,os,device,hour,day)

In [18]:
gp = subtrn_df.groupby(['ip','app','os','device','hour','day'])
subtrn_df['cumcnt'] = gp.click_time.cumcount()

mask_ = subtrn_df.cumcnt !=0
subtrn_df[mask]

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,hour,day,click_cnt,prev_click_time,prev2_click_time,prev_click_diff,prev2_click_diff,cumcnt
74,89489,3,1,13,379,2017-11-06 15:20:38,0,15,6,2,2017-11-06 15:13:23,NaT,00:07:15,NaT,1
90,109591,3,1,19,379,2017-11-06 15:39:16,0,15,6,3,2017-11-06 15:30:16,NaT,00:09:00,NaT,1
146,53964,3,1,18,379,2017-11-06 15:42:38,0,15,6,2,2017-11-06 15:41:39,NaT,00:00:59,NaT,1
190,108942,3,1,19,379,2017-11-06 15:45:18,0,15,6,4,2017-11-06 15:04:28,NaT,00:40:50,NaT,1
196,84774,3,1,13,379,2017-11-06 15:45:32,0,15,6,3,2017-11-06 15:44:55,NaT,00:00:37,NaT,1
200,42102,3,1,13,379,2017-11-06 15:45:45,0,15,6,2,2017-11-06 15:39:47,NaT,00:05:58,NaT,1
203,109591,3,1,19,379,2017-11-06 15:46:02,0,15,6,3,2017-11-06 15:39:16,2017-11-06 15:30:16,00:06:46,00:15:46,2
205,108942,3,1,19,379,2017-11-06 15:46:12,0,15,6,4,2017-11-06 15:45:18,2017-11-06 15:04:28,00:00:54,00:41:44,2
208,108881,3,1,13,379,2017-11-06 15:46:15,0,15,6,4,2017-11-06 15:41:58,NaT,00:04:17,NaT,1
213,76683,3,1,13,379,2017-11-06 15:46:25,0,15,6,2,2017-11-06 15:46:22,NaT,00:00:03,NaT,1


# cv loop 
over train data

In [9]:
from sklearn.model_selection import StratifiedKFold

stf = StratifiedKFold(n_splits=5,random_state=1)
stf.get_n_splits(X=train_df, y=train_df.is_attributed)

for trn_idx, val_idx in stf.split(X=train_df, y=train_df.is_attributed):
    print(val_idx)

[       0        1        2 ..., 36983273 36983274 36983275]
[36243525 36243796 36244437 ..., 74683566 74683569 74683930]
[ 73959321  73959322  73959323 ..., 110950996 110950997 110950998]
[107487770 107487834 107488225 ..., 148717424 148717443 148718406]
[147921222 147921223 147921224 ..., 184903887 184903888 184903889]


In [10]:
del trn_idx,val_idx
gc.collect()

1682