In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline 
import gc 
from tqdm import tqdm_notebook,tqdm

In [44]:
frm = 0 
to = frm + 1000000
val_size = 10000
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32',
        }
print('loading train data...',frm,to)
train_df = pd.read_csv("../input/train.csv.zip",
                       parse_dates=['click_time'], 
                       skiprows=range(1,frm), 
                       nrows=to-frm, 
                       dtype=dtypes, 
                       usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
print('loading test data...')
test_df = pd.read_csv("../input/test.csv.zip", 
                      nrows=100000, 
                      parse_dates=['click_time'], 
                      dtype=dtypes, 
                      usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])

loading train data... 0 1000000
loading test data...


In [45]:
train_df.dtypes

ip                       uint32
app                      uint16
device                   uint16
os                       uint16
channel                  uint16
click_time       datetime64[ns]
is_attributed             uint8
dtype: object

In [46]:
train_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed
0,83230,3,1,13,379,2017-11-06 14:32:21,0
1,17357,3,1,19,379,2017-11-06 14:33:34,0
2,35810,3,1,13,379,2017-11-06 14:34:12,0
3,45745,14,1,13,478,2017-11-06 14:34:52,0
4,161007,3,1,13,379,2017-11-06 14:35:08,0


In [47]:
train_df['day'] = train_df['click_time'].dt.day.astype(np.uint8)
train_df['hour'] = train_df['click_time'].dt.hour.astype(np.uint8)

# Features 


1. How many **unique** `channel` by `ip`
2. How many **cumcount** `app` by `ip`,`device`,`os` (person)
3. How many **unique** `hour` by `ip`,`day`
4. How many **unique** `app` by `ip`
5. How many **unique** `app` by `ip`,`os`
6. How many **unique** `device` by `ip`
7. How many **unique** `app` by `channel`
8. How many **cumcount** `os` by `ip` --- why ???? WE NEED IT??
9. How many **unique** `app` by `ip`,`device`,`os`

In [96]:
''.join(groupby)

'ip'

In [139]:
# selcols = ['ip','channel']
# groupby = ['ip']

# aggregator = 'nunique'

def encode_agg_feature(selcols, groupby, aggregator = 'nunique'):
    usecols = [e for e in selcols if e not in  groupby]
    if aggregator == 'nunique':
        gp = train_df[selcols].groupby(groupby)[usecols].nunique().reset_index().\
            rename(columns = {
                usecols[-1] : usecols[-1] + '_nunique_' + '_'.join(groupby)
            })
        df = train_df.merge(gp, how='left', on=groupby)

    elif aggregator == 'cumcount':
        gp = train_df[selcols].groupby(groupby)[usecols].cumcount()
        df = train_df.copy()
        df[usecols[-1] + '_cumcnt_' + '_'.join(groupby)] = gp.values
    
    elif aggregator == 'count':
        gp = train_df[selcols].groupby(groupby)[usecols].count().reset_index().\
            rename(columns = {
                usecols[-1] : usecols[-1] + '_cnt_' + '_'.join(groupby)
            })
        df = train_df.merge(gp, how='left', on=groupby)
        
    return df.iloc[:,-1]

In [129]:
df1 = encode_agg_feature(groupby=['ip'], selcols=['ip','channel'],aggregator='nunique')
df2 = encode_agg_feature(selcols=['app','ip','device','os'], groupby=['ip','device','os'], aggregator='cumcount')
df3 = encode_agg_feature(selcols=['hour','ip','day'], groupby=['ip','day'], aggregator='nunique')
df4 = encode_agg_feature(selcols=['app', 'ip'], groupby=['ip'])
df5 = encode_agg_feature(selcols=['app', 'ip', 'os'], groupby=['ip','os'])
df6 = encode_agg_feature(selcols=['device', 'ip'], groupby=['ip'])
df7 = encode_agg_feature(selcols=['app', 'channel'], groupby=['channel'])
df8 = encode_agg_feature(selcols=['os', 'ip'], groupby=['ip'], aggregator='cumcount')
df9 = encode_agg_feature(selcols=['app', 'ip','device','os'], groupby=['ip','device','os'])


In [137]:
train_df[df1.columns[-1]] = df1.iloc[:,-1]
train_df[df2.columns[-1]] = df2.iloc[:,-1]
train_df[df3.columns[-1]] = df3.iloc[:,-1]
train_df[df4.columns[-1]] = df4.iloc[:,-1]
train_df[df5.columns[-1]] = df5.iloc[:,-1]
train_df[df6.columns[-1]] = df6.iloc[:,-1]
train_df[df7.columns[-1]] = df7.iloc[:,-1]
train_df[df8.columns[-1]] = df8.iloc[:,-1]
train_df[df9.columns[-1]] = df9.iloc[:,-1]
# df1['temp']=df1.iloc[:,-1]

In [138]:
train_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,day,hour,channel_nunique_ip,app_cumcnt_ip_device_os,hour_nunique_ip_day,app_nunique_ip,app_nunique_ip_os,device_nunique_ip,app_nunique_channel,os_cumcnt_ip,app_nunique_ip_device_os
0,83230,3,1,13,379,2017-11-06 14:32:21,0,6,14,46,0,2,19,16,1,6,0,16
1,17357,3,1,19,379,2017-11-06 14:33:34,0,6,14,42,0,3,20,11,1,6,0,11
2,35810,3,1,13,379,2017-11-06 14:34:12,0,6,14,35,0,2,17,10,3,6,0,10
3,45745,14,1,13,478,2017-11-06 14:34:52,0,6,14,81,0,2,31,24,6,2,0,24
4,161007,3,1,13,379,2017-11-06 14:35:08,0,6,14,1,0,1,1,1,1,6,0,1


## Next Click 

In [145]:
D=2**26
train_df['category'] = (train_df['ip'].astype(str) + "_" + train_df['app'].astype(str) + "_" + train_df['device'].astype(str) \
    + "_" + train_df['os'].astype(str)).apply(hash) % D
click_buffer= np.full(D, 3000000000, dtype=np.uint32)

train_df['epochtime']= train_df['click_time'].astype(np.int64) // 10 ** 9
next_clicks= []
for category, t in zip(reversed(train_df['category'].values), reversed(train_df['epochtime'].values)):
    next_clicks.append(click_buffer[category]-t)
    click_buffer[category]= t

In [146]:
QQ= list(reversed(next_clicks))

In [148]:
train_df['nextClick'] = QQ

In [149]:
train_df.head().T

Unnamed: 0,0,1,2,3,4
ip,83230,17357,35810,45745,161007
app,3,3,3,14,3
device,1,1,1,1,1
os,13,19,13,13,13
channel,379,379,379,478,379
click_time,2017-11-06 14:32:21,2017-11-06 14:33:34,2017-11-06 14:34:12,2017-11-06 14:34:52,2017-11-06 14:35:08
is_attributed,0,0,0,0,0
day,6,6,6,6,6
hour,14,14,14,14,14
channel_nunique_ip,46,42,35,81,1


In [81]:
selcols = ['ip','channel']


# unique_channel = train_df[['ip','channel']].groupby('ip')['channel'].nunique().reset_index().\
#     rename(columns={
#         'channel' : 'unique_channel'
#     })
    
# train_df.merge(unique_channel,how='left',on='ip')


0            0
1            0
2            0
3            0
4            0
5            0
6            0
7            0
8            0
9            0
10           0
11           0
12           0
13           0
14           0
15           0
16           0
17           0
18           0
19           0
20           0
21           0
22           0
23           0
24           0
25           0
26           0
27           0
28           0
29           0
          ... 
999970       9
999971      12
999972      26
999973      12
999974      45
999975      76
999976      81
999977    1100
999978      10
999979       1
999980      24
999981       1
999982      47
999983       2
999984       2
999985       7
999986       0
999987       3
999988      16
999989      18
999990      30
999991       3
999992     627
999993     149
999994       8
999995       6
999996       1
999997       0
999998      36
999999       9
Length: 1000000, dtype: int64

In [73]:
selcols = ['ip','device','os','app']
gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].cumcount()    
gp1 = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].count()

In [78]:
gp

ip      device  os 
9       1       13      2
10      1       22      1
20      1       10      1
                13      2
25      1       13      9
                18      1
27      1       13      7
                22     14
31      1       20     11
36      1       10     40
39      0       24      1
        1       13     10
                18      2
45      1       14      3
                47      2
52      1       19      1
59      1       18     19
                19      5
                27      1
        3032    607     2
60      1       10      1
61      1       13      1
63      1       13     14
                18      2
81      1       6       1
                13     12
                19      1
83      1       8       6
85      1       13      1
88      1       3       1
                       ..
212673  1       22      9
212677  1       19      2
212690  1       13     21
                20      1
                23      1
212695  1       20      8
212699  1       16

In [74]:
gp1.reset_index().app.cumsum()

0               2
1               3
2               4
3               6
4              15
5              16
6              23
7              37
8              48
9              88
10             89
11             99
12            101
13            104
14            106
15            107
16            126
17            131
18            132
19            134
20            135
21            136
22            150
23            152
24            153
25            165
26            166
27            172
28            173
29            174
           ...   
133848     999858
133849     999860
133850     999881
133851     999882
133852     999883
133853     999891
133854     999900
133855     999902
133856     999904
133857     999911
133858     999935
133859     999937
133860     999938
133861     999941
133862     999949
133863     999956
133864     999957
133865     999958
133866     999960
133867     999972
133868     999973
133869     999976
133870     999977
133871     999983
133872    

In [10]:
selcols = ['ip','channel']
i = 0
gp = train_df[selcols].groupby(by=selcols[0:len(selcols)-1])[selcols[len(selcols)-1]].nunique().reset_index().\
                    rename(index=str, columns={selcols[len(selcols)-1]: 'X'+str(i)})

In [19]:
train_df['os'].astype(str)

0        13
1        19
2        13
3        13
4        13
5        16
6        23
7        19
8        13
9        22
10       25
11       13
12       19
13       18
14       18
15       19
16       13
17       19
18       13
19       19
20       18
21       18
22        3
23       13
24       17
25       19
26       17
27       13
28       20
29       19
         ..
99970    19
99971     8
99972    14
99973    18
99974     8
99975     9
99976    17
99977    17
99978    27
99979    19
99980    10
99981    19
99982    17
99983     8
99984    13
99985    22
99986    19
99987    15
99988    25
99989    19
99990    13
99991    19
99992    13
99993     1
99994    13
99995    19
99996    13
99997    18
99998    13
99999    20
Name: os, Length: 100000, dtype: object

In [21]:
D = 2**26
train_df['category'] = (train_df['ip'].astype(str) + "_" + train_df['app'].astype(str) + "_" + train_df['device'].astype(str) \
+ "_" + train_df['os'].astype(str)).apply(hash) % D

In [22]:
train_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,category
0,83230,3,1,13,379,2017-11-06 14:32:21,0,45840466
1,17357,3,1,19,379,2017-11-06 14:33:34,0,2828707
2,35810,3,1,13,379,2017-11-06 14:34:12,0,41578010
3,45745,14,1,13,478,2017-11-06 14:34:52,0,26340760
4,161007,3,1,13,379,2017-11-06 14:35:08,0,45114231


In [23]:
click_buffer= np.full(D, 3000000000, dtype=np.uint32)

In [28]:
D=2**26
train_df['category'] = (train_df['ip'].astype(str) + "_" + train_df['app'].astype(str) + "_" + train_df['device'].astype(str) \
    + "_" + train_df['os'].astype(str)).apply(hash) % D
click_buffer= np.full(D, 3000000000, dtype=np.uint32)

train_df['epochtime']= train_df['click_time'].astype(np.int64) // 10 ** 9 
next_clicks= []
for category, t in zip(reversed(train_df['category'].values), reversed(train_df['epochtime'].values)):
    next_clicks.append(click_buffer[category]-t)
    click_buffer[category]= t


0    1509978741000000000
1    1509978814000000000
2    1509978852000000000
3    1509978892000000000
4    1509978908000000000
Name: click_time, dtype: int64