In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
%matplotlib inline 
import gc 
from tqdm import tqdm_notebook,tqdm

In [2]:
debug = 0
if debug:
    nrows = 100000
    nchunk = 400000
    val_size = 25000
    frm = nrows - 75000
    to = frm + nchunk
else:
    nrows=184903891-1
    nchunk=40000000

    val_size=2500000
    frm=nrows-75000000
    to = frm + nchunk

In [3]:
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'is_attributed' : 'uint8',
        'click_id'      : 'uint32',
        }
print('loading train data...',frm,to)
train_df = pd.read_csv("../input/train.csv.zip",
                       parse_dates=['click_time'], 
                       skiprows=range(1,frm), 
                       nrows=to-frm, 
                       dtype=dtypes, 
                       usecols=['ip','app','device','os', 'channel', 'click_time', 'is_attributed'])
print('loading test data...')
test_df = pd.read_csv("../input/test.csv.zip", 
#                       nrows=100000, 
                      parse_dates=['click_time'], 
                      dtype=dtypes, 
                      usecols=['ip','app','device','os', 'channel', 'click_time', 'click_id'])

loading train data... 109903890 149903890
loading test data...


In [4]:
len_train = len(train_df)
train_df = train_df.append(test_df) ## append train, test df
del test_df
gc.collect()

14

In [5]:
train_df.head()

Unnamed: 0,app,channel,click_id,click_time,device,ip,is_attributed,os
0,23,153,,2017-11-08 12:33:34,1,103202,0.0,16
1,3,280,,2017-11-08 12:33:34,1,143414,0.0,19
2,12,178,,2017-11-08 12:33:34,1,173096,0.0,17
3,3,280,,2017-11-08 12:33:34,1,8210,0.0,42
4,3,130,,2017-11-08 12:33:34,1,5746,0.0,19


In [6]:
train_df['day'] = train_df['click_time'].dt.day.astype(np.uint8)
train_df['hour'] = train_df['click_time'].dt.hour.astype(np.uint8)

# Features 


1. How many **unique** `channel` by `ip`
2. How many **cumcount** `app` by `ip`,`device`,`os` (person)
3. How many **unique** `hour` by `ip`,`day`
4. How many **unique** `app` by `ip`
5. How many **unique** `app` by `ip`,`os`
6. How many **unique** `device` by `ip`
7. How many **unique** `app` by `channel`
8. How many **cumcount** `os` by `ip` --- why ???? WE NEED IT??
9. How many **unique** `app` by `ip`,`device`,`os`

In [7]:
def encode_agg_feature(selcols, groupby, aggregator = 'nunique'):
    usecols = [e for e in selcols if e not in  groupby]
    if aggregator == 'nunique':        
        gp = train_df[selcols].groupby(groupby)[usecols].nunique().reset_index().\
            rename(columns = {
                usecols[-1] : usecols[-1] + '_nunique_' + '_'.join(groupby)
            })
        df = train_df.merge(gp, how='left', on=groupby)

    elif aggregator == 'cumcount':
        gp = train_df[selcols].groupby(groupby)[usecols].cumcount()
        df = train_df.copy()
        df[usecols[-1] + '_cumcnt_' + '_'.join(groupby)] = gp.values
    
    elif aggregator == 'count':
        gp = train_df[selcols].groupby(groupby)[usecols].count().reset_index().\
            rename(columns = {
                usecols[-1] : usecols[-1] + '_cnt_' + '_'.join(groupby)
            })
        df = train_df.merge(gp, how='left', on=groupby)
    elif aggregator in ['var','mean']:
        agg=np.var if aggregator == 'var' else np.mean            
        gp = train_df[selcols].groupby(groupby).agg(agg).reset_index().\
            rename(columns = {
                usecols[-1] : usecols[-1] + '_'+ str(aggregator) +'_' + '_'.join(groupby)
            })
        df = train_df.merge(gp, how='left', on=groupby)
    return df.iloc[:,-1]

In [8]:
print('feature1 : number of unique "channel" by ip',end='\t')
f1 = encode_agg_feature(groupby=['ip'], selcols=['ip','channel'],aggregator='nunique')
print(f1.name)

print('feature2 : cumcount of "app" by (ip,device,os)',end='\t')
f2 = encode_agg_feature(selcols=['app','ip','device','os'], groupby=['ip','device','os'], aggregator='cumcount')
print(f2.name)

print('feature3 : number of unique "hour" by (ip, day)',end='\t')
f3 = encode_agg_feature(selcols=['hour','ip','day'], groupby=['ip','day'], aggregator='nunique')
print(f3.name)

print('feature4 : number of unique "app" by ip', end='\t')
f4 = encode_agg_feature(selcols=['app', 'ip'], groupby=['ip'])
print(f4.name)

print('feature5 : number of unique "app" by (ip,os)',end='\t')
f5 = encode_agg_feature(selcols=['app', 'ip', 'os'], groupby=['ip','os'])
print(f5.name)

print('feature6 : number of unique "device" by ip',end='\t')
f6 = encode_agg_feature(selcols=['device', 'ip'], groupby=['ip'])
print(f6.name)

print('feature7 : number of unique "app" by channel',end='\t')
f7 = encode_agg_feature(selcols=['app', 'channel'], groupby=['channel'])
print(f7.name)

print('feature8 : cumcount of "os" by ip ',end='\t')
f8 = encode_agg_feature(selcols=['os', 'ip'], groupby=['ip'], aggregator='cumcount')
print(f8.name)

print('feature9 : number of unique "app" by (ip,device,os)',end='\t')
f9 = encode_agg_feature(selcols=['app', 'ip','device','os'], groupby=['ip','device','os'])
print(f9.name)


feature1 : number of unique "channel" by ip	channel_nunique_ip
feature2 : cumcount of "app" by (ip,device,os)	app_cumcnt_ip_device_os
feature3 : number of unique "hour" by (ip, day)	hour_nunique_ip_day
feature4 : number of unique "app" by ip	app_nunique_ip
feature5 : number of unique "app" by (ip,os)	app_nunique_ip_os
feature6 : number of unique "device" by ip	device_nunique_ip
feature7 : number of unique "app" by channel	app_nunique_channel
feature8 : cumcount of "os" by ip 	os_cumcnt_ip
feature9 : number of unique "app" by (ip,device,os)	app_nunique_ip_device_os


In [9]:
train_df[f1.name] = f1.values
train_df[f2.name] = f2.values
train_df[f3.name] = f3.values
train_df[f4.name] = f4.values
train_df[f5.name] = f5.values
train_df[f6.name] = f6.values
train_df[f7.name] = f7.values
train_df[f8.name] = f8.values
train_df[f9.name] = f9.values

In [10]:
train_df.columns

Index(['app', 'channel', 'click_id', 'click_time', 'device', 'ip',
       'is_attributed', 'os', 'day', 'hour', 'channel_nunique_ip',
       'app_cumcnt_ip_device_os', 'hour_nunique_ip_day', 'app_nunique_ip',
       'app_nunique_ip_os', 'device_nunique_ip', 'app_nunique_channel',
       'os_cumcnt_ip', 'app_nunique_ip_device_os'],
      dtype='object')

In [11]:
train_df.head().T

Unnamed: 0,0,1,2,3,4
app,23,3,12,3,3
channel,153,280,178,280,130
click_id,,,,,
click_time,2017-11-08 12:33:34,2017-11-08 12:33:34,2017-11-08 12:33:34,2017-11-08 12:33:34,2017-11-08 12:33:34
device,1,1,1,1,1
ip,103202,143414,173096,8210,5746
is_attributed,0,0,0,0,0
os,16,19,17,42,19
day,8,8,8,8,8
hour,12,12,12,12,12


## count 

- feature 10 : count by (ip,day,hour)
- feature 11 : count by (ip,app)
- feature 12 : count by (ip,app,os)

In [12]:
print('feature 10 : count by (ip,day,hour)',end='\t')
f10 = encode_agg_feature(selcols=['ip','day','hour','channel'],groupby=['ip','day','hour'],aggregator='count')
print(f10.name)

print('feature 11 : count by (ip,app)', end='\t')
f11 = encode_agg_feature(selcols=['ip','app','channel'],groupby=['ip','app'],aggregator='count')
print(f11.name)

print('feature 12 : count by (ip,app,os)',end='\t')
f12 = encode_agg_feature(selcols=['ip','app','channel','os'],groupby=['ip','app','os'],aggregator='count')
print(f12.name)

feature 10 : count by (ip,day,hour)	channel_cnt_ip_day_hour
feature 11 : count by (ip,app)	channel_cnt_ip_app
feature 12 : count by (ip,app,os)	channel_cnt_ip_app_os


## var, mean hour

- feature 13 : var of hour by (ip,day,channel)
- feature 14 : var of hour by (ip,app,os)
- feature 15 : var of day by (ip,app,channel)
- feature 16 : mean of hour by (ip,app,channel)

In [13]:
print('feature 13: var of hour by (ip,day,channel)',end='\t')
f13 = encode_agg_feature(selcols=['ip','day','channel','hour'],
                         groupby=['ip','day','channel'],
                         aggregator='var'                    
                        )
print(f13.name)

print('feature 14: var of hour by (ip,app,os)', end='\t')
f14 =encode_agg_feature(selcols=['ip','app','os','hour'], groupby=['ip','app','os'], aggregator='var')
print(f14.name)

print('feature 15: var of day by (ip,app,channel)',end='\t')
f15 = encode_agg_feature(selcols=['ip','app','channel','day'],groupby=['ip','app','channel'], aggregator='var')
print(f15.name)

print('feature 16: mean of hour by (ip,app,channel)', end='\t')
f16 = encode_agg_feature(selcols=['ip','app','channel','hour'],groupby=['ip','app','channel'],aggregator='mean')
print(f16.name)


feature 13: var of hour by (ip,day,channel)	hour_var_ip_day_channel
feature 14: var of hour by (ip,app,os)	hour_var_ip_app_os
feature 15: var of day by (ip,app,channel)	day_var_ip_app_channel
feature 16: mean of hour by (ip,app,channel)	hour_mean_ip_app_channel


In [14]:
train_df[f10.name] = f10.values
train_df[f11.name] = f11.values
train_df[f12.name] = f12.values
train_df[f13.name] = f13.values
train_df[f14.name] = f14.values
train_df[f15.name] = f15.values
train_df[f16.name] = f16.values

In [15]:
del f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f12,f13,f14,f15,f16

In [16]:
gc.collect()

562

# Next Click 

In [17]:
D=2**26
train_df['category'] = (train_df['ip'].astype(str) + "_" + train_df['app'].astype(str) + "_" + train_df['device'].astype(str) \
    + "_" + train_df['os'].astype(str)).apply(hash) % D

click_buffer= np.full(D, 3000000000, dtype=np.uint32)
train_df['epochtime']= train_df['click_time'].astype(np.int64) // 10 ** 9 ## pretty slow
next_clicks= []

for category, t in tqdm_notebook(zip(reversed(train_df['category'].values), reversed(train_df['epochtime'].values))):
    next_clicks.append(click_buffer[category]-t)
    click_buffer[category]= t

next_clicks = list(reversed(next_clicks))
# train_df['nextClick'] = next_clicks





In [18]:
### time consuming ###
# selcols = ['ip','device','os','app','click_time']
# groupby = ['ip','device','os','app']
# temp_df[selcols].groupby(groupby).click_time.transform(lambda x:x.diff().shift(-1)).dt.seconds

In [19]:
train_df['nextClick'] = next_clicks
train_df['nextClick_shift1'] = pd.Series(next_clicks).shift(1)

### dtypes 

In [20]:
print("vars and data type: ")
train_df.info()
print('downcast to uint8,16, float16,32')
train_df.channel_nunique_ip = train_df.channel_nunique_ip.astype('uint8') ## 93 -- uint8
train_df.app_cumcnt_ip_device_os = train_df.app_cumcnt_ip_device_os.astype('uint16')  ## 1860 -- uint16
train_df.hour_nunique_ip_day = train_df.hour_nunique_ip_day.astype('uint8') ## 3 -- uint8
train_df.app_nunique_ip = train_df.app_nunique_ip.astype('uint8')  ## 41 --uint8
train_df.app_nunique_ip_os = train_df.app_nunique_ip_os.astype('uint8') ## 29 -- uint8
train_df.device_nunique_ip = train_df.device_nunique_ip.astype('uint8') ## 8 -- uint8
train_df.app_nunique_channel = train_df.app_nunique_channel.astype('uint8') ##77 -- uint8
train_df.os_cumcnt_ip = train_df.os_cumcnt_ip.astype('uint16') ## 6436 --uint16
train_df.app_nunique_ip_device_os = train_df.app_nunique_ip_device_os.astype('uint8') ## 29 -- uint8
train_df.channel_cnt_ip_day_hour = train_df.channel_cnt_ip_day_hour.astype('uint16') ## 6437 --uint16
train_df.channel_cnt_ip_app = train_df.channel_cnt_ip_app.astype('uint16')##1725 -- uint16
train_df.channel_cnt_ip_app_os = train_df.channel_cnt_ip_app_os.astype('uint16') ## 519 -- uint16

train_df.hour_var_ip_day_channel =train_df.hour_var_ip_day_channel.astype('float16')  
train_df.hour_var_ip_app_os = train_df.hour_var_ip_app_os.astype('float16')
train_df.day_var_ip_app_channel = train_df.day_var_ip_app_channel.astype('float16')
train_df.hour_mean_ip_app_channel = train_df.hour_mean_ip_app_channel.astype('float16')
train_df.category = train_df.category.astype('uint32')
train_df.epochtime = train_df.epochtime.astype('uint32')
train_df.nextClick = train_df.nextClick.astype('uint32')
train_df.info()

vars and data type: 
<class 'pandas.core.frame.DataFrame'>
Int64Index: 58790469 entries, 0 to 18790468
Data columns (total 30 columns):
app                         uint16
channel                     uint16
click_id                    float64
click_time                  datetime64[ns]
device                      uint16
ip                          uint32
is_attributed               float64
os                          uint16
day                         uint8
hour                        uint8
channel_nunique_ip          int64
app_cumcnt_ip_device_os     int64
hour_nunique_ip_day         int64
app_nunique_ip              int64
app_nunique_ip_os           int64
device_nunique_ip           int64
app_nunique_channel         int64
os_cumcnt_ip                int64
app_nunique_ip_device_os    int64
channel_cnt_ip_day_hour     int64
channel_cnt_ip_app          int64
channel_cnt_ip_app_os       int64
hour_var_ip_day_channel     float64
hour_var_ip_app_os          float64
day_var_ip_app_channel    

In [21]:
gc.collect()

38

___

# Save

In [22]:
train_df.shape

(58790469, 30)

In [23]:
test_df = train_df[len_train:]
val_df = train_df[(len_train-val_size):len_train]
train_df = train_df[:(len_train-val_size)]

print("train size: ", len(train_df))
print("valid size: ", len(val_df))
print("test size : ", len(test_df))


train size:  37500000
valid size:  2500000
test size :  18790469


In [24]:
print('../../feat_{}_{}'.format(frm,to))

../../feat_109903890_149903890


In [25]:
store = pd.HDFStore('../input/feat/feat_{}_{}.h5'.format(frm,to))
store['train_df'] = train_df
store['valid_df'] = val_df
store['test_df'] = test_df
store.close()

load Hdf

In [29]:
with pd.HDFStore('../input/feat/feat_109903890_149903890.h5') as store:
    print(store.keys())
    test_df = store['test_df']
    val_df = store['valid_df']

['/test_df', '/train_df', '/valid_df']


In [31]:
test_df.shape

(18790469, 30)

_____
# PLAYGROUND

In [27]:
cols = ['nextClick', 'nextClick_shift', 'app', 'device', 'os', 'channel', 'hour', 'day', 'ip_tcount', 'ip_tchan_count', 'ip_app_count', 'ip_app_os_count', 'ip_app_os_var', 'ip_app_channel_var_day', 'ip_app_channel_mean_hour', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8']
importances = [138, 1, 232, 27, 147, 305, 105, 0, 40, 2, 32, 21, 10, 7, 8, 52, 9, 9, 33, 11, 28, 13, 7, 35]

In [28]:
pd.DataFrame({'col':cols,'imp':importances}).sort_values(by='imp',ascending=False)

Unnamed: 0,col,imp
5,channel,305
2,app,232
4,os,147
0,nextClick,138
6,hour,105
15,X0,52
8,ip_tcount,40
23,X8,35
18,X3,33
10,ip_app_count,32
