In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
%matplotlib inline 
import gc,time

In [2]:
with pd.HDFStore('../input/raw_data.h5') as store:
    print(store.keys())
    test_df = store['test_df']
    train_df = store['train_df']

['/test_df', '/train_df']


In [3]:
debug = 1
if debug:
    nrows = 100000
    nchunk = 400000
    val_size = 25000
    frm = nrows - 75000
    to = frm + nchunk

In [4]:
def encode_agg_feature(trn_df, selcols, groupby, aggregator = 'nunique'):
    usecols = [e for e in selcols if e not in  groupby]
    if aggregator == 'nunique':        
        gp = trn_df[selcols].groupby(groupby)[usecols].nunique().reset_index().\
            rename(columns = {
                usecols[-1] : usecols[-1] + '_nunique_' + '_'.join(groupby)
            })
        df = trn_df.merge(gp, how='left', on=groupby)

    elif aggregator == 'cumcount':
        gp = trn_df[selcols].groupby(groupby)[usecols].cumcount()
        df = trn_df.copy()
        df[usecols[-1] + '_cumcnt_' + '_'.join(groupby)] = gp.values
    
    elif aggregator == 'count':
        gp = trn_df[selcols].groupby(groupby)[usecols].count().reset_index().\
            rename(columns = {
                usecols[-1] : usecols[-1] + '_cnt_' + '_'.join(groupby)
            })
        df = trn_df.merge(gp, how='left', on=groupby)
    elif aggregator in ['var','mean']:
        agg=np.var if aggregator == 'var' else np.mean            
        gp = trn_df[selcols].groupby(groupby).agg(agg).reset_index().\
            rename(columns = {
                usecols[-1] : usecols[-1] + '_'+ str(aggregator) +'_' + '_'.join(groupby)
            })
        df = trn_df.merge(gp, how='left', on=groupby)
    return df.iloc[:,-1]

In [5]:
trn_df = train_df.iloc[frm:to,:].copy()
trn_df.shape

(400000, 9)

CV loop to built features 
- X_tr, X_val
- train on X_tr, val on X_val

importanta feature 

#### next click

- next click time by ('ip','app','os','device')
- next click time 2 by ...
- cnt by next click

#### feat_base

- channel cnt by (ip,day,hour)
- channel cnt by (ip,app)
- channel nunique by ip
- device nunique by ip
- app nunique by ip

#### mean encode 
- ip mean target
- app mean target 
- channel mean target 
- os mean target
- device mean target

In [6]:
2**16

65536

In [7]:
trn_df_tmp =trn_df.copy()
print('next click time by(ip,app,os,device)...')
gp = trn_df_tmp.groupby(['ip','app','os','device'])
print('shift 1...')
trn_df_tmp['shift_1'] = gp.click_time.shift(-1)
print('shift 2...')
trn_df_tmp['shift_2'] = gp.click_time.shift(-2)
print('timedelta to uint16...')
trn_df_tmp['next_click_dt'] = ((trn_df_tmp.shift_1 - trn_df_tmp.click_time) / np.timedelta64(1,'s')).fillna(50000).astype('uint16')
trn_df_tmp['next_click_dt2'] = ((trn_df_tmp.shift_2 - trn_df_tmp.click_time) / np.timedelta64(1,'s')).fillna(50000).astype('uint16')

print('cnt by next click...')
nextclick_cnt = trn_df_tmp.next_click_dt.value_counts().reset_index().rename(columns={'index':'next_click_dt','next_click_dt':'next_click_dt_cnt'})
trn_df_tmp = trn_df_tmp.merge(nextclick_cnt,how='left')
del nextclick_cnt; gc.collect()
print('complete')

next click time by(ip,app,os,device)...
shift 1...
shift 2...
timedelta to uint16...
cnt by next click...
complete


In [9]:
print('feature1 : number of unique "channel" by ip',end='\t')
f1 = encode_agg_feature(trn_df_tmp,groupby=['ip'], selcols=['ip','channel'],aggregator='nunique')
print(f1.name)

# print('feature2 : cumcount of "app" by (ip,device,os)',end='\t')
# f2 = encode_agg_feature(trn_df_tmp,selcols=['app','ip','device','os'], groupby=['ip','device','os'], aggregator='cumcount')
# print(f2.name)

# print('feature3 : number of unique "hour" by (ip, day)',end='\t')
# f3 = encode_agg_feature(trn_df_tmp,selcols=['hour','ip','day'], groupby=['ip','day'], aggregator='nunique')
# print(f3.name)

print('feature4 : number of unique "app" by ip', end='\t')
f4 = encode_agg_feature(trn_df_tmp,selcols=['app', 'ip'], groupby=['ip'])
print(f4.name)

# print('feature5 : number of unique "app" by (ip,os)',end='\t')
# f5 = encode_agg_feature(trn_df_tmp,selcols=['app', 'ip', 'os'], groupby=['ip','os'])
# print(f5.name)

print('feature6 : number of unique "device" by ip',end='\t')
f6 = encode_agg_feature(trn_df_tmp,selcols=['device', 'ip'], groupby=['ip'])
print(f6.name)

print('feature7 : number of unique "app" by channel',end='\t')
f7 = encode_agg_feature(trn_df_tmp,selcols=['app', 'channel'], groupby=['channel'])
print(f7.name)

# print('feature8 : cumcount of "os" by ip ',end='\t')
# f8 = encode_agg_feature(trn_df_tmp,selcols=['os', 'ip'], groupby=['ip'], aggregator='cumcount')
# print(f8.name)

print('feature9 : number of unique "app" by (ip,device,os)',end='\t')
f9 = encode_agg_feature(trn_df_tmp,selcols=['app', 'ip','device','os'], groupby=['ip','device','os'])
print(f9.name)

print('feature 10 : count by (ip,day,hour)',end='\t')
f10 = encode_agg_feature(trn_df_tmp,selcols=['ip','day','hour','channel'],groupby=['ip','day','hour'],aggregator='count')
print(f10.name)

print('feature 11 : count by (ip,app)', end='\t')
f11 = encode_agg_feature(trn_df_tmp,selcols=['ip','app','channel'],groupby=['ip','app'],aggregator='count')
print(f11.name)

# print('feature 12 : count by (ip,app,os)',end='\t')
# f12 = encode_agg_feature(selcols=['ip','app','channel','os'],groupby=['ip','app','os'],aggregator='count')
# print(f12.name)

trn_df_tmp[f1.name] = f1.values
trn_df_tmp[f4.name] = f4.values
trn_df_tmp[f6.name] = f6.values
trn_df_tmp[f7.name] = f7.values
trn_df_tmp[f9.name] = f9.values
trn_df_tmp[f10.name]= f10.values
trn_df_tmp[f11.name]= f11.values

# del f1,f4,f6,f7,f9,f10,f11;gc.collect()

feature1 : number of unique "channel" by ip	channel_nunique_ip
feature4 : number of unique "app" by ip	app_nunique_ip
feature6 : number of unique "device" by ip	device_nunique_ip
feature7 : number of unique "app" by channel	app_nunique_channel
feature9 : number of unique "app" by (ip,device,os)	app_nunique_ip_device_os
feature 10 : count by (ip,day,hour)	channel_cnt_ip_day_hour
feature 11 : count by (ip,app)	channel_cnt_ip_app


In [12]:
trn_df.shape

(400000, 9)

In [16]:
from sklearn.model_selection import StratifiedKFold

y_tr = trn_df.is_attributed.values.astype(np.int8) # target 
skf = StratifiedKFold(4, random_state=0)
skf.get_n_splits(X=trn_df,y=y_tr)

cols = ['ip','app','channel','os','device']
for fold, (tr_idx, val_idx) in enumerate(skf.split(trn_df,y_tr)):
    
    ## generate features 
    print(fold,tr_idx.shape,val_idx.shape)
    
    X_tr ,X_val = trn_df.iloc[tr_idx], trn_df.iloc[val_idx]
    print('mean encoding...')
    for col in cols:
        means = X_val[col].map(X_tr.groupby(col).is_attributed.mean())
        X_val[col + '_mean_target'] = means
        print(col,end='\n\t')
    trn_df.iloc[val_idx] = X_val
prior = trn_df.is_attributed.mean()
trn_df[[col+'_mean_target' for col in cols]] = trn_df[[col+'_mean_target' for col in cols]].fillna(prior)

0 (299999,) (100001,)
mean encoding...
ip
	app

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



	channel
	os
	device
	1 (299999,) (100001,)
mean encoding...
ip
	app
	channel
	os
	device
	2 (300001,) (99999,)
mean encoding...
ip
	app
	channel
	os
	device
	3 (300001,) (99999,)
mean encoding...
ip
	app
	channel
	os
	device
	

KeyError: "['ip_mean_target' 'app_mean_target' 'channel_mean_target' 'os_mean_target'\n 'device_mean_target'] not in index"

In [17]:
trn_df.head()

Unnamed: 0,ip,app,device,os,channel,click_time,is_attributed,hour,day
25000,88591,11,1,13,219,2017-11-06 16:00:31,0,16,6
25001,20924,9,1,13,134,2017-11-06 16:00:31,0,16,6
25002,8531,17,1,47,280,2017-11-06 16:00:31,0,16,6
25003,21927,12,1,22,178,2017-11-06 16:00:31,0,16,6
25004,74803,64,1,6,459,2017-11-06 16:00:31,0,16,6


In [11]:
train_new = train_df.copy()
cols = ['ip','app','device','os','channel']
# cols_sets = [
#     ['ip','app','os','device'],
#     ['ip'],
    
# ]
for col in cols:
    train_new[col + '_mean_target'] = 0
    

for tr_idx, val_idx in skf.split(train_df,y_tr):
    print('traing idx:{}'.format(tr_idx))
    X_tr ,X_val = train_df.loc[tr_idx], train_df.loc[val_idx]
    for col in cols:
        means = X_val[col].map(X_tr.groupby(col).is_attributed.mean())
        X_val[col + '_mean_target'] = means
    train_new.loc[val_idx] = X_val
prior = train_df.is_attributed.mean()
# train_new.fillna(prior, inplace=True)
train_new[[col+'_mean_target' for col in cols]] = train_new[[col+'_mean_target' for col in cols]].fillna(prior)

(400001, 7)

25000