# Second Level 

In [1]:
import pandas as pd
import numpy as np 
import gc

### Load first level data 

In [9]:
with pd.HDFStore('../data/feat/stage2_data.h5') as stage2_h5:
    print(stage2_h5.keys())
    stage2_df = stage2_h5['stage2_df']

['/stage2_df']


In [13]:
stage2_df.head()

Unnamed: 0,rg,lasso,rf,lgbm,knn,knn_dist_0,knn_dist_1,knn_dist_2,knn_dist_3,knn_dist_4,...,knn_dist_14,kmean_dist_label,kmean_dist_0,kmean_dist_1,kmean_dist_2,kmean_dist_3,kmean_dist_4,kmean_dist_5,kmean_dist_6,kmean_dist_7
0,0.345505,0.299434,0.55938,0.687585,0.461946,15.0,15.0,15.0,16.0,16.0,...,17.0,3.0,16.171893,16.788638,13.484739,10.455496,163.121368,14.736764,17.483576,35.278573
1,1.083458,1.036473,2.350962,2.604097,1.886891,7097.084351,7367.291954,7391.268279,7754.255436,7779.592759,...,8422.03043,5.0,9.423429,10.257709,8.15073,6.497106,161.405323,7.958082,11.485983,32.846975
2,0.432379,0.398738,0.645353,0.502553,0.400254,8810.403742,8901.891784,8916.891784,8944.502339,8967.15634,...,9150.136168,6.0,8.785192,8.794748,6.493592,6.395639,161.372517,8.16227,10.156587,32.766569
3,0.495248,0.437356,3.162069,5.00124,0.075603,11.0,12.0,12.0,13.0,13.0,...,19.0,3.0,16.18651,16.802709,13.502333,10.478966,163.123577,14.752742,17.49604,35.280873
4,0.458267,0.401383,2.405929,3.038452,0.663293,9.0,10.0,10.0,11.0,11.0,...,14.0,3.0,16.180627,16.797045,13.495253,10.469525,163.122691,14.746311,17.491022,35.279936


### Load features 

In [5]:
with pd.HDFStore('../data/feat/data.h5') as store:
    print(store.keys())
    X_cv = store['X_cv']
    X_train = store['X_train']
    X_test = store['X_test']
    y_cv = store['y_cv']
    y_train = store['y_train']

['/X_cv', '/X_test', '/X_train', '/y_cv', '/y_train']


In [16]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [17]:
X_train = downcast_dtypes(X_train)
X_cv = downcast_dtypes(X_cv)
X_test = downcast_dtypes(X_test)

y_train = y_train.astype(np.float32)
y_cv = y_cv.astype(np.float32)

gc.collect()


2146

In [18]:
X_train.fillna(0,inplace=True)
X_cv.fillna(0,inplace=True)
X_test.fillna(0,inplace=True)

In [34]:
y_train = y_train.clip(0,20)
y_cv = y_cv.clip(0,20)

#### memory issue
1. reduce `date_block_num >= 12`  

In [101]:
train_dates = X_train.date_block_num

mask = train_dates >= 12
X_train = X_train[mask]
y_train = y_train[mask]

level2_date_block = [27, 28, 29, 30, 31, 32]
level2_mask = train_dates.isin(level2_date_block)
train_dates_level2 = train_dates[level2_mask]
train_y_level2 = y_train[level2_mask]

train_dates = train_dates[mask]
test_dates = X_test.date_block_num
cv_dates = X_cv.date_block_num

merge = pd.concat([X_train, X_cv, X_test], ignore_index=True)
merge_dates = np.concatenate([train_dates,cv_dates, test_dates], axis=0)
merge_y = np.concatenate([y_train,y_cv, np.zeros((len(test_dates),))], axis=0)

In [36]:
stage2_train_dates = merge_dates[merge_dates>=level2_date_block[0]] ## 27 - 34
stage2_train_mask = (stage2_train_dates <= level2_date_block[-1]) # train on : 27-32
stage2_valid_mask = (stage2_train_dates == 33) ## validate on : 33
stage2_final_train_mask = (stage2_train_dates <= 33) ## 
stage2_test_mask = (stage2_train_dates==34)
stage2_train_y = merge_y[merge_dates>=level2_date_block[0]] ##

In [130]:
pd.Series(stage2_train_y).shape

(1828564,)

In [31]:
# print(pd.Series(stage2_train_mask).value_counts())
# print(pd.Series(stage2_valid_mask).value_counts())
# print(pd.Series(stage2_final_train_mask).value_counts())
# print(pd.Series(stage2_test_mask).value_counts())

True     1376192
False     452372
dtype: int64
False    1590392
True      238172
dtype: int64
True     1614364
False     214200
dtype: int64
False    1614364
True      214200
dtype: int64


In [46]:
model_cols = ['rg', 'lasso', 'rf', 'lgbm', 'knn', 'kmean_dist_label']
# stage2_df.loc[:,model_cols].head()
stage2_df[model_cols].corr()

Unnamed: 0,rg,lasso,rf,lgbm,knn,kmean_dist_label
rg,1.0,0.991765,0.74204,0.697276,0.495558,0.1961
lasso,0.991765,1.0,0.748991,0.705493,0.493394,0.177352
rf,0.74204,0.748991,1.0,0.961878,0.506604,0.077598
lgbm,0.697276,0.705493,0.961878,1.0,0.467184,0.072676
knn,0.495558,0.493394,0.506604,0.467184,1.0,0.059852
kmean_dist_label,0.1961,0.177352,0.077598,0.072676,0.059852,1.0


In [51]:
from scipy.stats.mstats import hmean, gmean

In [52]:
model_cnt = stage2_df[model_cols].shape[1]

In [59]:
model_cols[5]

'kmean_dist_label'

In [91]:
# stage2_df[model_cols[1]].values
# gmean(stage2_df.loc[:,model_cols[1]].values)


  log_a = np.log(a)


array([ 0.40926415,  1.56099583,  0.50727376, ...,  0.04115166,
        0.0585324 ,  0.07933516])

In [92]:
def stage2_feat_engineer(df, model_cols=[]):
    model_cnt = df[model_cols].shape[1]
    
    print('Calculating 2 g\\h\\m-mean')
    for i in range(model_cnt-1): # i = 0 to model_cnt-2
        for j in range(i+1, model_cnt): # j = i+1 to model_cnt-1
            cols = [model_cols[i], model_cols[j]]
            df.loc[:, 'gmean_' + str(i) + '_'+ str(j)] = gmean(df.loc[:,cols].values, axis=1)
            df.loc[:, 'mean_' + str(i) + '_'+ str(j)] = np.mean(df.loc[:,cols].values, axis=1)

    if model_cnt > 3:
        print('Calculating 3 g\\h\\m-mean')        
        for i in range(model_cnt-2): # i = 0 to model_cnt-3
            for j in range(i+1, model_cnt-1): # j = i+1 to model_cnt-2
                for k in range(j+1, model_cnt): # k = j+1 to model_cnt-1
                    cols = [model_cols[i], model_cols[j], model_cols[k]]
                    df.loc[:, 'gmean_' + str(i) + '_'+ str(j) + '_'+ str(k)] = gmean(df.loc[:,cols].values, axis=1)
                    df.loc[:, 'mean_' + str(i) + '_'+ str(j) + '_'+ str(k)] = np.mean(df.loc[:,cols].values, axis=1)

    print('Calculating all g\\h\\m-mean')             
    df.loc[:, 'gmean_all'] = gmean(df.loc[:,model_cols].values, axis=1)
    df.loc[:, 'mean'] = np.mean(df.loc[:,model_cols].values, axis=1)
    df.loc[:, 'med'] = np.median(df.loc[:,model_cols].values, axis=1)
    df.loc[:, 'max'] = np.amax(df.loc[:,model_cols].values, axis=1)
    df.loc[:, 'min'] = np.amin(df.loc[:,model_cols].values, axis=1)
    df.loc[:, 'std'] = np.std(df.loc[:,model_cols].values, axis=1)

In [93]:
stage2_feat_engineer(stage2_df,model_cols)

Calculating 2 g\h\m-mean


  log_a = np.log(a)


Calculating 3 g\h\m-mean
Calculating all g\h\m-mean


In [94]:
stage2_df.head()

Unnamed: 0,rg,lasso,rf,lgbm,knn,knn_dist_0,knn_dist_1,knn_dist_2,knn_dist_3,knn_dist_4,...,gmean_2_4_5,mean_2_4_5,gmean_3_4_5,mean_3_4_5,gmean_all,mean,med,max,min,std
0,0.345505,0.299434,0.55938,0.687585,0.461946,15.0,15.0,15.0,16.0,16.0,...,0.918628,1.340442,0.984041,1.383177,0.61695,0.892308,0.510663,3.0,0.299434,0.951387
1,1.083458,1.036473,2.350962,2.604097,1.886891,7097.084351,7367.291954,7391.268279,7754.255436,7779.592759,...,2.809662,3.079284,2.907087,3.163663,2.004464,2.32698,2.118926,5.0,1.036473,1.330904
2,0.432379,0.398738,0.645353,0.502553,0.400254,8810.403742,8901.891784,8916.891784,8944.502339,8967.15634,...,1.157253,2.348536,1.06469,2.300936,0.715599,1.396546,0.467466,6.0,0.398738,2.060467
3,0.495248,0.437356,3.162069,5.00124,0.075603,11.0,12.0,12.0,13.0,13.0,...,0.89511,2.079224,1.042906,2.692281,0.958799,2.028586,1.747624,5.00124,0.075603,1.814864
4,0.458267,0.401383,2.405929,3.038452,0.663293,9.0,10.0,10.0,11.0,11.0,...,1.685401,2.023074,1.821768,2.233915,1.178257,1.661221,1.534611,3.038452,0.401383,1.174297


In [95]:
gc.collect()

5120

In [105]:
stage2_df.shape

(1828564, 105)

In [107]:
stage2_df = pd.concat([stage2_df, merge[merge_dates>=level2_date_block[0]].reset_index(drop=True)], axis=1)

In [108]:
stage2_df.shape

(1828564, 159)

In [110]:
stage2_df.head(3).T

Unnamed: 0,0,1,2
rg,0.345505,1.083458,0.432379
lasso,0.299434,1.036473,0.398738
rf,0.559380,2.350962,0.645353
lgbm,0.687585,2.604097,0.502553
knn,0.461946,1.886891,0.400254
knn_dist_0,15.000000,7097.084351,8810.403742
knn_dist_1,15.000000,7367.291954,8901.891784
knn_dist_2,15.000000,7391.268279,8916.891784
knn_dist_3,16.000000,7754.255436,8944.502339
knn_dist_4,16.000000,7779.592759,8967.156340


In [125]:
y_cv.shape

(238172,)

(6186922,)

# Train on 2nd Level 
- train on `27-32` ,`stage2_train_mask`  
- validate on `33`
- final train on `27-33` and predict on `34`

In [118]:
# stage2_df[stage2_train_mask]
stage2_df[stage2_valid_mask].shape

(238172, 159)

In [123]:
pd.Series(stage2_final_train_mask).value_counts()

True     1614364
False     214200
dtype: int64

In [143]:
import lightgbm as lgb
import copy 
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import mean_squared_error

In [141]:
def objective(params):
    
    clf = params.get('clf')
    new_params = params.get('params') ## update params
#     print(params,clf.get_params())

    copy_clf = copy.deepcopy(clf)
    original_params = copy_clf.get_params()
    original_params.update(new_params)
    copy_clf.set_params(**original_params) # update copy clf with trying new_params              
    
    ## simple hold out 
    
    copy_clf.fit(stage2_df[stage2_train_mask].values, stage2_train_y[stage2_train_mask])    
    
    pred_y = copy_clf.predict(stage2_df[stage2_valid_mask].values)
    pred_y = np.clip(pred_y, 0., 20.) # clip to (0,20)
    rmse = mean_squared_error(stage2_train_y[stage2_valid_mask], pred_y)**.5
    print('rmse: {:.5f} '.format(rmse), end='\t')
    print('params: {}'.format(new_params))
    
    return {'loss':rmse,'status':STATUS_OK}

In [145]:
lgb_clf

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       learning_rate=0.1, max_depth=-1, min_child_samples=20,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=8, num_leaves=31, objective=None, random_state=0,
       reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
       subsample_for_bin=200000, subsample_freq=1)

In [161]:
lgb_clf = lgb.LGBMRegressor(random_state=0, n_jobs=8)

# 1. find an optimal n_esti for rather larger learning rate
lgb_params = {
    'clf' : lgb_clf,
    'params': {
        
        'boosting_type': hp.choice('boosting_type',['gbdt', 'dart']), ## gbdt 
#         'boosting_type': 'gbdt',
        'max_depth' : hp.choice("max_depth", np.arange(4, 17, dtype=int)),    
        'num_leaves'   : hp.choice('num_leaves', np.arange(8,129,2,dtype=int)),        
        
        'feature_fraction': hp.uniform('feature_fraction', 0.3, 1.0),
        'bagging_fraction': hp.uniform ('bagging_fraction', 0.7, 1),
        'reg_alpha' : hp.uniform('reg_alpha',0,1),
        'reg_lambda': hp.uniform('reg_lambda',0,1),
        'gamma' : hp.uniform('gamma', 0.1,0.5)
    }
    
}
best =fmin(fn = objective,
           space = lgb_params,
           algo = tpe.suggest,
           max_evals = 100
           )

rmse: 0.95168 	params: {'bagging_fraction': 0.8241366101364762, 'boosting_type': 'gbdt', 'feature_fraction': 0.8117256778270627, 'gamma': 0.4235898972332336, 'max_depth': 11, 'num_leaves': 86, 'reg_alpha': 0.2877542357666576, 'reg_lambda': 0.1052770711844655}
rmse: 0.91783 	params: {'bagging_fraction': 0.8398052878818728, 'boosting_type': 'dart', 'feature_fraction': 0.6690387216785224, 'gamma': 0.17047456661651433, 'max_depth': 15, 'num_leaves': 18, 'reg_alpha': 0.3204963721783003, 'reg_lambda': 0.7066185373452311}
rmse: 0.93673 	params: {'bagging_fraction': 0.7760534465744401, 'boosting_type': 'gbdt', 'feature_fraction': 0.6477871916992203, 'gamma': 0.4587998545467007, 'max_depth': 6, 'num_leaves': 34, 'reg_alpha': 0.47373316328453097, 'reg_lambda': 0.3324304334351037}
rmse: 0.93930 	params: {'bagging_fraction': 0.9093468968359684, 'boosting_type': 'gbdt', 'feature_fraction': 0.9301793912074237, 'gamma': 0.276494492899382, 'max_depth': 5, 'num_leaves': 48, 'reg_alpha': 0.3685192937526

rmse: 0.92175 	params: {'bagging_fraction': 0.8728097607242306, 'boosting_type': 'dart', 'feature_fraction': 0.34832760832578064, 'gamma': 0.13156555952296187, 'max_depth': 4, 'num_leaves': 90, 'reg_alpha': 0.5237915180486983, 'reg_lambda': 0.1991661012038736}
rmse: 0.92292 	params: {'bagging_fraction': 0.9603747486014677, 'boosting_type': 'dart', 'feature_fraction': 0.9186026229643607, 'gamma': 0.12911852472532856, 'max_depth': 4, 'num_leaves': 90, 'reg_alpha': 0.5488257769705249, 'reg_lambda': 0.1545807859377144}
rmse: 0.92123 	params: {'bagging_fraction': 0.9079073146052532, 'boosting_type': 'dart', 'feature_fraction': 0.35188332440998693, 'gamma': 0.15627684377726786, 'max_depth': 4, 'num_leaves': 106, 'reg_alpha': 0.36067095516320047, 'reg_lambda': 0.010975285483777186}
rmse: 0.91999 	params: {'bagging_fraction': 0.896465969298457, 'boosting_type': 'dart', 'feature_fraction': 0.6907704262597025, 'gamma': 0.44777445058000864, 'max_depth': 10, 'num_leaves': 10, 'reg_alpha': 0.374243

rmse: 0.92758 	params: {'bagging_fraction': 0.8781282725067074, 'boosting_type': 'dart', 'feature_fraction': 0.6755297520929628, 'gamma': 0.16997897623450886, 'max_depth': 13, 'num_leaves': 54, 'reg_alpha': 0.27152393554854404, 'reg_lambda': 0.7045091353336083}
rmse: 0.92106 	params: {'bagging_fraction': 0.8522014896301852, 'boosting_type': 'dart', 'feature_fraction': 0.6201475657972695, 'gamma': 0.1380989849404548, 'max_depth': 8, 'num_leaves': 12, 'reg_alpha': 0.9661616378375898, 'reg_lambda': 0.5732333322672629}
rmse: 0.92365 	params: {'bagging_fraction': 0.858657507574162, 'boosting_type': 'dart', 'feature_fraction': 0.43498443034629597, 'gamma': 0.10325046444077475, 'max_depth': 8, 'num_leaves': 68, 'reg_alpha': 0.38901239635586626, 'reg_lambda': 0.2962773625557014}
rmse: 0.92122 	params: {'bagging_fraction': 0.8100725020671454, 'boosting_type': 'dart', 'feature_fraction': 0.7570688984498198, 'gamma': 0.14544235353069057, 'max_depth': 14, 'num_leaves': 22, 'reg_alpha': 0.427472057

rmse: 0.92485 	params: {'bagging_fraction': 0.7170813678328304, 'boosting_type': 'dart', 'feature_fraction': 0.7256200706023888, 'gamma': 0.3671881991137781, 'max_depth': 16, 'num_leaves': 44, 'reg_alpha': 0.6949671928779473, 'reg_lambda': 0.7606328463334753}
rmse: 0.94709 	params: {'bagging_fraction': 0.7627412589895034, 'boosting_type': 'gbdt', 'feature_fraction': 0.3888910443208207, 'gamma': 0.25271928457533893, 'max_depth': 7, 'num_leaves': 66, 'reg_alpha': 0.1712090636093719, 'reg_lambda': 0.6694403758176385}
rmse: 0.92090 	params: {'bagging_fraction': 0.9097237679224196, 'boosting_type': 'dart', 'feature_fraction': 0.49803292318996983, 'gamma': 0.39216130117104864, 'max_depth': 12, 'num_leaves': 20, 'reg_alpha': 0.6372041574246663, 'reg_lambda': 0.25712404955228424}
rmse: 0.92874 	params: {'bagging_fraction': 0.8730111166549867, 'boosting_type': 'dart', 'feature_fraction': 0.5401827264671424, 'gamma': 0.40361335423813177, 'max_depth': 9, 'num_leaves': 110, 'reg_alpha': 0.08372208

In [162]:
best

{'bagging_fraction': 0.834750265236025,
 'boosting_type': 1,
 'feature_fraction': 0.7703258965795443,
 'gamma': 0.4687491515287674,
 'max_depth': 6,
 'num_leaves': 5,
 'reg_alpha': 0.45107691031250763,
 'reg_lambda': 0.8878342456557001}

In [166]:
lgb_clf = lgb.LGBMRegressor(random_state=0, n_jobs=8)
lgb_params = {
    'clf' : lgb_clf,
    'params': {
        
        'boosting_type': 'dart',
        'max_depth' : 10, 
        'num_leaves'   : 13,         
        'feature_fraction': 0.7703258965795443,
        'bagging_fraction': 0.834750265236025,
        'reg_alpha' : 0.45107691031250763,
        'reg_lambda': 0.8878342456557001,
        'gamma' : 0.4687491515287674
    }    
}
best_lgb_2nd_val_loss = objective(lgb_params)

rmse: 0.91792 	params: {'boosting_type': 'dart', 'max_depth': 10, 'num_leaves': 13, 'feature_fraction': 0.7703258965795443, 'bagging_fraction': 0.834750265236025, 'reg_alpha': 0.45107691031250763, 'reg_lambda': 0.8878342456557001, 'gamma': 0.4687491515287674}


> rmse: 0.91882 	
- params: `{'bagging_fraction': 0.8219608927918671, 'boosting_type': 'dart', 'feature_fraction': 0.8197533685642909, 'gamma': 0.4386326407529417, 'max_depth': 11, 'num_leaves': 10, 'reg_alpha': 0.7227955651016644, 'reg_lambda': 0.11999749478454569}`

# Submission

In [149]:
# lgb_train = lgb.Dataset(stage2_df[stage2_final_train_mask],stage2_train_y[stage2_final_train_mask])
# params_best = {
#         'boosting_type': 'dart',
#         'max_depth' : 11,
#         'num_leaves': 10,        
#         'feature_fraction': 0.81975,
#         'bagging_fraction': 0.82196,
#         'reg_alpha' : 0.7228,
#         'reg_lambda': 0.1200,
#         'gamma' : 0.4386
# }
# # lgb_cv = lgb.Dataset(X_cv,y_cv,reference=lgb_train)
# reg_sub = lgb.train(params_best,                    
#                     train_set = lgb_train
#                     )

In [168]:
lgb_params = {                
    'boosting_type': 'dart',
    'max_depth' : 10, 
    'num_leaves'   : 13,         
    'feature_fraction': 0.7703258965795443,
    'bagging_fraction': 0.834750265236025,
    'reg_alpha' : 0.45107691031250763,
    'reg_lambda': 0.8878342456557001,
    'gamma' : 0.4687491515287674
}

In [170]:
lgb_clf = lgb.LGBMRegressor(random_state=0, n_jobs=8,**lgb_params)

## simple hold out 

lgb_clf.fit(stage2_df[stage2_train_mask].values, stage2_train_y[stage2_train_mask])    

pred_y = lgb_clf.predict(stage2_df[stage2_valid_mask].values)
pred_y = np.clip(pred_y, 0., 20.) # clip to (0,20)
rmse = mean_squared_error(stage2_train_y[stage2_valid_mask], pred_y)**.5
print('rmse: {:.5f} '.format(rmse), end='\t')


rmse: 0.91792 	

In [1]:
lgb_clf = lgb.LGBMRegressor(random_state=0, n_jobs=8,**lgb_params)
lgb_clf.fit(stage2_df[stage2_final_train_mask].values,stage2_train_y[stage2_final_train_mask])
pred_y = lgb_clf.predict(stage2_df[stage2_test_mask].values)
pred_y = np.clip(pred_y,0,20)
## 

NameError: name 'lgb' is not defined

In [172]:
y_submit = pd.DataFrame({'ID':range(len(pred_y)), 'item_cnt_month':pred_y}) # validate on 2015oct : 0.91732, LB: 0.94824
y_submit.to_csv('../data/output/submission_lgbm5_stacking_finetune_params.csv',index=False) # stacking