In [2]:
import numpy as np
import pandas as pd
import time

from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

random_state = 13
np.random.seed(random_state)

In [3]:
print('read data')
df_train = pd.read_csv('input/train.csv')
df_test = pd.read_csv('input/test.csv')
test_ID = df_test['ID_code'].values
Y = df_train.target.values.astype(np.float32)
df_train = df_train.drop(['ID_code','target'], axis=1)
df_test = df_test.drop(['ID_code'], axis=1)
df_all = pd.concat([df_train,df_test], axis=0, sort=False, ignore_index=True)
del df_train, df_test
enginering_feats = [('var_26','var_44'),('var_44','var_123'),('var_44','var_155')]

read data


In [4]:
print('start training of LightGBM...')
import lightgbm as lgb
from sklearn.model_selection import KFold

start_tiem = time.time()
n_predict = 0
valid = np.zeros( (len(test_ID),) )

def lgb_roc_score(y_hat, data):
    y_true = data.get_label()
    return 'roc', roc_auc_score(y_true, y_hat), True

start training of LightGBM...


In [5]:
for fe_id, fe in enumerate(enginering_feats):
    # Magic Feature Enginering
    df_e = df_all.copy()
    df_e['%s_plus_%s'%fe] = df_e[fe[0]]+df_e[fe[1]]
    df_e['%s_minus_%s'%fe] = df_e[fe[1]]-df_e[fe[0]]
    df_e = df_e.drop(list(fe),axis=1)
    Xp = df_e.values
    _X = Xp[:len(Y)]
    Xt = Xp[len(Y):]
    X = _X
    del df_e, _X

In [7]:
X.shape

(200000, 200)

In [8]:
for fold_id, (IDX_train, IDX_test) in enumerate(KFold(n_splits=6, random_state=fe_id+random_state, shuffle=True).split(Y)):
    X_train = X[IDX_train]
    X_test = X[IDX_test]
    Y_train = Y[IDX_train]
    Y_test = Y[IDX_test]

    lgb_params = {
        "objective" : "binary",
        "metric" : "roc",
        "max_depth" : 2,
        "num_leaves" : 2,
        "learning_rate" : 0.055,
        "bagging_fraction" : 0.3,
        "feature_fraction" : 0.15,
        "lambda_l1" : 5,
        "lambda_l2" : 5,
        "bagging_seed" : fe_id,
        "verbosity" : 1,
        "seed": fe_id
    }

    lgtrain = lgb.Dataset(X_train, label=Y_train)
    lgtest = lgb.Dataset(X_test, label=Y_test)
    evals_result = {}
    lgb_clf = lgb.train(lgb_params, lgtrain, 35000, 
                        valid_sets=[lgtrain, lgtest], 
                        early_stopping_rounds=500, 
                        verbose_eval=2000, 
                        feval=lgb_roc_score,
                        evals_result=evals_result)
    valid += lgb_clf.predict( Xt ).reshape((-1,))
    n_predict += 1


valid = np.clip( valid / n_predict, 0.0, 1.0 )
print('save result.')
pd.DataFrame({'ID_code':test_ID,'target':valid}).to_csv('submission_2.csv',index=False)
print('done.')

Training until validation scores don't improve for 500 rounds.
[2000]	training's roc: 0.890832	valid_1's roc: 0.885666
[4000]	training's roc: 0.903794	valid_1's roc: 0.896343
[6000]	training's roc: 0.908305	valid_1's roc: 0.899578
[8000]	training's roc: 0.910497	valid_1's roc: 0.90072
[10000]	training's roc: 0.91187	valid_1's roc: 0.901186
[12000]	training's roc: 0.912828	valid_1's roc: 0.901345
[14000]	training's roc: 0.913604	valid_1's roc: 0.901467
Early stopping, best iteration is:
[14041]	training's roc: 0.913622	valid_1's roc: 0.901481
Training until validation scores don't improve for 500 rounds.
[2000]	training's roc: 0.890637	valid_1's roc: 0.88233
[4000]	training's roc: 0.903566	valid_1's roc: 0.894825
[6000]	training's roc: 0.9081	valid_1's roc: 0.898802
[8000]	training's roc: 0.910289	valid_1's roc: 0.900397
[10000]	training's roc: 0.911682	valid_1's roc: 0.901084
[12000]	training's roc: 0.912678	valid_1's roc: 0.901397
[14000]	training's roc: 0.913458	valid_1's roc: 0.9015

In [9]:
sub_1 = pd.read_csv('submission.csv')
sub_2 = pd.read_csv('submission_2.csv')

In [11]:
sub_1['target'] = (sub_1['target'] + sub_1['target'])/2

In [12]:
sub_1.to_csv('submission_avg_1_2.csv',index=False)