In [1]:
fname = 'lgb_070'

In [2]:
import numpy as np
import pandas as pd
import datetime
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

from tqdm import tqdm_notebook

import gc
import datetime

from sklearn.metrics import f1_score, log_loss
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.decomposition import PCA
import xgboost as xgb
import lightgbm as lgb

from scipy.stats import hmean, gmean
from scipy.special import expit, logit

pd.options.display.max_rows = 100
pd.options.display.max_colwidth = 100
pd.options.display.max_columns = 100

import pickle as pkl

In [3]:
import numpy as np 
from numba import jit

@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

def eval_auc (preds, dtrain):
    labels = dtrain.get_label()    
    return 'auc', fast_auc(labels, preds), True

In [28]:
def get_importances(clfs):
    importances = [clf.feature_importance('gain') for clf in clfs]
    importances = np.vstack(importances)
    mean_gain = np.mean(importances, axis=0)
    features = clfs[0].feature_name()
    data = pd.DataFrame({'gain':mean_gain, 'feature':features})
    plt.figure(figsize=(8, 30))
    sns.barplot(x='gain', y='feature', data=data.sort_values('gain', ascending=False))
    plt.tight_layout()
    return data

def get_ranks(y_pred):
    return pd.Series(y_pred).rank().values

def cross_validate_lgb(param, x_train, y_train, x_test, kf,  
                       num_boost_round, 
                       fname, nseed=1, verbose_eval=1,
                       target='target', featimp=False, save_pred=False,
                       use_auc=True,
                       early_stopping_rounds=150,
                       use_all_folds=True,
                       ):
    
    if save_pred:
        now = datetime.datetime.now()
        now = str(now.strftime('%Y-%m-%d-%H-%M-%S'))
        print('started at:', now)
        print('num bagging seeds:', nseed)
        fname = '../submissions/'+fname+'_'+now
        print(x_train.shape, x_test.shape)
        
    
    nfold = kf.n_splits
    print('nfold', nfold)
        
    val_pred = np.zeros((x_train.shape[0], nseed))
    test_pred = np.zeros((x_test.shape[0], nfold * nseed))
        
    print('num features:', x_train.shape[1])
    
    d_train = lgb.Dataset(x_train, label=y_train)
    if use_auc:
        metric = 'auc'
    else:
        metric = None
    history = lgb.cv(params, d_train, num_boost_round=num_boost_round, 
                     folds=kf, metrics=metric, fobj=None,
                     early_stopping_rounds=early_stopping_rounds, 
                    verbose_eval=verbose_eval, show_stdv=True, seed=0)
    if use_auc:
        best_round = np.argmax(history['auc-mean'])
    else:
        best_round = np.argmin(history['binary_logloss-mean'])
    print('best_round:', best_round)
    if best_round == 0:
        best_round = 1
    bsts = []
    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
        print('train fold', i, end=' ')
        x_train_kf, x_test_kf = x_train.loc[train_index,:].copy(), x_train.loc[test_index,:].copy()
        y_train_kf, y_test_kf = y_train[train_index], y_train[test_index]

        d_train_kf = lgb.Dataset(x_train_kf, label=y_train_kf)
        d_test_kf = lgb.Dataset(x_test_kf, label=y_test_kf)
        
        train_fold_pred = 0
        
        feval = None
        if use_auc:
            feval=eval_auc
        for seed in range(nseed):
            param['seed'] = seed
            

            bst = lgb.train(param, d_train_kf, num_boost_round=best_round,
                                verbose_eval=None,
                                early_stopping_rounds=None, 
                               )
                
            bsts.append(bst)
            pred = bst.predict(x_test_kf)
            val_pred[test_index, seed] = (pred)
            pred = bst.predict(x_test)
            test_pred[:, i*nseed + seed] = (pred)
            train_fold_pred = train_fold_pred + bst.predict(x_train_kf)
            print('.', end='')
            
        train_fold_pred /= nseed
        print('train auc: %0.5f' % fast_auc(y_train_kf, train_fold_pred), end=' ')
        val_fold_pred = np.mean(val_pred[test_index,:], axis=1)  
        print('val auc: %0.5f' % fast_auc(y_test_kf, val_fold_pred))
        del d_train_kf
        del d_test_kf
        gc.collect()
        if featimp:
            importances = get_importances(bsts)
            plt.show()
        else:
            importances = None
        if use_all_folds == False:
            break
    
    
    print('saving raw train prediction to:', fname+'_train.npy')
    np.save(fname+'_train.npy', val_pred)
    print('saving raw test prediction to:', fname+'_test.npy')
    np.save(fname+'_test.npy', test_pred)
    
    epsilon = 1e-6

    train_expit = (val_pred + epsilon - val_pred.ravel().min())
    test_expit = (test_pred + epsilon - test_pred.ravel().min())
    train_expit /= (epsilon + train_expit.ravel().max())
    test_expit /= (epsilon + test_expit.ravel().max())
    
    train_pred_mean = np.mean(train_expit, axis=1)
    test_pred_mean = np.mean(test_expit, axis=1)        
    print('cv       mean auc:%0.5f' % fast_auc(y_train, train_pred_mean))

    if featimp:
        importances = get_importances(bsts)
    else:
        importances = None

    return importances, val_pred, test_pred, bsts, fname


In [5]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

features = train.columns[2:]

x_train = train[features].copy()
y_train = train.target.values
x_test = test[features].copy()


In [6]:
public_lb = np.load('../data/public_LB.npy')
private_lb = np.load('../data/private_LB.npy')

real_test = np.concatenate((public_lb, private_lb))
real_test.shape

real_test = x_test.iloc[real_test]

In [7]:
x_train.shape, y_train.shape, x_test.shape, real_test.shape

((200000, 200), (200000,), (200000, 200), (100000, 200))

In [8]:
count_cols = []

for c in tqdm_notebook(features[:200]):
    count_col = c+'_count'
    tmp = pd.concat((x_train[c], real_test[c])).value_counts()
    x_train[count_col] = x_train[c].map(tmp).clip(0, 6)
    x_test[count_col] = x_test[c].map(tmp).clip(0, 6).fillna(1)
    count_cols.append(count_col)
 

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [29]:
params =  {
    'boosting_type': 'gbdt',
  "objective"                  : "binary",
  "learning_rate"              : 0.01,
  "num_leaves"                 : 3,
    'feature_fraction':1,
  "bagging_fraction"           : 0.8,
  "bagging_freq"               : 1,
               #'min_data_in_leaf' : 80,
               #'min_sum_hessian_in_leaf' : 10.0,
     'nthread'                  : 20,
    'bin_construct_sample_cnt' : 1000000,
    'max_depth':2,
    'lambda_l2':2,
}

num_boost_round = 10000
verbose_eval=1000
early_stopping_rounds = 200

res = []

for c, count in zip(tqdm_notebook(features), count_cols): #[('var_28', 'var_28_count')]: #
    
    fname_var = '_'.join([fname, c])
    print(c)
    x_train_1 = x_train[[c, count]].copy()
    y_train = train.target.values
    x_test_1 = x_test[[c, count]].copy()


    for seed in range(1):
        print('***** seed %d *****' % seed)
        kf = StratifiedKFold(5, shuffle=True, random_state=seed)

        (importances_reglinear, train_pred_reglinear, test_pred_reglinear, bsts, fname_save
        ) = cross_validate_lgb(params, x_train_1, y_train, x_test_1, kf,  
                           num_boost_round, 
                           fname_var, nseed=1, verbose_eval=None,
                           target='target', featimp=False, save_pred=True,
                           use_auc=False,
                           early_stopping_rounds=early_stopping_rounds,
                               use_all_folds=True,
                           )
        res.append(fname_save)
    

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))

var_0
***** seed 0 *****
started at: 2019-03-27-09-04-50
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 800
train fold 0 .train auc: 0.55550 val auc: 0.54871
train fold 1 .train auc: 0.55639 val auc: 0.54207
train fold 2 .train auc: 0.55611 val auc: 0.55044
train fold 3 .train auc: 0.55640 val auc: 0.54868
train fold 4 .train auc: 0.55603 val auc: 0.54896
saving raw train prediction to: ../submissions/lgb_070_var_0_2019-03-27-09-04-50_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_0_2019-03-27-09-04-50_test.npy
cv       mean auc:0.54739
var_1
***** seed 0 *****
started at: 2019-03-27-09-05-09
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 1104
train fold 0 .train auc: 0.55262 val auc: 0.54938
train fold 1 .train auc: 0.55393 val auc: 0.54464
train fold 2 .train auc: 0.55444 val auc: 0.54258
train fold 3 .train auc: 0.55329 val auc: 0.54711
train fold 4 .train auc: 0.55310 val auc: 0.54757
saving ra

best_round: 112
train fold 0 .train auc: 0.51281 val auc: 0.50569
train fold 1 .train auc: 0.51529 val auc: 0.50765
train fold 2 .train auc: 0.51420 val auc: 0.50166
train fold 3 .train auc: 0.51456 val auc: 0.49848
train fold 4 .train auc: 0.51453 val auc: 0.50846
saving raw train prediction to: ../submissions/lgb_070_var_14_2019-03-27-09-08-04_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_14_2019-03-27-09-08-04_test.npy
cv       mean auc:0.50412
var_15
***** seed 0 *****
started at: 2019-03-27-09-08-09
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 258
train fold 0 .train auc: 0.52301 val auc: 0.51000
train fold 1 .train auc: 0.52021 val auc: 0.52074
train fold 2 .train auc: 0.52060 val auc: 0.51611
train fold 3 .train auc: 0.52160 val auc: 0.51314
train fold 4 .train auc: 0.52095 val auc: 0.51156
saving raw train prediction to: ../submissions/lgb_070_var_15_2019-03-27-09-08-09_train.npy
saving raw test prediction to: ../submiss

cv       mean auc:0.49857
var_28
***** seed 0 *****
started at: 2019-03-27-09-10-45
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 351
train fold 0 .train auc: 0.52904 val auc: 0.52314
train fold 1 .train auc: 0.53239 val auc: 0.51588
train fold 2 .train auc: 0.52947 val auc: 0.52791
train fold 3 .train auc: 0.53015 val auc: 0.51980
train fold 4 .train auc: 0.52975 val auc: 0.52267
saving raw train prediction to: ../submissions/lgb_070_var_28_2019-03-27-09-10-45_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_28_2019-03-27-09-10-45_test.npy
cv       mean auc:0.52083
var_29
***** seed 0 *****
started at: 2019-03-27-09-10-54
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 30
train fold 0 .train auc: 0.50858 val auc: 0.50116
train fold 1 .train auc: 0.50880 val auc: 0.49635
train fold 2 .train auc: 0.50782 val auc: 0.50033
train fold 3 .train auc: 0.50807 val auc: 0.50374
train fold 4 .train auc: 0.5106

best_round: 91
train fold 0 .train auc: 0.51397 val auc: 0.50565
train fold 1 .train auc: 0.51165 val auc: 0.50606
train fold 2 .train auc: 0.51499 val auc: 0.50380
train fold 3 .train auc: 0.51472 val auc: 0.50137
train fold 4 .train auc: 0.51316 val auc: 0.50665
saving raw train prediction to: ../submissions/lgb_070_var_42_2019-03-27-09-13-04_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_42_2019-03-27-09-13-04_test.npy
cv       mean auc:0.50347
var_43
***** seed 0 *****
started at: 2019-03-27-09-13-08
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 240
train fold 0 .train auc: 0.53173 val auc: 0.51621
train fold 1 .train auc: 0.52930 val auc: 0.52739
train fold 2 .train auc: 0.52965 val auc: 0.52270
train fold 3 .train auc: 0.52816 val auc: 0.53040
train fold 4 .train auc: 0.52955 val auc: 0.52585
saving raw train prediction to: ../submissions/lgb_070_var_43_2019-03-27-09-13-08_train.npy
saving raw test prediction to: ../submissi

best_round: 621
train fold 0 .train auc: 0.53528 val auc: 0.52202
train fold 1 .train auc: 0.53546 val auc: 0.52763
train fold 2 .train auc: 0.53396 val auc: 0.53112
train fold 3 .train auc: 0.53490 val auc: 0.52796
train fold 4 .train auc: 0.53337 val auc: 0.53482
saving raw train prediction to: ../submissions/lgb_070_var_56_2019-03-27-09-15-49_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_56_2019-03-27-09-15-49_test.npy
cv       mean auc:0.52816
var_57
***** seed 0 *****
started at: 2019-03-27-09-16-04
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 224
train fold 0 .train auc: 0.52041 val auc: 0.51235
train fold 1 .train auc: 0.51812 val auc: 0.51797
train fold 2 .train auc: 0.52162 val auc: 0.50517
train fold 3 .train auc: 0.52072 val auc: 0.51100
train fold 4 .train auc: 0.51896 val auc: 0.51742
saving raw train prediction to: ../submissions/lgb_070_var_57_2019-03-27-09-16-04_train.npy
saving raw test prediction to: ../submiss

best_round: 527
train fold 0 .train auc: 0.53358 val auc: 0.52582
train fold 1 .train auc: 0.53238 val auc: 0.53338
train fold 2 .train auc: 0.53426 val auc: 0.52210
train fold 3 .train auc: 0.53416 val auc: 0.52105
train fold 4 .train auc: 0.53397 val auc: 0.52664
saving raw train prediction to: ../submissions/lgb_070_var_70_2019-03-27-09-18-16_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_70_2019-03-27-09-18-16_test.npy
cv       mean auc:0.52479
var_71
***** seed 0 *****
started at: 2019-03-27-09-18-29
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 434
train fold 0 .train auc: 0.53548 val auc: 0.52933
train fold 1 .train auc: 0.53541 val auc: 0.53283
train fold 2 .train auc: 0.53519 val auc: 0.53126
train fold 3 .train auc: 0.53479 val auc: 0.53276
train fold 4 .train auc: 0.53653 val auc: 0.52688
saving raw train prediction to: ../submissions/lgb_070_var_71_2019-03-27-09-18-29_train.npy
saving raw test prediction to: ../submiss

best_round: 203
train fold 0 .train auc: 0.51785 val auc: 0.51181
train fold 1 .train auc: 0.51922 val auc: 0.50564
train fold 2 .train auc: 0.51660 val auc: 0.51756
train fold 3 .train auc: 0.51718 val auc: 0.50929
train fold 4 .train auc: 0.51777 val auc: 0.50296
saving raw train prediction to: ../submissions/lgb_070_var_84_2019-03-27-09-21-56_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_84_2019-03-27-09-21-56_test.npy
cv       mean auc:0.50871
var_85
***** seed 0 *****
started at: 2019-03-27-09-22-03
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 454
train fold 0 .train auc: 0.52808 val auc: 0.52467
train fold 1 .train auc: 0.53008 val auc: 0.52008
train fold 2 .train auc: 0.52994 val auc: 0.51867
train fold 3 .train auc: 0.52898 val auc: 0.52394
train fold 4 .train auc: 0.52802 val auc: 0.52457
saving raw train prediction to: ../submissions/lgb_070_var_85_2019-03-27-09-22-03_train.npy
saving raw test prediction to: ../submiss

best_round: 56
train fold 0 .train auc: 0.51471 val auc: 0.49663
train fold 1 .train auc: 0.51020 val auc: 0.49710
train fold 2 .train auc: 0.51222 val auc: 0.50549
train fold 3 .train auc: 0.51119 val auc: 0.50912
train fold 4 .train auc: 0.51338 val auc: 0.49552
saving raw train prediction to: ../submissions/lgb_070_var_98_2019-03-27-09-24-53_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_98_2019-03-27-09-24-53_test.npy
cv       mean auc:0.49984
var_99
***** seed 0 *****
started at: 2019-03-27-09-24-57
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 487
train fold 0 .train auc: 0.55768 val auc: 0.54648
train fold 1 .train auc: 0.55584 val auc: 0.55146
train fold 2 .train auc: 0.55501 val auc: 0.55756
train fold 3 .train auc: 0.55698 val auc: 0.54813
train fold 4 .train auc: 0.55520 val auc: 0.55518
saving raw train prediction to: ../submissions/lgb_070_var_99_2019-03-27-09-24-57_train.npy
saving raw test prediction to: ../submissi

best_round: 770
train fold 0 .train auc: 0.53528 val auc: 0.52944
train fold 1 .train auc: 0.53478 val auc: 0.52978
train fold 2 .train auc: 0.53644 val auc: 0.52794
train fold 3 .train auc: 0.53661 val auc: 0.52779
train fold 4 .train auc: 0.53574 val auc: 0.53224
saving raw train prediction to: ../submissions/lgb_070_var_112_2019-03-27-09-27-45_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_112_2019-03-27-09-27-45_test.npy
cv       mean auc:0.52894
var_113
***** seed 0 *****
started at: 2019-03-27-09-28-03
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 420
train fold 0 .train auc: 0.52049 val auc: 0.51754
train fold 1 .train auc: 0.52095 val auc: 0.51014
train fold 2 .train auc: 0.52197 val auc: 0.50747
train fold 3 .train auc: 0.52133 val auc: 0.51250
train fold 4 .train auc: 0.52025 val auc: 0.51251
saving raw train prediction to: ../submissions/lgb_070_var_113_2019-03-27-09-28-03_train.npy
saving raw test prediction to: ../sub

best_round: 16
train fold 0 .train auc: 0.50918 val auc: 0.50072
train fold 1 .train auc: 0.50631 val auc: 0.50165
train fold 2 .train auc: 0.50717 val auc: 0.50389
train fold 3 .train auc: 0.51021 val auc: 0.50189
train fold 4 .train auc: 0.50881 val auc: 0.50109
saving raw train prediction to: ../submissions/lgb_070_var_126_2019-03-27-09-31-01_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_126_2019-03-27-09-31-01_test.npy
cv       mean auc:0.50186
var_127
***** seed 0 *****
started at: 2019-03-27-09-31-05
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 897
train fold 0 .train auc: 0.54419 val auc: 0.54676
train fold 1 .train auc: 0.54716 val auc: 0.53678
train fold 2 .train auc: 0.54786 val auc: 0.53202
train fold 3 .train auc: 0.54476 val auc: 0.54189
train fold 4 .train auc: 0.54585 val auc: 0.54006
saving raw train prediction to: ../submissions/lgb_070_var_127_2019-03-27-09-31-05_train.npy
saving raw test prediction to: ../subm

best_round: 153
train fold 0 .train auc: 0.51774 val auc: 0.51545
train fold 1 .train auc: 0.51827 val auc: 0.50687
train fold 2 .train auc: 0.52034 val auc: 0.50640
train fold 3 .train auc: 0.51801 val auc: 0.51574
train fold 4 .train auc: 0.51872 val auc: 0.51255
saving raw train prediction to: ../submissions/lgb_070_var_140_2019-03-27-09-33-57_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_140_2019-03-27-09-33-57_test.npy
cv       mean auc:0.51096
var_141
***** seed 0 *****
started at: 2019-03-27-09-34-03
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 949
train fold 0 .train auc: 0.53751 val auc: 0.53410
train fold 1 .train auc: 0.53771 val auc: 0.52956
train fold 2 .train auc: 0.53851 val auc: 0.53013
train fold 3 .train auc: 0.53894 val auc: 0.52817
train fold 4 .train auc: 0.53763 val auc: 0.53002
saving raw train prediction to: ../submissions/lgb_070_var_141_2019-03-27-09-34-03_train.npy
saving raw test prediction to: ../sub

best_round: 786
train fold 0 .train auc: 0.55096 val auc: 0.54007
train fold 1 .train auc: 0.55056 val auc: 0.53977
train fold 2 .train auc: 0.54868 val auc: 0.54957
train fold 3 .train auc: 0.54808 val auc: 0.55160
train fold 4 .train auc: 0.55066 val auc: 0.53987
saving raw train prediction to: ../submissions/lgb_070_var_154_2019-03-27-09-38-02_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_154_2019-03-27-09-38-02_test.npy
cv       mean auc:0.54361
var_155
***** seed 0 *****
started at: 2019-03-27-09-38-27
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 584
train fold 0 .train auc: 0.53677 val auc: 0.53905
train fold 1 .train auc: 0.53721 val auc: 0.53617
train fold 2 .train auc: 0.54045 val auc: 0.52406
train fold 3 .train auc: 0.53966 val auc: 0.52612
train fold 4 .train auc: 0.53719 val auc: 0.53852
saving raw train prediction to: ../submissions/lgb_070_var_155_2019-03-27-09-38-27_train.npy
saving raw test prediction to: ../sub

best_round: 290
train fold 0 .train auc: 0.52305 val auc: 0.51608
train fold 1 .train auc: 0.52283 val auc: 0.51324
train fold 2 .train auc: 0.52106 val auc: 0.51404
train fold 3 .train auc: 0.52330 val auc: 0.50950
train fold 4 .train auc: 0.52419 val auc: 0.51096
saving raw train prediction to: ../submissions/lgb_070_var_168_2019-03-27-09-42-02_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_168_2019-03-27-09-42-02_test.npy
cv       mean auc:0.51217
var_169
***** seed 0 *****
started at: 2019-03-27-09-42-14
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 607
train fold 0 .train auc: 0.55418 val auc: 0.53678
train fold 1 .train auc: 0.55250 val auc: 0.54561
train fold 2 .train auc: 0.55120 val auc: 0.55069
train fold 3 .train auc: 0.55018 val auc: 0.55643
train fold 4 .train auc: 0.55334 val auc: 0.54565
saving raw train prediction to: ../submissions/lgb_070_var_169_2019-03-27-09-42-14_train.npy
saving raw test prediction to: ../sub

best_round: 316
train fold 0 .train auc: 0.51634 val auc: 0.49889
train fold 1 .train auc: 0.51483 val auc: 0.50751
train fold 2 .train auc: 0.51630 val auc: 0.50059
train fold 3 .train auc: 0.51563 val auc: 0.50608
train fold 4 .train auc: 0.51679 val auc: 0.50521
saving raw train prediction to: ../submissions/lgb_070_var_182_2019-03-27-09-46-41_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_182_2019-03-27-09-46-41_test.npy
cv       mean auc:0.50280
var_183
***** seed 0 *****
started at: 2019-03-27-09-46-53
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 69
train fold 0 .train auc: 0.51098 val auc: 0.49926
train fold 1 .train auc: 0.50944 val auc: 0.49821
train fold 2 .train auc: 0.51165 val auc: 0.50088
train fold 3 .train auc: 0.51406 val auc: 0.50251
train fold 4 .train auc: 0.51078 val auc: 0.50260
saving raw train prediction to: ../submissions/lgb_070_var_183_2019-03-27-09-46-53_train.npy
saving raw test prediction to: ../subm

best_round: 793
train fold 0 .train auc: 0.53221 val auc: 0.51372
train fold 1 .train auc: 0.53056 val auc: 0.52543
train fold 2 .train auc: 0.52900 val auc: 0.52710
train fold 3 .train auc: 0.53210 val auc: 0.51752
train fold 4 .train auc: 0.52999 val auc: 0.52296
saving raw train prediction to: ../submissions/lgb_070_var_196_2019-03-27-09-50-36_train.npy
saving raw test prediction to: ../submissions/lgb_070_var_196_2019-03-27-09-50-36_test.npy
cv       mean auc:0.52057
var_197
***** seed 0 *****
started at: 2019-03-27-09-51-01
num bagging seeds: 1
(200000, 2) (200000, 2)
nfold 5
num features: 2
best_round: 986
train fold 0 .train auc: 0.53910 val auc: 0.52691
train fold 1 .train auc: 0.53878 val auc: 0.53362
train fold 2 .train auc: 0.53832 val auc: 0.53145
train fold 3 .train auc: 0.53721 val auc: 0.53372
train fold 4 .train auc: 0.53960 val auc: 0.52371
saving raw train prediction to: ../submissions/lgb_070_var_197_2019-03-27-09-51-01_train.npy
saving raw test prediction to: ../sub

In [31]:
train_pred = pd.DataFrame()
for  col, f in zip(features, res):
    train_pred[col] = np.load('%s_train.npy' % f).ravel()

In [32]:
test_pred = pd.DataFrame()
for  col, f in zip(features, res):
    test_pred[col] = np.load('%s_test.npy' % f).mean(axis=1)

In [35]:
fast_auc(y_train, np.log(train_pred).sum(axis=1) -np.log(1 - train_pred).sum(axis=1))

0.9232452918532014

In [38]:
fname = 'lgb_070'

sub = train[['ID_code']].copy()
sub['target'] = np.log(train_pred).sum(axis=1) -np.log(1 - train_pred).sum(axis=1)
sub.target = sub.target.rank() / 200000
print('cv       mean auc:%0.5f' % fast_auc(y_train, sub.target))

print('saving train pred to', '../submissions/%s_train.csv' % fname)
sub.to_csv('../submissions/%s_train.csv' % fname, index=False, float_format='%.6f')

sub = test[['ID_code']].copy()
sub['target'] = np.log(test_pred).sum(axis=1) -np.log(1 - test_pred).sum(axis=1)
sub.target = sub.target.rank() / 200000
print('saving test pred to', '../submissions/%s_test.csv' % fname)
sub.to_csv('../submissions/%s_test.csv' % fname, index=False, float_format='%.6f')

sub.head()

cv       mean auc:0.92325
saving train pred to ../submissions/lgb_070_train.csv
saving test pred to ../submissions/lgb_070_test.csv


Unnamed: 0,ID_code,target
0,test_0,0.789285
1,test_1,0.895385
2,test_2,0.890305
3,test_3,0.8654
4,test_4,0.785815


In [16]:
(np.log(train_pred).sum(axis=1) -np.log(1 - train_pred).sum(axis=1)).head()

0   -440.586297
1   -436.475481
2   -441.254642
3   -437.622267
4   -438.854418
dtype: float64