In [1]:
import gzip
import os
import pickle

import numpy as np
import pandas as pd

from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn import preprocessing

In [2]:
pd.options.display.width = 200
pd.options.display.max_columns = 160

In [3]:
PD = '../data/out'
predict_paths = sorted([os.path.join(PD, x) for x in os.listdir(PD) if x.find('_cvnan') == -1 and not x.startswith('l2_')])
predicts = [pd.read_csv(x) for x in predict_paths]
predict_paths

['../data/out/l1_1_ho_xgb_1_cv0.538182_std0.00519003.csv.gz',
 '../data/out/l1_1_ho_xgb_2_cv0.538129_std0.00516314.csv.gz',
 '../data/out/l1_1_keras_2_cv0.541153_std0.csv.gz',
 '../data/out/l1_3_ho_xgb_1_cv0.538187_std6.6891e-05.csv.gz',
 '../data/out/l1_3_ho_xgb_2_cv0.538055_std7.72378e-05.csv.gz',
 '../data/out/l1_3_ho_xgb_3_cv0.538288_std0.000183969.csv.gz',
 '../data/out/l1_3_keras_1_cv0.540691_std0.csv.gz',
 '../data/out/l1_3_keras_2_cv0.54326_std0.csv.gz',
 '../data/out/l1_4_et_1_cv0.542129_std3.44189e-05.csv.gz',
 '../data/out/l1_4_et_2_cv0.54197_std0.000176882.csv.gz',
 '../data/out/l1_4_et_2_cv0.5419_std0.000188937.csv.gz',
 '../data/out/l1_4_keras_1_cv0.541952_std0.csv.gz',
 '../data/out/l1_4_keras_2_cv0.543698_std0.csv.gz',
 '../data/out/l1_4_keras_3_cv0.542354_std0.csv.gz',
 '../data/out/l1_4_keras_4_cv0.541854_std0.csv.gz',
 '../data/out/l1_4_lgb_1_cv0.538081_std2.33345e-05.csv.gz',
 '../data/out/l1_4_rf_1_cv0.545454_std0.000162543.csv.gz',
 '../data/out/l1_4_rf_2_cv0.5435

In [4]:
df = pd.DataFrame()
df['id'] = predicts[-1]['id']
df['train'] = predicts[-1]['train']
df.describe()

Unnamed: 0,id,train
count,100000.0,100000.0
mean,49999.5,0.7
std,28867.657797,0.45826
min,0.0,0.0
25%,24999.75,0.0
50%,49999.5,1.0
75%,74999.25,1.0
max,99999.0,1.0


In [5]:
for p, n in zip(predicts, predict_paths):
    p.drop(['id', 'train'], axis=1, inplace=True)
    n = os.path.basename(n)
    n = n[:n.index('_cv')]
    p.columns = [n + '_' + c for c in p.columns]
predicts[-1].columns

Index(['l1_7_lgb_c_y', 'l1_7_lgb_c_lgb0', 'l1_7_lgb_c_lgb1', 'l1_7_lgb_c_lgb2', 'l1_7_lgb_c_lgb3', 'l1_7_lgb_c_lgb4'], dtype='object')

In [6]:
df2 = pd.concat([df] + predicts, axis=1)

idx = df2.train == 1
train = df2.ix[idx].drop(['train', 'id'], axis=1)
test = df2.ix[~idx].drop(['train', 'id'], axis=1)

In [7]:
to_drop = [c for c in train.columns if c.endswith('_y')]
y = train[to_drop[0]]

train.drop(to_drop, axis=1, inplace=True)
test.drop(to_drop, axis=1, inplace=True)

y.describe()

count    70000.000000
mean         0.499700
std          0.500003
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: l1_1_ho_xgb_1_y, dtype: float64

In [8]:
train.shape, test.shape

((70000, 176), (30000, 176))

In [9]:
metrics.log_loss(y_true=y, y_pred=train.mean(axis=1))

0.5379076630921803

In [10]:
metrics.log_loss(y_true=y, y_pred=train.median(axis=1))

0.53756568632270996

In [11]:
def gmean(t):
    eps = 1e-5
    t = np.clip(t, eps, 1-eps)
    t = np.log(t + eps)
    t = np.mean(t, axis=1)
    t = np.exp(t) - eps
    t = np.clip(t, eps, 1-eps)
    return t
metrics.log_loss(y_true=y, y_pred=gmean(train))

0.54583072301255953

In [12]:
train.min()

l1_1_ho_xgb_1_p0    0.019664
l1_1_ho_xgb_2_p0    0.027440
l1_1_keras_2_p      0.000000
l1_3_ho_xgb_1_p0    0.030272
l1_3_ho_xgb_1_p1    0.027619
l1_3_ho_xgb_1_p2    0.031279
l1_3_ho_xgb_2_p0    0.029898
l1_3_ho_xgb_2_p1    0.029202
l1_3_ho_xgb_2_p2    0.027126
l1_3_ho_xgb_3_p0    0.027876
l1_3_ho_xgb_3_p1    0.021997
l1_3_ho_xgb_3_p2    0.028107
l1_3_keras_1_p      0.000000
l1_3_keras_2_p      0.000000
l1_4_et_1_p0        0.012897
l1_4_et_1_p1        0.013616
l1_4_et_1_p2        0.009871
l1_4_et_2_p0        0.013670
l1_4_et_2_p1        0.017805
l1_4_et_2_p2        0.020341
l1_4_et_2_p0        0.014988
l1_4_et_2_p1        0.015837
l1_4_et_2_p2        0.018169
l1_4_keras_1_p      0.000239
l1_4_keras_2_p      0.002715
l1_4_keras_3_p      0.025872
l1_4_keras_4_p      0.013974
l1_4_lgb_1_p0       0.025083
l1_4_lgb_1_p1       0.027616
l1_4_lgb_1_p2       0.022892
                      ...   
l1_7_lgb_5_lgb0     0.025567
l1_7_lgb_5_lgb1     0.033775
l1_7_lgb_5_lgb2     0.035172
l1_7_lgb_6_lgb