In [49]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from scipy.stats import percentileofscore
import sys
from tqdm import tqdm_notebook
sys.path.append('../')

%matplotlib inline

In [103]:
baseModelNames = [
#     'LB0.802',
#     'allParamsTuned',
#     'withAggFeatsIn',
#     'PrunedFeatureModel',
    '0824_new_agg_Final',
#     'WithPosScaleWeight_eq_5',
    'CV10-0821',
    'CV10-0827',
#     '1150FEATS_LGB',
#     '1050FEATS_LGB',
#     '1400FEATS_LGB',
    'aws_stacking_base_lgb_1',
    'aws_stacking_base_lgb_4',
#     'aws_stacking_base_lgb_7',
    'aws_stacking_base_lgb_9',
    'aws_stacking_base_lgb_10',
    'XGB-5-CV',
    '6CV_Final_FEATURE_SET',
]

In [104]:
def get_data(modelNames):
    oof = {}
    preds = {}
    for name in tqdm_notebook(modelNames):
        pathName = '../workdir/' + name
        with open(pathName + '/kfold_model.pkl', 'rb') as f:
            Clf = pickle.load(f)
        oof_arr = Clf.oof_preds_
        pred_arr = pd.read_csv(pathName + '/preds.csv', usecols=['TARGET']).values.ravel()

        oof[name + '_oof'] = oof_arr
        preds[name + '_preds'] = pred_arr

    return oof, preds

oof, preds = get_data(baseModelNames)
df = pd.DataFrame(oof)
predDf = pd.DataFrame(preds)

A Jupyter Widget






In [105]:
df = df.join(pd.read_csv('../input/train_df.csv', usecols=['SK_ID_CURR', 'TARGET']))
oof_cols = [col for col in df.columns if '_oof' in col]
# df[oof_cols] = df[oof_cols].apply(lambda x: x.rank() / len(x))
df['std'] = df[oof_cols].std(axis=1)

In [106]:
predDf = predDf.join(pd.read_csv('../input/sample_submission.csv', usecols=['SK_ID_CURR']))
pred_cols = [col for col in predDf.columns if '_preds' in col]
predDf['std'] = predDf[pred_cols].std(axis=1)

<font size=5> OOF AUC

In [107]:
for col in oof_cols:
    auc = roc_auc_score(df['TARGET'], df[col])
    print('%-30s    %.6f' % (col, auc))

0824_new_agg_Final_oof            0.798611
CV10-0821_oof                     0.799078
CV10-0827_oof                     0.799363
aws_stacking_base_lgb_1_oof       0.797604
aws_stacking_base_lgb_4_oof       0.797043
aws_stacking_base_lgb_9_oof       0.797162
aws_stacking_base_lgb_10_oof      0.797106
XGB-5-CV_oof                      0.797479


<font size=5> Correlation

In [108]:
df[oof_cols].describe()

Unnamed: 0,0824_new_agg_Final_oof,CV10-0821_oof,CV10-0827_oof,aws_stacking_base_lgb_1_oof,aws_stacking_base_lgb_4_oof,aws_stacking_base_lgb_9_oof,aws_stacking_base_lgb_10_oof,XGB-5-CV_oof
count,307507.0,307507.0,307507.0,307507.0,307507.0,307507.0,307507.0,307507.0
mean,0.078994,0.079124,0.078928,0.078976,0.078696,0.078761,0.078649,0.078925
std,0.09551,0.095313,0.095482,0.095241,0.094969,0.09514,0.094419,0.095507
min,0.001302,0.001485,0.001443,0.001327,0.001318,0.001326,0.001616,0.000883
25%,0.021344,0.02151,0.021448,0.021571,0.021387,0.021524,0.022033,0.021387
50%,0.043726,0.043933,0.043679,0.043812,0.043636,0.043784,0.044037,0.044113
75%,0.095522,0.095955,0.095516,0.095455,0.095264,0.095251,0.094804,0.095769
max,0.883344,0.900749,0.900269,0.894087,0.897669,0.89328,0.894229,0.913474


In [131]:
df['TARGET'].sum()

24825.0

In [138]:
df['CV10-0827_oof'].quantile(.08)

0.010902984270474817

In [141]:
low_risk[low_risk['TARGET'] == 1]

Unnamed: 0,0824_new_agg_Final_oof,CV10-0821_oof,CV10-0827_oof,aws_stacking_base_lgb_1_oof,aws_stacking_base_lgb_4_oof,aws_stacking_base_lgb_9_oof,aws_stacking_base_lgb_10_oof,XGB-5-CV_oof,SK_ID_CURR,TARGET,std
246,0.038260,0.029567,0.035721,0.028824,0.041763,0.046673,0.037448,0.027764,100286,1.0,0.006704
550,0.026679,0.029402,0.026147,0.027660,0.028216,0.021893,0.024164,0.021548,100636,1.0,0.002906
624,0.008623,0.011717,0.012074,0.012445,0.011544,0.012011,0.015299,0.016005,100714,1.0,0.002302
788,0.040283,0.043899,0.034584,0.049222,0.032060,0.044386,0.048571,0.043703,100902,1.0,0.006144
1007,0.019165,0.019714,0.022201,0.019472,0.022979,0.018866,0.020505,0.018353,101165,1.0,0.001640
1137,0.022729,0.022225,0.021292,0.024974,0.021819,0.024503,0.021139,0.021494,101330,1.0,0.001466
1312,0.006111,0.006406,0.006592,0.008359,0.007362,0.005699,0.005950,0.006853,101541,1.0,0.000863
1507,0.007826,0.007464,0.006623,0.006939,0.007253,0.006641,0.007357,0.004772,101772,1.0,0.000939
1644,0.008984,0.016274,0.016658,0.012952,0.018164,0.018133,0.016900,0.015796,101929,1.0,0.003093
1699,0.041052,0.042687,0.037669,0.049756,0.063213,0.051235,0.040259,0.031162,101988,1.0,0.009862


In [139]:
low_risk = df[df['CV10-0827_oof'] < 0.04]

In [147]:
high_risk = df[df['CV10-0827_oof'] > 0.22]

In [148]:
high_risk.groupby(['TARGET']).size()

TARGET
0.0    16025
1.0     8323
dtype: int64

In [163]:
df

Unnamed: 0_level_0,0824_new_agg_Final_oof,CV10-0821_oof,CV10-0827_oof,aws_stacking_base_lgb_1_oof,aws_stacking_base_lgb_4_oof,aws_stacking_base_lgb_9_oof,aws_stacking_base_lgb_10_oof,XGB-5-CV_oof,TARGET,std
SK_ID_CURR,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
100002,0.210298,0.205311,0.167970,0.176746,0.215443,0.174997,0.169135,0.133222,1.0,0.027464
100003,0.017592,0.014610,0.019369,0.013991,0.015109,0.013443,0.013278,0.020602,0.0,0.002823
100004,0.027966,0.024537,0.035159,0.030465,0.021895,0.027267,0.030108,0.032351,0.0,0.004249
100006,0.061338,0.087284,0.062734,0.074991,0.078352,0.083214,0.083166,0.069739,0.0,0.009718
100007,0.061688,0.043275,0.069017,0.076048,0.076391,0.070453,0.079255,0.065116,0.0,0.011510
100008,0.044475,0.038145,0.044428,0.045779,0.043143,0.043349,0.051762,0.052328,0.0,0.004665
100009,0.004582,0.004194,0.004712,0.004342,0.004491,0.003856,0.005056,0.003027,0.0,0.000619
100010,0.014206,0.019453,0.017051,0.015389,0.011798,0.015122,0.016749,0.011203,0.0,0.002737
100011,0.075978,0.056020,0.046456,0.061817,0.060444,0.057441,0.050031,0.063852,0.0,0.009031
100012,0.050442,0.035514,0.042210,0.040204,0.047586,0.045780,0.044687,0.047944,0.0,0.004818


In [170]:
low_risk_bad_man

1         100003
2         100004
6         100009
7         100010
10        100014
11        100015
12        100016
13        100017
14        100018
17        100021
18        100022
23        100027
24        100029
25        100030
29        100034
32        100037
35        100041
36        100043
37        100044
38        100045
39        100046
43        100050
47        100054
48        100055
50        100058
52        100060
53        100061
54        100062
58        100069
60        100071
           ...  
307440    456182
307441    456183
307443    456185
307448    456191
307452    456195
307454    456197
307456    456199
307458    456201
307459    456203
307460    456204
307461    456205
307462    456206
307463    456207
307464    456208
307473    456217
307478    456226
307479    456227
307481    456229
307482    456230
307484    456232
307487    456235
307489    456237
307490    456238
307491    456239
307495    456243
307496    456244
307498    456246
307500    4562

In [162]:
df.set_index('SK_ID_CURR', inplace=True)

In [173]:
df['weight'] = 1
df.loc[low_risk_bad_man.values.ravel(), 'weight'] = 5
df.loc[high_risk_good_man.values.ravel(), 'weight'] = 5

In [176]:
with open('../config/sample_weights_lst.pkl', 'wb') as f:
    pickle.dump(df['weight'].values, f)

In [181]:
train_df_debug = pd.read_csv('../input/train_df.csv', nrows=20000)

In [None]:
train_df_debug.to_csv('')

In [171]:
low_risk_bad_man = low_risk[low_risk['TARGET'] == 1]['SK_ID_CURR']

In [172]:
high_risk_good_man = high_risk[high_risk['TARGET'] == 0]['SK_ID_CURR']

In [117]:
df['std'].describe()

count    307507.000000
mean          0.009311
std           0.009642
min           0.000137
25%           0.002771
50%           0.005750
75%           0.012261
max           0.095233
Name: std, dtype: float64

In [109]:
df[oof_cols].corr()

Unnamed: 0,0824_new_agg_Final_oof,CV10-0821_oof,CV10-0827_oof,aws_stacking_base_lgb_1_oof,aws_stacking_base_lgb_4_oof,aws_stacking_base_lgb_9_oof,aws_stacking_base_lgb_10_oof,XGB-5-CV_oof
0824_new_agg_Final_oof,1.0,0.981866,0.980044,0.977545,0.972665,0.97412,0.974647,0.972598
CV10-0821_oof,0.981866,1.0,0.981068,0.981726,0.977862,0.978905,0.978856,0.973626
CV10-0827_oof,0.980044,0.981068,1.0,0.985328,0.981283,0.982709,0.982644,0.985936
aws_stacking_base_lgb_1_oof,0.977545,0.981726,0.985328,1.0,0.987072,0.98786,0.988867,0.977503
aws_stacking_base_lgb_4_oof,0.972665,0.977862,0.981283,0.987072,1.0,0.984955,0.984667,0.973275
aws_stacking_base_lgb_9_oof,0.97412,0.978905,0.982709,0.98786,0.984955,1.0,0.98637,0.975599
aws_stacking_base_lgb_10_oof,0.974647,0.978856,0.982644,0.988867,0.984667,0.98637,1.0,0.975993
XGB-5-CV_oof,0.972598,0.973626,0.985936,0.977503,0.973275,0.975599,0.975993,1.0


<font size=5> start Stacking

In [110]:
from lightgbm import LGBMClassifier

In [111]:
def train(X, y, model, cv=5):
    from copy import deepcopy
    models = [deepcopy(model) for i in range(cv)]
    split = StratifiedKFold(n_splits=cv, shuffle=True, random_state=2018)
    oof_preds = np.zeros((len(X),))
    for i, (train_idx, dev_idx) in enumerate(split.split(X, y)):
        X_train, X_dev = X.iloc[train_idx], X.iloc[dev_idx]
        y_train, y_dev = y.iloc[train_idx], y.iloc[dev_idx]
        
        model = models[i]
        print('start train fold %s' % str(i + 1))
        model.fit(X_train, y_train,
#                   eval_set=[(X_train, y_train), (X_dev, y_dev)],
#                   eval_names=['train', 'valid'],
#                   eval_metric='auc',
#                   verbose=20,
#                   early_stopping_rounds=50,
                 )
        y_hat = model.predict_proba(X_dev)[:, 1]
        oof_preds[dev_idx] = y_hat
        print('=' * 60)
    print("FULL AUC: ", roc_auc_score(y, oof_preds))
    return models

# lgbClf = LGBMClassifier(n_estimators=250, learning_rate=0.01, reg_alpha=1)
models = train(df[oof_cols] , df['TARGET'], LogisticRegression())

start train fold 1
start train fold 2
start train fold 3
start train fold 4
start train fold 5
FULL AUC:  0.8002586939987644


In [112]:
for model in models:
    print(model.coef_)

[[ 2.26515779  2.57842904  1.90975781  0.48303102  0.31304016  0.284946
  -0.26249098  0.56277937]]
[[ 2.20675262  2.79784114  1.7581874   0.45571632  0.2504624   0.37178648
  -0.18541415  0.44971989]]
[[ 1.91377538  2.98586719  2.20742493  0.63714568  0.43199181  0.19910791
  -0.42061313  0.24338793]]
[[ 2.29983647  2.82105972  2.02093713  0.17550924  0.7359004   0.05534723
  -0.46840465  0.44056343]]
[[ 2.52130093  2.3003746   2.03512982  0.41161032  0.43621479  0.20947237
  -0.48564179  0.66807556]]


In [113]:
def pred(X, models):
    y_hat = np.zeros((len(X), ))
    n = len(models)
    for model in models:
        y_hat += model.predict_proba(X)[:, 1]
    y_hat /= n
    return y_hat

stackingPreds = pred(predDf[pred_cols], models)

<font size=5> Weighted AVG

In [45]:
def avg_score(weights, df):
    ret = 0
    for col, w in weights.items():
        ret += df[col + '_oof'] * w
    return ret
weights = {
    'LB0.802': 0, 
    'allParamsTuned': 0,
    'withAggFeatsIn': 0, 
    '0824_new_agg_Final':0, 
    'CV10-0821': 0.5,
    'CV10-0827': 0.4,
    '1150FEATS_LGB':0,
    'aws_stacking_base_lgb_1': 0.4,
    'aws_stacking_base_lgb_7': 0., 
    'aws_stacking_base_lgb_9': 0.,
    'aws_stacking_base_lgb_10': 0.5,
}
avg_score_ = avg_score(weights, df)
roc_auc_score(df['TARGET'], avg_score_)

0.799630420065069

In [114]:
predDf['TARGET'] = stackingPreds

In [115]:
predDf[['SK_ID_CURR', 'TARGET']].to_csv('logi_stacking_with_feature_selected.csv', index=False)