In [22]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from scipy.stats import percentileofscore
import sys
from tqdm import tqdm_notebook
sys.path.append('../')

%matplotlib inline

In [30]:
baseModelNames = [
#     'LB0.802',
#     'allParamsTuned',
    'withAggFeatsIn',
#     'PrunedFeatureModel',
#     '0824_new_agg_Final',
#     'WithPosScaleWeight_eq_5',
    'CV10-0821',
    'CV10-0827',
    '1150FEATS_LGB',
#     '1050FEATS_LGB',
    '1400FEATS_LGB',
#     'aws_stacking_base_lgb_1',
#     'aws_stacking_base_lgb_4',
#     'aws_stacking_base_lgb_7',
#     'aws_stacking_base_lgb_9',
#     'aws_stacking_base_lgb_10',
#     'XGB-5-CV',
    '6CV_Final_FEATURE_SET',
    '8CV_NO_POS_CC',
#     '8CV_NO_CC_POS_INSTAL',
    '8CV_NO_FINAL',
]

In [31]:
def get_data(modelNames):
    oof = {}
    preds = {}
    for name in tqdm_notebook(modelNames):
        pathName = '../workdir/' + name
        with open(pathName + '/kfold_model.pkl', 'rb') as f:
            Clf = pickle.load(f)
        oof_arr = Clf.oof_preds_
        pred_arr = pd.read_csv(pathName + '/preds.csv', usecols=['TARGET']).values.ravel()

        oof[name + '_oof'] = oof_arr
        preds[name + '_preds'] = pred_arr

    return oof, preds

oof, preds = get_data(baseModelNames)
df = pd.DataFrame(oof)
predDf = pd.DataFrame(preds)

A Jupyter Widget



In [32]:
df = df.join(pd.read_csv('../input/train_df.csv', usecols=['SK_ID_CURR', 'TARGET']))
oof_cols = [col for col in df.columns if '_oof' in col]
# df[oof_cols] = df[oof_cols].apply(lambda x: x.rank() / len(x))
df['std'] = df[oof_cols].std(axis=1)

In [33]:
predDf = predDf.join(pd.read_csv('../input/sample_submission.csv', usecols=['SK_ID_CURR']))
pred_cols = [col for col in predDf.columns if '_preds' in col]
predDf['std'] = predDf[pred_cols].std(axis=1)

<font size=5> OOF AUC

In [34]:
for col in oof_cols:
    auc = roc_auc_score(df['TARGET'], df[col])
    print('%-30s    %.6f' % (col, auc))

withAggFeatsIn_oof                0.798296
CV10-0821_oof                     0.799078
CV10-0827_oof                     0.799363
1150FEATS_LGB_oof                 0.798455
1400FEATS_LGB_oof                 0.798209
6CV_Final_FEATURE_SET_oof         0.798551
8CV_NO_POS_CC_oof                 0.798237
8CV_NO_FINAL_oof                  0.799079


<font size=5> Correlation

In [35]:
df[oof_cols].describe()

Unnamed: 0,withAggFeatsIn_oof,CV10-0821_oof,CV10-0827_oof,1150FEATS_LGB_oof,1400FEATS_LGB_oof,6CV_Final_FEATURE_SET_oof,8CV_NO_POS_CC_oof,8CV_NO_FINAL_oof
count,307507.0,307507.0,307507.0,307507.0,307507.0,307507.0,307507.0,307507.0
mean,0.078945,0.079124,0.078928,0.078845,0.078797,0.078723,0.078611,0.078526
std,0.095678,0.095313,0.095482,0.095061,0.09529,0.095243,0.095268,0.095802
min,0.001279,0.001485,0.001443,0.001717,0.001523,0.001451,0.001516,0.001232
25%,0.021235,0.02151,0.021448,0.021517,0.021471,0.021356,0.021121,0.020928
50%,0.043577,0.043933,0.043679,0.043708,0.043566,0.043629,0.043493,0.04314
75%,0.095571,0.095955,0.095516,0.095422,0.095233,0.095335,0.095323,0.094901
max,0.879603,0.900749,0.900269,0.883544,0.889802,0.896685,0.901637,0.892403


In [60]:
with open('../config/sample_weights_lst.pkl', 'wb') as f:
    pickle.dump(df['weights'].values, f)

In [None]:
train_df_debug = pd.read_csv('../input/train_df.csv', nrows=20000)

In [None]:
train_df_debug.to_csv('../input/train_df_debug.csv', index=False)

In [50]:
df['CV10-0827_oof'].quantile(.08)

0.010902984270474817

In [51]:
len(df[df['CV10-0827_oof'] < 0.012])

30097

In [52]:
low_risk = df[df['CV10-0827_oof'] < 0.012]
low_risk_bad_man = low_risk[low_risk['TARGET'] == 1]['SK_ID_CURR']

In [53]:
high_risk = df[df['CV10-0827_oof'] > 0.25]
high_risk_good_man = high_risk[high_risk['TARGET'] == 0]['SK_ID_CURR']

In [55]:
df.set_index('SK_ID_CURR', inplace=True)

In [57]:
df['weights'] = 1
df.loc[low_risk_bad_man, 'weights'] = 5
df.loc[high_risk_good_man, 'weights'] = 5

In [None]:
df[]

In [None]:
df['std'].describe()

In [36]:
df[oof_cols].corr()

Unnamed: 0,withAggFeatsIn_oof,CV10-0821_oof,CV10-0827_oof,1150FEATS_LGB_oof,1400FEATS_LGB_oof,6CV_Final_FEATURE_SET_oof,8CV_NO_POS_CC_oof,8CV_NO_FINAL_oof
withAggFeatsIn_oof,1.0,0.979816,0.977416,0.973129,0.974331,0.973121,0.970711,0.97527
CV10-0821_oof,0.979816,1.0,0.981068,0.976567,0.978107,0.976504,0.97407,0.979624
CV10-0827_oof,0.977416,0.981068,1.0,0.989251,0.990371,0.988094,0.985924,0.988883
1150FEATS_LGB_oof,0.973129,0.976567,0.989251,1.0,0.992643,0.985742,0.982469,0.985055
1400FEATS_LGB_oof,0.974331,0.978107,0.990371,0.992643,1.0,0.986931,0.983792,0.986813
6CV_Final_FEATURE_SET_oof,0.973121,0.976504,0.988094,0.985742,0.986931,1.0,0.982601,0.985281
8CV_NO_POS_CC_oof,0.970711,0.97407,0.985924,0.982469,0.983792,0.982601,1.0,0.986432
8CV_NO_FINAL_oof,0.97527,0.979624,0.988883,0.985055,0.986813,0.985281,0.986432,1.0


<font size=5> start Stacking

In [None]:
from lightgbm import LGBMClassifier

In [37]:
def train(X, y, model, cv=5):
    from copy import deepcopy
    models = [deepcopy(model) for i in range(cv)]
    split = StratifiedKFold(n_splits=cv, shuffle=True, random_state=2018)
    oof_preds = np.zeros((len(X),))
    for i, (train_idx, dev_idx) in enumerate(split.split(X, y)):
        X_train, X_dev = X.iloc[train_idx], X.iloc[dev_idx]
        y_train, y_dev = y.iloc[train_idx], y.iloc[dev_idx]
        
        model = models[i]
        print('start train fold %s' % str(i + 1))
        model.fit(X_train, y_train,
#                   eval_set=[(X_train, y_train), (X_dev, y_dev)],
#                   eval_names=['train', 'valid'],
#                   eval_metric='auc',
#                   verbose=20,
#                   early_stopping_rounds=50,
                 )
        y_hat = model.predict_proba(X_dev)[:, 1]
        oof_preds[dev_idx] = y_hat
        print('=' * 60)
    print("FULL AUC: ", roc_auc_score(y, oof_preds))
    return models

# lgbClf = LGBMClassifier(n_estimators=250, learning_rate=0.01, reg_alpha=1)
models = train(df[oof_cols] , df['TARGET'], LogisticRegression())

start train fold 1
start train fold 2
start train fold 3
start train fold 4
start train fold 5
FULL AUC:  0.8004624790453958


In [21]:
for model in models:
    print(model.coef_)

[[-3.92414956 -0.16775436  0.93380102 -0.00735691  5.42258363  1.84648586
   0.49897467 -0.5465593  -0.12895278 -0.74268395 -0.48852687  0.07765628
  -0.11034734 -0.45792602  0.43418118  1.09403378  0.55797115  0.83444658]]
[[-3.93679824e+00  1.83209430e-01  6.70605011e-01  1.25637060e-03
   5.50935026e+00  1.73033062e+00  4.67749463e-01 -2.48383650e-01
  -4.06219900e-01 -8.94192865e-01 -2.51316522e-01 -3.28916755e-02
   9.04904325e-02 -4.93579357e-01  4.21613421e-01  7.59769703e-01
   2.95064102e-01  1.18049185e+00]]
[[-3.85101614 -0.16561281  0.52939657 -0.10080996  5.44260332  1.94080549
   0.73082419 -0.50067944 -0.22356099 -0.71676617  0.09602303 -0.20948457
  -0.01402301 -0.55037956  0.15356099  0.77570037  0.28661289  1.52985716]]
[[-3.86786811 -0.03733224  0.7504814   0.04263365  5.41861816  2.09467883
   0.72880684 -1.04453909 -0.11666455 -0.41562627 -0.58555416  0.07926958
   0.01411795 -0.82047432  0.39106391  0.9903012   0.45885007  0.99040887]]
[[-4.26953319 -0.39994696  0

In [38]:
def pred(X, models):
    y_hat = np.zeros((len(X), ))
    n = len(models)
    for model in models:
        y_hat += model.predict_proba(X)[:, 1]
    y_hat /= n
    return y_hat

stackingPreds = pred(predDf[pred_cols], models)

<font size=5> Weighted AVG

In [None]:
def avg_score(weights, df):
    ret = 0
    for col, w in weights.items():
        ret += df[col + '_oof'] * w
    return ret
weights = {
    'LB0.802': 0, 
    'allParamsTuned': 0,
    'withAggFeatsIn': 0, 
    '0824_new_agg_Final':0, 
    'CV10-0821': 0.5,
    'CV10-0827': 0.4,
    '1150FEATS_LGB':0,
    'aws_stacking_base_lgb_1': 0.4,
    'aws_stacking_base_lgb_7': 0., 
    'aws_stacking_base_lgb_9': 0.,
    'aws_stacking_base_lgb_10': 0.5,
}
avg_score_ = avg_score(weights, df)
roc_auc_score(df['TARGET'], avg_score_)

In [39]:
predDf['TARGET'] = stackingPreds

In [40]:
predDf[['SK_ID_CURR', 'TARGET']].to_csv('logi_stacking_with_feature_selectedV@.csv', index=False)