In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from scipy.stats import percentileofscore
import sys
from tqdm import tqdm_notebook
sys.path.append('../')

%matplotlib inline

In [11]:
baseModelNames = [
    'LB0.802',
    'allParamsTuned',
    'withAggFeatsIn',
#     'PrunedFeatureModel',
    '0824_new_agg_Final',
    'WithPosScaleWeight_eq_5',
    'CV10-0821',
    'CV10-0827',
    '1150FEATS_LGB',
#     'XGB-5-CV',
]

In [12]:
def get_data(modelNames):
    oof = {}
    preds = {}
    for name in tqdm_notebook(modelNames):
        pathName = '../workdir/' + name
        with open(pathName + '/kfold_model.pkl', 'rb') as f:
            Clf = pickle.load(f)
        oof_arr = Clf.oof_preds_
        pred_arr = pd.read_csv(pathName + '/preds.csv', usecols=['TARGET']).values.ravel()

        oof[name + '_oof'] = oof_arr
        preds[name + '_preds'] = pred_arr

    return oof, preds

oof, preds = get_data(baseModelNames)
df = pd.DataFrame(oof)
predDf = pd.DataFrame(preds)

A Jupyter Widget






In [13]:
df = df.join(pd.read_csv('../input/train_df.csv', usecols=['SK_ID_CURR', 'TARGET']))
oof_cols = [col for col in df.columns if '_oof' in col]
# df[oof_cols] = df[oof_cols].apply(lambda x: x.rank() / len(x))
df['std'] = df[oof_cols].std(axis=1)

In [14]:
predDf = predDf.join(pd.read_csv('../input/sample_submission.csv', usecols=['SK_ID_CURR']))
pred_cols = [col for col in predDf.columns if '_preds' in col]
predDf['std'] = predDf[pred_cols].std(axis=1)

<font size=5> OOF AUC

In [15]:
for col in oof_cols:
    auc = roc_auc_score(df['TARGET'], df[col])
    print('%-30s    %.6f' % (col, auc))

LB0.802_oof                       0.798360
allParamsTuned_oof                0.797814
withAggFeatsIn_oof                0.798296
0824_new_agg_Final_oof            0.798611
WithPosScaleWeight_eq_5_oof       0.797380
CV10-0821_oof                     0.799078
CV10-0827_oof                     0.799363
1150FEATS_LGB_oof                 0.798455


<font size=5> Correlation

In [16]:
df[oof_cols].describe()

Unnamed: 0,LB0.802_oof,allParamsTuned_oof,withAggFeatsIn_oof,0824_new_agg_Final_oof,WithPosScaleWeight_eq_5_oof,CV10-0821_oof,CV10-0827_oof,1150FEATS_LGB_oof
count,307507.0,307507.0,307507.0,307507.0,307507.0,307507.0,307507.0,307507.0
mean,0.0789,0.079093,0.078945,0.078994,0.223849,0.079124,0.078928,0.078845
std,0.095012,0.094832,0.095678,0.09551,0.182805,0.095313,0.095482,0.095061
min,0.001466,0.001575,0.001279,0.001302,0.004251,0.001485,0.001443,0.001717
25%,0.021588,0.021691,0.021235,0.021344,0.083227,0.02151,0.021448,0.021517
50%,0.04387,0.044081,0.043577,0.043726,0.163316,0.043933,0.043679,0.043708
75%,0.095396,0.09597,0.095571,0.095522,0.316467,0.095955,0.095516,0.095422
max,0.890341,0.899455,0.879603,0.883344,0.955054,0.900749,0.900269,0.883544


In [17]:
df[oof_cols].corr()

Unnamed: 0,LB0.802_oof,allParamsTuned_oof,withAggFeatsIn_oof,0824_new_agg_Final_oof,WithPosScaleWeight_eq_5_oof,CV10-0821_oof,CV10-0827_oof,1150FEATS_LGB_oof
LB0.802_oof,1.0,0.976576,0.984745,0.987605,0.939105,0.978451,0.986135,0.981768
allParamsTuned_oof,0.976576,1.0,0.97753,0.979595,0.927106,0.991211,0.978974,0.974787
withAggFeatsIn_oof,0.984745,0.97753,1.0,0.988,0.931213,0.979816,0.977416,0.973129
0824_new_agg_Final_oof,0.987605,0.979595,0.988,1.0,0.933923,0.981866,0.980044,0.975957
WithPosScaleWeight_eq_5_oof,0.939105,0.927106,0.931213,0.933923,1.0,0.927966,0.932281,0.930294
CV10-0821_oof,0.978451,0.991211,0.979816,0.981866,0.927966,1.0,0.981068,0.976567
CV10-0827_oof,0.986135,0.978974,0.977416,0.980044,0.932281,0.981068,1.0,0.989251
1150FEATS_LGB_oof,0.981768,0.974787,0.973129,0.975957,0.930294,0.976567,0.989251,1.0


<font size=5> start Stacking

In [18]:
def train(X, y, model, cv=5):
    from copy import deepcopy
    models = [deepcopy(model) for i in range(cv)]
    split = StratifiedKFold(n_splits=cv, shuffle=True)
    oof_preds = np.zeros((len(X),))
    for i, (train_idx, dev_idx) in enumerate(split.split(X, y)):
        X_train, X_dev = X.iloc[train_idx], X.iloc[dev_idx]
        y_train, y_dev = y.iloc[train_idx], y.iloc[dev_idx]
        
        model = models[i]
        print('start train fold %s' % str(i + 1))
        model.fit(X_train, y_train,
#                   eval_set=[(X_train, y_train), (X_dev, y_dev)],
#                   eval_names=['train', 'valid'],
#                   eval_metric='auc',
#                   verbose=20,
#                   early_stopping_rounds=50,
                 )
        y_hat = model.predict_proba(X_dev)[:, 1]
        oof_preds[dev_idx] = y_hat
        print('=' * 60)
    print("FULL AUC: ", roc_auc_score(y, oof_preds))
    return models

# lgbClf = LGBMClassifier(n_estimators=250, learning_rate=0.01, reg_alpha=1)
models = train(df[oof_cols] , df['TARGET'], LogisticRegression())

start train fold 1
start train fold 2
start train fold 3
start train fold 4
start train fold 5
FULL AUC:  0.7979117247480441


In [19]:
for model in models:
    print(model.coef_)

[[-3.76846701 -0.021525    0.75606529 -0.10063246  5.3481299   1.84468434
   1.34679284 -0.21716767]]
[[-3.79135293 -0.31336149  0.95176052 -0.02499953  5.43792562  2.03355426
   1.24434643 -0.47765438]]
[[-4.16690477 -0.29776536  0.83115713  0.19917924  5.51607734  1.90354591
   1.25675307 -0.16482504]]
[[-3.97561532 -0.32234854  1.07575083 -0.08158333  5.41632182  1.90479184
   1.32119684 -0.23664279]]
[[-3.87296646 -0.22706361  0.5788576   0.63297307  5.41183406  1.99295277
   1.03754315 -0.41417601]]


In [None]:
def pred(X, models):
    y_hat = np.zeros((len(X), ))
    n = len(models)
    for model in models:
        y_hat += model.predict_proba(X)[:, 1]
    y_hat /= n
    return y_hat

stackingPreds = pred(predDf[pred_cols], models)

<font size=5> Weighted AVG

In [37]:
def avg_score(weights, df, cols):
    ret = 0
    for i, col in enumerate(cols):
        ret += weights[i] * df[col]
    return ret
avg_score_ = avg_score([0, 0, 0, 0, 0, 0.5, 0.5, 0.], df, oof_cols)
roc_auc_score(df['TARGET'], avg_score_)

0.8002069367026086

In [39]:
predDf['TARGET'] = avg_score([0, 0, 0, 0, 0, 0.5, 0.5, 0.], predDf, pred_cols)

In [41]:
predDf[['SK_ID_CURR', 'TARGET']].to_csv('weighted_avg2.csv', index=False)