In [1]:
import numpy as np
import pickle
import pandas as pd
from keras.layers import Dense, Activation, BatchNormalization, Dropout, Lambda, Input
from keras.models import Model, load_model, Sequential
from keras.optimizers import adam
from sklearn.naive_bayes import GaussianNB
from matplotlib import pyplot as plt
from lightgbm.basic import Booster
from sklearn.metrics import roc_auc_score, roc_curve
from keras.callbacks import ModelCheckpoint
from NaiveBayesPDF import NaiveBayesPDF

Using TensorFlow backend.


# Cargo datos train y valid

In [2]:
df_train = pd.read_csv('data/train_splitted.csv')
df_valid = pd.read_csv('data/valid_splitted.csv')

X_train = df_train.drop(columns=['ID_code', 'target'])
X_valid = df_valid.drop(columns=['ID_code', 'target'])

y_train = df_train['target']
y_valid = df_valid['target']

In [3]:
y_train_cat = np.array([y_train, 1-y_train]).T
y_valid_cat = np.array([y_valid, 1-y_valid]).T

In [4]:
y_train.shape, y_train_cat.shape

((160000,), (160000, 2))

# Loading models

In [5]:
model_NB = pickle.load(open('trained_models/GNB_train_only.pk', 'rb'))

In [6]:
model_MLP_Keras_0 = load_model('trained_models/MLP_keras_balanced_0.h5')
model_MLP_Keras_1 = load_model('trained_models/MLP_keras_balanced_1.h5')
# model_MLP_Keras_2 = load_model('trained_models/MLP_keras_balanced_2.h5')
# model_MLP_Keras_3 = load_model('trained_models/MLP_keras_balanced_3.h5')
# model_MLP_Keras_4 = load_model('trained_models/MLP_keras_balanced_4.h5')
# model_MLP_Keras_5 = load_model('trained_models/MLP_keras_balanced_5.h5')
# model_MLP_Keras_6 = load_model('trained_models/MLP_keras_balanced_6.h5')
# model_MLP_Keras_7 = load_model('trained_models/MLP_keras_balanced_7.h5')
# model_MLP_Keras_8 = load_model('trained_models/MLP_keras_balanced_8.h5')

In [7]:
model_LGBM = pickle.load(open('trained_models/lgmb_model_train_only.pkl', 'rb'))

In [8]:
model_NBpdf = pickle.load(open('trained_models/naive_bayes_pdf_train_only.pk', 'rb'))

In [9]:
def predict(models, data, y=None):
    predictions = []
    scores = []
    aucs = []
    accs = []
    for model in models:
        if type(model) == GaussianNB:
            print('Predicting Naive Bayes...')
            # acc = model.score(data, y)
            model_pred = model.predict_proba(data)[:,1]
        if type(model) == Sequential:
            print('Predicting Keras MLP...')
            # acc = model.evaluate(data, y)
            model_pred = model.predict(data, verbose=1)[:,0]
        if type(model) == Booster:
            print('Predicting LGBM...')
            model_pred = model.predict(data)
        if type(model) == NaiveBayesPDF:
            print('Predicting NBpdf...')
            _, _, _, _, model_pred, _, _ = model.predict(data.values)
        if y is not None:
            aucs.append(roc_auc_score(y, model_pred))
            accs.append(((model_pred>0.5)==y).sum()/len(y))
        predictions.append(model_pred)
        #predictions.append(np.log(model_pred/(1-model_pred)))
    return np.array(predictions).T, aucs, accs

In [11]:
models = [
    model_NB, model_NBpdf, model_LGBM,
    model_MLP_Keras_0,
    model_MLP_Keras_1,
#     model_MLP_Keras_2,
#     model_MLP_Keras_3,
#     model_MLP_Keras_4,
#     model_MLP_Keras_5,
#     model_MLP_Keras_6,
#     model_MLP_Keras_7,
#     model_MLP_Keras_8,
]

In [12]:
predictions_train, aucs_train, accs_train = predict(models, X_train, y_train)
print(aucs_train)
print(accs_train)

Predicting Naive Bayes...
Predicting NBpdf...
Predicting LGBM...
Predicting Keras MLP...
Predicting Keras MLP...
[0.8893228519335807, 0.9049951884982382, 0.9263432742809904, 0.875284793939197, 0.8756829014848423]
[0.92193125, 0.925125, 0.934875, 0.7746375, 0.78340625]


In [None]:
predictions_valid, aucs_val, accs_val = predict(models, X_valid, y_valid)
print(aucs_val)
print(accs_val)

In [None]:
predictions_train.shape, predictions_valid.shape

In [None]:
def plot_hists(predictions, N = 50):
    plt.figure(figsize=(20,6))
    colors = ['r', 'b', 'y', 'k']
    for i in range(predictions.shape[1]):
        hist = np.histogram(np.log(predictions[:,i]/(1-predictions[:,i])), N)
        # _ = plt.hist(np.log(predictions[:,i]/(1-predictions[:,i])), N, label=str(type(models[i])))
        x_axis = np.linspace(hist[1][0], hist[1][-1], N)
        plt.plot(x_axis, hist[0]/(hist[1][1]-hist[1][0]), label=str(type(models[i])), c=colors[i])
    plt.legend()
    plt.show()

In [None]:
plot_hists(predictions_train)

In [None]:
plot_hists(predictions_valid)

# Ensamble

In [None]:
predictions_train.shape

In [None]:
hidden_units = 2
model = Sequential()
model.add(BatchNormalization(input_shape=(predictions_valid.shape[1],)))
model.add(Dense(hidden_units, activation='relu'))
#model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.summary()

In [None]:
batch_size = 256
lr = 1e-4
optimizer = adam(lr=lr)
model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
saved_best_model = 'ensamble_only_train_best_{}_bs_{}_{}.hdf5'.format(lr, batch_size, hidden_units)
print(saved_best_model)

In [None]:
checkpoint = ModelCheckpoint(saved_best_model, monitor='val_loss', verbose=1, save_best_only=True)

In [None]:
model.fit(predictions_train, y_train_cat, 
          batch_size=batch_size, 
          epochs=100, 
          validation_data=(predictions_valid, y_valid_cat), 
          callbacks = [checkpoint],
          verbose=1)

# Train evaluation

In [None]:
model.save_weights('ensamble_only_train_last.hdf5')

In [None]:
#model.load_weights('ensamble_only_train_last.hdf5')

In [None]:
model.load_weights(saved_best_model)

In [None]:
ensamble_prediction_train = model.predict(predictions_train, verbose=1)[:,0]

In [None]:
ensamble_prediction_train.shape, y_train_cat.shape

In [None]:
roc_auc_score(y_train, ensamble_prediction_train)

In [None]:
_ = plt.hist(np.log(ensamble_prediction_train/(1-ensamble_prediction_train)), 50)

# Validation Evaluation

In [None]:
ensamble_prediction_valid = model.predict(predictions_valid, verbose=1)[:,0]

In [None]:
_ = plt.hist(np.log(ensamble_prediction_valid/(1-ensamble_prediction_valid)), 50)

In [None]:
roc_auc_score(y_valid, ensamble_prediction_valid)

lr=1e-4 bs=256 roc_auc=0.904051104282362 epocs=50
lr=1e-5 bs=256 roc_auc=0.904051104282362 epocs=300



In [None]:
fpr, tpr, thres = roc_curve(y_valid, ensamble_prediction_valid)
plt.plot(fpr, tpr)
plt.show()