In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping

from category_encoders.cat_boost import CatBoostEncoder

Using TensorFlow backend.


In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [3]:
X_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']
X_test = test.drop('id', axis=1)

In [4]:
# create training data by stratified Kfold to add noises to the training set
en_cols = X_train.columns.tolist()
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
X_train_l = []
X_test_l = []
y_train_l = []
for train_index, test_index in kf.split(X_train, y_train):
    X_dev = X_train.iloc[train_index]
    y_dev = y_train.iloc[train_index]
    X_val = X_train.iloc[test_index]
    y_val = y_train.iloc[test_index]
    cbe_temp = CatBoostEncoder(cols=en_cols).fit(X_dev, y_dev)
    X_train_l.append(cbe_temp.transform(X_val))
    X_test_l.append(cbe_temp.transform(X_test))
    y_train_l.append(y_val)

In [5]:
# finalise the training data
X_train = pd.concat(X_train_l)
y_train = pd.concat(y_train_l)
X_test = sum(X_test_l) / 10

In [9]:
# logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
# xgboost 
xgb = XGBClassifier(max_depth=10, 
                    n_estimators=1000, 
                    learning_rate=0.01, 
                    n_jobs=7, 
                    random_state=42, 
                    scale_pos_weight=2.268)
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
              max_depth=10, min_child_weight=1, missing=None, n_estimators=1000,
              n_jobs=7, nthread=None, objective='binary:logistic',
              random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=2.268, seed=None, silent=True, subsample=1)

In [11]:
# Adaboost
ada = AdaBoostClassifier(n_estimators=5000, learning_rate=0.01, random_state=42)
ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.01,
                   n_estimators=5000, random_state=42)

In [15]:
# neural network
es = EarlyStopping(monitor='val_auroc', mode='max', verbose=1, min_delta=0.001, patience=100)
def auroc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)


nn = Sequential()
nn.add(Dense(23, input_dim=23, activation='relu'))
nn.add(Dense(12, activation='relu'))
nn.add(Dense(6, activation='relu'))
nn.add(Dense(1, activation='sigmoid'))
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auroc])
nn.fit(X_train, y_train, 
       batch_size=256, 
       epochs=1000, 
       verbose=1, 
       callbacks=[es], 
       validation_split=0.2)

Train on 240000 samples, validate on 60000 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000


Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000
Epoch 111/1000
Epoch 112/1000
Epoch 113/1000


Epoch 114/1000
Epoch 115/1000
Epoch 116/1000
Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 00119: early stopping


<keras.callbacks.History at 0x154087c35c0>

In [17]:
nn.fit(X_train, y_train, 
       batch_size=256, 
       epochs=1000, 
       verbose=1, 
       callbacks=[es], 
       validation_split=0.2)

Train on 240000 samples, validate on 60000 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000


Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 00101: early stopping


<keras.callbacks.History at 0x1546fe50dd8>

In [19]:
# get prediction results for all models
lr_out = pd.DataFrame({'id': test['id'], 'target': lr.predict_proba(X_test)[:,1]})
xgb_out = pd.DataFrame({'id': test['id'], 'target': xgb.predict_proba(X_test)[:,1]})
ada_out = pd.DataFrame({'id': test['id'], 'target': ada.predict_proba(X_test)[:,1]})
nn_out = pd.DataFrame({'id': test['id'], 'target':[i[0] for i in nn.predict_proba(X_test)]})

In [20]:
# ensemble results from these 4 models -> 0.80481
ens_out = (lr_out + xgb_out + nn_out) / 3 
ens_out['id'] = ens_out['id'].astype(int)

In [21]:
ens_out.to_csv("../submissions/enm-cat-all-noise-submission.csv", index=False)