In [37]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier

from keras.layers import Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping

from category_encoders.leave_one_out import LeaveOneOutEncoder

In [2]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [3]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(['id', 'target'], axis=1), train['target'], 
                                                    test_size=0.2, random_state=42)

In [5]:
# prepare encoder for both validation and test data
en_cols = X_train.columns.tolist()
looe = LeaveOneOutEncoder(cols=en_cols).fit(X_train, y_train)

In [6]:
# transform validation and test data
proc_test = looe.transform(test.drop('id', axis=1))
X_test = looe.transform(X_test)

In [7]:
# create training data by stratified Kfold to add noises to the training set
kf = StratifiedKFold(n_splits=5)
X_train_l = []
y_train_l = []
for train_index, test_index in kf.split(X_train, y_train):
    X_dev = X_train.iloc[train_index]
    y_dev = y_train.iloc[train_index]
    X_val = X_train.iloc[test_index]
    y_val = y_train.iloc[test_index]
    looe_temp = LeaveOneOutEncoder(cols=en_cols).fit(X_dev, y_dev)
    X_train_l.append(looe_temp.transform(X_val))
    y_train_l.append(y_val)

In [8]:
# finalise the training data
X_train = pd.concat(X_train_l)
y_train = pd.concat(y_train_l)

In [9]:
# logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
# lr performance
print("ROCAUC score: {}".format(roc_auc_score(y_test, lr.predict_proba(X_test)[:,1])))

ROCAUC score: 0.7994153379667676


In [33]:
# xgboost 
xgb = XGBClassifier(max_depth=10, 
                    n_estimators=1000, 
                    learning_rate=0.01, 
                    n_jobs=7, 
                    random_state=42, 
                    scale_pos_weight=2.268)
xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
              max_depth=10, min_child_weight=1, missing=None, n_estimators=1000,
              n_jobs=7, nthread=None, objective='binary:logistic',
              random_state=42, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=2.268, seed=None, silent=True, subsample=1)

In [34]:
# xgb performance
print("ROCAUC score: {}".format(roc_auc_score(y_test, xgb.predict_proba(X_test)[:,1])))

ROCAUC score: 0.7940303716319459


In [35]:
# Adaboost
ada = AdaBoostClassifier(n_estimators=5000, learning_rate=0.01, random_state=42)
ada.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=0.01,
                   n_estimators=5000, random_state=42)

In [36]:
# ada performance
print("ROCAUC score: {}".format(roc_auc_score(y_test, ada.predict_proba(X_test)[:,1])))

ROCAUC score: 0.7975746003420995


In [40]:
# neural network
es = EarlyStopping(monitor='val_auroc', mode='max', verbose=1, min_delta=0.001, patience=50)
def auroc(y_true, y_pred):
    return tf.py_func(roc_auc_score, (y_true, y_pred), tf.double)


nn = Sequential()
nn.add(Dense(23, input_dim=23, activation='relu'))
nn.add(Dense(12, activation='relu'))
nn.add(Dense(1, activation='sigmoid'))
nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=[auroc])
nn.fit(X_train, y_train, 
       batch_size=128, 
       epochs=1000, 
       verbose=1, 
       callbacks=[es], 
       validation_split=0.2)

Train on 192000 samples, validate on 48000 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000


Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 00080: early stopping


<keras.callbacks.History at 0x1b0a578d208>

In [42]:
# nn performance
print("ROCAUC score: {}".format(roc_auc_score(y_test, nn.predict_proba(X_test))))

ROCAUC score: 0.7990699339145892


In [15]:
# save lr model -> public score 0.80288
lr_out = pd.DataFrame({'id': test['id'], 'target': lr.predict_proba(proc_test)[:,1]})
lr_out.to_csv("../submissions/lr-tar-noise-submission.csv", index=False)

In [49]:
# get prediction results for all models
xgb_out = pd.DataFrame({'id': test['id'], 'target': xgb.predict_proba(proc_test)[:,1]})
ada_out = pd.DataFrame({'id': test['id'], 'target': ada.predict_proba(proc_test)[:,1]})
nn_out = pd.DataFrame({'id': test['id'], 'target':[i[0] for i in nn.predict_proba(proc_test)]})

In [54]:
# ensemble results from these 4 models -> 0.80295
ens_out = (lr_out + xgb_out + ada_out + nn_out) / 4
ens_out['id'] = ens_out['id'].astype(int)
ens_out.to_csv("../submissions/ens-tar-noise-submission.csv", index=False)

In [55]:
# nn model -> 0.80254
nn_out.to_csv("../submissions/nn-tar-noise-submission.csv", index=False)