# Logistic Regression

## Import Libraries

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold

from keras.callbacks import ModelCheckpoint
from keras.callbacks import EarlyStopping
from keras.callbacks import TensorBoard
from keras.regularizers import l1_l2
from keras.layers import Input
from keras.layers import Dense
from keras.models import Model
from keras import backend as K

import numpy as np
import pandas as pd
import pickle
import json
import sys
import os

## Load the data

In [None]:
train = pd.read_csv('pca_train_2.csv').fillna('').values
test = pd.read_csv('pca_test_2.csv').fillna('').values

## Split into X and Y

In [None]:
X_train, Y_train = train[:,:-1], train[:,-1:]
X_test, Y_test = test[:,:-1], test[:,-1:]

## Logistic Regression Model

In [None]:
def logistic(C):
    input_layer = Input(batch_shape=(None, 8), name='C_{}_input'.format(C))
    output_layer = Dense(1, activation='sigmoid',
                         kernel_regularizer=l1_l2(l1=C, l2=C), 
                         kernel_initializer='he_uniform',
                         bias_initializer='he_uniform',
                         name='C_{}_output'.format(C))(input_layer)
    return Model(inputs=input_layer, outputs=output_layer, name='C_{}')

## 5-fold Cross Validation

In [None]:
C=[0.001, 0.01, 0.1, 1, 10, 100]
for c in C:
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=10)
    cnt = 0
    val_acc = []
    print('Training for C={}\n'.format(c))
    for train_index, valid_index in skf.split(X_train, Y_train, Y_train):
        cnt += 1
        K.clear_session()
        print('Fold {}'.format(cnt))
        key = 'logistic_C{}_fold{}'.format(c, cnt)
        x_train, y_train = X_train[train_index], Y_train[train_index]
        x_valid, y_valid = X_train[valid_index], Y_train[valid_index]
        model = logistic(c)
        model.compile('rmsprop', 'binary_crossentropy', ['accuracy'])
        if not os.path.exists(os.path.join('logs', '{}_log'.format(key))):
            os.mkdir(os.path.join('logs', '{}_log'.format(key)))
        checkpoint = ModelCheckpoint(filepath=os.path.join('models', '{}.h5'.format(key)),
                                     monitor='val_acc', save_best_only=True)
        earlystop = EarlyStopping(monitor='val_acc', patience=5, mode='max', restore_best_weights=True)
        tensorboard = TensorBoard(log_dir=os.path.join('logs', '{}_log'.format(key)),
                                  histogram_freq=1, batch_size=64, write_graph=False,
                                  write_grads=False, write_images=True)
        model.fit(x=x_train, y=y_train, batch_size=64, epochs=20,
                  callbacks=[tensorboard, earlystop, checkpoint],
                  validation_data=(x_valid, y_valid), shuffle=True, verbose=2)
        eval_results = model.evaluate(x=x_valid, y=y_valid, batch_size=1123, verbose=0)
        val_acc.append(eval_results[1]*x_valid.shape[0])
        print()
    d = {}
    if os.path.exists('scores.json'):
        d = json.load(open('scores.json'))
    d['logistic_C{}'.format(c)] = sum(val_acc)/X_train.shape[0]
    json.dump(d, open('scores.json', 'w'))
    print('\n')

## Full-Training with C=10

In [None]:
key = 'logistic_C{}'.format(10)
model = logistic(C=10)
model.compile('rmsprop', 'binary_crossentropy', ['accuracy'])
if not os.path.exists(os.path.join('logs', '{}_log'.format(key))):
    os.mkdir(os.path.join('logs', '{}_log'.format(key)))
checkpoint = ModelCheckpoint(filepath=os.path.join('models', '{}.h5'.format(key)),
                             monitor='acc', save_best_only=True)
earlystop = EarlyStopping(monitor='acc', patience=5, mode='max', restore_best_weights=True)
tensorboard = TensorBoard(log_dir=os.path.join('logs', '{}_log'.format(key)),
                          histogram_freq=0, batch_size=64, write_graph=False,
                          write_grads=False, write_images=True)
history = model.fit(x=X_train, y=Y_train, batch_size=64, epochs=20,
                    callbacks=[tensorboard, earlystop, checkpoint],
                    shuffle=True, verbose=1)

## Testing on Test Data

In [None]:
results = model.evaluate(X_test, Y_test, batch_size=64, verbose=1)
print('{:.4f}'.format(results[1]))