In [1]:
import os
import time
import tempfile
import sklearn
import kerastuner
import tensorflow as tf
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from tensorflow import keras
from keras.wrappers.scikit_learn import KerasClassifier

from kerastuner.tuners import RandomSearch
from kerastuner.engine.hyperparameters import HyperParameters

from sklearn.metrics import confusion_matrix
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import f_regression
from sklearn.model_selection import KFold
from sklearn.feature_selection import RFECV

Using TensorFlow backend.


### Load data

In [2]:
X_train = pd.read_csv('./data/X_train.csv')
y_train = pd.read_csv('./data/y_train.csv')
X_train['y'] = y_train['y']
raw_df = X_train.drop(['id'], axis=1)

In [3]:
seed = 23
df = raw_df.copy()
all_labels = np.array(df.loc[:, 'y'])

# split and shuffle dataset
train_df, test_df = train_test_split(df, stratify=all_labels, random_state=seed, test_size=0.20)

# Form np arrays of labels and features.
y_train = np.array(train_df.pop('y'))
y_test = np.array(test_df.pop('y'))

# get features
X_train = np.array(train_df)
X_test = np.array(test_df)

# one-hot encode labels for multiclass model
y_train = tf.keras.utils.to_categorical(y_train, num_classes=3, dtype='int')
y_test = tf.keras.utils.to_categorical(y_test, num_classes=3, dtype='int')

### Transform data

In [4]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [5]:
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

(3840, 1000) (960, 1000)
(3840, 3) (960, 3)


### Define class weights

In [6]:
cls0, cls1, cls2 = np.bincount(raw_df['y'])
total = cls0 + cls1 + cls2
print('Samples:\n    Total: {}\n \
      Class 0: {} ({:.2f}% of total)\n \
      Class 1: {} ({:.2f}% of total)\n \
      Class 2: {} ({:.2f}% of total)'.format(total, cls0, 100*cls0/total, cls1, 100*cls1/total, cls2, 100*cls2/total))

Samples:
    Total: 4800
       Class 0: 600 (12.50% of total)
       Class 1: 3600 (75.00% of total)
       Class 2: 600 (12.50% of total)


In [7]:
# Scaling by total/2 helps keep the loss to a similar magnitude.
# The sum of the weights of all examples stays the same.
weight_for_0 = (1 / cls0)*(total)/3.0
weight_for_1 = (1 / cls1)*(total)/3.0
weight_for_2 = (1 / cls2)*(total)/3.0

class_weight = {0: weight_for_0, 1: weight_for_1, 2: weight_for_2}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))
print('Weight for class 2: {:.2f}'.format(weight_for_2))

Weight for class 0: 2.67
Weight for class 1: 0.44
Weight for class 2: 2.67


### Build Model

In [8]:
METRICS = [keras.metrics.CategoricalAccuracy(name="categorical_accuracy", dtype=None),
           keras.metrics.Precision(name='precision'),
           keras.metrics.Recall(name='recall'),
           keras.metrics.AUC(name='auc')]

n_features = X_train.shape[-1]
def make_model(hp):
    model = keras.Sequential()
    model.add(keras.layers.Dense(hp.Int('input_units', min_value=4, max_value=128, step=4, default=64),
                                 activation=hp.Choice('dense_activation', values=['relu', 'tanh', 'sigmoid'],
                                                      default='relu'),
                                 input_shape=(n_features,)))
    model.add(keras.layers.Dropout(hp.Float('dropout', 0, 0.5, step=0.1)))
    for i in range(hp.Int('n_layers', 1, 4)):
        model.add(keras.layers.Dense(hp.Int(f'dense_{i}_units', min_value=4, max_value=128, step=4, default=32),
                                     activation=hp.Choice('dense_activation', values=['relu', 'tanh', 'sigmoid'],
                                                      default='relu')))
    model.add(keras.layers.Dense(3, activation='softmax'))
    model.compile(optimizer=keras.optimizers.Adam(hp.Float('learning_rate', min_value=1e-4,
                                                           max_value=1e-2, sampling='LOG', default=1e-3)),
                  loss=keras.losses.CategoricalCrossentropy(),
                  metrics=METRICS,
                  weighted_metrics=['categorical_accuracy'])
    return model

### Search Parameters

In [None]:
tuner = RandomSearch(make_model,
                    objective=kerastuner.Objective('val_recall', direction='max'),
                    max_trials=20,
                    executions_per_trial=2,
                    directory='log_dir',
                    seed=seed)

tuner.search(x=X_train,
            y=y_train,
            epochs=40,
            batch_size=128,
            validation_data=(X_test, y_test),
            verbose=0,
            class_weight=class_weight)

In [15]:
print(tuner.get_best_hyperparameters()[0].values)
#print(tuner.results_summary())
print(tuner.get_best_models()[0].summary())

{'input_units': 120, 'dense_activation': 'relu', 'dropout': 0.1, 'n_layers': 3, 'dense_0_units': 60, 'learning_rate': 0.000164395039994333, 'dense_1_units': 40, 'dense_2_units': 116, 'dense_3_units': 48}
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 120)               120120    
_________________________________________________________________
dropout (Dropout)            (None, 120)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 60)                7260      
_________________________________________________________________
dense_2 (Dense)              (None, 40)                2440      
_________________________________________________________________
dense_3 (Dense)              (None, 116)               4756      
__________________________________________________

In [16]:
model = tuner.get_best_models()[0]



### Cross validation

In [35]:
df = raw_df.copy()
y = np.array(df.pop('y'))
X = np.array(df)

In [19]:
EPOCHS = 200
BATCH_SIZE = 128

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_recall', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [37]:
# define kfold for training set
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

score = list()

# loop through folds
for train_index, val_index in skf.split(X, y):

    # split into training and validation
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    # feature selection
    #X_train, X_val, cor = select_features(X_train, y_train, X_val, mutual_info_regression, n_features)

    # scale training, validation and testing set
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # oen-hot encode labels
    y_train = tf.keras.utils.to_categorical(y_train, num_classes=3, dtype='int')
    y_val = tf.keras.utils.to_categorical(y_val, num_classes=3, dtype='int')

    # train the model
    model = tuner.get_best_models()[0]
    history = model.fit(
        X_train,
        y_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        callbacks = [early_stopping],
        validation_data=(X_val, y_val),
        verbose=0,
        class_weight=class_weight)

    # predict on test data
    test_predictions = model.predict(X_val, batch_size=BATCH_SIZE)

    # calculate average recall 
    prediction = np.argmax(test_predictions, -1)
    unhotted_test_labels = np.argmax(y_val, -1)
    BMAC = balanced_accuracy_score(unhotted_test_labels, prediction)

    # store bmac
    score.append(BMAC)
    print('BMAC: {:0.4f}'.format(BMAC))

print('CV complete.')

Restoring model weights from the end of the best epoch.
Epoch 00024: early stopping
BMAC: 0.9023
Restoring model weights from the end of the best epoch.
Epoch 00020: early stopping
BMAC: 0.9051
Restoring model weights from the end of the best epoch.
Epoch 00017: early stopping
BMAC: 0.9051
Restoring model weights from the end of the best epoch.
Epoch 00016: early stopping
BMAC: 0.9301
Restoring model weights from the end of the best epoch.
Epoch 00018: early stopping
BMAC: 0.9111
CV complete.


In [38]:
print("%0.4f (+/- %0.2f)" % (np.mean(score), np.std(score) * 2))

0.9107 (+/- 0.02)


### Predicting on test set

In [17]:
X_new = pd.read_csv('./data/X_test.csv')
X_new = X_new.drop(['id'], axis=1)
X_new = scaler.transform(X_new)

In [20]:
nn_prediction = model.predict(X_new, batch_size=BATCH_SIZE)
nn_prediction = np.argmax(nn_prediction, -1)

In [21]:
ID = np.array(range(len(X_new)))
df = pd.DataFrame({'id': ID,
                    'y': nn_prediction})
name = 'nn_test.csv'
path = os.path.join('.', name)
df.to_csv(path, index=False)