In [1]:
import pandas as pd
import numpy as np
import os
import sys
import datetime
from sklearn.model_selection import StratifiedKFold, ParameterGrid
sys.path.append(os.environ['CMS_ROOT'])
from cms_modules.utils import model_summary_to_string, args_to_dict
from cms_modules.logging import Logger

import tensorflow as tf
EarlyStopping = tf.keras.callbacks.EarlyStopping
TensorBoard = tf.keras.callbacks.TensorBoard

ecbdl14_root = '/home/jjohn273/git/ECBDL14-Classification/'
sys.path.append(ecbdl14_root)
from model import create_model
from CustomCallbacks import KerasRocAucCallback

  from ._conv import register_converters as _register_converters


### Define DNN Config

In [2]:
config = {}
hidden_layers_markup='32+32'
config['hidden_layers'] = [32,32]
config['learn_rate'] = 0.001
config['batch_size'] = 128
config['dropout_rate'] = 0.5
config['batchnorm'] = True
epochs=50
debug = True

### Define I/O Paths

In [3]:
# inputs
data_path = os.path.join(ecbdl14_root, 'data/ecbdl14.onehot.sample.hdf')
data_key = 'train'
# outputs
now = datetime.datetime.today()
ts = now.strftime("%m%d%y-%H%M%S")
validation_auc_outputs = f'{ts}-validation-auc-results.csv'
train_auc_outputs = f'{ts}-train-auc-results.csv'

### Init Output Files

In [4]:
config_value = f'layers:{hidden_layers_markup}-learn_rate:{config.get("learn_rate")}'
config_value += f'-batch_size:{config.get("batch_size")}-dropout_rate:{config.get("dropout_rate")}-bathcnorm:{config.get("batchnorm")}'

if not os.path.isfile(train_auc_outputs):
    results_header = 'config,fold,' + ','.join([f'ep_{i}' for i in range(epochs) if i%callback_freq == 0])
    output_files = [train_auc_outputs, validation_auc_outputs]
    output_headers = [results_header,results_header]
    for file, header in zip(output_files, output_headers):
        with open(file, 'w') as fout:
            fout.write(header + '\n')

def write_results(file, results):
    with open(file, 'a') as fout:
        fout.write(results + '\n')

### Init Logger

In [10]:
tensorboard_dir = f'tensorboard'
log_file = f'logs/{ts}-{config_value}.txt'
logger = Logger(log_file)
logger.log_time('Starting grid search job')
logger.log_time(f'Outputs being written to {[validation_auc_outputs,train_auc_outputs]}')
logger.write_to_file()

### Load Data

In [6]:
df = pd.read_hdf(data_path, data_key)
logger.log_time(f'Loaded data with shape {df.shape}').write_to_file()
if debug:
    y, x = df[:10000]['target'], df[:10000].drop(columns=['target'])
else:
    y, x = df['target'], df.drop(columns=['target'])

### Iterate Over K-Fold Validation

In [11]:
stratified_cv = StratifiedKFold(n_splits=3, shuffle=True)
logger.log_time('Starting cross-validation')
logger.log_time(f'Using config: {config_value}')

# iterate over cross-validation folds
for fold, (train_index, validate_index) in enumerate(stratified_cv.split(x, y)):
    logger.log_time(f'Starting fold {fold}').write_to_file()
    # prepare input data
    x_train, y_train = x.iloc[train_index].values, y.iloc[train_index].values
    x_valid, y_valid = x.iloc[validate_index].values, y.iloc[validate_index].values
    input_dim = x_train.shape[1]

    # setup callbacks for monitoring AUC and early stopping
    validation_auc_callback = KerasRocAucCallback(x_valid, y_valid, True, logger)
    train_auc_callback = KerasRocAucCallback(x_train, y_train)
    early_stopping = EarlyStopping(monitor='val_auc', min_delta=0.01, patience=10, mode='max')
    tensorboard = TensorBoard(log_dir=f'{tensorboard_dir}/{config_value}/fold-{fold}', write_graph=False)

    callbacks = [validation_auc_callback, train_auc_callback, early_stopping, tensorboard]
    
    # create model and log it's description on 1st run
    dnn = create_model(input_dim, config)
    if fold == 0:
        logger.log_time(model_summary_to_string(dnn)).write_to_file()

    # train model
    logger.log_time('Starting training...').write_to_file()
    history = dnn.fit(x_train, y_train, epochs=epochs, callbacks=callbacks, verbose=0)
    logger.log_time('Trainin complete!').write_to_file()
    logger.log_time(f'History: {history}')
    
    # write results
    prefix = f'{config_value},{fold}'
    validation_aucs = np.array(history.history['val_auc'], dtype=str)
    write_results(validation_auc_outputs, f'{prefix},{",".join(validation_aucs)}')
    train_aucs = np.array(history.history['train_auc'], dtype=str)
    write_results(train_auc_outputs, f'{prefix},{",".join(train_aucs)}')