In [3]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
import datetime

import tensorflow as tf
K = tf.keras.backend
EarlyStopping = tf.keras.callbacks.EarlyStopping
ReduceLROnPlateau = tf.keras.callbacks.ReduceLROnPlateau
TensorBoard = tf.keras.callbacks.TensorBoard

proj_dir = os.environ['CMS_ROOT']
sys.path.append(proj_dir)
from utils.utils import model_summary_to_string, args_to_dict, write_dnn_perf_metrics
from utils.logging import Logger
from utils.keras_callbacks import KerasRocAucCallback
from utils.data import load_data, get_embedded_data
from utils.mlp import create_model

In [5]:
############################################
# Parse CLI Args & Create DNN Config
############################################

config = {}
hidden_layers_markup = '128+64'
config['hidden_layers'] = [64, 64]
config['learn_rate'] = 1e-3
config['batch_size'] = 128
config['dropout_rate'] = 0.5
config['batchnorm'] = True
epochs = 25

embedding_path=os.path.join(proj_dir, 'data', 'skipgram-e300-w5-i100.kv')
embedding_type = 'skipgram150'
drop_columns = ['state_code']

sample_size = 200000

In [6]:
############################################
# Define I/O Paths
############################################

now = datetime.datetime.today()

validation_auc_outputs = 'validation-auc-results.csv'
train_auc_outputs = 'train-auc-results.csv'
results_file = 'results.csv'

config_value = f'embedding:{embedding_type}-layers:{hidden_layers_markup}-learn_rate:{config.get("learn_rate")}'
config_value += f'-batch_size:{config.get("batch_size")}-dropout_rate:{config.get("dropout_rate")}-bathcnorm:{config.get("batchnorm")}'

if not os.path.isfile(train_auc_outputs):
    results_header = 'config,' + ','.join([f'ep_{i}' for i in range(epochs)])
    output_files = [train_auc_outputs, validation_auc_outputs]
    output_headers = [results_header,results_header]
    for file, header in zip(output_files, output_headers):
        with open(file, 'w') as fout:
            fout.write(header + '\n')

def write_results(file, results):
    with open(file, 'a') as fout:
        fout.write(results + '\n')


In [8]:
############################################
# Initialize Logger
############################################
ts = now.strftime("%m%d%y-%H%M%S")
tensorboard_dir = f'tensorboard/{ts}-{config_value}/'
log_file = f'logs/{ts}-{config_value}.txt'
logger = Logger(log_file)
logger.log_time('Starting job')
logger.log_time('Using ts: {ts}')
logger.log_time(f'Outputs being written to {[validation_auc_outputs,train_auc_outputs]}')
logger.write_to_file()

In [None]:
load_sampled_data(1000000).head()

Loading data from path /Users/jujohnson/cms-data/raw/Medicare_PUF_PartB_2012to2017.csv.gz


In [8]:
############################################
# Train/Test Split & Normalize
############################################

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
y_train, y_test = np.array(y_train), np.array(y_test)

scaler = MaxAbsScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
del x, y

In [9]:
############################################
# Setup Training Callbacks
############################################

validation_auc_callback = KerasRocAucCallback(x_test, y_test, True, logger)
train_auc_callback = KerasRocAucCallback(x_train, y_train)
early_stopping = EarlyStopping(monitor='val_auc', min_delta=0.001, patience=5, mode='max', restore_best_weights=True)
tensorboard = TensorBoard(log_dir=f'{tensorboard_dir}', write_graph=False)
callbacks = [validation_auc_callback, train_auc_callback, early_stopping, tensorboard]

In [12]:
############################################
# Build Model
############################################
input_dim = x_train.shape[1]

K.clear_session()
learn_rate = config.get('learn_rate', 1e-3)
dropout_rate = config.get('dropout_rate')
batchnorm = config.get('batchnorm', False)
hidden_layers = config.get('hidden_layers', [32])
activation = config.get('activation', 'relu')
optimizer = config.get('optimizer', Adam)    
gpu_count = config.get('gpus', 0)

model = Sequential()

for idx, width in enumerate(hidden_layers):
    input_dim = input_dim if idx == 0 else None

    # hidden layers
    model.add(Dense(width, input_dim=input_dim))
    if batchnorm:
        model.add(BatchNormalization())
    model.add(Activation(activation))
    if dropout_rate != None:
        model.add(Dropout(dropout_rate))

# output layer
model.add(Dense(1, activation='sigmoid'))

# use gpus?
if gpu_count > 1:
    model = multi_gpu_model(model, gpu_count)

model.compile(loss='binary_crossentropy', optimizer=optimizer(learn_rate))


def write_model(model, path):
    json = model.to_json()
    with open(path, 'w') as out:
        out.write(json)

In [13]:
############################################
# Training
############################################

logger.log_time('Starting training...').write_to_file()
history = model.fit(x_train, y_train, epochs=epochs, callbacks=callbacks, verbose=1)
logger.log_time('Trainin complete!').write_to_file()

Train on 160000 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25


In [14]:
############################################
# Write Results
############################################

prefix = f'{config_value}'
validation_aucs = np.array(history.history['val_auc'], dtype=str)
write_results(validation_auc_outputs, f'{prefix},{",".join(validation_aucs)}')
train_aucs = np.array(history.history['train_auc'], dtype=str)
write_results(train_auc_outputs, f'{prefix},{",".join(train_aucs)}')


minority_size = (y_train == 1).sum() / len(y_train) * 100
threshold = minority_size / 100

y_prob = model.predict(x_test)
write_dnn_perf_metrics(y_test, y_prob, threshold, config_value, 2, 64, final_results)

# free some memory
del history, x_test, y_test, x_train, y_train
del model