In [1]:
import os
import sys
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MaxAbsScaler
import datetime

import tensorflow as tf
K = tf.keras.backend
EarlyStopping = tf.keras.callbacks.EarlyStopping
ReduceLROnPlateau = tf.keras.callbacks.ReduceLROnPlateau
TensorBoard = tf.keras.callbacks.TensorBoard

proj_dir = os.environ['CMS_ROOT']
sys.path.append(proj_dir)
from utils.utils import model_summary_to_string, args_to_dict, write_dnn_perf_metrics
from utils.logging import Logger
from utils.keras_callbacks import KerasRocAucCallback
from utils.data import load_data, load_sampled_data, get_embedded_data
from utils.mlp import create_model

In [2]:
############################################
# Parse CLI Args & Create DNN Config
############################################

config = {}
hidden_layers_markup = '128+64'
config['hidden_layers'] = [64, 64]
config['learn_rate'] = 1e-3
config['batch_size'] = 128
config['dropout_rate'] = 0.5
config['batchnorm'] = True
epochs = 25

# embedding_path=os.path.join(proj_dir, 'data', 'skipgram-e300-w5-i100.kv')
embedding_type = 'onehot'
drop_columns = ['state_code']

sample_size = 200000

In [3]:
############################################
# Define I/O Paths
############################################

now = datetime.datetime.today()

validation_auc_outputs = 'validation-auc-results.csv'
train_auc_outputs = 'train-auc-results.csv'
results_file = 'results.csv'

config_value = f'embedding:{embedding_type}-layers:{hidden_layers_markup}-learn_rate:{config.get("learn_rate")}'
config_value += f'-batch_size:{config.get("batch_size")}-dropout_rate:{config.get("dropout_rate")}-bathcnorm:{config.get("batchnorm")}'

if not os.path.isfile(train_auc_outputs):
    results_header = 'config,' + ','.join([f'ep_{i}' for i in range(epochs)])
    output_files = [train_auc_outputs, validation_auc_outputs]
    output_headers = [results_header,results_header]
    for file, header in zip(output_files, output_headers):
        with open(file, 'w') as fout:
            fout.write(header + '\n')

def write_results(file, results):
    with open(file, 'a') as fout:
        fout.write(results + '\n')



In [4]:
############################################
# Initialize Logger
############################################
ts = now.strftime("%m%d%y-%H%M%S")
log_file = f'logs/{ts}-{config_value}.txt'
logger = Logger(log_file)
logger.log_time('Using ts: {ts}')
logger.log_time(f'Outputs being written to {[validation_auc_outputs,train_auc_outputs]}')
logger.write_to_file()

In [5]:
data = load_sampled_data(sample_size)

# drop columns, onehot encode, or lookkup embeddings
x, y = get_embedded_data(data, embedding_type, None, drop_columns)

del data
logger.log_time(f'Loaded embedded data with shape {x.shape}')

Loading data from path /Users/jujohnson/cms-data/raw/Medicare_PUF_PartB_2012to2017.csv.gz
Loaded data with shape: (56818165, 12)
Dropped nan, updated shape: (56818165, 12)
Positive sample count: 36548 18.274%
Negative sample count: 163452 81.726%
Using columns Index(['gender', 'provider_type', 'hcpcs_code', 'line_srvc_cnt',
       'bene_unique_cnt', 'bene_day_srvc_cnt', 'average_submitted_chrg_amt',
       'average_medicare_payment_amt'],
      dtype='object')
Using onehot embedding


  df = df.to_sparse().to_coo().astype('float32')
Use a Series with sparse values instead.

    >>> series = pd.Series(pd.SparseArray(...))

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  return klass(values, index=self.index, name=items, fastpath=True)


Embedded data shape: (200000, 7633)


<utils.logging.Logger at 0x7fe163c435f8>

In [13]:
############################################
# Train/Test Split & Normalize
############################################

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
# del x, y
scaler = MaxAbsScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [29]:
x_train.toarray()

<160000x7633 sparse matrix of type '<class 'numpy.float32'>'
	with 1279997 stored elements in Compressed Sparse Row format>

In [83]:
np.random.shuffle(x)

TypeError: sparse matrix length is ambiguous; use getnnz() or shape[0]

In [109]:
validation_auc_callback = KerasRocAucCallback(x_test, y_test, True, logger)
train_auc_callback = KerasRocAucCallback(x_train, y_train)
early_stopping = EarlyStopping(monitor='val_auc', min_delta=0.001, patience=10, mode='max')
tensorboard_dir = f'tensorboard/{ts}-{config_value}'
tensorboard = TensorBoard(log_dir=f'{tensorboard_dir}', write_graph=False)
callbacks = [validation_auc_callback, train_auc_callback, early_stopping, tensorboard]

In [None]:
############################################
# Training
############################################
K.clear_session()
input_dim = x_train.shape[1]
model = create_model(input_dim, config)

logger.log_time('Starting training...').write_to_file()

if 'onehot' in embedding_type:
  training_generator = DataGenerator(x_train, y_train, batch_size=batch_size)
  validation_generator = DataGenerator(x_test, y_test, batch_size=batch_size)
  history = model.fit_generator(
    epochs=epochs, generator=training_generator,
    validation_data=validation_generator,
    use_multiprocessing=True,
    callbacks=callbacks,
    workers=1)
else:
  history = model.fit(x_train, y_train, epochs=epochs, callbacks=callbacks, verbose=1)

logger.log_time('Trainin complete!').write_to_file()

Epoch 1/25
 192/1250 [===>..........................] - ETA: 57s - loss: 0.4978

In [None]:
############################################
# Write Results
############################################

prefix = f'{config_value}'
validation_aucs = np.array(history.history['val_auc'], dtype=str)
write_results(validation_auc_outputs, f'{prefix},{",".join(validation_aucs)}')
train_aucs = np.array(history.history['train_auc'], dtype=str)
write_results(train_auc_outputs, f'{prefix},{",".join(train_aucs)}')


minority_size = (y_train == 1).sum() / len(y_train) * 100
threshold = minority_size / 100

y_prob = model.predict(x_test)
write_dnn_perf_metrics(y_test, y_prob, threshold, config_value, 2, 64, final_results)

# free some memory
del history, x_test, y_test, x_train, y_train
del model

In [14]:
tf.__version__

'2.0.0'