In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import numpy as np
import pickle
from tensorflow.contrib import layers
import sklearn.metrics as Metrics
from sklearn.linear_model import Lasso, LassoCV
from sklearn.cross_validation import KFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
import os


import itertools
from collections import Counter

import matplotlib.pyplot as plt

In [None]:
class Options(object):
    def __init__(self):
        self.gpu_id = 1
        ###################### Data ##########################
        # Number of diagnosis codes
        self.num_dcode = None
        #################### Embeddings #########################        
        # Vector size for each word embeddings from GloVe
        self.emb_size = 300
        ###################### Model ######################### 
        # Training Batch Size
        self.batch_size = 40
        # Epoch
        self.epoch = 251
        # Learning rate
        self.lr_rate = 1e-3
        # keep_prob, dropout_rate = 1 - keep_prob, here is the keep_prob rate
        self.keep_prob = 0.8
        # Hidden Layer
        self.H_dis = 300
        # Optimizer
        self.optimizer = 'Adam'
        # Validation Frequency
        self.valid_freq = 100
        # Early Stopping
        self.early_stop = False
        # Patience
        self.patience = None
        opt.cur_num = 0
        opt.num_test = 0

In [None]:
opt = Options()

In [1]:
def get_minibatches_idx(n, minibatch_size, shuffle=False):
    """ shuffle the dataset and get minibatches
    
    Args:
        n: size of the dataset
        minibatch_size: size of a desired minibatch size
        shuffle: an option to shuffle the dataset before getting minibatches. Default to False.
    
    Return:
        zipped iterable that contains the minibatch size and minibatches
    """
    idx_list = np.arange(n, dtype="int32")

    if shuffle:
        np.random.shuffle(idx_list)
        
    minibatches = []
    minibatch_start = 0
    for i in range(n // minibatch_size):
        minibatches.append(idx_list[minibatch_start:
                                    minibatch_start + minibatch_size])
        minibatch_start += minibatch_size

    if (minibatch_start != n):
        # Make a minibatch out of what is left
        minibatches.append(idx_list[minibatch_start:])

    return zip(range(len(minibatches)), minibatches)

In [None]:
def discriminator_2layer(H, opt, dropout, is_training, prefix='', num_outputs=1, is_reuse=None):
    """ Apply two fully connected layers with each layer batch-normalized, last layer is linear
    
    Args:
        H: encoder input
        opt: option class
        dropout: keep rate for dropout layers
        is_training: training flag for batch normalization
        
    Return:
        logits: logits for one batch. Further calculate the probability by using sigmoid/softmax.         
    """
    # biasInit = tf.constant_initializer(0.001, dtype=tf.float32)
    H_dis_ = tf.layers.dense(tf.nn.dropout(H, keep_prob=dropout), units=opt.H_dis,
                                   activation=None, name=prefix + 'dis_1', use_bias=False,
                                   reuse=is_reuse)
    H_dis_norm = tf.layers.batch_normalization(H_dis_, training=is_training)
    H_dis = tf.nn.relu(H_dis_norm, 'relu')
    
    logits = tf.layers.dense(tf.nn.dropout(H_dis, keep_prob=dropout), units=num_outputs,
                           use_bias=False, name=prefix + 'dis_2', reuse=is_reuse)
    return logits

In [None]:
def emb_classifier(x, y, keep_prob, opt, is_training):
    """MLP model"""
    x = tf.cast(x, tf.float32)
    logits = discriminator_2layer(x, opt, keep_prob, is_training)
    prob = tf.nn.sigmoid(logits)    
    loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(labels=y, logits=logits))
    saver = tf.train.Saver()
    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        train_step = tf.train.AdamOptimizer(opt.lr_rate).minimize(loss)
    return prob, loss, train_step, saver

In [None]:
def get_model(train_data, train_label, val_data, val_label, opt):
    """Implement MLP"""
    n_train = len(train_data)
    n_val = len(val_data)
    tf.set_random_seed(123)
    tf.reset_default_graph()
    
    x_ = tf.placeholder(tf.int32, shape=[None, opt.num_dcode])
    y_ = tf.placeholder(tf.float32, shape=[None, 1])
    is_training = tf.placeholder_with_default(False, shape=(), name="training")
    keep_prob = tf.placeholder(tf.float32)
    prob_, loss_, train_op, saver_ = emb_classifier(x_, y_, keep_prob, opt, is_training)
    
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        train_auc_list = []
        val_auc_list = []

        # Validation
        uidx = 0
        stop_uidx = 0
        max_val_auc = 0
        counter = 0
        train_loss = []
        val_loss = []
        
        for epoch in range(opt.epoch):
            train_minibatch_idx = get_minibatches_idx(n_train, opt.batch_size, shuffle=True)
            for _, train_index in train_minibatch_idx:
                uidx += 1
                # Exponential Learning rate decay

                if opt.early_stop == False:
                    # x
                    x_batch = [train_data[t] for t in train_index]
                    # labels
                    x_labels = [train_label[t] for t in train_index]
                    x_labels = np.array(x_labels)
                    x_labels = x_labels.reshape([-1,1])

                    _, loss = sess.run([train_op, loss_], feed_dict={x_: x_batch, y_: x_labels, 
                                                                     keep_prob: opt.keep_prob, is_training: True})

                    if uidx % opt.valid_freq == 0:
#                         print('Iteration: -------------%d------------' % uidx)
                        train_prob_list = []
                        train_true_list = []
                        train_loss_list = []
                        H_enc_train = []
                        

                        kf_train = get_minibatches_idx(opt.samples, opt.batch_size, shuffle = False)
                        for _, train_index in kf_train:
                            # x
                            x_train_batch = [train_data[t] for t in train_index]
                            # labels
                            train_labels = [train_label[t] for t in train_index]
                            train_labels = np.array(train_labels)
                            train_labels = train_labels.reshape([-1, 1])
                            
                            train_prob, train_loss0 = sess.run([prob_,loss_], feed_dict={x_: x_train_batch,
                                                                   y_: train_labels, keep_prob: 1.0})
                            train_prob_list += train_prob.tolist()
                            train_true_list += train_labels.tolist()
                            train_loss_list.append(train_loss0)
                            
                        # Calculate train AUC score at iteration uidx
                        train_prob_array = np.asarray(train_prob_list)
                        train_true_array = np.asarray(train_true_list)
                        train_loss_array = np.array(train_loss_list)
                        train_loss.append(np.mean(train_loss_array))
                        train_auc = Metrics.roc_auc_score(train_true_array, train_prob_array)
                        train_auc_list.append(train_auc)

                        # Validation
                        val_prob_list = []
                        val_true_list = []
                        val_loss_list = []
                        val_minibatch = get_minibatches_idx(n_val, opt.batch_size, shuffle=False)
                        for _, val_id in val_minibatch:
                            # x
                            x_val_batch = [val_data[t] for t in val_id]
                            # labels
                            val_labels = [val_label[idx_y] for idx_y in val_id]
                            val_labels = np.array(val_labels).reshape([-1, 1])

                            val_prob, val_loss0 = sess.run([prob_, loss_], feed_dict = {x_:x_val_batch,
                                                                    y_:val_labels, keep_prob:1.0})

                            val_prob_list += val_prob.tolist()
                            val_true_list += val_labels.tolist()
                            val_loss_list.append(val_loss0)
                            
                        # Calculate validation accuracy and AUC
                        val_prob_array = np.asarray(val_prob_list)
                        val_true_array = np.asarray(val_true_list)
                        val_loss_array = np.array(val_loss_list)
                        val_loss.append(np.mean(val_loss_array))

                        val_auc = Metrics.roc_auc_score(val_true_array, val_prob_array)
                        val_auc_list.append(val_auc)

                        if val_auc > max_val_auc:
                            stop_uidx = uidx
                            curr_patience = 0
                            max_val_auc = val_auc
                            saver_.save(sess, './save/mlp_dcode/'+str(opt.num_cur))

                        elif curr_patience < opt.patience:
                            curr_patience += 1
#                             print(curr_patience)
                        if curr_patience == opt.patience:
                            opt.early_stop = False   
                else:
                    break
            if opt.early_stop == True:
                print("Early stopping at epoch {0}: \t iteration: {1} \t max val AUC: {2}".format(epoch, stop_uidx, 
                                                                                                round(max_val_auc,4)))
                break
    return [train_auc_list, val_auc_list, train_loss, val_loss]