# Calculate and graph performance summary statistics #

** Author: Andrew Larkin **, Oregon State University College of Public Health and Human Sciences <br>
** Date created: ** January 7, 2018

### Summary ###
Once hyperparameter values have been selected in the hyperparameter tuning script, ModelTraining_FixedParams reads performance dictionaries in pickled format, trains the model with selected hyperparameters, and saves model weights and metadata from the epoch with the lost dev cost after 50000 epochs. This script is also used to create the senstivity models (i.e. those with one of the input features such as hashtags, emoticons, etc. or an output class label removed form the model structure)

### Import libraries and define global variables and constants ###

In [1]:
#import libraries
import numpy as np
import tensorflow as tf
import re, string
import pickle
import os
import math
import copy
import pandas as ps

  from ._conv import register_converters as _register_converters


In [12]:
# define input and output filepaths
parentFolder = "C:/Users/larkinan/Desktop/DBTraining/"
dataset = parentFolder + "preprocessingOutput/"
performFolder = parentFolder + "modelTrainingPerformance/"

# pickled datasets to load
datasetPickleParams = { # where to store datasets for model training on hard disk
                        "trainDictPicklePath":dataset + "TrainDict_Jan7_18.p",
    "devDictPicklePath":dataset + "DevDict_Jan7_18.p",
    "testDictPicklePath":dataset + "TestDict_Jan7_18.p",
    "allDictPicklePath":dataset + "allDict.p",
    "embeddingMatrixPicklePath":dataset + "embeddingMatrix.p",
    "word2IndexPicklePath":dataset + "word2Index.p",
    "NYCDictPicklePath":dataset + "NYCDict_Jan7_18.p"
    }

In [3]:
# Now that hyperparams are fixed, set model params as global variables
vecDim = 300
batchSize = 64
numOutcomes = 7
numPostLSTMLayers = 2
hiddenLayerActivation = 'tanh'
hiddenLayerSize = 256
learningRate = 0.00009
preSoftmaxLayerSize = 14
keepRateLSTM = 0.9
keepRate = 0.5
numEpochs = 100000
l2Reg = 0.001

In [4]:
# load pickled preprocessed data
def loadDatasets(pickleParams):
    trainDict = pickle.load(open(pickleParams['trainDictPicklePath'],'rb'))
    devDict = pickle.load(open(pickleParams['devDictPicklePath'],'rb'))
    testDict = pickle.load(open(pickleParams['testDictPicklePath'],'rb'))
    NYC_Dict = pickle.load(open(pickleParams['NYCDictPicklePath'],'rb'))
    embeddingMatrix = pickle.load(open(pickleParams['embeddingMatrixPicklePath'],'rb'))
    word2IndexMap = pickle.load(open(pickleParams['word2IndexPicklePath'],'rb'))
    return(trainDict,devDict,testDict,NYC_Dict,embeddingMatrix,word2IndexMap)

In [5]:
# extract vectors from dataset dictionary.  
def extractDataFromDict(inputDict):
    return(inputDict['sent'], inputDict['labels'], inputDict['seqLens'],
           inputDict['hash'], inputDict['emot'],inputDict['loc_ind'])

In [6]:
#  randomly sample record indices 
def getSampledIndices(dataX,batchSize):
    instanceIndices = list(range(len(dataX)))
    np.random.shuffle(instanceIndices)
    sampledIndices = instanceIndices[:batchSize]
    return(sampledIndices)

In [7]:
# get sampled text and convert to index values using the map
def getSampledXVals(sampledIndices,dataX,word2IndexMap):
    sampledX = []
    for i in sampledIndices:
        sent = dataX[i]
        tempX = []
        for word in sent.split():
            
            # when applying to datasets other than the train, dev, and test, some words may not be in the dictionary
            if(word in word2IndexMap):
                tempX.append(word2IndexMap[word])
            else:
                tempX.append(word2IndexMap['UNK'])
        sampledX.append(tempX)
    return(sampledX)

### get a random sample for a single epoch or evaluation###
**Inputs**: <br>
- **batchSize** (int) - number of records to randomly sample <br>
- **dataX** (string array) - tweet text for all records <br>
- **dataY** (array of 1x7 int arrrays) - each 1x7 int array corresponds to 7 labels for one record <br>
- **dataSeqLens** (int array) - number of words in each record
- **dataHash** (array of binary numbers) - the nth digit in the ith binary number indicates whether the nth word in the ith dataX record is from a hashtag <br>
- **dataEmot** (array of binary numbers) - the nth digit in the ith binary number indicates whether the nth word in the ith dataX record is from an emoticon <br>
- **dataLoc** (array of binary numbers) - the nth digit in the ith binary number indicates whether the nth word in the ith dataX record is from a regional location description <br>
- **word2IndexMap** (dict) - dictionary of word:index keys <br>
- **numOutcomes** (int) - number of outcomes in the dataset <br>

**Outputs**: <br>
- **sampledX** (array of int arrays) - word2Index mapped numbers for the words in the sampled tweets <br>
- **sampledY** (array of int arrays) - outcome labels for sampled tweets <br>
- **samplesdSeqLens** (int array) - length of of sampled tweets <br>
- **sampledHash** (array of int arrays) - indicator values of which words in the sampled tweets are hashtags <br>
- **sampledEmot** (array of int arrays) - indicator values of which words in the sampled tweets are emoticon descriptions <br>
- **sampledLoc** (array of int arrays) - indicator values of which words in the sampled tweets are regional descriptions that use nature-related vocabulary <br>
- **sampledIndices** (int array) - sampled record indices in the original dataset <br>

In [8]:
def getSentenceBatch(batchSize,dataX,dataY,
                       dataSeqlens,dataHash,dataEmot,dataLoc,
                       word2IndexMap,numOutcomes):
    
    sampledIndices = getSampledIndices(dataX,batchSize)
    sampledX = getSampledXVals(sampledIndices,dataX,word2IndexMap)
    sampledY = np.asarray([dataY[i][0:numOutcomes*2] for i in sampledIndices]).reshape((batchSize, numOutcomes*2))
    sampledSeqlens = [dataSeqlens[i] for i in sampledIndices]
    sampledHash = np.asarray([dataHash[i] for i in sampledIndices],dtype=np.float32).reshape((batchSize,len(dataHash[0]),1))
    sampledEmot = np.asarray([dataEmot[i] for i in sampledIndices],dtype=np.float32).reshape((batchSize,len(dataEmot[0]),1))
    sampledLoc = np.asarray([dataLoc[i] for i in sampledIndices],dtype=np.float32).reshape((batchSize,len(dataLoc[0]),1))
    
    return(sampledX,sampledY,sampledSeqlens,sampledHash,sampledEmot,sampledLoc,sampledIndices)

In [9]:
def createModel(numFeatures,embeddingMatrix,trainDict,testDict,word2IndexMap):
    with tf.device('/device:GPU:0'):
        
        vocabSize = len(embeddingMatrix)

        trainX,trainY,trainSeqlens,trainHash,trainEmot,trainLoc = extractDataFromDict(trainDict)
        testX,testY,testSeqlens,testHash,testEmot,testLoc = extractDataFromDict(testDict)

        tf.reset_default_graph() 

        _inputs = tf.placeholder(tf.int32,shape=[None,numFeatures],name="featurePlaceholder")
        embedding_placeholder = tf.placeholder(tf.float32, [vocabSize,vecDim],name="embeddPlaceholder")

        _labels = tf.placeholder(tf.float32,shape=[None,numOutcomes*2],name="labelPlaceholder")
        _seqlens = tf.placeholder(tf.int32,shape=[None],name="sequenceLengthPlaceholder")

        # setup hashtag, emoticon, and regional indicators
        
        _hash_ind = tf.placeholder(tf.float32,shape=[None,numFeatures,1],name="hashtagPlaceholder")
        _emot_ind = tf.placeholder(tf.float32,shape=[None,numFeatures,1],name="emotPlaceholder")
        _loc_ind = tf.placeholder(tf.float32,shape=[None,numFeatures,1],name="locPlaceholder")

        embeddings = tf.Variable(tf.constant(0.0,shape=[vocabSize,vecDim]), trainable=False)
        embedding_init = embeddings.assign(embedding_placeholder)
        embed = tf.nn.embedding_lookup(embeddings,_inputs)
        embed2 = tf.concat(values=[embed,_loc_ind,_emot_ind],axis=2)
        
        # for sensitivity analyses, investigating the impact of removing hash_ind, 
        # emot_ind, or loc_ind from the model
        
        #embed2 = tf.concat(values=[embed,_emot_ind,_loc_ind],axis=2)
        #embed2 = tf.concat(values=[embed,_emot_ind,_loc_ind],axis=2)
        #embed2 = tf.concat(values=[embed,_hash_ind,_loc_ind],axis=2)

        
        #setup LSTM layers
        with tf.name_scope("biGRU"):
            
            with tf.variable_scope('forward'):
                gru_fw_cell = tf.contrib.rnn.LSTMCell(numFeatures,use_peepholes=False)
                gru_fw_cell = tf.contrib.rnn.DropoutWrapper(gru_fw_cell,output_keep_prob=keepRateLSTM)

            with tf.variable_scope('backward'):
                gru_bw_cell = tf.contrib.rnn.LSTMCell(numFeatures,use_peepholes=False)
                gru_bw_cell = tf.contrib.rnn.DropoutWrapper(gru_bw_cell,output_keep_prob=keepRateLSTM)

            (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn(
                cell_fw=gru_fw_cell,
                cell_bw=gru_bw_cell,
                inputs = embed2,
                sequence_length = _seqlens,
                dtype=tf.float32)

            hidden_input = tf.concat(values=[output_state_fw.h,output_state_bw.h],axis=1)
            fullLayerDropped = tf.layers.dropout(hidden_input,1-keepRateLSTM)
            
            # add weights for L2 regularization
            LSTM_weights = tf.Variable(tf.truncated_normal([numFeatures*2,numFeatures*2]))
            LSTM_bias = tf.Variable(tf.zeros([numFeatures*2]))
            LSTM_output = tf.matmul(fullLayerDropped,LSTM_weights) + LSTM_bias
            hiddenLayers = []
            hiddenWeights = []
            hiddenLayers.append(LSTM_output)
            hiddenWeights.append(LSTM_weights)

            
            # setup postLSTM layers
            for i in range(numPostLSTMLayers):
                tempLayer = tf.layers.dense(
                    hiddenLayers[len(hiddenLayers)-1],
                    hiddenLayerSize,
                    activation=tf.nn.tanh,
                    name = "hidden" + str(i)
                    )
                
                fullLayerDropped = tf.layers.dropout(tempLayer,1- keepRate)
                
                # weights for L2 regularization
                tempWeights = tf.Variable(tf.truncated_normal([hiddenLayerSize,hiddenLayerSize]))
                tempBias = tf.Variable(tf.zeros([hiddenLayerSize]))
                outLayer = tf.matmul(fullLayerDropped,tempWeights) + tempBias
                hiddenLayers.append(outLayer)
                hiddenWeights.append(tempWeights)
                
            
            # create a fully connected layer before the softmax layer
            tempLayer = tf.layers.dense(
                hiddenLayers[len(hiddenLayers)-1],
                preSoftmaxLayerSize,
                activation=tf.nn.relu,
                name='preSoftmax'
            )
            
            fullLayerDropped = tf.layers.dropout(tempLayer,rate = 1- keepRate)
            final_output = fullLayerDropped
            
            #greenspace is in index 1. Set to 1 for sensitivity analysis of removing greenspace from the set
            #of outcomes
            startIndex = 0
            
            
            #flatten output and apply softmax function
            concatenatedOutput = tf.identity(final_output[:,startIndex*2:(startIndex+1)*2])
            concatenatedLabels = tf.identity(_labels[:,startIndex*2:(startIndex+1)*2])
            
            for i in range(startIndex+1,7):
                concatenatedOutput = tf.concat(
                    [
                        concatenatedOutput,
                        tf.identity(final_output[:,(i*2):(i+1)*2])
                    ],
                    0
                )
                concatenatedLabels = tf.concat(
                    [
                        concatenatedLabels,
                        tf.identity(_labels[:,(i*2):(i+1)*2])
                    ],
                    0
                )

            softmax = tf.nn.softmax_cross_entropy_with_logits_v2(
                logits=concatenatedOutput,
                labels = concatenatedLabels
            )

            # define cost function including L2 regularization
            
            regularization = 0
            for i in range(len(hiddenWeights)):
                regularization = regularization + tf.nn.l2_loss(hiddenWeights[i])

            cost = tf.reduce_mean(softmax)
            cost2 = tf.reduce_mean(cost + l2Reg *regularization)
            optimizer = tf.train.AdamOptimizer(learningRate).minimize(cost2)

            # generate model predictions for all labels
            prediction = tf.argmax(concatenatedOutput,1)

            # identify correct predictions and calculate accuracy
            correct_prediction = tf.reshape(tf.equal(tf.argmax(concatenatedLabels,1),
                                                     prediction),[numOutcomes-startIndex,batchSize])
            accuracyVector = tf.reduce_mean(tf.cast(correct_prediction,tf.float16)*100,1)   

            
            # setup tf objects for saving model metadata and best model weights
            model_io_params = [_inputs,_labels,_seqlens,_hash_ind,_emot_ind,_loc_ind,prediction,cost]
            for save_param in model_io_params:
                tf.add_to_collection('model_io',save_param)
            model_saver = tf.train.Saver(max_to_keep = 5)
            model_saver.export_meta_graph(performFolder + "model_io.meta",
                                          collection_list = ['model_io'])


        # initialize and run training session
        with tf.Session() as sess:

            sess.run(tf.global_variables_initializer())
            sess.run(embedding_init, feed_dict= {embedding_placeholder:embeddingMatrix})

            # as training progresses, only save model weights which improve on dev cost.
            # start with worst possible cost 
            bestDevCost = 1

                        
            for step in range(numEpochs):
                x_batch, y_batch, seqlen_batch, hashtag_batch, emot_batch,loc_batch,indexNums = getSentenceBatch(
                    batchSize,
                    trainX,
                    trainY,
                    trainSeqlens,
                    trainHash,
                    trainEmot,
                    trainLoc,
                    word2IndexMap,
                    numOutcomes
                )

                _, c = sess.run([optimizer, cost],feed_dict={
                    _inputs:x_batch,
                    _labels:y_batch,
                    _seqlens:seqlen_batch,
                    _hash_ind:hashtag_batch,
                    _emot_ind:emot_batch,
                    _loc_ind:loc_batch
                }
                               )

                # evaluate model peformance using dev set 500 epochs.  If performance is new best, then save
                # model weights

                if step  % 500 == 0 and step > 0:
                    x_test,y_test,seqlen_test,hashtag_test,emot_test,loc_test,indexNums = getSentenceBatch(
                        5000,
                        testX,
                        testY,
                        testSeqlens,
                        testHash,
                        testEmot,
                        testLoc,
                        word2IndexMap,
                        numOutcomes
                    )    
                    batch_pred,c = sess.run(
                        [tf.argmax(concatenatedOutput,1),cost],
                        feed_dict={_inputs:x_test,
                                   _labels:y_test,
                                   _seqlens:seqlen_test,
                                   _hash_ind:hashtag_test,
                                   _emot_ind:emot_test,
                                   _loc_ind:loc_test
                                  }
                    )
                    
                    if(c < bestDevCost):
                        bestDevCost = c
                        print("new best model: %f", (c))
                        model_saver.save(sess,performFolder + "model",global_step=step)
                  
                    print("dev at step %i: %s " % (step,str(c)))

In [13]:
# main function
def main():
    trainDict, devDict, testDict, NYC_Dict,embeddingMatrix, word2IndexMap = loadDatasets(datasetPickleParams)
    numFeatures = len(trainDict['hash'][0])
    createModel(numFeatures,embeddingMatrix,trainDict,devDict,word2IndexMap)
main()

new best model: %f 0.31942517
dev at step 500: 0.31942517 
new best model: %f 0.25873736
dev at step 1000: 0.25873736 
new best model: %f 0.23352158
dev at step 1500: 0.23352158 
new best model: %f 0.21689768
dev at step 2000: 0.21689768 
new best model: %f 0.20683508
dev at step 2500: 0.20683508 
new best model: %f 0.20473154
dev at step 3000: 0.20473154 
new best model: %f 0.1963965
dev at step 3500: 0.1963965 
dev at step 4000: 0.19952711 
new best model: %f 0.19196838
dev at step 4500: 0.19196838 
new best model: %f 0.19022456
dev at step 5000: 0.19022456 
dev at step 5500: 0.19192366 
new best model: %f 0.18621777
dev at step 6000: 0.18621777 
dev at step 6500: 0.18914944 
dev at step 7000: 0.1871446 
new best model: %f 0.18375832
dev at step 7500: 0.18375832 
new best model: %f 0.1824224
dev at step 8000: 0.1824224 
new best model: %f 0.17798187
dev at step 8500: 0.17798187 
new best model: %f 0.17243452
dev at step 9000: 0.17243452 
new best model: %f 0.16979073
dev at step 9500

KeyboardInterrupt: 