# Training the Multiple BiLSTM Model #

**Author** Andrew Larkin <br>
**Affiliation** Oregon State University <br>
**Date Created** October 22, 2018 <br>

**Summary**
Create multivariate LSTM models for classifying urban nature tweets.  This script includes functions to tune hyperpameters and visualize model performance via graphs <br>


### Setup ###

Import libraries, set global constants and define filepaths


In [1]:
#import libraries
import numpy as np
#import tensorflow as tf
import re, string
import pickle
import os

In [2]:
# define input and output filepaths
parentFolder = "C:/Users/larkinan/Desktop/DBTraining/"
dataset = parentFolder + "preprocessingOutput/"
performFolder = parentFolder + "modelTrainingPerformance/"


In [3]:
# hyperparameters and constants for the deep learning model.  
modelParams = {
    'word_vec_dim':300, # dimension of each word vector
    'mini_batch_size':128,#[64,128,256],
    'learning_rate':0.0001,#[0.0001,0.0005,0.001,0.005,0.01,0.05,0.1],
    'momentum':0.9,
    'num_outcomes':7,#[1,2,3,4,5,6,7], # whether testing for just 1 or multiple outcomes,
    'postLSTM_layer_size':128,#[7,16,32,64,128],
    'postLSTM_layers':2,#[1,2,3],
    'pre_softmax_layer_size':28,
    'num_epochs':120000,#[1000,5000,10000,25000,50000,100000,250000],
    'num_dev':5000,
    'num_test':5000,
    'keep_prob':0.8,#[0.1,0.2,0.3,0.4,0.5],
    'hidden_layer_activation':'relu',#['tanh','relu','leaky_relu'],
    'batch_num':0,
}


# pickled datasets to load
datasetPickleParams = { # where to store datasets for model training on hard disk
    "trainDictPicklePath":dataset + "trainDict.p",
    "devDictPicklePath":dataset + "devDict.p",
    "testDictPicklePath":dataset + "testDict.p",
    "allDictPicklePath":dataset + "allDict.p",
    "embeddingMatrixPicklePath":dataset + "embeddingMatrix.p",
    "word2IndexPicklePath":dataset + "word2Index.p"
}


performancePickleParams = {
    "parentFolder":performFolder,
    "learningRate":performFolder + "learningRate",
    "batchSizeFolder":performFolder + "batchSize",
    "postLSTMLayerSize":performFolder + "postLSTMLayerSize",
    "postLSTMLayers":performFolder + "postLSTMLayers",
    "keepProb":performFolder + "keepProb",
    "hiddenLayerActivation":performFolder + "activationType"
}

In [4]:
# load pickled preprocessed data
def loadPreprocessedDatasets(pickleParams,loadTestData = False,loadAllData=False):
    testDict, allDataDict = (None for i in range(2))
    trainDict = pickle.load(open(pickleParams['trainDictPicklePath'],'rb'))
    devDict = pickle.load(open(pickleParams['devDictPicklePath'],'rb'))
    embeddingMatrix = pickle.load(open(pickleParams['embeddingMatrixPicklePath'],'rb'))
    word2IndexMap = pickle.load(open(pickleParams['word2IndexPicklePath'],'rb'))
    if(loadTestData):
        testDict = pickle.load(open(pickleParams['testDictPicklePath'],'rb'))
    if(loadAllData):
        allDataDict = pickle.load(open(pickleParams['allDictPicklePath'],'rb'))
    return(trainDict,devDict,embeddingMatrix,word2IndexMap,testDict,allDataDict)

In [5]:
trainDict, devDict, embeddingMatrix, word2IndexMap,testDict,allDict = loadDatasets(datasetPickleParams)
modelParams['num_features'] = len(trainDict['x'][0].split(" "))

NameError: name 'loadDatasets' is not defined

In [6]:
def extractDataFromDict(inputDict):
    return(inputDict['x'], inputDict['y'], inputDict['seqlens'],
           inputDict['hash'], inputDict['emot'])

In [7]:
def get_sentence_batch(
    batch_size,
    data_x,
    data_y,
    data_seqlens,
    hash_data,
    emot_data,
    word2index_map,numOutcomes,outcomeIndex):
    instance_indices = list(range(len(data_x)))
    np.random.shuffle(instance_indices)
    batch = instance_indices[:batch_size]
    x = [[word2index_map[word] for word in data_x[i].split()]
        for i in batch]
    y = np.asarray([data_y[i][outcomeIndex*2:outcomeIndex+(numOutcomes*2)] for i in batch]).reshape((batch_size, numOutcomes*2))
    seqlens = [data_seqlens[i] for i in batch]
    try:
        hashVals = np.asarray([hash_data[i] for i in batch],dtype=np.float32).reshape((batch_size,len(hash_data[0]),1))
    except Exception as e:
        print("hashtag failed" + str(e)) 
    try:
        emotVals = np.asarray([emot_data[i] for i in batch],dtype=np.float32).reshape((batch_size,len(emot_data[0]),1))
    except Exception as e:
        print("couldn't get sentence batch: " + str(e))
    return x,y,seqlens,hashVals,emotVals,batch

In [8]:
def calcF1Score(prediction,labels,batchSize):
    #labels = np.array(labels)
    numOutcomes = int(len(labels[0])/2)
    precisionArray = np.zeros((numOutcomes,2))
    recallArray = np.zeros((numOutcomes,2))
    F1ScoreArray = np.zeros((numOutcomes,4))
    for i in range(numOutcomes):
        subsetLabels = labels[:,i*2]
        predictionSubset = prediction[batchSize*(i):batchSize*(i+1)]
        positivePredictionLabels = subsetLabels[np.where( predictionSubset == 0 )]
        if(len(positivePredictionLabels)>0):
            numerator = np.sum(np.equal(positivePredictionLabels,1))
            if(numerator > 0):
                precisionArray[i,0] = (numerator)/(len(positivePredictionLabels))*100
            precisionArray[i,1] = len(positivePredictionLabels)
        positiveLabelPredictions = predictionSubset[np.where(subsetLabels == 1)]
        if(len(positiveLabelPredictions) > 0):
            numerator = np.sum(np.equal(positiveLabelPredictions,0))
            if(numerator > 0):
                recallArray[i,0] = (numerator)/(len(positiveLabelPredictions))*100.0
            recallArray[i,1] = len(positiveLabelPredictions)
        if(precisionArray[i,1] > 0 and recallArray[i,1] >0):
            F1ScoreArray[i,0] = 2*(precisionArray[i,0]*recallArray[i,0])/(precisionArray[i,0] + recallArray[i,0])
            F1ScoreArray[i,1] = 1
            F1ScoreArray[i,2] = precisionArray[i,0]
            F1ScoreArray[i,3] = recallArray[i,0]
    return(F1ScoreArray)

In [5]:
def readPickledPerformanceDatasets(inFolder):
    filesToRead = os.listdir(inFolder)
    dictArray = []
    for filename in filesToRead:
        performDict = pickle.load(open(inFolder + "/" + filename,'rb'))
        dictArray.append(performDict)
    return(dictArray)

{'trainDict': {'F1Score': [array([[       nan, 0.71356784, 0.5546875 , 1.        ],
       [       nan, 0.        , 0.        , 0.        ],
       [       nan, 0.        , 0.        , 0.        ],
       [       nan, 0.        , 0.        , 0.        ],
       [       nan, 0.        , 0.        , 0.        ],
       [       nan, 0.        , 0.        , 0.        ],
       [       nan, 0.        , 0.        , 0.        ]]), array([[0.56536805, 0.80272109, 0.7195122 , 0.90769231],
       [       nan, 0.        , 0.        , 0.        ],
       [       nan, 0.        , 0.        , 0.        ],
       [       nan, 0.        , 0.        , 0.        ],
       [       nan, 0.        , 0.        , 0.        ],
       [       nan, 0.        , 0.        , 0.        ],
       [       nan, 0.        , 0.        , 0.        ]]), array([[0.46549138, 0.75714286, 0.73611111, 0.77941176],
       [       nan, 0.        , 0.        , 0.        ],
       [       nan, 0.        , 0.        , 0.        ],


In [70]:
##### learningRateCandidates = [0.0001,0.0005]
modelParams['num_features'] = len(trainDict['hash'][0])
for candidateVal in learningRateCandidates:
    modelParams['learning_rate'] = candidateVal
    for batchNum in range(0,100):
        resultsFilepath = "%s%s%s%s%s%s" % (
            performancePickleParams['learningRate'], 
            "/v", 
            str(modelParams['learning_rate']).replace(".","p"), 
            "b", 
            str(batchNum), ---
            +-".p"
        )
        if not os.path.exists(resultsFilepath):
            performancePickleParams['paramToTunePath'] = resultsFilepath
            modelParams['batchNum'] = batchNum
            create_model(modelParams,embeddingMatrix,trainDict,devDict,word2IndexMap,performancePickleParams)
        else:
            print("%s already exists" % resultsFilepath)

C:/Users/larkinan/Desktop/DBTraining/modelTrainingPerformance/learningRate/v0p0001b0.p already exists
C:/Users/larkinan/Desktop/DBTraining/modelTrainingPerformance/learningRate/v0p0001b1.p already exists
C:/Users/larkinan/Desktop/DBTraining/modelTrainingPerformance/learningRate/v0p0001b2.p already exists
C:/Users/larkinan/Desktop/DBTraining/modelTrainingPerformance/learningRate/v0p0001b3.p already exists
C:/Users/larkinan/Desktop/DBTraining/modelTrainingPerformance/learningRate/v0p0001b4.p already exists
C:/Users/larkinan/Desktop/DBTraining/modelTrainingPerformance/learningRate/v0p0001b5.p already exists
C:/Users/larkinan/Desktop/DBTraining/modelTrainingPerformance/learningRate/v0p0001b6.p already exists
dev at step 2500: 0.1652579 
dev at step 5000: 0.1716033 
dev at step 7500: 0.1790022 




dev at step 10000: 0.16740176 
dev at step 12500: 0.15298422 
dev at step 15000: 0.15061896 
dev at step 17500: 0.12736495 
dev at step 20000: 0.13077444 
dev at step 22500: 0.1699023 
dev at step 25000: 0.121938825 
dev at step 27500: 0.1398831 
dev at step 30000: 0.13760196 
dev at step 32500: 0.13157256 
dev at step 35000: 0.13881102 
dev at step 37500: 0.16350578 
dev at step 40000: 0.15978968 
dev at step 42500: 0.16061893 
dev at step 45000: 0.14520316 
dev at step 47500: 0.12303587 
dev at step 50000: 0.11053735 
dev at step 52500: 0.14895359 
dev at step 55000: 0.14888212 
dev at step 57500: 0.13021477 
dev at step 60000: 0.118029356 
dev at step 62500: 0.20506215 
dev at step 65000: 0.15458143 
dev at step 67500: 0.14764442 
dev at step 70000: 0.14939848 
dev at step 72500: 0.19881943 
dev at step 75000: 0.16870901 
dev at step 77500: 0.18639113 
dev at step 80000: 0.15859458 
dev at step 82500: 0.23890693 
dev at step 85000: 0.2081624 
dev at step 87500: 0.20635532 
dev at st

dev at step 85000: 0.19107449 
dev at step 87500: 0.23108566 
dev at step 90000: 0.20971492 
dev at step 92500: 0.2556128 
dev at step 95000: 0.18451013 
dev at step 97500: 0.2541197 
dev at step 100000: 0.28781155 
dev at step 102500: 0.37580556 
dev at step 105000: 0.43182784 
dev at step 107500: 0.43690977 
dev at step 110000: 0.44084102 
dev at step 112500: 0.3218891 
dev at step 115000: 0.32613286 
dev at step 117500: 0.39484286 
dev at step 2500: 0.17572384 
dev at step 5000: 0.18789247 
dev at step 7500: 0.18532972 
dev at step 10000: 0.17416203 
dev at step 12500: 0.14189284 
dev at step 15000: 0.14667563 
dev at step 17500: 0.15888074 
dev at step 20000: 0.1798682 
dev at step 22500: 0.1320885 
dev at step 25000: 0.117047295 
dev at step 27500: 0.13280429 
dev at step 30000: 0.16503343 
dev at step 32500: 0.16635057 
dev at step 35000: 0.14061077 
dev at step 37500: 0.14557144 
dev at step 40000: 0.14574048 
dev at step 42500: 0.12160109 
dev at step 45000: 0.14338253 
dev at 

dev at step 42500: 0.15173712 
dev at step 45000: 0.09573268 
dev at step 47500: 0.19366825 
dev at step 50000: 0.154761 
dev at step 52500: 0.12467147 
dev at step 55000: 0.16153172 
dev at step 57500: 0.13867433 
dev at step 60000: 0.21954693 
dev at step 62500: 0.19154982 
dev at step 65000: 0.21733105 
dev at step 67500: 0.23085587 
dev at step 70000: 0.25722376 
dev at step 72500: 0.19253182 
dev at step 75000: 0.22324564 
dev at step 77500: 0.21117353 
dev at step 80000: 0.18115644 
dev at step 82500: 0.18483543 
dev at step 85000: 0.30326533 
dev at step 87500: 0.26404986 
dev at step 90000: 0.2933403 
dev at step 92500: 0.28865045 
dev at step 95000: 0.24671066 
dev at step 97500: 0.461253 
dev at step 100000: 0.3924623 
dev at step 102500: 0.47465453 
dev at step 105000: 0.39891848 
dev at step 107500: 0.5428129 
dev at step 110000: 0.5895712 
dev at step 112500: 0.40770862 
dev at step 115000: 0.7388182 
dev at step 117500: 0.44691667 
dev at step 2500: 0.20940952 
dev at ste

dev at step 117500: 0.49325994 
dev at step 2500: 0.17452453 
dev at step 5000: 0.16057687 
dev at step 7500: 0.14762546 
dev at step 10000: 0.16082397 
dev at step 12500: 0.12979594 
dev at step 15000: 0.14684403 
dev at step 17500: 0.1523089 
dev at step 20000: 0.09862499 
dev at step 22500: 0.15015788 
dev at step 25000: 0.11990834 
dev at step 27500: 0.12219082 
dev at step 30000: 0.16048697 
dev at step 32500: 0.13639235 
dev at step 35000: 0.15662667 
dev at step 37500: 0.12610683 
dev at step 40000: 0.11474292 
dev at step 42500: 0.13809289 
dev at step 45000: 0.1304717 
dev at step 47500: 0.1483592 
dev at step 50000: 0.15020859 
dev at step 52500: 0.14444877 
dev at step 55000: 0.14807059 
dev at step 57500: 0.14092116 
dev at step 60000: 0.18595223 
dev at step 62500: 0.1789453 
dev at step 65000: 0.19092853 
dev at step 67500: 0.19839846 
dev at step 70000: 0.18804769 
dev at step 72500: 0.28712896 
dev at step 75000: 0.21676469 
dev at step 77500: 0.21489926 
dev at step 80

KeyboardInterrupt: 

In [None]:
def calcAvg

In [13]:
def calcPerformanceMetrics(metricArrays,batch_pred,y_batch,batchSize,step,c):
    metricArrays[0].append(calcF1Score(batch_pred,y_batch,batchSize))
    metricArrays[1].append(batch_pred)
    metricArrays[2].append(y_batch)
    metricArrays[3].append(step)
    metricArrays[4].append(c)

In [67]:
def writePeformanceMetrics(trainArrays,devArrays,modelParams,pickleParams):
    trainDict = {
        'F1Score':trainArrays[0],
        'Predict':trainArrays[1],
        'Labels':trainArrays[2],
        'EpochNum':trainArrays[3],
        'Cost':trainArrays[4]
    }
            
    devDict = {
        'F1Score':devArrays[0],
        'Predict':devArrays[1],
        'Labels':devArrays[2],
        'EpochNum':devArrays[3],
        'Cost':devArrays[4]
    }
            
    batchDict = {
        'trainDict':trainDict,
        'devDict':devDict,
        'modelParams':modelParams,
    }
                    
    pickle.dump(batchDict,open(performancePickleParams['paramToTunePath'],"wb" )
    )

In [63]:
def create_model(modelParams,embeddingMatrix,trainDict,testDict,word2IndexMap,pickleParams):
    
    with tf.device('/device:GPU:0'):
        trainX,trainY,trainSeqlens,trainHash,trainEmot = extractDataFromDict(trainDict)
        testX,testY,testSeqlens,testHash,testEmot = extractDataFromDict(testDict)
    
        numFeatures = modelParams['num_features']
        vocabSize = len(embeddingMatrix)
        vecDim = modelParams['word_vec_dim']
        batchSize = modelParams['mini_batch_size']
        numOutcomes = modelParams['num_outcomes']
        numPostLSTMLayers = modelParams['postLSTM_layers']
        hiddenLayerActivation = modelParams['hidden_layer_activation']
        hiddenLayerSize = modelParams['postLSTM_layer_size']
        learningRate = modelParams['learning_rate']
        preSoftmaxLayerSize = modelParams['pre_softmax_layer_size']
        keepRate = modelParams['keep_prob']
        batchNum = modelParams['batchNum']
        numEpochs = modelParams['num_epochs']
    
        tf.reset_default_graph() 
        _inputs = tf.placeholder(tf.int32,shape=[batchSize,numFeatures],name="featurePlaceholder")
        embedding_placeholder = tf.placeholder(tf.float32, [vocabSize,vecDim],name="embeddPlaceholder")
    
        _labels = tf.placeholder(tf.float32,shape=[batchSize,numOutcomes*2],name="labelPlaceholder")
        _seqlens = tf.placeholder(tf.int32,shape=[batchSize],name="sequenceLengthPlaceholder")
                              
        _hash_ind = tf.placeholder(tf.float32,shape=[batchSize,numFeatures,1],name="hashtagPlaceholder")
        _emot_ind = tf.placeholder(tf.float32,shape=[batchSize,numFeatures,1],name="emotPlaceholder")
   
        embeddings = tf.Variable(tf.constant(0.0,shape=[vocabSize,vecDim]), trainable=False)
        embedding_init = embeddings.assign(embedding_placeholder)
        embed = tf.nn.embedding_lookup(embeddings,_inputs)
        embed2 = tf.concat(values=[embed,_hash_ind,_emot_ind],axis=2)
    
        with tf.name_scope("biGRU"):
            with tf.variable_scope('forward'):
                gru_fw_cell = tf.contrib.rnn.LSTMCell(numFeatures,use_peepholes=False)
                gru_fw_cell = tf.contrib.rnn.DropoutWrapper(gru_fw_cell,output_keep_prob=keepRate)
          
            with tf.variable_scope('backward'):
                gru_bw_cell = tf.contrib.rnn.LSTMCell(numFeatures,use_peepholes=False)
                gru_bw_cell = tf.contrib.rnn.DropoutWrapper(gru_bw_cell,output_keep_prob=keepRate)
          
            (output_fw, output_bw), (output_state_fw, output_state_bw) = tf.nn.bidirectional_dynamic_rnn(cell_fw=gru_fw_cell,
                                                              cell_bw=gru_bw_cell,
                                                              inputs = embed2,
                                                              sequence_length = _seqlens,
                                                              dtype=tf.float32,
                                                              scope="biga")
        
            hidden_input = tf.concat(values=[output_state_fw.h,output_state_bw.h],axis=1)
            hiddenLayers = []
            hiddenLayers.append(hidden_input)
            for i in range(numPostLSTMLayers):
                if(hiddenLayerActivation == 'tanh'):
                    tempLayer = tf.layers.dense(
                        hiddenLayers[len(hiddenLayers)-1],hiddenLayerSize,activation=tf.nn.tanh,name = "hidden" + str(i))
                elif(hiddenLayerActivation == 'relu'):
                      tempLayer = tf.layers.dense(
                        hiddenLayers[len(hiddenLayers)-1],hiddenLayerSize,activation=tf.nn.relu,name = "hidden" + str(i))
                fullLayerDropped = tf.layers.dropout(tempLayer,1- keepRate)
                hiddenLayers.append(fullLayerDropped)
                
            tempLayer = tf.layers.dense(
                hiddenLayers[len(hiddenLayers)-1],preSoftmaxLayerSize,activation=tf.nn.relu,name='preSoftmax')
            fullLayerDropped = tf.layers.dropout(tempLayer,rate = 1- keepRate)
            
            hiddenLayers.append(fullLayerDropped)
        
            weights = {'linear_layer':tf.Variable(tf.truncated_normal([preSoftmaxLayerSize,numOutcomes*2],mean=0,stddev=0.01))}
            biases = {'linear_layer':tf.Variable(tf.truncated_normal([numOutcomes*2],mean=0,stddev=0.01))}
        
            final_output = tf.matmul(
                hiddenLayers[len(hiddenLayers)-1],weights["linear_layer"]+ biases["linear_layer"])
            
            cost = 0
            #softmaxArray = {}
            concatenatedOutput = tf.identity(final_output[:,0:2])
            concatenatedLabels = tf.identity(_labels[:,0:2])
            for i in range(1,numOutcomes):
                concatenatedOutput = tf.concat(
                    [concatenatedOutput,tf.identity(final_output[:,(i*2):(i+1)*2])],
                    0
                )
                concatenatedLabels = tf.concat(
                    [concatenatedLabels,tf.identity(_labels[:,(i*2):(i+1)*2])],
                    0
                )
            softmax = tf.nn.softmax_cross_entropy_with_logits_v2(
                    logits=concatenatedOutput,labels = concatenatedLabels)
                
                
            cost = tf.reduce_mean(softmax)
            optimizer = tf.train.AdamOptimizer(learningRate).minimize(cost)

      
            correct_prediction = tf.reshape(tf.equal(tf.argmax(concatenatedLabels,1),
                                      tf.argmax(concatenatedOutput,1)),[numOutcomes,batchSize])
        
            accuracyVector = tf.reduce_mean(tf.cast(correct_prediction,tf.float16)*100,1)
            
        
        
        
        with tf.Session() as sess:
                     
         
            sess.run(tf.global_variables_initializer())
            sess.run(embedding_init, feed_dict= {embedding_placeholder:embeddingMatrix})
            
            trainArrays = [[],[],[],[],[]]
            devArrays = [[],[],[],[],[]]
        
            for step in range(numEpochs):
                x_batch, y_batch, seqlen_batch, hashtag_batch, emot_batch,indexNums = get_sentence_batch(
                    batchSize,trainX,trainY,trainSeqlens, trainHash,trainEmot,word2IndexMap,numOutcomes,0)
            
            
                _, c = sess.run([optimizer, cost],feed_dict={
                    _inputs:x_batch,_labels:y_batch,_seqlens:seqlen_batch,
                    _hash_ind:hashtag_batch,_emot_ind:emot_batch})
            
            
                if(step % 500 == 0 and step > 0):
                    batch_pred, c = sess.run(
                        [tf.argmax(concatenatedOutput,1),cost],
                        feed_dict={_inputs:x_batch,_labels:y_batch,_seqlens:seqlen_batch,
                                                      _hash_ind:hashtag_batch,_emot_ind:emot_batch}
                    )
                    
                    
                    calcPerformanceMetrics(trainArrays,batch_pred,y_batch,batchSize,step,c)
                   
                    
                    
                if step  % 500 == 0 and step > 0:
                    batch_total = []
                    y_test_total = []
                    x_test,y_test,seqlen_test,hashtag_test,emot_test,indexNums = get_sentence_batch(
                        batchSize,testX,testY,testSeqlens,testHash,testEmot,word2IndexMap,numOutcomes,0)    
                    batch_pred,c = sess.run([tf.argmax(concatenatedOutput,1),cost],
                                                    feed_dict={_inputs:x_test,
                                                            _labels:y_test,
                                                            _seqlens:seqlen_test,_hash_ind:hashtag_test,
                                                            _emot_ind:emot_test})
                    calcPerformanceMetrics(devArrays,batch_pred,y_test,batchSize,step,c)
                    
                if(step% 2500 == 0 and step > 0):
                        print("dev at step %i: %s " % (step,str(c)))
                    
                    
                
                    
                    
            writePeformanceMetrics(trainArrays,devArrays,modelParams,pickleParams)
                    
            
         
           
            
            
                    

In [None]:
create_model(64,10,57235,300,embedding_matrix,trainDict,testDict,word2Index)
#print(len(trainDict['train_x']))
#create_model(5,10,vocabulary_size,vec_dim,embedding_matrix,train_x,train_y,train_seqlens)