# Calculate and graph performance summary statistics #

** Author: Andrew Larkin **, Oregon State University College of Public Health and Human Sciences <br>
** Date created: ** January 5th, 2018

### Summary ###
For evaluating performance of candidate models in train, dev, test, and independent datasets.  Calculate confusion matrices of model-dataset combinations.  Graph precision and recall for each model-dataset combation and outcome.

This script is divided into two parts: <br>
1) Calculate performance metrics and output to CSV <br>
2) Graph performance metrics and save to .eps file (or print to screen)

### Import libraries and define global variables and constants ###

In [None]:
import copy, math, os, pickle, re, string
import matplotlib.lines as mlines
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import numpy as np
import pandas as ps
import tensorflow as tf

In [None]:
# define input and output filepaths
parentFolder = "C:/Users/larkinan/Desktop/DBTraining/"
performFolder = parentFolder + "modelTrainingPerformance/"
datasetFolder = performFolder + "preprocessingOutput/"
performCSV_Denorm = performFolder + "ModelPrecisionRecall_Jan6_18.csv"   # intermediate file containing precision and recall estimates to graph

# pickled datasets to load
datasetPickleParams = { # where to store datasets for model training on hard disk
                        "trainDictPicklePath":dataset + "trainDict.p",
    "devDictPicklePath":dataset + "devDict.p",
    "testDictPicklePath":dataset + "testDict.p",
    "allDictPicklePath":dataset + "allDict.p",
    "embeddingMatrixPicklePath":dataset + "embeddingMatrix.p",
    "word2IndexPicklePath":dataset + "word2Index.p"
    "NYC_DictPicklePath":dataset + "NYC_Dict.p"
}

## Part 1: Calculate performance metrics and output to CSV ##

In [None]:
# load pickled preprocessed data
def loadDatasets(pickleParams):
    trainDict = pickle.load(open(pickleParams['trainDictPicklePath'],'rb'))
    devDict = pickle.load(open(pickleParams['devDictPicklePath'],'rb'))
    testDict = pickle.load(open(pickleParams['testDictPicklePath'],'rb'))
    NYC_Dict = pickle.load(open(pickleParams['NYC_DictPicklePath'],'rb'))
    embeddingMatrix = pickle.load(open(pickleParams['embeddingMatrixPicklePath'],'rb'))
    word2IndexMap = pickle.load(open(pickleParams['word2IndexPicklePath'],'rb'))
    return(trainDict,devDict,testDict,NYC_Dict,embeddingMatrix,word2IndexMap)

In [None]:
#  randomly sample record indices 
def getSampledIndices(dataX):
    instanceIndices = list(range(len(dataX)))
    np.random.shuffle(instanceIndices)
    sampledIndices = instanceIndices[:batchSize]
    return(sampledIndices)

In [None]:
# get sampled text and convert to index values using the map
def getSampledXVals(sampledIndices,dataX,word2IndexMap):
    sampledX = []
    for i in sampledIndices:
        sent = dataX[i]
        tempX = []
        for word in sent.split():
            
            # when applying to datasets other than the train, dev, and test, some words may not be in the dictionary
            if(word in word2IndexMap):
                tempX.append(word2IndexMap[word])
            else:
                tempX.append(word2IndexMap['UNK'])
        sampledX.append(tempX)
    return(sampledX)

### get a random sample for a single epoch or evaluation###
**Inputs**: <br>
- **batchSize** (int) - number of records to randomly sample <br>
- **dataX** (string array) - tweet text for all records <br>
- **dataY** (array of 1x7 int arrrays) - each 1x7 int array corresponds to 7 labels for one record <br>
- **dataSeqLens** (int array) - number of words in each record
- **dataHash** (array of binary numbers) - the nth digit in the ith binary number indicates whether the nth word in the ith dataX record is from a hashtag <br>
- **dataEmot** (array of binary numbers) - the nth digit in the ith binary number indicates whether the nth word in the ith dataX record is from an emoticon <br>
- **dataLoc** (array of binary numbers) - the nth digit in the ith binary number indicates whether the nth word in the ith dataX record is from a regional location description <br>
- **word2IndexMap** (dict) - dictionary of word:index keys <br>
- **numOutcomes** (int) - number of outcomes in the dataset <br>

**Outputs**: <br>
- **sampledX** (array of int arrays) - word2Index mapped numbers for the words in the sampled tweets <br>
- **sampledY** (array of int arrays) - outcome labels for sampled tweets <br>
- **samplesdSeqLens** (int array) - length of of sampled tweets <br>
- **sampledHash** (array of int arrays) - indicator values of which words in the sampled tweets are hashtags <br>
- **sampledEmot** (array of int arrays) - indicator values of which words in the sampled tweets are emoticon descriptions <br>
- **sampledLoc** (array of int arrays) - indicator values of which words in the sampled tweets are regional descriptions that use nature-related vocabulary <br>
- **sampledIndices** (int array) - sampled record indices in the original dataset <br>

In [None]:
def getSentenceBatch(batchSize,dataX,dataY,
                       dataSeqlens,dataHash,dataEmot,dataLoc,
                       word2IndexMap,numOutcomes):
    
    sampledIndices = getSampledIndices(dataX)
    sampledX = getSampledXVals(sampledIndices,dataX,word2IndexMap)
    sampledY = np.asarray([dataY[i][0:numOutcomes*2] for i in sampledIndices]).reshape((batchSize, numOutcomes*2))
    sampledSeqlens = [dataSeqlens[i] for i in batch]
    sampledHash = np.asarray([dataHash[i] for i in sampledIndices],dtype=np.float32).reshape((batchSize,len(hash_data[0]),1))
    sampledEmot = np.asarray([emot_data[i] for i in sampledIndices],dtype=np.float32).reshape((batchSize,len(emot_data[0]),1))
    sampledLoc = np.asarray([loc_data[i] for i in sampledIndices],dtype=np.float32).reshape((batbatchSize,len(loc_data[0]),1))
    
    return(sampledX,sampledY,sampledSeqlens,sampledHash,sampledEmot,sampledLoc,sampledIndices)

In [None]:
# extract vectors from dataset dictionary.  
def extractDataFromDict(inputDict):
    return(inputDict['sent'], inputDict['labels'], inputDict['seqLens'],
           inputDict['hash'], inputDict['emot'],inputDict['loc_ind'])

In [None]:
# update performance mat once values are calculated
def fillInPerfMat(performanceMat,MCC,F1Score,Precision,Recall,TP,FP,TN,FN):
    performanceMat[perfMatIndex,0] = MCC
    performanceMat[perfMatIndex,1] = F1Score
    performanceMat[perfMatIndex,2] = Precision
    performanceMat[perfMatIndex,3] = Recall
    performanceMat[perfMatIndex,4] = TP
    performanceMat[perfMatIndex,5] = TN
    performanceMat[perfMatIndex,6] = FP
    performanceMat[perfMatIndex,7] = FN	

In [None]:
def calcMCC(TPperc,TNperc,FPperc,FNperc):
    MCCNumerator = TPperc*TNperc-FPperc*FNperc


    MCCDenomenator = math.sqrt((TPperc+FPperc)*(TPperc+FNperc)*(TNperc+FPperc)*(TNperc+FNperc))
    MCC = 0
    try:
        MCC = MCCNumerator/MCCDenomenator
    except:
        MCC = 0

### calculate statistics for one outcome of a subset of the entire epoch dataset ###
**Inputs**: <br>
- **subsetLabels** (int array) - labels for the outcome of interest and subset of interest.  value of 1 corresponds to a prediction of a positive label (label array holds actual labels) <br>
- **predictionSubset** (int array) - model predictions for the outcome of interest and subset of interest. Prediction array corresponds to the predicted column that the model predicts - column 0 holds the positive label, column 1 holds the negative label.  A value of 0 in predictionSubset therefore corresponds to predicting a positive label <br>
- **performanceMat** (float matrix) - a nx8 matrix, where n corresponds to the number of outcomes and 8 corresponds to the number of performance metrics the matrix holds <br>
- **perMatIndex** (int) - index of the current outcome (and row of the performanceMat) to calculate performance values for <br>

In [None]:
def calcStatsForSubset(subsetLabels,predictionSubset,performanceMat,perfMatIndex):
    
    TP, TN, FP, FN, TPperc,FPperc,TNperc,FNperc, Recall, Precision, F1Score = [0 for x in range(11)]
    
    
    positivePredictionLabels = subsetLabels[np.where( predictionSubset == 0 )]   
    
    if(len(positivePredictionLabels)>0):
        TP = np.sum(np.equal(positivePredictionLabels,1))
        FP = len(positivePredictionLabels) - TP
        
    # calc TN and FN
    negativePredictionLabels = subsetLabels[np.where( predictionSubset == 1 )] 
    
    if(len(negativePredictionLabels) > 0):
        TN = np.sum(np.equal(negativePredictionLabels,0))
        FN = len(negativePredictionLabels) - TN
    if(TP > 0):
        Recall = (TP*1.0)/((TP+FN)*1.0)
        Precision = (TP*1.0)/((TP+FP)*1.0)
        F1Score = 2*(Precision*Recall)/(Precision+Recall)
        TPperc = float(TP/len(predictionSubset))*100
    if(FP>0):
        FPperc = float(FP/len(predictionSubset))*100
    if(TN>0):
        TNperc = float(TN/len(predictionSubset))*100
    if(FN>0):
        FNperc = float(FN/len(predictionSubset))*100

    MCC = calcMcc(TPperc,TNperc,FPperc,FNperc)
    
    fillInPerfMat(performanceMat,MCC,F1Score,Precision,Recall,TP,FP,TN,FN)

In [None]:
# convert float matrix to dictionary
def perfMatToDict(inMat):
    performanceDict = { 'MCC':inMat[:,0],
                        'F1Score':inMat[:,1],
                        'Precision':inMat[:,2],
                        'Recall':inMat[:,3],
                        'TP':inMat[:,4],
                        'TN':inMat[:,5],
                        'FP':inMat[:,6],
                        'FN':inMat[:,7]
                        }
    return(performanceDict)

### calculate confusion matrix for an epoch or evaluation run, including subsets for greenspace and non greenspace outcomes ###
**Inputs**: <br>
- **prediction** (int array) - array of prediction values.  Values are flattened into a 1 x (n*m) array, where n is the number of records and m is the number of outcomes <br>
- **labels** (array of int arrays) - labels of each outcome.  Each int array contains nx2 ints, where n is the number of outcomes.  See the data preprocessing section for more details about the label structure <br>
- **batchSize** (int) - number of records <br>
- **numOutcomes** (int) - number of outcomes <br>

**Outputs**: <br>
- **allDict** (dict) - performance dictionary for all records and all outcomes <br>
- **greenDict** (dict) - performance dictionary for records with positive greenspace labels <br>
- **notGreenDict** (dict) - performance dictionary for records with negative greenspace labels <br>

In [None]:
def calcConfMatrix(prediction,labels,batchSize,numOutcomes):

    # Create confusion matrices for all data, data with positive green labels, and data with 
    # negatie green labels.  8 performance metrics for each outcome
    performanceMatGreen = np.zeros((numOutcomes,8))  
    performanceMatNotGreen = np.zeros((numOutcomes,8))
    performanceMatAll = np.zeros((numOutcomes,8))
    positiveGreenIndeces = []
    negativeGreenIndeces = []
    labelOffset = 7 - numOutcomes
    
    
    # calculate statistics for one outcome at a time
    for predictionIndex in range(numOutcomes):
        subsetLabels = labels[:,(predictionIndex+labelOffset)*2]

        predictionSubset = prediction[batchSize*(predictionIndex):batchSize*(predictionIndex+1)]
        
        # prediction index 0 corresponds to the first outcome, which is greenspace.  Use this to identify 
        # which records have positive greenspace labels
        if(predictionIndex==0):
            positiveGreenIndeces = np.where(predictionSubset==0)
            negativeGreenIndeces = np.where(predictionSubset==1)
        else:
            positiveGreenSubset = subsetLabels[positiveGreenIndeces]
            negativeGreenSubset = subsetLabels[negativeGreenIndeces]

            positiveGreenPredictions = predictionSubset[positiveGreenIndeces]
            positivePredictionLabels = positiveGreenSubset[np.where( positiveGreenPredictions == 0 )]
            negativeGreenPredictions = predictionSubset[negativeGreenIndeces]
            
            calcStatsForSubset(positiveGreenSubset,positiveGreenPredictions,performanceMatGreen,predictionIndex)
            calcStatsForSubset(negativeGreenSubset,negativeGreenPredictions,performanceMatNotGreen,predictionIndex)
        calcStatsForSubset(subsetLabels,predictionSubset,performanceMatAll,predictionIndex)

    # convert performance matrices to dictionaries
    allDict = perfMatToDict(performanceMatAll)
    greenDict = perfMatToDict(performanceMatGreen)
    notGreenDict = perfMatToDict(performanceMatNotGreen)
    
    return([allDict,greenDict,notGreenDict])

### identify records where predictions don't match labels (false positives or fase negatives) for all outcomes ###
**Inputs**: <br>
- **text** (string array) - text of each record <br>
- **yTest** (array of int arrays) - labels for each outcome.  NOTE: 1 corresponds to a positive label, while one corresponds to a negative label.  This differs from yPredict. One int array corresponds to the outcome labels for record. <br>
- **yPredict** (array of int arrays) - predictions for each outcome.  NOTE: 1 corresponds to a negative label, while 0 corresponds to a positive label.  This differs from yTest.  One int array corrresponds to the outcome labels for once record <br>
- **batchSize** (int) - number of records <br>
- **idNums** (int array) - unique id number of each record from the pickled database loaded from the hard drive <br>
- **falseNum** (int) - number that the yPredict and yTest will both hold if the record is false.  For example, if falseNum is 1 then the function is identifying flase negatives, and yTest and yPredict will both contain 1 <br>

**Outputs**: <br>
- **masterDictArray** (array of dicts) - one dictionary for each outcome.  Each dictionary contains the following key:value pairs:

    1) FalseID (int array) - unique id numbers for each mismatch record.  Id numbers correspond to original pickled dataset loaded from secondary storage <br>
    2) FalseText (string array) - text for each mistmatch record <br>
    3) Outcome (int) - index of the outcome.  0 for greenspace, 7 for air. See the data preprocessing section for more details <br>

In [None]:
def identifyFalseRecords(text,yTest,yPredict,batchSize,idNums,falseNum):
  
    numOutcomes = int(len(yTest[0])/2)
    masterDictArray = []
  
    for i in range(numOutcomes):
        subsetLabels = yTest[:,i*2]
        predSubset = yPredict[batchSize*(i):batchSize*(i+1)]
        
        # identify id numbers of records where labels and predictions both have the false num value 
        #(i.e. don't predict the same label for the current outcome)
        posPredIndices = np.where(predSubset == falseNum)
        negActual = np.where(subsetLabels == falseNum)
        
        FP_ID = []
        FP_X = []
        flatPred = posPredIndices[0].ravel().tolist()
        
        # iteratively search through predictions.  If id number is in both, then there's a mismatch between
        # prediction and label
        for index in range(len(flatPred)):
            candVal = flatPred[index]
            if candVal in negActual[0]:
                origId = id_nums[candVal]
                FP_ID.append(origId)
                FP_X.append(X[origId])
        
        tempDict = {'FalseID':FP_ID,'FalseText':FP_X,'Outcome':i}
        masterDictArray.append(tempDict)
        
    return(masterDictArray)

In [None]:
# convert dictionary to pandas dataframe and save to csv file.  outFilepath is an absolute filepath
def saveDictToCSV(inDict,outFilepath):
    performanceDF = ps.DataFrame.from_dict(inDict)
    performanceDF.to_csv(outFilepath)

### identify false predictions and save text to csv file ###
**Inputs**:
- **text** (string array) - text of each record <br>
- **yTest** (array of int arrays) - labels for each outcome.  NOTE: 1 corresponds to a positive label, while one corresponds to a negative label.  This differs from yPredict. One int array corresponds to the outcome labels for record. <br>
- **predVals** (array of int arrays) - predictions for each outcome.  NOTE: 1 corresponds to a negative label, while 0 corresponds to a positive label.  This differs from yTest.  One int array corrresponds to the outcome labels for once record <br>
- **batchSize** (int) - number of records <br>
- **indexNums** (int array) - unique id number of each record from the pickled database loaded from the hard drive <br>
- **iterationNum** (int) - model run number, when multiple batches are needed for large datasets too big to fit into a single model <br>

In [None]:
def getFalsePredictionText(text,yTest,predVals,batchSize,indexNums,iterationNum):
    # get the text for the false positive and false negatives
    dictLabels = ['FN','FP']
    FalseNegDict = identifyFalseRecords(text,yTest,predVals,batchSize,indexNums,1)
    FalsePosDict = identifyFalseRecords(text,yTest,predVals,batchSize,indexNums,0)
    dicts = [FalseNegDict,FalsePosDict]
    
    outcomes = ['greenspace','safety','beauty','exercise','social','stress','air']
    # for each outcome and dictionary of false positives and negatives, write text and metadata to csv
    for dictIndex in range(len(dicts)):
        label = dictLabels[dictIndex]
        tempDict = dicts[dictIndex]
        for outcomeIndex in range(len(outcomes)):
            outputFilename = performFolder + "%s%s%i.csv" %(outcomes[outcomeIndex],label,iterationNum)
            if(not os.path.exists(outputFilename)):
                saveDictToCSV(tempDict[outcomeIndex],outputFilename)

### calculate model performance statistics, and get text of misclassified tweets ###
**Inputs**: <br>
- **text** (string array) - text of each record <br>
- **yTest** (array of int arrays) - labels for each outcome.  NOTE: 1 corresponds to a positive label, while one corresponds to a negative label.  This differs from yPredict. One int array corresponds to the outcome labels for record. <br>
- **predVals** (array of int arrays) - predictions for each outcome.  NOTE: 1 corresponds to a negative label, while 0 corresponds to a positive label.  This differs from yTest.  One int array corrresponds to the outcome labels for once record <br>
- **batchSize** (int) - number of records <br>
- **indexNums** (int array) - unique id number of each record from the pickled database loaded from the hard drive <br>
- **iterationNum** (int) - model run number, when multiple batches are needed for large datasets too big to fit into a single model <br>
- **numOutcomes** (int) - number of outcome classes <br>

In [None]:
# calculate the confusion matrix, MCC, F1Score, Precision, and Recall for all outcomes  
def getModelPerfStats(text,predVals,yTest,batchSize,indexNums,iterationNum,outType,numOutcomes):
    
    # calculate confucsion matrix for the entire dataset, as well as subset with positive and negative 
    # greenspace labels
    [perfDictAll,perfDictGreen,perfDictNotGreen] = (calcConfMatrix(predVals,yTest,batchSize,numOutcomes))
    saveDictToCSV(perfDictAll,performFolder + outType + "performAll.csv")
    saveDictToCSV(perfDictGreen,performFolder + outType + "performGreen.csv")
    saveDictToCSV(perfDictNotGreen,performFolder + outType + "performNotGreen.csv")	
    
    getFalsePredictionText(text,yTest,predVals,batchSize,indexNums)

In [None]:
# load model properties from input tensorflow metadata objecxt
def loadModelProperties(inputModelMeta):
    
    model_io = tf.get_collection('model_io')
    inputs = inputModelMeta[0]
    labels = inputModelMeta[1]
    seqlens = inputModelMeta[2]
    hash_ind = inputModelMeta[3]
    emot_ind = inputModelMeta[4]
    loc_ind = inputModelMeta[5]
    prediction = inputModelMeta[6]
    cost = model_io[7]
    
    return(inputs,labels,seqlens,hash_ind,loc_ind,prediction,cost)

### load trained model into memory and classify predictions ###
**Inputs**: <br>
- **inDict** (dict) - dictionary containing input data for model, and labels to test model performance <br>
- **batchSize** (int) - number of records in inDict <br>
- **outType** (string) - tag to add to saved records, indicating what type of data (e.g. train, dev) inDict corresponds to <br>
- **modelNum** (int) - unique id signifying which trained model to load into memory.  This is the original epoch number during the traing process <br>
- **numOutcomes** (int) - number of outcomes in the model <br>

In [None]:
def runModel(inDict,batchSize,outType,modelNum,numOutcomes):
    X,Y,Seqlens,Hash,Emot,Loc = extractDataFromDict(inDict)
    x_test,y_test,seqlen_test,hashtag_test,emot_test,loc_test,indexNums = get_sentence_batch(batchSize,X,Y,Seqlens,Hash,Emot,Loc,word2IndexMap,7,0)
    tf.reset_default_graph()
    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(performFolder + "model_io.meta")
        saver.restore(sess,performFolder + "model-" + str(modelNum))

        inputs, labels, seqlens, hash_ind,loc_ind,prediction,cost = loadModelProperties(model_io)
        
        # script is running on NDIVIA Titan V (12GB memory) and Titan X Pascal video cards.  
        # Upper limit for number of records in a single run is approximately 10,000.
        numModelRuns = math.ceil(batchSize/10000)

        for modelRun in range(numModelRuns):

            x_test_partition = x_test[modelRun*10000:(modelRun+1)*10000]
            y_test_partition = y_test[modelRun*10000:(modelRun+1)*10000]
            seqlen_test_partition =seqlen_test[modelRun*10000:(modelRun+1)*10000]
            hashtag_test_partition = hashtag_test[modelRun*10000:(modelRun+1)*10000]
            emot_test_partition = emot_test[modelRun*10000:(modelRun+1)*10000]
            loc_test_partition = loc_test[modelRun*10000:(modelRun+1)*10000]
            indexNums_partition = indexNums[modelRun*10000:(modelRun+1)*10000]

            pred_vals,c = sess.run([prediction,cost],
                                   feed_dict={inputs:x_test_partition,
                                              labels:y_test_partition,
                                              seqlens:seqlen_test_partition,
                                              hash_ind:hashtag_test_partition,
                                              emot_ind:emot_test_partition,
                                              loc_ind:loc_test_partition
                                              })

            # calculate model performance and write to stable storage
            getModelPerfStats(X,pred_vals,y_test_partition,len(indexNums_partition),indexNums_partition,modelRun,outType,numOutcomes,saveToCSV)

### main script in part  1 ###

In [None]:
trainDict, devDict, testDict, NYC_Dict,embeddingMatrix, word2IndexMap = loadDatasets(datasetv2PickleParams)
runModel(trainDict,60000,"train",21500,7)
runModel(devDict,5000,"dev",21500,7)
runModel(testDict,5000,"test",21500,7)
runModel(NYCDict,4850,"NYC",21500,7)

## Part 2: Graph Performance Metrics ##

In [None]:
# load performanceMeasuresFromCSV
def loadPerformanceData(dataFilepath,debug=False):
    rawData = ps.read_csv(dataFilepath)
    
    # Second step in case future versions need to process input data
    processedData = rawData
    
    if(debug):
        print(processedData.head())
        keys = processedData.keys()
        print("number of records: %i" %len(processedData[keys[0]]))
    
    return(rawData)

### For each subplot, setup graph properties including axis boundaries, titles, and reference lines ###
**Inputs:** <br>
- **tempAxis** (object) - matplotlib axis object for the current subplot <br>
- **xDim** (2 element float array) - min and max boundaries for the x axis
- **yDim** (2 element float array) - min and max boundaries for the y axis
- **yLabel** (string) - optional label for the yaxis
- **xLabel** (string) - optional label for the xaxis
- **outcome** (string) - optional subplot title

In [None]:
def setAxisProperties(tempAxis,xDim,yDim,yLabel=None,xLabel = None,title=None):
    v_line = mlines.Line2D([0.7, 0.7], [0, 1], color='black',linestyle='dashed')
    h_line = mlines.Line2D([0,1],[0.7,0.7],color='black',linestyle='dashed')
    tempAxis.set_xlim(xDim)
    tempAxis.set_ylim(yDim)
    tempAxis.add_line(v_line)
    tempAxis.add_line(h_line)
    if(yLabel):
        tempAxis.set_ylabel(yLabel)
    if(xLabel): 
        tempAxis.set_xlabel(xLabel)
    if(title):
        tempAxis.set_title(title)

### plot subset for one outcome of interest  ### 
plot includes all models and datasets for the outcome.  Colors and markers correspond to dataset and model, respectively <br>

**Inputs** <br>
- **performData** (pandas dataframe) - contains denormalized records of the dataset.  Important keys in the dataset include: <br>
    1) outcome - which outcome the record corresponds to <br>
    2) dataset - which dataset the record corresponds to (e.g. train, dev, test, etc.) <br>
    3) model - which model the record corresponds to <br>
    4) precision <br>
    5) recall <br>
    
    
- **outcome** (string) - outcome of interest.  Used to screen dataset
- **outcomeVar** (string) - key for the column in the dataset that contains the outcome label
- **subPlotMatrix** (3 element int array) - dimensions of the master plot and which index to push the subplot
- **xDim** (2 element float array) - min and max boundaries of the x-axis
- **yDim** (2 element float array) - min and max boundaries of the y-axis
- **colorVec** (string array) - colors to distinguish between origin datasets (e.g. train, dev,) in the subplot <br>
- **markerVec** (string array) - markers to diistinguish between models in the subplot <br>

**Outputs** <br>
- **markerDict** (dict) - marker:model pairs.  Mostly for debug purposes
- **colorDict** (dict) - color:dataset pairs.  Mostly for debug purposes

In [None]:
def plotByOutcome(performData,outcome,outcomeVar,subplotMatrix = [4,2,1],
                  xDim = [0.4,1],yDim = [0.2,1],
                  colorVec = ['red','blue','green','#a05195','#d45087','#f95d6a','#ff7c43','#ffa600'], 
                  markerVec = ['+','o','s','d','x','^'],debug=False):
    
    
    
    # setup axis and graph properties
    tempAxis = plt.subplot(subplotMatrix[0],subplotMatrix[1],subplotMatrix[2])
    setAxisProperties(tempAxis,xDim,yDim,'Recall','Precision',outcome)
    
    # setup local variables 
    outcomeSubset = performData.loc[performData[outcomeVar] == outcome]  # data subset for the outcome of interest
    dataOrigins = sorted(set(outcomeSubset['Dataset']))          # set of datasets
    models = sorted(set(outcomeSubset['Model']))                 # set of models
    compareLineX = [0,0]                                         # x coords for line comparing two models
    compareLineY = [0,0]                                         # y coords for line comparing two models
    modelsOfInterest = ['Whole','NoEmbed']                       # models to comapre
    markerDict = {}                                              # store marker:model pairs
    colorDict = {}                                               # store color:dataset pairs
    
    
    # for each dataset and each model, plot the precision vs. recall value 
    # with the appropriate (color,symbol) type
    for origin in dataOrigins:    
        
        # subset records to the current 'origin' dataset of interest
        originSubset = outcomeSubset.loc[outcomeSubset['Dataset'] == origin]
        datasetColor = colorVec[dataOrigins.index(origin)]
        colorDict[origin] = datasetColor
        
        # for the current dataset, go thorugh each model type
        for model in models:
            
            # only one datapoint for each model-dataset combination
            dataPoint = originSubset.loc[originSubset['Model'] == model]
            
            # setup a line to compare difference in precision and recall 
            # between the worst- and best-performing models
            if model in modelsOfInterest:
                modelIndex = modelsOfInterest.index(model)
                compareLineX[modelIndex] = float(dataPoint['Precision'])
                compareLineY[modelIndex] = float(dataPoint['Recall'])
            
            
            modelMarker = markerVec[models.index(model)]
            markerDict[model] = modelMarker
            plt.scatter(
                dataPoint['Precision'], 
                dataPoint['Recall'],
                marker = modelMarker,
                color = datasetColor
            )
            
        # add line comparing worst- and best-performing model for the current dataset
        compare_line = mlines.Line2D(compareLineX,compareLineY,color=datasetColor)
        tempAxis.add_line(compare_line)
        
        
    # when debugging, push graph to screen rather than saving to file
    if(debug):
        plt.show()
        
        
    return(colorDict,markerDict)

### plot precision vs. recall performance for all outcomes, datasets, and models ###
Graph consists of multiple subplots, with one subplot for each outcome.  Points are colored by originating dataset (e.g. train, dev, test), while markers correspond to model (e.g. whole model, model with no emoticons, etc.)

**Inputs** <br>
- **performData** (pandas dataframe) - contains all records and labels for all outcomes.  Keys include: <br>
    1) outcome - which outcome the record corresponds to <br>
    2) dataset - which dataset the record corresponds to (e.g. train, dev, test, etc.) <br>
    3) model - which model the record corresponds to <br>
    4) precision <br>
    5) recall <br>

- **xLims** (2 element float array) - x-axis boundaries <br>
- **yLims** (2 element float array) - y-axis boundaries <br>
- **title** (string) - optional graph title
- **outputFilename** (string) - absolute filepath where the plot will be saved.  If none, plot is pushed to notebook <br>

In [None]:
def plotPrecisionRecall(performData,xLims,yLims,title="Precision vs. Recall",outputFilename=None):
    
    
    # local variable setup.  
    fig = figure(num=None, figsize=(8, 16), dpi=160, facecolor='w', edgecolor='k')
    fig.suptitle(title, fontsize=16)
    uniqueOutcomes = set(performData['Outcome'])
    
    
    index = 1        # subplot indeces start at 1, not 0
    colorDict, marerDict = [{} for x in range(2)]
    
    for outcome in uniqueOutcomes:
        colorDict, markerDict = plotByOutcome(
            performData,
            outcome,
            'Outcome',
            [4,2,index],
            xLims[index-1],
            yLims[index-1]
        )
        index+=1

    if(not outputFilename):
        print(colorDict)
        print(markerDict)
        plt.show()

    else:
        plt.savefig(outputFilename,bbox_inches="tight")

In [None]:
# create peformance graphs based on input CSV data.  PerformCSV corresponds to an absoluate CSV filepath
def createPerformanceGraphs(performCSV):
    
    performDataset = loadPerformanceData(performCSV,True)
    
    # plot graphs with different x- y-axis boundaries for each outcome
    yLims = [[0.4,1],[0.4,1],[0.5,1],[0.2,1],[0.5,1],[0.7,1],[0.0,1]]
    xLims = [[0.4,1],[0.7,1],[0.7,1],[0.7,1],[0.7,1],[0.7,1],[0.4,1]]
    outputFilepath = performFolder + "precRecallVaryingBound.eps"
    plotPrecisionRecall(performDataset,xLims,yLims,"Precision vs. Recall with Varying Boundaries",outputFilepath)
    
    # plot graphs with the same x- and y-axisboundaries for each outcome
    yLims = [[0.0,1],[0.0,1],[0.0,1],[0.0,1],[0.0,1],[0.0,1],[0.0,1]]
    xLims = [[0.4,1],[0.4,1],[0.4,1],[0.4,1],[0.4,1],[0.4,1],[0.4,1]]
    outputFilepath = performFolder + "precRecallFixedBound.eps"
    plotPrecisionRecall(performDataset,xLims,yLims,"Precision vs. Recall with Set Boundaries",outputFilepath)

In [None]:
createPerformanceGraphs(performCSV_Denorm)