In [3]:
import csv
import os

RESULTS_DIR = '/Users/jarridr/repos/decagon/finals'
INIT_DATA_SET_PROPORTION = 0.2

DATA_SET_ID_IDX   = 0
EPOCH_IDX         = 1
LOSS_IDX          = 2
LATENCY_IDX       = 3
EVALUATED_ALL_IDX = 4
EDGE_TYPE_IDX     = 5
AUROC_IDX         = 6
AUPRC_IDX         = 7
APK_IDX           = 8

class TrainingJobResults:
    def __init__(
        self, 
        activeLearningPolicy, 
        edgeTypePredicted, 
        dataSetProportion, 
        aurocVals, 
        auprcVals
    ):
        self.activeLearningPolicy = activeLearningPolicy 
        self.edgeTypePredicted = edgeTypePredicted
        self.dataSetProportion = dataSetProportion
        self.aurocVals = aurocVals
        self.auprcVals = auprcVals
        
    def combine(self, other):
        self.aurocVals.extend(other.aurocVals)
        self.auprcVals.extend(other.auprcVals)
        
class DataSetInformation:
    OkayPolicies = set(['RandomMasking', 'Greedy'])
    OkayEdgeTypes = set(['Neutropenia', 'Hyperglycaemia', 'Anosmia'])
    
    def __init__(
        self,
        activeLearningPolicy, 
        edgeTypePredicted,
        dataSetProportion, 
    ):  
        if activeLearningPolicy not in DataSetInformation.OkayPolicies:
            raise ValueError
            
        if edgeTypePredicted not in DataSetInformation.OkayEdgeTypes:
            raise ValueError
        
        self.activeLearningPolicy = activeLearningPolicy 
        self.edgeTypePredicted = edgeTypePredicted
        self.dataSetProportion = dataSetProportion
        
    def __hash__(self):
        return hash((
            self.activeLearningPolicy,
            self.edgeTypePredicted,
            self.dataSetProportion
        ))

def parseAll():    
    result = {}
    
    for trainingJobResultsDict in map(parseFile, os.listdir(RESULTS_DIR)):
        for trainingJobResults in trainingJobResultsDict.values():
            activeLearnPol = trainingJobResults.activeLearningPolicy
            if activeLearnPol not in result:
                result[activeLearnPol] = {}

            edgeTypePredicted = trainingJobResults.edgeTypePredicted
            if edgeTypePredicted not in result[activeLearnPol]:
                result[activeLearnPol][edgeTypePredicted] = {}

            dataSetProportion = trainingJobResults.dataSetProportion
            if dataSetProportion not in result[activeLearnPol][edgeTypePredicted]:
                result[activeLearnPol][edgeTypePredicted][dataSetProportion] = trainingJobResults
            else:
                result[activeLearnPol][edgeTypePredicted][dataSetProportion].combine(trainingJobResults)

    return result
        

def parseFile(rawFilename: str) -> TrainingJobResults:
    result = {}
    f = open('%s/%s' % (RESULTS_DIR, rawFilename))
    reader = csv.reader(f)

    # Skip the header
    try:
        next(reader)
    except:
        import pdb; pdb.set_trace()

    for iteration in reader:
        # Only use data for the first epoch
        if iteration[EPOCH_IDX] != '1':
            continue
        
        try:
            dataSetInformation = parseDataSetId(iteration[DATA_SET_ID_IDX])
        
            if dataSetInformation not in result:
                result[dataSetInformation] = TrainingJobResults(
                    dataSetInformation.activeLearningPolicy,
                    dataSetInformation.edgeTypePredicted,
                    dataSetInformation.dataSetProportion,
                    aurocVals=[],
                    auprcVals=[]
                )

            import pdb; pdb.set_trace()
            result[dataSetInformation].aurocVals.append(float(iteration[AUROC_IDX]))
            result[dataSetInformation].auprcVals.append(float(iteration[AUPRC_IDX]))
            
        except:
            continue
    
    f.close()
    
    import pdb; pdb.set_trace()
    return result
        
def parseDataSetId(dataSetId: str) -> DataSetInformation:
    activeLearningPolicyStartIdx = 0
    activeLearningPolicyEndIdx = dataSetId.find('ActiveLearner')
    
    edgeTypeStartIdx = dataSetId.find('DataSet') + len('DataSet')
    edgeTypeEndIdx = dataSetId.find('AdjMtx')
    
    activeLearningPolicyIterNum = int(dataSetId[-1])
    dataSetProportion = (20 + min(80, 2 ** activeLearningPolicyIterNum)) / 100
    
    return DataSetInformation(
        dataSetId[activeLearningPolicyStartIdx:activeLearningPolicyEndIdx],
        dataSetId[edgeTypeStartIdx:edgeTypeEndIdx],
        dataSetProportion
    )
    

In [2]:
# Plot the data!
from typing import Dict, Iterable
import seaborn as sns
import pandas as pd
import numpy as np

TrainDataResults = Dict[str, Dict[str, Dict[float, TrainingJobResults]]]

def getResultsDict(
    policyName: str,
    proportionToRes: Dict[float, TrainingJobResults], 
    plotAuprc
) -> Dict[str, Iterable]:
    def metricExtractor(results: Iterable[TrainingJobResults], plotAuprc: bool):
        preAttrName = 'auprcVals' if plotAuprc else 'aurocVals' 
        metrics = list(map(lambda x: np.max(getattr(x, preAttrName)), results))
        
        # Slice at -4 to exclude the "Vals" substring from attrName
        return preAttrName[:-4], metrics

    import pdb; pdb.set_trace()
    attrName, metrics = metricExtractor(proportionToRes.values(), plotAuprc)
    
    return {
        'LearningPolicy': [policyName for _ in proportionToRes],
        'DataSetProportion': [dataSetProportion for dataSetProportion in proportionToRes.keys()],
        attrName: metrics
    }

def trainResultsAsDF(
    trainResults: TrainDataResults, 
    edgeType: str, 
    plotAuprc: bool
) -> pd.DataFrame:
    randomResultsDict = getResultsDict(
        'RandomMasking', 
        trainResults['RandomMasking'][edgeType], 
        plotAuprc
    )
    
    greedyResultsDict = getResultsDict(
        'Greedy', 
        trainResults['Greedy'][edgeType], 
        plotAuprc
    )
    
    randResultsDf   = pd.DataFrame(randomResultsDict)
    greedyResultsDf = pd.DataFrame(greedyResultsDict)
    
    return pd.concat([randResultsDf, greedyResultsDf])

def plotData(trainResults: TrainDataResults, edgeType: str, plotAuprc: bool) -> None:
    dataFrame = trainResultsAsDF(trainResults, edgeType, plotAuprc)
    
    yKey = 'auprc' if plotAuprc else 'auroc'
    sns.lineplot(
        x='DataSetProportion',
        y=yKey,
        hue='variable',
        data=pd.melt(dataFrame, ['LearningPolicy']),
        estimator=None,
        style='choice'
    )
    

NameError: name 'TrainingJobResults' is not defined

In [19]:
allData: TrainDataResults = parseAll()

KeyError: 'Random'

In [29]:
plotData(allData, 'Hyperglycaemia', plotAuprc=False)

> <ipython-input-28-29ec026d1178>(22)getResultsDict()
-> attrName, metrics = metricExtractor(proportionToRes.values(), plotAuprc)


(Pdb)  len(proportionToRes.values())


8


(Pdb)  proportionToRes.keys()


dict_keys([0.52, 0.21, 0.84, 0.24, 1.0, 0.22, 0.28, 0.36])


(Pdb)  proportionToRes[0.21]


<__main__.TrainingJobResults object at 0x7ff9c0fbb9d0>


(Pdb)  proportionToRes[0.21].aurocVals


[0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 3.0, 0.0, 2.0, 5.0, 4.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 4.0, 0.0, 2.0, 5.0, 4.0, 0.0, 2.0, 5.0, 4.0, 0.0, 2.0, 5.0, 3.0, 0.0, 2.0, 5.0, 3.0, 0.0, 2.0, 5.0, 3.0, 0.0, 2.0, 5.0, 4.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 3.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 3.0, 0.0, 2.0, 5.0, 4.0, 0.0, 2.0, 5.0, 3.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 4.0, 0.0, 2.0, 5.0, 4.0, 0.0, 2.0, 5.0, 3.0, 0.0, 2.0, 5.0, 4.0, 0.0, 2.0, 5.0, 3.0, 0.0, 2.0, 5.0, 3.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 3.0, 0.0, 2.0, 5.0, 4.0, 0.0, 2.0, 5.0, 3.0, 0.0, 2.0, 5.0, 4.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 4.0, 0.0, 2.0, 5.0, 4.0, 0.0, 2.0, 5.0, 3.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0, 0.0, 2.0, 5.0, 1.0,

(Pdb)  q


BdbQuit: 