HemoPheno4HF  
SCRIPT DESCRIPTION: Training process for developing MVDDs  
CODE DEVELOPED BY: Josephine Lamp  
ORGANIZATION: University of Virginia, Charlottesville, VA  
LAST UPDATED: 8/24/2020  

In [None]:
from MVDD.MVDD import MVDD
import MVDD.MVDD_Generator as mvGen
import networkx as nx
from networkx.drawing.nx_pydot import *
import Params as params
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
import random
from itertools import permutations
import warnings
warnings.filterwarnings('ignore')

# Hemo

In [None]:
#Load data
hemoData = pd.read_csv('Data/Preprocessed Data/Cluster_Hemo.csv', index_col='DEIDNUM')
allScores = hemoData['Score']
death = hemoData['ScoreDeath']
rehosp = hemoData['ScoreRehosp']
readm = hemoData['ScoreReadmission']

# Preprocess and create training and testing sets
hemo = hemoData.drop('Score', axis=1)
hemo = hemo.drop('ScoreDeath', axis=1)
hemo = hemo.drop('ScoreRehosp', axis=1)
hemo = hemo.drop('ScoreReadmission', axis=1)
hemo = hemo.replace(np.inf, 0)
hemo = hemo.fillna(0)
xTrain, xTest, yTrain, yTest = train_test_split(hemo, allScores, test_size=.2)

print(xTrain.shape, xTest.shape)
xTrain

In [None]:
def getParamAccuracy(treeFilename, rootNode, xData, yData, paramRanges, relops):
    dot = read_dot(treeFilename + '.dot')
    dot = nx.DiGraph(dot)
    mvdd = MVDD(params.hemo, dot, root=rootNode)
    mvdd.featureDict = params.hemoDict

    mvParam, usedParams, usedRelops = mvGen.addGraphParams(mvdd, paramRanges, relops, inorder=True)

    mvParam.saveToFile(treeFilename + "Params")
    mvParam.saveDotFile(treeFilename + "Params")

    predScores = []

    for index, row in xData.iterrows():
        score, path = mvParam.predictScore(row)
        predScores.append(int(score))

    acc = accuracy_score(yData, predScores)

    return acc

In [None]:
#Working here, find better parameter estimation method
featureRanges = params.hemoDict
treeAccDF = pd.read_csv("AccuracyDFTrees.csv")

#get total number of branches for each node
branchDict = mvdd.getNumberBranchesPerNode(returnTerminals=False)

#based on number of branches per node, pick a split of values in the variable range
paramDict = {}
for k in branchDict.keys():
    numBranches = branchDict[k]
    low = featureRanges[k][0]
    high = featureRanges[k][1]
    
    lst = []
    for n in range(numBranches):
        lst.append(random.uniform(low, high))
    
    paramDict[k] = lst

In [None]:
# Run param optimization
filename = 'TreeFiles/TreeTraining/treeRandom37'

origacc = float(treeAccDF[treeAccDF['Filename'] == filename + '.dot']['Accuracy'])
print("Original Accuracy was", origacc)

acc = getParamAccuracy(treeFilename='TreeFiles/TreeTraining/treeRandom37', rootNode='PCWP', xData=xTrain,
                                             yData=yTrain, paramRanges=paramDict, relops=params.hemoRelopsV1)
print("Accuracy is", acc)

# All Data

In [None]:
#Load data
allDataOrig = pd.read_csv('Data/Preprocessed Data/Cluster_AllData.csv', index_col='DEIDNUM')
allScores = allDataOrig['Score']
death = allDataOrig['ScoreDeath']
rehosp = allDataOrig['ScoreRehosp']
readm = allDataOrig['ScoreReadmission']

# Preprocess and create training and testing sets
allData = allDataOrig.drop('Score', axis=1)
allData = allData.drop('ScoreDeath', axis=1)
allData = allData.drop('ScoreRehosp', axis=1)
allData = allData.drop('ScoreReadmission', axis=1)
allData = allData.replace(np.inf, 0)
allData = allData.fillna(0)
xTrain, xTest, yTrain, yTest = train_test_split(allData, allScores, test_size=.2)

print(xTrain.shape, xTest.shape)
xTrain

# Hemo

In [9]:
#Load data
hemoData = pd.read_csv('Data/Preprocessed Data/Cluster_Hemo.csv', index_col='DEIDNUM')
allScores = hemoData['Score']
death = hemoData['ScoreDeath']
rehosp = hemoData['ScoreRehosp']
readm = hemoData['ScoreReadmission']

# Preprocess and create training and testing sets
hemo = hemoData.drop('Score', axis=1)
hemo = hemo.drop('ScoreDeath', axis=1)
hemo = hemo.drop('ScoreRehosp', axis=1)
hemo = hemo.drop('ScoreReadmission', axis=1)
hemo = hemo.replace(np.inf, 0)
hemo = hemo.fillna(0)
xTrain, xTest, yTrain, yTest = train_test_split(hemo, allScores, test_size=.2)

print(xTrain.shape, xTest.shape)
xTrain

(334, 27) (84, 27)


Unnamed: 0_level_0,RAP,PAS,PAD,PAMN,PCWP,PCWPMod,PCWPA,PCWPMN,CO,CI,...,MPAP,CPI,PP,PPP,PAPP,SVR,RAT,PPRatio,Age,EjF
DEIDNUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
53623,6.0,28.0,17.0,20.0,16.0,16.0,16.0,16.0,2.92,2.30,...,39.333333,0.537177,47.0,0.573171,0.392857,2721.461187,0.375000,0.643836,34.0,20.0
55140,14.0,74.0,43.0,53.0,29.0,29.0,29.0,29.0,2.20,1.10,...,102.666667,0.336585,28.0,0.297872,0.418919,4509.090909,0.482759,0.294737,66.0,15.0
58594,3.0,46.0,18.0,32.0,17.0,17.0,17.0,0.0,3.00,1.94,...,58.000000,0.655270,39.0,0.364486,0.608696,3982.222222,0.176471,0.520000,69.0,10.0
1084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.00,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,49.0,14.0
60522,10.0,29.0,13.0,19.0,7.0,7.0,0.0,7.0,3.90,2.00,...,37.666667,0.702143,50.0,0.434783,0.551724,3042.735043,1.428571,0.833333,63.0,20.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52030,6.0,30.0,16.0,20.0,13.0,13.0,13.0,13.0,2.61,1.60,...,40.666667,0.463562,44.0,0.458333,0.466667,3821.200511,0.461538,0.505747,56.0,20.0
93759,0.0,58.0,20.0,38.0,0.0,20.0,0.0,0.0,5.20,2.30,...,71.333333,0.761567,36.0,0.346154,0.655172,0.000000,0.000000,0.537313,46.0,20.0
22684,13.0,61.0,29.0,43.0,0.0,29.0,0.0,0.0,4.60,2.10,...,80.333333,0.828825,68.0,0.507463,0.524590,2869.565217,0.000000,0.883117,64.0,25.0
91186,18.0,0.0,0.0,38.0,24.0,24.0,0.0,24.0,5.10,2.70,...,0.000000,0.810200,52.0,0.509804,0.000000,1840.522876,0.750000,0.753623,66.0,30.0


In [10]:
def getParamAccuracy(treeFilename, rootNode, xData, yData, paramRanges, relops):
    dot = read_dot(treeFilename + '.dot')
    dot = nx.DiGraph(dot)
    mvdd = MVDD(params.hemo, dot, root=rootNode)
    mvdd.featureDict = params.hemoDict

    mvParam, usedParams, usedRelops = mvGen.addGraphParams(mvdd, paramRanges, relops, inorder=True)

    mvParam.saveToFile(treeFilename + "Params")
    mvParam.saveDotFile(treeFilename + "Params")

    predScores = []

    for index, row in xData.iterrows():
        score, path = mvParam.predictScore(row)
        predScores.append(int(score))

    acc = accuracy_score(yData, predScores)

    return acc

In [27]:
#Working here, find better parameter estimation method
featureRanges = params.hemoDict
treeAccDF = pd.read_csv("AccuracyDFTrees.csv")

#get total number of branches for each node
branchDict = mvdd.getNumberBranchesPerNode(returnTerminals=False)

#based on number of branches per node, pick a split of values in the variable range
paramDict = {}
for k in branchDict.keys():
    numBranches = branchDict[k]
    low = featureRanges[k][0]
    high = featureRanges[k][1]
    
    lst = []
    for n in range(numBranches):
        lst.append(random.uniform(low, high))
    
    paramDict[k] = lst

In [28]:
# Run param optimization
filename = 'TreeFiles/TreeTraining/treeRandom37'

origacc = float(treeAccDF[treeAccDF['Filename'] == filename + '.dot']['Accuracy'])
print("Original Accuracy was", origacc)

acc = getParamAccuracy(treeFilename='TreeFiles/TreeTraining/treeRandom37', rootNode='PCWP', xData=xTrain,
                                             yData=yTrain, paramRanges=paramDict, relops=params.hemoRelopsV1)
print("Accuracy is", acc)

Original Accuracy was 0.4910179640718562
Accuracy is 0.15568862275449102


# All Data

In [28]:
#Load data
allDataOrig = pd.read_csv('Data/Preprocessed Data/Cluster_AllData.csv', index_col='DEIDNUM')
allScores = allDataOrig['Score']
death = allDataOrig['ScoreDeath']
rehosp = allDataOrig['ScoreRehosp']
readm = allDataOrig['ScoreReadmission']

# Preprocess and create training and testing sets
allData = allDataOrig.drop('Score', axis=1)
allData = allData.drop('ScoreDeath', axis=1)
allData = allData.drop('ScoreRehosp', axis=1)
allData = allData.drop('ScoreReadmission', axis=1)
allData = allData.replace(np.inf, 0)
allData = allData.fillna(0)
xTrain, xTest, yTrain, yTest = train_test_split(allData, allScores, test_size=.2)

print(xTrain.shape, xTest.shape)
xTrain

(692, 118) (174, 118)


Unnamed: 0_level_0,Age,Gender,Race,Wt,BMI,InitialHospDays,TotalHospDays,NYHA,MLHFS,AF,...,VALSA,EjF,BPDIAS,BPSYS,HR,PV,MAP,PP,PPP,PPRatio
DEIDNUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
53357,81.0,1.0,1.0,78.727273,27.893733,11.0,11.0,3.0,58.0,1.0,...,0.0,29.0,56.0,82.0,80.0,10.931617,119.333333,26.0,0.317073,0.325000
12868,72.0,1.0,1.0,69.090909,25.377744,12.0,17.0,3.0,66.0,1.0,...,0.0,14.0,63.0,96.0,78.0,0.000000,138.000000,33.0,0.343750,0.423077
5401,61.0,1.0,1.0,103.500000,34.581844,12.0,11.0,3.0,75.0,1.0,...,0.0,20.0,76.0,92.0,96.0,3.022408,142.666667,16.0,0.173913,0.166667
85589,50.0,1.0,1.0,114.000000,35.980306,8.0,66.0,4.0,65.0,1.0,...,0.0,15.0,34.0,80.0,57.0,15.887063,102.666667,46.0,0.575000,0.807018
1059,79.0,1.0,1.0,77.100000,26.678201,13.0,11.0,4.0,54.0,1.0,...,0.0,15.0,61.0,90.0,98.0,0.000000,130.666667,29.0,0.322222,0.295918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84987,56.0,1.0,4.0,54.100000,21.132812,6.0,9.0,4.0,80.0,0.0,...,0.0,20.0,72.0,117.0,110.0,-12.705110,165.000000,45.0,0.384615,0.409091
17343,49.0,2.0,2.0,98.727273,34.161686,4.0,10.0,4.0,67.0,0.0,...,0.0,15.0,73.0,114.0,58.0,-4.419776,162.666667,41.0,0.359649,0.706897
17905,69.0,1.0,1.0,82.000000,25.880571,3.0,3.0,2.0,61.0,1.0,...,0.0,5.0,50.0,100.0,70.0,-13.067916,133.333333,50.0,0.500000,0.714286
30415,62.0,2.0,2.0,73.600000,39.213597,6.0,17.0,3.0,92.0,0.0,...,0.0,17.0,74.0,106.0,76.0,-4.836635,155.333333,32.0,0.301887,0.421053
