# Preliminaries

In [None]:
#Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import copy
import warnings
import pickle
from scipy import stats
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from kneed import KneeLocator
from sklearn.cluster import AgglomerativeClustering


plt.rc('font', size=12)
warnings.filterwarnings('ignore')

In [None]:
#Load ESCAPE training data
escapeAllData = pd.read_csv("Data/Original DataFrames/AllDataSingleValue.csv", sep=",", index_col='DEIDNUM') #all feature dataset
escapeHemo = pd.read_csv("Data/Original DataFrames/HemoSingleValue.csv", sep=",", index_col='DEIDNUM') #dataset with only hemodynamics
escapeLabels  = pd.read_csv("Data/Original DataFrames/Labels.csv", sep=",", index_col='DEIDNUM') #labels for prediction classes 

#Load validation
aricCohortHemo = pd.read_csv("Data Validation/ARIC/Preprocessed Data/Clustered_HemoDF_ARIC_Cohort.csv", sep=",", index_col='ID')
aricCohortAllData = pd.read_csv("Data Validation/ARIC/Preprocessed Data/Clustered_AllData_ARIC_Cohort.csv", sep=",", index_col='ID')
cohortLabels = pd.read_csv("Data Validation/ARIC/Original DataFrames/LabelsAricCohort.csv", sep=",", index_col='ID')


# Cluster Labels

In [None]:
#HELPER METHODS

def plotPCAClusters(X, y_km, K, name): #plot clusters from pca
    df = pd.DataFrame(X, columns=["Component_1", "Component_2"])
    df['Cluster'] = y_km + 1
    plt.figure(figsize=(16,7))
    sns.scatterplot(x='Component_1', y='Component_2', hue='Cluster', palette=sns.color_palette('hls', K), data=df, legend='full')
    plt.title("Resulting Score Clusters")
    plt.savefig("Score Label Preprocessing/Figures/Score_Clusters_" + name + ".png")

    plt.show()
    
    return X

def plotClusters(X, y_km, columns, K): #plot original data clusters
    X['cluster'] = y_km
    plt.figure(figsize=(16,7))
    sns.scatterplot(x=columns[0], y=columns[1], hue='cluster',palette=sns.color_palette('hls', K), data=X, legend='full')
    plt.show()
    
    return X

def compareClusters(dt, K): #compare details of clusters
    clusters = []
    clusterDesc = []
    for num in range(K):
        cls = dt[dt['cluster'] == num]
        clusters.append(cls)

#         print("Cluster ", num)
#         print(cls.describe())
        clusterDesc.append(cls.describe())
        
    return clusters, clusterDesc

def runClustering(clusterModel, data, origData, labels, modelName, columns, pca=False):
    #Run clustering
    y_model = clusterModel.fit_predict(data)

    #Save cluster model
    pickle.dump(clusterModel, open('Score Label Preprocessing/ClusterModels/' + modelName + '.sav', 'wb'))
    
    #Plot Clusters
    if not pca:
        plotClusters(origData, y_model, columns, 5)
    else:
        plotPCAClusters(data, y_model, 5, modelName)
    
    #Get real data with clusters
    hemoDF = copy.deepcopy(origData)
    hemoDF['cluster'] = y_model

    #add class labels
    hemoDF['Death'] = labels['Death']
    hemoDF['Rehosp'] = labels['Rehosp']
    hemoDF['Readmission'] = labels['Readmission']

    #get descriptions of each cluster
    clusters, clusterDesc = compareClusters(hemoDF, 5)
    
    #get descriptive summary details for each cluster
    descSummary = pd.DataFrame()
    for i in range(len(clusterDesc)):
        descSummary = descSummary.append(clusterDesc[i])
    
    return hemoDF, descSummary

def runValidationClustering(clusterModel, data, origData, labels, columns, pca=False):
    #Run clustering
    y_model = clusterModel.fit_predict(data) #TODO MIGHT NEED TO CHANGE THIS ONE

    #Plot Clusters
    if not pca:
        plotClusters(origData, y_model, columns, 5)
    else:
        plotPCAClusters(data, y_model, 5, modelName)
    
    #Get real data with clusters
    hemoDF = copy.deepcopy(origData)
    hemoDF['cluster'] = y_model

    #add class labels
    try:
        hemoDF['Death'] = labels['Death']
    except:
        print("No Death Labels")
    try:
        hemoDF['Rehosp'] = labels['Rehosp']
    except:
        print("No Rehosp Labels")
    try:
        hemoDF['Readmission'] = labels['Readmission']
    except:
        print("No Readmission Labels")

    #get descriptions of each cluster
    clusters, clusterDesc = compareClusters(hemoDF, 5)
    
    #get descriptive summary details for each cluster
    descSummary = pd.DataFrame()
    for i in range(len(clusterDesc)):
        descSummary = descSummary.append(clusterDesc[i])
    
    return hemoDF, descSummary

def assignScoresFromClusters(descSummary):
    clsScDF = pd.DataFrame()
    clsScDF.index.name = 'Cluster'

    scores = [1,2,3,4,5]
    dthMn = descSummary.loc['mean']['Death'].to_numpy()
    rehospMn = descSummary.loc['mean']['Rehosp'].to_numpy()
    readmMn = descSummary.loc['mean']['Readmission'].to_numpy()
    
    dthScores = [0,0,0,0,0]
    cnt = 0
    for v in np.argsort(dthMn):
        dthScores[v] = scores[cnt]
        cnt += 1

    rehospScores = [0,0,0,0,0]
    cnt = 0
    for v in np.argsort(rehospMn):
        rehospScores[v] = scores[cnt]
        cnt += 1

    readmScores = [0,0,0,0,0]
    cnt = 0
    for v in np.argsort(readmMn):
        readmScores[v] = scores[cnt]
        cnt += 1

    clsScDF['DeathMean'] = dthMn
    clsScDF['DeathScores'] = dthScores
    clsScDF['RehospMean'] = rehospMn
    clsScDF['RehospScores'] = rehospScores
    clsScDF['ReadmMean'] = readmMn
    clsScDF['ReadmScores'] = readmScores

    aveScores = []
    for i in clsScDF.index:
        c = clsScDF.loc[i]
        a = c['DeathScores'] + c['RehospScores'] + c['ReadmScores']
        aveScores.append(a/3)

    clsScDF['AveScores'] = aveScores
    return clsScDF
    

def saveClusteredData(hemoDF, scoreAsmts, saveName):
    
    clusNums = hemoDF['cluster']

    labels = []
    for c in clusNums:
        labels.append(int(scoreAsmts.loc[c]['FinalScores']))
    hemoDF['Score'] = labels

    #Death Scores
    labels = []
    for c in clusNums:
        labels.append(int(scoreAsmts.loc[c]['DeathScores']))
    hemoDF['ScoreDeath'] = labels

    #Rehosp Scores
    labels = []
    for c in clusNums:
        labels.append(int(scoreAsmts.loc[c]['RehospScores']))

    hemoDF['ScoreRehosp'] = labels

    #Readm Scores
    labels = []
    for c in clusNums:
        labels.append(int(scoreAsmts.loc[c]['ReadmScores']))
    hemoDF['ScoreReadmission'] = labels

    hemoDF = hemoDF.drop(columns=["cluster"])
    hemoDF = hemoDF.drop(columns=["Death"])
    hemoDF = hemoDF.drop(columns=["Rehosp"])
    hemoDF = hemoDF.drop(columns=["Readmission"])

    hemoDF.to_csv(saveName)
    
    return hemoDF

# Cluster ESCAPE Hemo

In [None]:
# first - try other clustering methods

#try clustering to find similar groups using PCA
hemo = copy.deepcopy(escapeHemo)
hemo = hemo.replace(np.inf, 0)
hemo = hemo.fillna(0)
data=hemo
# scaler = MinMaxScaler()#scale data
# hemo.loc[:,:] = scaler.fit_transform(hemo)
# pca = PCA(n_components=2)
# data = pca.fit_transform(hemo)

In [None]:
#TODO HERE- Choose clustering method
ac = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
hemoDF, descSummary = runClustering(clusterModel=ac, data=data, origData=escapeHemo, labels=escapeLabels, 
                                    modelName='AC_Hemo', columns=['MPAP', 'BPDIAS'], pca=False)
hemoDF

In [None]:
scoreAsmts = assignScoresFromClusters(descSummary)
scoreAsmts

In [None]:
#TODO - MUST ALWAYS ADD ASMTS HERE
#Note - higher average scores means worse outcomes
finalScores = [4, 5, 2, 3, 1]
scoreAsmts['FinalScores'] = finalScores
scoreAsmts

In [None]:
hemoDF = saveClusteredData(hemoDF, scoreAsmts,"Data/Preprocessed Data/ESCAPE_AC_Hemo.csv")
hemoDF

# Cluster ARIC Cohort Hemo

In [None]:
#Fix labels that only have death
cohortLabels['Rehosp']=0
cohortLabels['Readmission']=0
cohortLabels

In [None]:
hemo = copy.deepcopy(aricCohortHemo)
hemo = hemo.replace(np.inf, 0)
hemo = hemo.fillna(0)
data=hemo

In [None]:
#TODO - update model here
model = pickle.load(open('Score Label Preprocessing/ClusterModels/AC_Hemo.sav', 'rb'))
hemoDF, descSummary = runValidationClustering(clusterModel=model, data=data, origData=aricCohortHemo, labels=cohortLabels, 
                                    columns=['MPAP', 'BPDIAS'], pca=False)
hemoDF

In [None]:
scoreAsmts = assignScoresFromClusters(descSummary)
scoreAsmts

In [None]:
#TODO - MUST ALWAYS ADD ASMTS HERE
#Note - higher average scores means worse outcomes
finalScores = [2,1,4,5,3]
scoreAsmts['FinalScores'] = finalScores
scoreAsmts

In [None]:
hemoDF = saveClusteredData(hemoDF, scoreAsmts,"Data Validation/ARIC/Preprocessed Data/ARIC_Cohort_AC_Hemo.csv")
hemoDF