In [None]:
import pandas as pd #make data-frames and summarise data
import numpy as np #manipulate multi-dimensional arrays
import os as os #file system read
import matplotlib.pyplot as plt #visualisation
import seaborn as sns
from scipy import stats #stats tests
from sklearn import preprocessing
from sklearn.cluster import KMeans
from scipy.stats import ttest_ind 
from scipy.stats import fisher_exact

In [None]:
ecmodf = pd.read_csv('data/ecmo.csv', index_col=0)
ecmodf = ecmodf.drop(columns=['sx_v', 'neut', 'cr'])
print(ecmodf.isnull().sum()) #count null values

In [None]:
ecmodf.fillna(ecmodf.mean(), inplace=True) #impute null values with mean of column
print(ecmodf.isnull().sum()) #recount null values

In [None]:
ecmodf['bmi'] = ecmodf['bmi'].astype('int64')
ecmodf.head()

In [None]:
ecmodf.hist(layout=(5,3), figsize=(12,16)) #visual normality check

In [None]:
sval, pval = stats.normaltest(ecmodf)  #D'Agostino-Pearson test
pval = np.round(pval, 4)
pval

In [None]:
normality_df = pd.DataFrame([pval], [0], columns=ecmodf.columns)
normality_df

In [None]:
norm = []
for column in normality_df:
        if normality_df.loc[0, column] < 0.05: #in normality df, at each location row 0, per column
            norm.append("not normal") #append text to norm array
        else:
            norm.append("normal")
normality_df.loc[1] = norm #adds norm to row 1
normality_df
            

In [None]:
ecmodf_norm = ecmodf.copy()

In [None]:
ecmodf_norm['ddim'] = np.log(ecmodf_norm['ddim']) #log conversion of skewed variables - days and sofa not included as would not expected to conform to a power law distribution
ecmodf_norm['ferritin'] = np.log(ecmodf_norm['ferritin'])
ecmodf_norm['pct'] = np.log(ecmodf_norm['pct'])
ecmodf_norm['nlrat'] = np.log(ecmodf_norm['nlrat'])
ecmodf_norm['lymph'] = np.log(ecmodf_norm['lymph'])
ecmodf_norm['pplat'] = np.log(ecmodf_norm['pplat'])
ecmodf_norm['pco2'] = np.log(ecmodf_norm['pco2'])
ecmodf_norm['bmi'] = np.log(ecmodf_norm['bmi'])

In [None]:
ecmodf_norm.hist(layout=(5,3), figsize=(12,16)) #visual normality check

In [None]:
normalize = preprocessing.MinMaxScaler()
ecmodf_normalized = normalize.fit_transform(ecmodf_norm)
ecmodf_normalized = pd.DataFrame(ecmodf_normalized)
ecmodf_normalized.columns = ecmodf_norm.columns
ecmodf_normalized.hist(layout=(5,3), figsize=(12,16)) #visual normality check

ecmodf_normalized.to_csv(r'ecmodf_norm.csv', header=True)

In [None]:
inertia = []
K = range(1,10)
for k in K:
    kmeanModel = KMeans(n_clusters=k, init='k-means++')
    kmeanModel.fit(ecmodf_normalized)
    inertia.append(kmeanModel.inertia_)
    
plt.figure(figsize=(16,8))
plt.plot(K, inertia, 'bx-')
plt.xlabel('k')
plt.ylabel('inertia')
plt.title('The Elbow Method showing the optimal k')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, init='k-means++', n_init=1000, n_jobs=-1).fit(ecmodf_normalized)
centroids = kmeans.cluster_centers_
print(centroids)

plt.scatter(ecmodf_normalized['ferritin'], ecmodf_normalized['ddim'], c= kmeans.labels_.astype(float), s=50, alpha=0.5)
plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50)
plt.show()

In [None]:
pred = kmeans.predict(ecmodf_normalized)
ecmodf_clustered = pd.DataFrame(ecmodf)
ecmodf_clustered['cluster'] = pred
ecmodf_clustered

In [None]:
outcomesdf = pd.read_csv('outcomes.csv', index_col=0) #sex M=0, F=1; ethnic W=0, B=1, SA/EA=2; na = peak dose during admission
ecmodf_analysis = ecmodf_clustered.join(outcomesdf) #join outcomes data to clustering df
ecmodf_analysis.head()

In [None]:
ecmodf_analysis['sex'] = ecmodf_analysis['sex'].astype('bool')
ecmodf_analysis['death'] = ecmodf_analysis['death'].astype('bool')
ecmodf_analysis['rrt'] = ecmodf_analysis['rrt'].astype('bool')
ecmodf_analysis['ethnic'] = ecmodf_analysis['ethnic'].astype('object')
ecmodf_analysis.to_csv(r'ecmodf_analysis.csv', header=True)

In [None]:
ecmoclus0 = ecmodf_analysis[ecmodf_analysis.cluster == 0] #split off into separate cluster dfs
ecmoclus1 = ecmodf_analysis[ecmodf_analysis.cluster == 1]
ecmoclus2 = ecmodf_analysis[ecmodf_analysis.cluster == 2]

In [None]:
ecmoclus0.describe()

In [None]:
ecmoclus0.describe(exclude=['float64', 'int64'])

In [None]:
ecmoclus1.describe()

In [None]:
ecmoclus1.describe(exclude=['float64', 'int64'])

In [None]:
ecmoclus2.describe()

In [None]:
ecmoclus2.describe(exclude=['float64', 'int64'])

In [None]:
ecmodf_analysis['cluster'].value_counts() #counts numbers in each cluster

In [None]:
#steve stats code
ecmodf_summ = pd.DataFrame(index = ['C0', 'C1', 'C2'] , columns = ecmodf_analysis.columns) #create new table to show characteristics per cluster

for column in ecmoclus0: #mean of each column in each cluster
    ecmodf_summ.loc['C0', column] = ecmoclus0[column].mean(axis=0)
    
for column in ecmoclus1:
    ecmodf_summ.loc['C1', column] = ecmoclus1[column].mean(axis=0)
    
for column in ecmoclus2:
    ecmodf_summ.loc['C2', column] = ecmoclus2[column].mean(axis=0)
    
del ecmodf_summ['cluster'] #delete categorical values
del ecmodf_summ['death']
del ecmodf_summ['rrt']
del ecmodf_summ['sex']
del ecmodf_summ['ethnic']
    
ecmodf_summ

In [None]:
# Setup data frames for mortality contingency tables
# Compare one cluster against the other two clusters

mort0 = pd.DataFrame(index=['Alive', 'Dead'], columns=['C0', 'Rest'])
mort1 = pd.DataFrame(index=['Alive', 'Dead'], columns=['C1', 'Rest'])
mort2 = pd.DataFrame(index=['Alive', 'Dead'], columns=['C2', 'Rest'])

mort0.loc['Alive','C0'] = (~ecmoclus0['death']).values.sum()
mort0.loc['Dead','C0'] = ecmoclus0['death'].values.sum()
mort0.loc['Alive','Rest'] = (~ecmoclus1['death']).values.sum() + (~ecmoclus2['death']).values.sum()
mort0.loc['Dead','Rest'] = ecmoclus1['death'].values.sum() + ecmoclus2['death'].values.sum()

mort1.loc['Alive','C1'] = (~ecmoclus1['death']).values.sum()
mort1.loc['Dead','C1'] = ecmoclus1['death'].values.sum()
mort1.loc['Alive','Rest'] = (~ecmoclus0['death']).values.sum() + (~ecmoclus2['death']).values.sum()
mort1.loc['Dead','Rest'] = ecmoclus0['death'].values.sum() + ecmoclus2['death'].values.sum()

mort2.loc['Alive','C2'] = (~ecmoclus2['death']).values.sum()
mort2.loc['Dead','C2'] = ecmoclus2['death'].values.sum()
mort2.loc['Alive','Rest'] = (~ecmoclus1['death']).values.sum() + (~ecmoclus0['death']).values.sum()
mort2.loc['Dead','Rest'] = ecmoclus1['death'].values.sum() + ecmoclus0['death'].values.sum()

print(mort0)
print(mort1)
print(mort2)

In [None]:
# Fisher exact test to compare mortality of each cluster vs the rest

mort0OR, mort0PVAL = stats.fisher_exact(mort0)
mort1OR, mort1PVAL = stats.fisher_exact(mort1)
mort2OR, mort2PVAL = stats.fisher_exact(mort2)

if mort0PVAL <0.05:
    print("there is a significant difference between mortality in C0 and all other patients")
    print(mort0PVAL)
else:
    print("there is NOT a significant difference between mortality in C0 and all other patients")
    print(mort0PVAL)
    
if mort1PVAL <0.05:
    print("there is a significant difference between mortality in C1 and all other patients")
    print(mort1PVAL)
else:
    print("there is NOT a significant difference between mortality in C1 and all other patients")
    print(mort1PVAL)
    
if mort2PVAL <0.05:
    print("there is a significant difference between mortality in C2 and all other patients")
    print(mort2PVAL)
else:
    print("there is NOT a significant difference between mortality in C2 and all other patients")
    print(mort2PVAL)

In [None]:
# Setup data frames for RRT contingency tables
# Compare one cluster against the other two clusters

rrt0 = pd.DataFrame(index=['RRT', 'No RRT'], columns=['C0', 'Rest'])
rrt1 = pd.DataFrame(index=['RRT', 'No RRT'], columns=['C1', 'Rest'])
rrt2 = pd.DataFrame(index=['RRT', 'No RRT'], columns=['C2', 'Rest'])

rrt0.loc['No RRT','C0'] = (~ecmoclus0['rrt']).values.sum()
rrt0.loc['RRT','C0'] = ecmoclus0['rrt'].values.sum()
rrt0.loc['No RRT','Rest'] = (~ecmoclus1['rrt']).values.sum() + (~ecmoclus2['rrt']).values.sum()
rrt0.loc['RRT','Rest'] = ecmoclus1['rrt'].values.sum() + ecmoclus2['rrt'].values.sum()

rrt1.loc['No RRT','C1'] = (~ecmoclus1['rrt']).values.sum()
rrt1.loc['RRT','C1'] = ecmoclus1['rrt'].values.sum()
rrt1.loc['No RRT','Rest'] = (~ecmoclus0['rrt']).values.sum() + (~ecmoclus2['rrt']).values.sum()
rrt1.loc['RRT','Rest'] = ecmoclus0['rrt'].values.sum() + ecmoclus2['rrt'].values.sum()

rrt2.loc['No RRT','C2'] = (~ecmoclus2['rrt']).values.sum()
rrt2.loc['RRT','C2'] = ecmoclus2['rrt'].values.sum()
rrt2.loc['No RRT','Rest'] = (~ecmoclus1['rrt']).values.sum() + (~ecmoclus0['death']).values.sum()
rrt2.loc['RRT','Rest'] = ecmoclus1['rrt'].values.sum() + ecmoclus0['death'].values.sum()

print(rrt0)
print(rrt1)
print(rrt2)

In [None]:
# Fisher exact test to compare RRT of each cluster vs the rest

rrt0OR, rrt0PVAL = stats.fisher_exact(rrt0)
rrt1OR, rrt1PVAL = stats.fisher_exact(rrt1)
rrt2OR, rrt2PVAL = stats.fisher_exact(rrt2)

if rrt0PVAL <0.05:
    print("there is a significant difference between RRT in C0 and all other patients")
    print(rrt0PVAL)
else:
    print("there is NOT a significant difference between RRT in C0 and all other patients")
    print(rrt0PVAL)
    
if rrt1PVAL <0.05:
    print("there is a significant difference between RRT in C1 and all other patients")
    print(rrt1PVAL)
else:
    print("there is NOT a significant difference between RRT in C1 and all other patients")
    print(rrt1PVAL)
    
if rrt2PVAL <0.05:
    print("there is a significant difference between RRT in C2 and all other patients")
    print(rrt2PVAL)
else:
    print("there is NOT a significant difference between RRT in C2 and all other patients")
    print(rrt2PVAL)

In [None]:
# split into alive and dead dataframes

ecmodf_alive = ecmodf_analysis[ecmodf_analysis['death'] == False]
ecmodf_dead = ecmodf_analysis[ecmodf_analysis['death'] == True]

In [None]:
# Mann-Whitney U test to compare means of variables between alive and dead

print("""Compare variables between alive and dead for statistical significance
----------------------------------------------------------------------
      """)
for column in ecmodf_alive:
    a = ecmodf_alive[column]
    b = ecmodf_dead[column]
    stat, pval = stats.mannwhitneyu(a, b, alternative='two-sided')
    if pval < 0.05:
        print(str(column) + " difference is statistically significant with a p-value of: " + str(pval))
    if pval >0.05:
        print(str(column) + " difference is NOT statistically significant with a p-value of: " + str(pval))

In [None]:
# Create a new dataframe with nice labels for graphing reasons

nicenames = ['Age (years)', 'BMI (kg/m2)', 'Admission to IMV (days)', 'Ventilation to ECMO (days)', 'SOFA score', 'PF Ratio', 'pCO2 (kPa)', 'Pplat (cmH2O)', 'Lymphocyte count', 'N/L Ratio', 'Procalcitonin', 'Ferritin', 'CRP', 'Fibrinogen', 'D-Dimer', 'Cluster ID']
ecmodf_names = ecmodf_clustered
ecmodf_names.columns = nicenames

In [None]:
# Box plot for each variable by cluster

for column in ecmodf_names.columns[:15]:
        plt.rcParams["axes.labelsize"] = 30
        plt.figure(figsize=(10,10))
        sns.set(style='whitegrid', palette='pastel', font_scale=2)
        sns.boxplot(data = ecmodf_clustered, x = 'Cluster ID', y=column, showmeans=True, showfliers=False, saturation=0.9, meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"grey"})

In [None]:
# Make DF suitable for strip plot

ecmodf_normalized_clustered = pd.DataFrame(ecmodf_normalized)
ecmodf_normalized_clustered['cluster'] = pred
ecmodf_normalized_names = ecmodf_normalized_clustered
ecmodf_normalized_names.columns = nicenames
melted = pd.melt(ecmodf_normalized_clustered, id_vars='Cluster ID')

In [None]:
# Plot strip plot

plt.figure(figsize=(10,15))
stripplot = sns.stripplot(data=melted, x='variable', y='value', hue='Cluster ID', jitter=0.2, alpha=0.6, dodge=False, palette='bright')
stripplot.set_xticklabels(stripplot.get_xticklabels(), rotation=90, horizontalalignment='right', fontweight='light', fontsize='large')
stripplot.set_ylabel("Normalised Scale", fontweight='light', fontsize='large')
stripplot.set_title('Distribution of data and clusters by variable', fontweight='light', fontsize='xx-large')
plt.savefig(figures/"ECMOClusterStripPlot3.png")