In [None]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind 
from scipy.stats import fisher_exact
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu
from scipy.stats import kruskal
from scipy.stats import f_oneway
from scipy.stats import normaltest

import matplotlib.pyplot as plt 
import seaborn as sns

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 500)

In [None]:
ecmodf_analysis = pd.read_csv('data/ecmodf_clusters_outcomes.csv', index_col=0)
ecmodf_analysis['sex'] = ecmodf_analysis['sex'].astype('bool')
ecmodf_analysis['death'] = ecmodf_analysis['death'].astype('bool')
ecmodf_analysis['rrt'] = ecmodf_analysis['rrt'].astype('bool')
ecmodf_analysis['ptx'] = ecmodf_analysis['ptx'].astype('bool')
ecmodf_analysis['pe'] = ecmodf_analysis['pe'].astype('bool')
ecmodf_analysis['bronchinf'] = ecmodf_analysis['bronchinf'].astype('bool')
ecmodf_analysis['ethnic'] = ecmodf_analysis['ethnic'].astype('object')
del ecmodf_analysis['admit_date']
del ecmodf_analysis['hosp']

ecmodf_comorb = pd.read_csv('data/ecmo_comorb.csv', index_col=0)
ecmodf_steroids = pd.read_csv('data/ecmo_steroids.csv', index_col=0)
ecmodf_analysis = pd.concat([ecmodf_analysis, ecmodf_comorb], axis=1)
ecmodf_analysis = pd.concat([ecmodf_analysis, ecmodf_steroids], axis=1)

In [None]:
ecmodf_analysis.head()

In [None]:
ecmodf_analysis['cluster'].value_counts() #counts numbers in each cluster

## Survival comparison

In [None]:
##DO NOT DROP COLUMNS use for later subanalysis

ecmodf_analysis = pd.concat([ecmodf_analysis, pd.get_dummies(ecmodf_analysis['ethnic'], prefix='ethnic')], axis=1)

ecmodf_analysis = pd.concat([ecmodf_analysis, pd.get_dummies(ecmodf_analysis['cth'])], axis=1)

ecmodf_analysis = pd.concat([ecmodf_analysis, pd.get_dummies(ecmodf_analysis['deathmod'])], axis=1)

ecmodf_analysis.head()

In [None]:
cat = ['sex', 'death', 'bronchinf', 'rrt', 'pe', 'ptx', 'ethnic_0', 'ethnic_1', 'ethnic_2', 'cth_bleed', 'cth_stroke', 'death_bleed', 'death_mof', 'death_stroke', 'death_tamponade', 'death_withdraw', 'asthma', 'diabetes', 'hypertension', 'presteroid', 'ecmosteroid']
con = ['age', 'bmi', 'ed_v', 'v_vv', 'sofa', 'pfr', 'pco2', 'pplat', 'lymph', 'nlrat', 'pct', 'ferritin', 'crp', 'fib', 'ddim', 'na', 'time', 'resp', 'ttsteroids'] 

pop1 = ecmodf_analysis[ecmodf_analysis.death == 0]
pop0 = ecmodf_analysis[ecmodf_analysis.death == 1]


In [None]:
def iqr(series):
    q75, q25 = np.percentile(series.dropna(), [75 ,25]) #include dropna because np.percentile doesn't play nice with NaN
    return str("[" + str(np.round(q25, decimals=1)) + " - " + str(np.round(q75, decimals=1)) + "]")

In [None]:
cohort_summ = pd.DataFrame(index=['pop', 'alive', 'dead', 'p'], columns = cat + con)

In [None]:
for column in cohort_summ.columns:
    for row in cohort_summ.index:
        
        if row == 'pop':
            if column in cat:
                count = ecmodf_analysis[column].sum()
                percent = np.round((count / (len(ecmodf_analysis[column]) - ecmodf_analysis[column].isnull().sum())) * 100, decimals=1)
                cohort_summ.loc[row, column] = str(count) + " (" + str(percent) + "%)"
            if column in con:
                median = np.round(ecmodf_analysis[column].median(axis=0), decimals=1)
                cohort_summ.loc[row, column] = str(median) + ' ' + iqr(ecmodf_analysis[column])
                
        elif row == 'alive':
            if column in cat:
                count = pop1[column].sum()
                percent = np.round((count / (len(pop1[column]) - pop1[column].isnull().sum())) * 100, decimals=1)
                cohort_summ.loc[row, column] = str(count) + " (" + str(percent) + "%)"
            if column in con:
                median = np.round(pop1[column].median(axis=0), decimals=1)
                cohort_summ.loc[row, column] = str(median) + ' ' + iqr(pop1[column])
                
        elif row == 'dead':
            if column in cat:
                count = pop0[column].sum()
                percent = np.round((count / (len(pop0[column]) - pop0[column].isnull().sum())) * 100, decimals=1)
                cohort_summ.loc[row, column] = str(count) + " (" + str(percent) + "%)"
            if column in con:
                median = np.round(pop0[column].median(axis=0), decimals=1)
                cohort_summ.loc[row, column] = str(median) + ' ' + iqr(pop0[column])  
                

In [None]:
for i in con:
    sval, pval = mannwhitneyu(pop1[i].dropna(), pop0[i].dropna(), alternative = 'two-sided')
    cohort_summ.loc['p', i] = np.round(pval, decimals = 6)
    
for i in cat:
    try:
        OR, p = fisher_exact(pd.crosstab(ecmodf_analysis['death'], ecmodf_analysis[i].dropna()))
        cohort_summ.loc['p', i] = np.round(p, decimals = 6)
    except:
        stat, p, dof, expected = chi2_contingency(pd.crosstab(ecmodf_analysis['death'], ecmodf_analysis[i].dropna()))
        cohort_summ.loc['p', i] = np.round(p, decimals = 6)

In [None]:
cohort_summ.transpose()

## Clusters initial analysis

In [None]:
deathmod = pd.crosstab(ecmodf_analysis['cluster'], ecmodf_analysis['deathmod'])
deathmod

In [None]:
for i in deathmod.columns:
    stat, pval, dof, expected = chi2_contingency(pd.crosstab(ecmodf_analysis['cluster'], ecmodf_analysis[i]))
    if pval < 0.05:
        print("There is a significant difference in " + str(i) + " between clusters - pval = " + str(pval))
    else:
        print("There is NOT a significant difference in " + str(i) + " between clusters - pval = " + str(pval))

In [None]:
ctfinds = pd.crosstab(ecmodf_analysis['cluster'], ecmodf_analysis['cth'])
ctfinds

In [None]:
for i in ctfinds.columns:
    stat, pval, dof, expected = chi2_contingency(pd.crosstab(ecmodf_analysis['cluster'], ecmodf_analysis[i]))
    if pval < 0.05:
        print("There is a significant difference in " + str(i) + " between clusters - pval = " + str(pval))
    else:
        print("There is NOT a significant difference in " + str(i) + " between clusters - pval = " + str(pval))

In [None]:
firstbronch = pd.crosstab(ecmodf_analysis['cluster'], ecmodf_analysis['bronch0'])
firstbronch

In [None]:
ecmodf_analysis = pd.concat([ecmodf_analysis, pd.get_dummies(ecmodf_analysis['bronch0'])], axis=1)
ecmodf_analysis.drop('bronch0', inplace=True, axis=1)

In [None]:
for i in firstbronch.columns:
    stat, pval, dof, expected = chi2_contingency(pd.crosstab(ecmodf_analysis['cluster'], ecmodf_analysis[i]))
    if pval < 0.05:
        print("There is a significant difference in " + str(i) + " between clusters - pval = " + str(pval))
    else:
        print("There is NOT a significant difference in " + str(i) + " between clusters - pval = " + str(pval))

In [None]:
pathbronch = pd.crosstab(ecmodf_analysis['cluster'], ecmodf_analysis['bronchinf'])
stat, pval, dof, expected = chi2_contingency(pathbronch)
print("p = " + str(pval))
pathbronch

## Generate three-way results table

In [None]:
ecmodf_analysis.head()

In [None]:
# Split into separate dataframes for each cluster
ecmoclus0 = ecmodf_analysis[ecmodf_analysis.cluster == 0] #split off into separate cluster dfs
ecmoclus1 = ecmodf_analysis[ecmodf_analysis.cluster == 1]
ecmoclus2 = ecmodf_analysis[ecmodf_analysis.cluster == 2]

In [None]:
con = ['age', 'bmi', 'ed_v', 'v_vv', 'sofa', 'pfr', 'pco2', 'pplat', 'lymph', 'nlrat', 'pct', 'ferritin', 'crp', 'fib', 'ddim', 'time', 'na', 'resp', 'ttsteroids']

cat = ['death', 'sex', 'rrt', 'pe', 'ptx', 'diabetes', 'asthma', 'hypertension', 'death_bleed', 'ethnic_0', 'ethnic_1', 'ethnic_2',  'death_ich', 'death_mof', 'death_stroke', 'death_tamponade', 'death_withdraw', 'cth_bleed', 'cth_stroke', 'bronchinf','presteroid', 'ecmosteroid']


In [None]:
def iqr(series):
    q75, q25 = np.percentile(series.dropna(), [75 ,25]) #include dropna because np.percentile doesn't play nice with NaN
    return str("[" + str(np.round(q25, decimals=1)) + " - " + str(np.round(q75, decimals=1)) + "]")

iqr(ecmodf_analysis['age']) #test

In [None]:
ecmodf_final = pd.DataFrame(index=['all', 'C0', 'C1', 'C2', 'p'], columns = con + cat)
ecmodf_final

In [None]:
for column in ecmodf_final.columns:
    for row in ecmodf_final.index:
        
        if row == 'all':
            if column in cat:
                count = ecmodf_analysis[column].sum()
                percent = np.round((count / (len(ecmodf_analysis[column]) - ecmodf_analysis[column].isnull().sum())) * 100, decimals=1)
                ecmodf_final.loc[row, column] = str(count) + " (" + str(percent) + "%)"
            if column in con:
                median = np.round(ecmodf_analysis[column].median(axis=0), decimals=1)
                ecmodf_final.loc[row, column] = str(median) + ' ' + iqr(ecmodf_analysis[column])
                
        elif row == 'C0':
            if column in cat:
                count = ecmoclus0[column].sum()
                percent = np.round((count / (len(ecmoclus0[column]) - ecmoclus0[column].isnull().sum())) * 100, decimals=1)
                ecmodf_final.loc[row, column] = str(count) + " (" + str(percent) + "%)"
            if column in con:
                median = np.round(ecmoclus0[column].median(axis=0), decimals=1)
                ecmodf_final.loc[row, column] = str(median) + ' ' + iqr(ecmoclus0[column])
                
        elif row == 'C1':
            if column in cat:
                count = ecmoclus1[column].sum()
                percent = np.round((count / (len(ecmoclus1[column]) - ecmoclus1[column].isnull().sum())) * 100, decimals=1)
                ecmodf_final.loc[row, column] = str(count) + " (" + str(percent) + "%)"
            if column in con:
                median = np.round(ecmoclus1[column].median(axis=0), decimals=1)
                ecmodf_final.loc[row, column] = str(median) + ' ' + iqr(ecmoclus1[column])    
    
        elif row == 'C2':
            if column in cat:
                count = ecmoclus2[column].sum()
                percent = np.round((count / (len(ecmoclus2[column]) - ecmoclus2[column].isnull().sum())) * 100, decimals=1)
                ecmodf_final.loc[row, column] = str(count) + " (" + str(percent) + "%)"
            if column in con:
                median = np.round(ecmoclus2[column].median(axis=0), decimals=1)
                ecmodf_final.loc[row, column] = str(median) + ' ' + iqr(ecmoclus2[column])    


In [None]:
for i in con:
    stat, pval = kruskal(ecmoclus0[i].dropna(), ecmoclus1[i].dropna(), ecmoclus2[i].dropna())
    ecmodf_final.loc['p', i] = np.round(pval, decimals = 6)

for i in cat:
    try:
        OR, p = fisher_exact(pd.crosstab(ecmodf_analysis['cluster'], ecmodf_analysis[i].dropna()))
        ecmodf_final.loc['p', i] = np.round(p, decimals = 6)
    except:
        stat, p, dof, expected = chi2_contingency(pd.crosstab(ecmodf_analysis['cluster'], ecmodf_analysis[i].dropna()))
        ecmodf_final.loc['p', i] = np.round(p, decimals = 6)

In [None]:
ecmodf_final.transpose()

## Clus0 vs Clus2

In [None]:
ecmodf_0v2 = pd.DataFrame(index=['C0', 'C2', 'p'], columns = con + cat)

for column in ecmodf_0v2.columns:
    for row in ecmodf_0v2.index:
        
        if row == 'C0':
            if column in cat:
                count = ecmoclus0[column].sum()
                percent = np.round((count / (len(ecmoclus0[column]) - ecmoclus0[column].isnull().sum())) * 100, decimals=1)
                ecmodf_0v2.loc[row, column] = str(count) + " (" + str(percent) + "%)"
            if column in con:
                median = np.round(ecmoclus0[column].median(axis=0), decimals=1)
                ecmodf_0v2.loc[row, column] = str(median) + ' ' + iqr(ecmoclus0[column])
 
        elif row == 'C2':
            if column in cat:
                count = ecmoclus2[column].sum()
                percent = np.round((count / (len(ecmoclus2[column]) - ecmoclus2[column].isnull().sum())) * 100, decimals=1)
                ecmodf_0v2.loc[row, column] = str(count) + " (" + str(percent) + "%)"
            if column in con:
                median = np.round(ecmoclus2[column].median(axis=0), decimals=1)
                ecmodf_0v2.loc[row, column] = str(median) + ' ' + iqr(ecmoclus2[column])    
                
for i in con:
    sval, pval = mannwhitneyu(ecmoclus0[i].dropna(), ecmoclus2[i].dropna(), alternative = 'two-sided')
    ecmodf_0v2.loc['p', i] = np.round(pval, decimals = 6)

tempanalysis = ecmoclus0.append(ecmoclus2)
    
for i in cat:
#    try:
    OR, p = fisher_exact(pd.crosstab(tempanalysis['cluster'], tempanalysis[i].dropna()))
    ecmodf_0v2.loc['p', i] = np.round(p, decimals = 6)
#    except:
#        stat, p, dof, expected = chi2_contingency(pd.crosstab(tempanalysis['cluster'], tempanalysis[i].dropna()))
#        ecmodf_0v2.loc['p', i] = np.round(p, decimals = 6)

In [None]:
ecmodf_0v2.transpose()

## Positive bronch results

In [None]:
ecmobronch0 = ecmodf_analysis[ecmodf_analysis.bronchinf == False] #split off into separate cluster dfs
ecmobronch1 = ecmodf_analysis[ecmodf_analysis.bronchinf == True]

In [None]:
cat2 = ['death', 'sex', 'rrt', 'pe', 'ptx', 'death_bleed', 'death_ich', 'death_mof', 'death_stroke', 'death_tamponade', 'death_withdraw']

ecmodf_bronch = pd.DataFrame(index=['bronch0', 'bronch1', 'p'], columns = con + cat2)
ecmodf_bronch

In [None]:
for column in ecmodf_bronch.columns:
    for row in ecmodf_bronch.index:
        
        if row == 'bronch0':
            if column in cat:
                count = ecmobronch0[column].sum()
                percent = np.round((count / (len(ecmobronch0[column]) - ecmobronch0[column].isnull().sum())) * 100, decimals=1)
                ecmodf_bronch.loc[row, column] = str(count) + " (" + str(percent) + "%)"
            if column in con:
                median = np.round(ecmobronch0[column].median(axis=0), decimals=1)
                ecmodf_bronch.loc[row, column] = str(median) + ' ' + iqr(ecmobronch0[column])
 
        elif row == 'bronch1':
            if column in cat:
                count = ecmobronch1[column].sum()
                percent = np.round((count / (len(ecmobronch1[column]) - ecmobronch1[column].isnull().sum())) * 100, decimals=1)
                ecmodf_bronch.loc[row, column] = str(count) + " (" + str(percent) + "%)"
            if column in con:
                median = np.round(ecmoclus2[column].median(axis=0), decimals=1)
                ecmodf_bronch.loc[row, column] = str(median) + ' ' + iqr(ecmobronch1[column])    
                
for i in con:
    sval, pval = mannwhitneyu(ecmobronch0[i].dropna(), ecmobronch1[i].dropna(), alternative = 'two-sided')
    ecmodf_bronch.loc['p', i] = np.round(pval, decimals = 6)
    
for i in cat2:
    try:
        OR, p = fisher_exact(pd.crosstab(ecmodf_analysis['bronchinf'], ecmodf_analysis[i].dropna()))
        ecmodf_bronch.loc['p', i] = np.round(p, decimals = 6)
    except:
        stat, p, dof, expected = chi2_contingency(pd.crosstab(ecmodf_analysis['bronchinf'], ecmodf_analysis[i].dropna()))
        ecmodf_bronch.loc['p', i] = np.round(p, decimals = 6)
        

In [None]:
ecmodf_bronch.transpose()

## Survival analysis

In [None]:
ecmodf_survival = pd.read_csv('data/ecmo_time.csv', index_col=0)
ecmodf_survival.head()

In [None]:
from lifelines import KaplanMeierFitter

alldeath = ecmodf_survival[['death', 'time2']].copy()

kmf = KaplanMeierFitter()
kmf.fit(alldeath['time2'], alldeath['death'], label = "death")
kmf.plot(ci_show = True)

In [None]:
timeclus0 = ecmodf_survival[ecmodf_survival.cluster == 0] #split off into separate cluster dfs
timeclus1 = ecmodf_survival[ecmodf_survival.cluster == 1]
timeclus2 = ecmodf_survival[ecmodf_survival.cluster == 2]

In [None]:
t0 = timeclus0['time2']
i0 = timeclus0['death']

t1 = timeclus1['time2']
i1 = timeclus1['death']

t2 = timeclus2['time2']
i2 = timeclus2['death']

In [None]:
sns.set(style='white')

plt.figure(figsize=(10,6))

kmf.fit(t1, i1, label='Phenotype 1 (n=24)')
a1 = kmf.plot(ci_show = False)

kmf.fit(t0, i0, label='Phenotype 2 (n=20)')
a1 = kmf.plot(ci_show = False)

kmf.fit(t2, i2, label='Phenotype 3 (n=12)')
a1 = kmf.plot(ci_show = False)

a1.set_ylabel('Survival Probability (%)', fontsize=14, labelpad=12)
a1.set_xlabel('Days after ECMO initiation', fontsize=14, labelpad=12)


In [None]:
from lifelines import CoxPHFitter

ecmodf_cox = ecmodf_survival.drop(columns = ['hosp', 'time2'])

ecmodf_cox['SOFA score'] = ecmodf_analysis['sofa']

#ecmodf_cox['Ventilation days'] = ecmodf_analysis['v_vv']

#ecmodf_cox['Diabetes'] = ecmodf_analysis['diabetes']
#ecmodf_cox['Hypertension'] = ecmodf_analysis['hypertension']
#ecmodf_cox['Asthma'] = ecmodf_analysis['asthma']

ecmodf_cox['D-dimer (per doubling)'] = ecmodf_analysis['ddim']
ecmodf_cox['D-dimer (per doubling)'] = np.log2(ecmodf_cox['D-dimer (per doubling)'])

#ecmodf_cox['Ferritin (per doubling)'] = ecmodf_analysis['ferritin']
#ecmodf_cox['Ferritin (per doubling)'] = np.log2(ecmodf_cox['Ferritin (per doubling)'])

#ecmodf_cox['N:L ratio'] = ecmodf_analysis['nlrat']

#ecmodf_cox['HI group'] = ecmodf_cox['hiferritin']
ecmodf_cox.drop(['hiferritin'], inplace=True, axis=1)

ecmodf_cox['Body mass index'] = ecmodf_analysis['bmi']
ecmodf_cox['Age'] = ecmodf_analysis['age']
ecmodf_cox['RESP score'] = ecmodf_analysis['resp']

#ecmodf_cox['Procalcitonin (per doubling)'] = ecmodf_analysis['pct']
#ecmodf_cox['Procalcitonin (per doubling)'] = np.log2(ecmodf_cox['Procalcitonin (per doubling)'])

ecmodf_cox['sex'] = ecmodf_analysis['sex'] #1 = female
ecmodf_cox = pd.concat([ecmodf_cox, pd.get_dummies(ecmodf_cox['sex'], drop_first = False, prefix = 'female')], axis=1)
ecmodf_cox['Male sex'] = ecmodf_cox['female_False']
ecmodf_cox.drop(['sex', 'female_True', 'female_False'], inplace=True, axis=1)

ecmodf_cox['ethnic'] = ecmodf_analysis['ethnic'] #0 = white, #1 = black, #2 = other
ecmodf_cox = pd.concat([ecmodf_cox, pd.get_dummies(ecmodf_cox['ethnic'], prefix = 'ethnic')], axis=1)
ecmodf_cox['Black ethnicity (vs white)'] = ecmodf_cox['ethnic_1']
ecmodf_cox['Asian ethnicity (vs white)'] = ecmodf_cox['ethnic_2']
#ecmodf_cox['White ethnicity (vs other)'] = ecmodf_cox['ethnic_0']
ecmodf_cox.drop(['ethnic', 'ethnic_0', 'ethnic_2', 'ethnic_1'], inplace=True, axis=1)

#ecmodf_cox = pd.concat([ecmodf_cox, pd.get_dummies(ecmodf_cox['cluster'], prefix = 'cluster')], axis=1)
#ecmodf_cox.drop(['cluster', 'cluster_0', 'cluster_2'], inplace=True, axis=1)

ecmodf_cox.drop(['cluster'], inplace=True, axis=1)

ecmodf_cox


In [None]:
## Variance inflation factor to detect colinearity
## Linear relationships that exist between independent variables in model will cause coefficients (and HR) to be unstable
## By regression each variable against all other independent variables in table, can generate tolerance (1-R^2) and VIF (reciprocal of tolerance)
## VIF high = high multicollinearity 

from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

X = add_constant(ecmodf_cox) #requires constants column as reference(?)
X.drop(['death', 'time'], inplace=True, axis=1)


In [None]:
## No formal criteria, but values > 4 for VIF in any given variable suggest action is required
## VIF markedly different from other variables suggest action is required

pd.Series([variance_inflation_factor(X.values, i) 
               for i in range(X.shape[1])], 
              index=X.columns)

In [None]:
cph = CoxPHFitter()

cph.fit(ecmodf_cox, 'time', event_col='death')

cph.print_summary()

## exp(coef) gives hazard ratios

In [None]:
cph.log_likelihood_ratio_test()

In [None]:
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Tahoma']

fig, ax = plt.subplots(figsize=(10, 7))

plt.xlim(-0.5, 4.2)

cph.plot(hazard_ratios=True, ax = ax, elinewidth = 2, capsize = 3, fmt='o', ecolor='lightgray')

#xlolims=True,

plt.tight_layout()

plt.savefig('figures/fig1.png')


In [None]:
### For partial effects charts for clusters

ecmodf_survival2 = pd.read_csv('ecmodf_analysis_survival.csv', index_col=0)

ecmodf_cox2 = ecmodf_survival2.drop(columns = ['hosp', 'time2', 'sofa', 'resp'])

ecmodf_cox2['SOFA score'] = ecmodf_survival2['sofa']

ecmodf_cox2['RESP score'] = ecmodf_survival2['resp']

ecmodf_cox2 = pd.concat([ecmodf_cox2, pd.get_dummies(ecmodf_cox2['cluster'], prefix = 'cluster')], axis=1)
ecmodf_cox2.drop(['cluster', 'cluster_2'], inplace=True, axis=1)

#ecmodf_cox2['Hyperinflammatory Cluster'] = ecmodf_survival2['hicluster']
#ecmodf_cox2.drop(['cluster', 'hicluster'], inplace=True, axis=1)



In [None]:
cph = CoxPHFitter()

cph.fit(ecmodf_cox2, 'time', event_col='death')

cph.print_summary()


In [None]:
fig, ax = plt.subplots(figsize=(10, 10))

plt.xlim(0, 8)

cph.plot(hazard_ratios=True, ax = ax, elinewidth = 2, capsize = 3, fmt='o', ecolor='lightgray')

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

plt.xlim(0, 55)

cph.plot_partial_effects_on_outcome(covariates='cluster_1', values=[0, 1], plot_baseline=False, ax=ax, cmap='coolwarm')




## Cluster vs Others

In [None]:
# Setup data frames for mortality contingency tables
# Compare one cluster against the other two clusters

mort0 = pd.DataFrame(index=['Alive', 'Dead'], columns=['C0', 'Rest'])
mort1 = pd.DataFrame(index=['Alive', 'Dead'], columns=['C1', 'Rest'])
mort2 = pd.DataFrame(index=['Alive', 'Dead'], columns=['C2', 'Rest'])

mort0.loc['Alive','C0'] = (~ecmoclus0['death']).values.sum()
mort0.loc['Dead','C0'] = ecmoclus0['death'].values.sum()
mort0.loc['Alive','Rest'] = (~ecmoclus1['death']).values.sum() + (~ecmoclus2['death']).values.sum()
mort0.loc['Dead','Rest'] = ecmoclus1['death'].values.sum() + ecmoclus2['death'].values.sum()

mort1.loc['Alive','C1'] = (~ecmoclus1['death']).values.sum()
mort1.loc['Dead','C1'] = ecmoclus1['death'].values.sum()
mort1.loc['Alive','Rest'] = (~ecmoclus0['death']).values.sum() + (~ecmoclus2['death']).values.sum()
mort1.loc['Dead','Rest'] = ecmoclus0['death'].values.sum() + ecmoclus2['death'].values.sum()

mort2.loc['Alive','C2'] = (~ecmoclus2['death']).values.sum()
mort2.loc['Dead','C2'] = ecmoclus2['death'].values.sum()
mort2.loc['Alive','Rest'] = (~ecmoclus1['death']).values.sum() + (~ecmoclus0['death']).values.sum()
mort2.loc['Dead','Rest'] = ecmoclus1['death'].values.sum() + ecmoclus0['death'].values.sum()

print(mort0)
print(mort1)
print(mort2)

# Fisher exact test to compare mortality of each cluster vs the rest

mort0OR, mort0PVAL = fisher_exact(mort0)
mort1OR, mort1PVAL = fisher_exact(mort1)
mort2OR, mort2PVAL = fisher_exact(mort2)

if mort0PVAL <0.05:
    print("there is a significant difference between mortality in C0 and all other patients")
    print(mort0PVAL)
else:
    print("there is NOT a significant difference between mortality in C0 and all other patients")
    print(mort0PVAL)
    
if mort1PVAL <0.05:
    print("there is a significant difference between mortality in C1 and all other patients")
    print(mort1PVAL)
else:
    print("there is NOT a significant difference between mortality in C1 and all other patients")
    print(mort1PVAL)
    
if mort2PVAL <0.05:
    print("there is a significant difference between mortality in C2 and all other patients")
    print(mort2PVAL)
else:
    print("there is NOT a significant difference between mortality in C2 and all other patients")
    print(mort2PVAL)

In [None]:
# Setup data frames for RRT contingency tables
# Compare one cluster against the other two clusters

rrt0 = pd.DataFrame(index=['RRT', 'No RRT'], columns=['C0', 'Rest'])
rrt1 = pd.DataFrame(index=['RRT', 'No RRT'], columns=['C1', 'Rest'])
rrt2 = pd.DataFrame(index=['RRT', 'No RRT'], columns=['C2', 'Rest'])

rrt0.loc['No RRT','C0'] = (~ecmoclus0['rrt']).values.sum()
rrt0.loc['RRT','C0'] = ecmoclus0['rrt'].values.sum()
rrt0.loc['No RRT','Rest'] = (~ecmoclus1['rrt']).values.sum() + (~ecmoclus2['rrt']).values.sum()
rrt0.loc['RRT','Rest'] = ecmoclus1['rrt'].values.sum() + ecmoclus2['rrt'].values.sum()

rrt1.loc['No RRT','C1'] = (~ecmoclus1['rrt']).values.sum()
rrt1.loc['RRT','C1'] = ecmoclus1['rrt'].values.sum()
rrt1.loc['No RRT','Rest'] = (~ecmoclus0['rrt']).values.sum() + (~ecmoclus2['rrt']).values.sum()
rrt1.loc['RRT','Rest'] = ecmoclus0['rrt'].values.sum() + ecmoclus2['rrt'].values.sum()

rrt2.loc['No RRT','C2'] = (~ecmoclus2['rrt']).values.sum()
rrt2.loc['RRT','C2'] = ecmoclus2['rrt'].values.sum()
rrt2.loc['No RRT','Rest'] = (~ecmoclus1['rrt']).values.sum() + (~ecmoclus0['death']).values.sum()
rrt2.loc['RRT','Rest'] = ecmoclus1['rrt'].values.sum() + ecmoclus0['death'].values.sum()

print(rrt0)
print(rrt1)
print(rrt2)

# Fisher exact test to compare RRT of each cluster vs the rest

rrt0OR, rrt0PVAL = fisher_exact(rrt0)
rrt1OR, rrt1PVAL = fisher_exact(rrt1)
rrt2OR, rrt2PVAL = fisher_exact(rrt2)

if rrt0PVAL <0.05:
    print("there is a significant difference between RRT in C0 and all other patients")
    print(rrt0PVAL)
else:
    print("there is NOT a significant difference between RRT in C0 and all other patients")
    print(rrt0PVAL)
    
if rrt1PVAL <0.05:
    print("there is a significant difference between RRT in C1 and all other patients")
    print(rrt1PVAL)
else:
    print("there is NOT a significant difference between RRT in C1 and all other patients")
    print(rrt1PVAL)
    
if rrt2PVAL <0.05:
    print("there is a significant difference between RRT in C2 and all other patients")
    print(rrt2PVAL)
else:
    print("there is NOT a significant difference between RRT in C2 and all other patients")
    print(rrt2PVAL)

In [None]:
# Setup data frames for PE contingency tables
# Compare one cluster against the other two clusters

pe0 = pd.DataFrame(index=['pe', 'No pe'], columns=['C0', 'Rest'])
pe1 = pd.DataFrame(index=['pe', 'No pe'], columns=['C1', 'Rest'])
pe2 = pd.DataFrame(index=['pe', 'No pe'], columns=['C2', 'Rest'])

pe0.loc['No pe','C0'] = (~ecmoclus0['pe']).values.sum()
pe0.loc['pe','C0'] = ecmoclus0['pe'].values.sum()
pe0.loc['No pe','Rest'] = (~ecmoclus1['pe']).values.sum() + (~ecmoclus2['pe']).values.sum()
pe0.loc['pe','Rest'] = ecmoclus1['pe'].values.sum() + ecmoclus2['pe'].values.sum()

pe1.loc['No pe','C1'] = (~ecmoclus1['pe']).values.sum()
pe1.loc['pe','C1'] = ecmoclus1['pe'].values.sum()
pe1.loc['No pe','Rest'] = (~ecmoclus0['pe']).values.sum() + (~ecmoclus2['pe']).values.sum()
pe1.loc['pe','Rest'] = ecmoclus0['pe'].values.sum() + ecmoclus2['pe'].values.sum()

pe2.loc['No pe','C2'] = (~ecmoclus2['pe']).values.sum()
pe2.loc['pe','C2'] = ecmoclus2['pe'].values.sum()
pe2.loc['No pe','Rest'] = (~ecmoclus1['pe']).values.sum() + (~ecmoclus0['death']).values.sum()
pe2.loc['pe','Rest'] = ecmoclus1['pe'].values.sum() + ecmoclus0['death'].values.sum()

print(pe0)
print(pe1)
print(pe2)

# Fisher exact test to compare PE of each cluster vs the rest

pe0OR, pe0PVAL = fisher_exact(pe0)
pe1OR, pe1PVAL = fisher_exact(pe1)
pe2OR, pe2PVAL = fisher_exact(pe2)

if pe0PVAL <0.05:
    print("there is a significant difference between PE in C0 and all other patients")
    print(pe0PVAL)
else:
    print("there is NOT a significant difference between PE in C0 and all other patients")
    print(pe0PVAL)
    
if pe1PVAL <0.05:
    print("there is a significant difference between PE in C1 and all other patients")
    print(pe1PVAL)
else:
    print("there is NOT a significant difference between PE in C1 and all other patients")
    print(pe1PVAL)
    
if pe2PVAL <0.05:
    print("there is a significant difference between pe in C2 and all other patients")
    print(pe2PVAL)
else:
    print("there is NOT a significant difference between pe in C2 and all other patients")
    print(pe2PVAL)

In [None]:
# Setup data frames for PTX contingency tables
# Compare one cluster against the other two clusters

ptx0 = pd.DataFrame(index=['ptx', 'No ptx'], columns=['C0', 'Rest'])
ptx1 = pd.DataFrame(index=['ptx', 'No ptx'], columns=['C1', 'Rest'])
ptx2 = pd.DataFrame(index=['ptx', 'No ptx'], columns=['C2', 'Rest'])

ptx0.loc['No ptx','C0'] = (~ecmoclus0['ptx']).values.sum()
ptx0.loc['ptx','C0'] = ecmoclus0['ptx'].values.sum()
ptx0.loc['No ptx','Rest'] = (~ecmoclus1['ptx']).values.sum() + (~ecmoclus2['ptx']).values.sum()
ptx0.loc['ptx','Rest'] = ecmoclus1['ptx'].values.sum() + ecmoclus2['ptx'].values.sum()

ptx1.loc['No ptx','C1'] = (~ecmoclus1['ptx']).values.sum()
ptx1.loc['ptx','C1'] = ecmoclus1['ptx'].values.sum()
ptx1.loc['No ptx','Rest'] = (~ecmoclus0['ptx']).values.sum() + (~ecmoclus2['ptx']).values.sum()
ptx1.loc['ptx','Rest'] = ecmoclus0['ptx'].values.sum() + ecmoclus2['ptx'].values.sum()

ptx2.loc['No ptx','C2'] = (~ecmoclus2['ptx']).values.sum()
ptx2.loc['ptx','C2'] = ecmoclus2['ptx'].values.sum()
ptx2.loc['No ptx','Rest'] = (~ecmoclus1['ptx']).values.sum() + (~ecmoclus0['death']).values.sum()
ptx2.loc['ptx','Rest'] = ecmoclus1['ptx'].values.sum() + ecmoclus0['death'].values.sum()

print(ptx0)
print(ptx1)
print(ptx2)

# Fisher exact test to compare ptx of each cluster vs the rest

ptx0OR, ptx0PVAL = fisher_exact(ptx0)
ptx1OR, ptx1PVAL = fisher_exact(ptx1)
ptx2OR, ptx2PVAL = fisher_exact(ptx2)

if ptx0PVAL <0.05:
    print("there is a significant difference between PTX in C0 and all other patients")
    print(ptx0PVAL)
else:
    print("there is NOT a significant difference between PTX in C0 and all other patients")
    print(ptx0PVAL)
    
if ptx1PVAL <0.05:
    print("there is a significant difference between PTX in C1 and all other patients")
    print(ptx1PVAL)
else:
    print("there is NOT a significant difference between PTX in C1 and all other patients")
    print(ptx1PVAL)
    
if ptx2PVAL <0.05:
    print("there is a significant difference between ptx in PTX and all other patients")
    print(ptx2PVAL)
else:
    print("there is NOT a significant difference between ptx in PTX and all other patients")
    print(ptx2PVAL)

In [None]:
# Analysis of variance of medians between clusters using Kruskal-Wallis Test

print("""Compare for variance in medians between clusters
-------------------------------------------------
      """)

siglist = [] # initialis a list of significant values

for column in ecmoclus0:
    a = ecmoclus0[column]
    b = ecmoclus1[column]
    c = ecmoclus2[column]
    stat, pval = kruskal(a, b, c)
    if pval < 0.05:
        print(str(column) + " differences are statistically significant with a p-value of: " + str(pval))
        siglist += column
    if pval >0.05:
        print(str(column) + " difference are NOT statistically significant with a p-value of: " + str(pval))

In [None]:
from pingouin import pairwise_gameshowell, read_dataset
pairwise_gameshowell(data=ecmodf_analysis, dv='ddim',
                     between='cluster')  