In [None]:
import pandas as pd
import numpy as np
from scipy.stats import ttest_ind 
from scipy.stats import fisher_exact
from scipy.stats import chi2_contingency
from scipy.stats import mannwhitneyu
from scipy.stats import kruskal
from scipy.stats import f_oneway
from scipy.stats import normaltest

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 500)

In [None]:
ecmodf_analysis = pd.read_csv('data/ecmodf_clusters_outcomes.csv', index_col=0)
ecmodf_analysis['sex'] = ecmodf_analysis['sex'].astype('bool')
ecmodf_analysis['death'] = ecmodf_analysis['death'].astype('bool')
ecmodf_analysis['rrt'] = ecmodf_analysis['rrt'].astype('bool')
ecmodf_analysis['ptx'] = ecmodf_analysis['ptx'].astype('bool')
ecmodf_analysis['pe'] = ecmodf_analysis['pe'].astype('bool')
ecmodf_analysis['bronchinf'] = ecmodf_analysis['bronchinf'].astype('bool')
ecmodf_analysis['ethnic'] = ecmodf_analysis['ethnic'].astype('object')
del ecmodf_analysis['admit_date']
del ecmodf_analysis['hosp']

In [None]:
ecmodf_analysis.head()

## Exploratory analysis of SOFA/RESP/age/pfr/pCO2 prediction of mortality

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
regdf = ecmodf_analysis.loc[:, ['death', 'sofa', 'age', 'resp', 'pfr', 'pco2', 'ddim']]
regdf['ddim'] = np.log2(regdf['ddim'])
regdf.groupby('death').mean()

In [None]:
#y = predicted outcome
y = np.asarray(regdf['death'])
y.shape

In [None]:
#X = array of independent variable(s)
ind = ['resp', 'sofa', 'ddim'] ## <------ modify chosen variables here

x0 = np.asarray(regdf.loc[:, ind])

X = x0.reshape(56, len(ind))
X.shape

In [None]:
X[:10]

In [None]:
sns.countplot(x = 'death', data = regdf)

# Most ML algorithms have poor performance on minority class.
# I.e. - most results are in category A, therefore will predict category A most of the time
# SMOTE (synthetic minority oversampling technique) will synthesize new samples for minority classes using existing data
# This doesn't add new information to the model but improves model performance
# from imblearn.over_sampling import SMOTE
# oversample = SMOTE()
# X, y = oversample.fit_resample(X, y)

#==> Currently doesn't not work due to version dependencies
#==> So using "class_weights = "balanced" instead, which attempts to balance classes in model using artifical weighting

plt.show()

In [None]:
import statsmodels.api as sm

logit_model=sm.Logit(y,X) 

result=logit_model.fit()
print(result.summary()) 

In [None]:
##coef = regression coefficient (b1) = estimated increase in the log odds of outcome per integer unit increase in value of exposure
##give other exposure remaining same in multiple regression. Coef gives idea of effect size and direction
##p-value <0.05 if effect is statistically significant

OR = np.exp(result.params) #odds ratio - per integer unit increment - converted back from log odds by e^b1
CI =  np.exp(result.conf_int()) #confidence interval for odds ratio

odds_ratios = pd.DataFrame()

odds_ratios['var'] = pd.Series(ind)
odds_ratios['odds ratio'] = pd.Series(OR)
odds_ratios = pd.concat([odds_ratios, pd.DataFrame(CI)], axis=1)
#odds_ratios['CI1'] = pd.Series(CI)

odds_ratios

In [None]:
np.exp(result.conf_int()) #confidence interval for odds ratio

In [None]:
#weights = {0 : '0.28', 1 : '0.72'}

model = LogisticRegression(solver='liblinear', class_weight="balanced", C = 0.5, penalty='l1', random_state=0)
#low C help prevent overfitting in small dataset
model.fit(X, y)

In [None]:
print(classification_report(y, model.predict(X)))

In [None]:
cm = confusion_matrix(y, model.predict(X))

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
ax.set_ylim(1.5, -0.5)
for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
        
plt.show()

In [None]:
logit_roc_auc = roc_auc_score(y, model.predict(X))
fpr, tpr, thresholds = roc_curve(y, model.predict_proba(X)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')

plt.show()

## Adding membership of clusters into model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [None]:
temp = ecmodf_analysis.copy()

temp = pd.concat([temp, pd.get_dummies(temp['cluster'], prefix = 'cluster')], axis=1)
temp.drop('cluster', inplace=True, axis=1)

##hypoinflammatory
#temp.loc[temp['cluster'] == 2, 'cluster'] = 0

##septic
#temp.loc[temp['cluster'] == 1, 'cluster'] = 2
#temp.loc[temp['cluster'] == 0, 'cluster'] = 1
#temp.loc[temp['cluster'] == 2, 'cluster'] = 0

##fulminant
#temp.loc[temp['cluster'] == 1, 'cluster'] = 0
#temp.loc[temp['cluster'] == 2, 'cluster'] = 1

temp.head()




In [None]:
regdf = temp.loc[:, ['death', 'sofa', 'age', 'resp', 'cluster_0', 'cluster_1', 'cluster_2', 'rrt', 'ddim']]
regdf['ddim'] = np.log2(regdf['ddim'])
regdf.head()

In [None]:
#y = predicted outcome
y = np.asarray(regdf['rrt'])
y.shape

In [None]:
#X = array of independent variable(s)
ind = ['cluster_1', 'cluster_0', 'sofa', 'resp'] ## modify chosen variables here

x0 = np.asarray(regdf.loc[:, ind])

X = x0.reshape(56, len(ind))
X.shape

In [None]:
X[:10]

In [None]:
import statsmodels.api as sm

logit_model=sm.Logit(y,X)

result=logit_model.fit()
print(result.summary())

In [None]:
OR = np.exp(result.params) #odds ratio 
CI =  np.exp(result.conf_int()) #confidence interval for odds ratio

odds_ratios = pd.DataFrame()

odds_ratios['var'] = pd.Series(ind)
odds_ratios['odds ratio'] = pd.Series(OR)
odds_ratios = pd.concat([odds_ratios, pd.DataFrame(CI)], axis=1)
#odds_ratios['CI1'] = pd.Series(CI)

odds_ratios

In [None]:
model = LogisticRegression(solver='liblinear', class_weight="balanced", penalty='l1', C = 1.0, random_state=0)
model.fit(X, y)

In [None]:
print(classification_report(y, model.predict(X)))

In [None]:
cm = confusion_matrix(y, model.predict(X))

fig, ax = plt.subplots(figsize=(8, 8))
ax.imshow(cm)
ax.grid(False)
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted 0s', 'Predicted 1s'))
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual 0s', 'Actual 1s'))
ax.set_ylim(1.5, -0.5)
for i in range(2):
    for j in range(2):
        ax.text(j, i, cm[i, j], ha='center', va='center', color='red')
        
plt.show()

In [None]:
logit_roc_auc = roc_auc_score(y, model.predict(X))
fpr, tpr, thresholds = roc_curve(y, model.predict_proba(X)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')

plt.show()

## k-folds cross validation

In [None]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

from numpy import mean
from numpy import std

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 500)

In [None]:
ecmodf_analysis = pd.read_csv('ecmodf_analysis_manualedits.csv', index_col=0)
ecmodf_analysis['sex'] = ecmodf_analysis['sex'].astype('bool')
ecmodf_analysis['death'] = ecmodf_analysis['death'].astype('bool')
ecmodf_analysis['rrt'] = ecmodf_analysis['rrt'].astype('bool')
ecmodf_analysis['ptx'] = ecmodf_analysis['ptx'].astype('bool')
ecmodf_analysis['pe'] = ecmodf_analysis['pe'].astype('bool')
ecmodf_analysis['bronchinf'] = ecmodf_analysis['bronchinf'].astype('bool')
ecmodf_analysis['ethnic'] = ecmodf_analysis['ethnic'].astype('object')
del ecmodf_analysis['admit_date']
del ecmodf_analysis['hosp']

ecmodf_comorb = pd.read_csv('ecmodf_analysis_comorb.csv', index_col=0)
ecmodf_steroids = pd.read_csv('ecmodf_analysis_steroids.csv', index_col=0)
ecmodf_analysis = pd.concat([ecmodf_analysis, ecmodf_comorb], axis=1)
ecmodf_analysis = pd.concat([ecmodf_analysis, ecmodf_steroids], axis=1)

In [None]:
temp = ecmodf_analysis.copy()

temp = pd.concat([temp, pd.get_dummies(temp['cluster'], prefix = 'cluster')], axis=1)
temp.drop('cluster', inplace=True, axis=1)

temp

In [None]:
regdf = temp.loc[:, ['death', 'sofa', 'age', 'resp', 'cluster_0', 'cluster_1', 'cluster_2', 'rrt', 'ddim', 'v_vv', 'bmi', 'ferritin', 'presteroid', 'hypertension', 'diabetes']]
regdf['ddim'] = np.log2(regdf['ddim'])
regdf

In [None]:
#y = predicted outcome
y = np.asarray(regdf['rrt'])
y.shape

In [None]:
#X = array of independent variable(s)
ind = ['sofa', 'resp', 'cluster_1'] ## modify chosen variables here

x0 = np.asarray(regdf.loc[:, ind])

X = x0.reshape(56, len(ind))
X.shape

In [None]:
import statsmodels.api as sm

logit_model=sm.Logit(y,X)

result=logit_model.fit()
print(result.summary())

In [None]:
OR = np.exp(result.params) #odds ratio 
CI =  np.exp(result.conf_int()) #confidence interval for odds ratio

odds_ratios = pd.DataFrame()

odds_ratios['var'] = pd.Series(ind)
odds_ratios['odds ratio'] = pd.Series(OR)
odds_ratios = pd.concat([odds_ratios, pd.DataFrame(CI)], axis=1)
#odds_ratios['CI1'] = pd.Series(CI)

odds_ratios

In [None]:
kfold = KFold(n_splits=4) # Define the split
kfold.get_n_splits(X) # returns the number of splitting iterations in the cross-validator

print(kfold) 

In [None]:
for train_index, test_index in kfold.split(X):
    print('TRAIN:', train_index, 'TEST:', test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

In [None]:
model = LogisticRegression(solver='liblinear', class_weight="balanced", penalty='l1', C = 1.0, random_state=0)
model.fit(X, y)

In [None]:
scores = cross_val_score(model, X, y, scoring='roc_auc', cv=kfold, n_jobs=-1)

##options for scoring: accuracy, roc_auc, precision, f1

In [None]:
print(scores)

In [None]:
print('ROC AUC: %.3f (%.3f)' % (mean(scores), std(scores)))