In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SMOTE
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.preprocessing import MaxAbsScaler
from sklearn.linear_model import SGDClassifier
import shap
from sklearn.metrics import accuracy_score
import pickle
import json

## Reading the data

In [6]:
data_df_100 = pd.read_csv('../data/data_100.csv')
data_df_90 = pd.read_csv('../data/data_90.csv')
data_df_80 = pd.read_csv('../data/data_80.csv')
data_df_70 = pd.read_csv('../data/data_70.csv')
data_df_60 = pd.read_csv('../data/data_60.csv')
data_df_50 = pd.read_csv('../data/data_50.csv')
data_df_40 = pd.read_csv('../data/data_40.csv')

## The functions

In [11]:
def evaluate_model(classifier, X, y, scaler, train_size=0.8, total_samples=100, smote=False, best_model=False, best_criteria='positives'):
    """
    if smote is True:
        oversample = SMOTE()
        X, y = oversample.fit_resample(X, y)
    """
    oversample = SMOTE()
    scaler.fit(X)
    X_scaled = scaler.transform(X)
    auc_vals = np.array([])
    acc_vals = np.array([])
    specificity_vals = np.array([])
    sensitivity_vals = np.array([])
    ppv_vals = np.array([])
    model_list = []
    proba_vals = []
    test_vals = []
    fpr_vals = []
    tpr_vals = []
    for i in range(1,total_samples):
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=train_size)
        if smote is True:
            X_train, y_train = oversample.fit_resample(X_train, y_train)
        clf = classifier
        model = clf.fit(X_train,y_train)
        model_list.append(model)
        y_hat = model.predict_proba(X_test)
        proba_vals.append(y_hat[:,1])
        test_vals.append(y_test)
        if np.sum(np.round(y_hat[:,1])) == 0:
            continue
        fpr, tpr, thresholds = metrics.roc_curve(y_test, y_hat[:,1])
        tn, fp, fn, tp = confusion_matrix(y_test,np.round(y_hat[:,1])).ravel()
        specificity = tn / (tn+fp)
        sensitivity = tp / (tp+fn)
        ppv = tp/(tp+fp)
        auc_vals = np.append(metrics.auc(fpr, tpr),auc_vals)
        acc_vals = np.append(accuracy_score(y_test, np.round(y_hat[:,1])),acc_vals)
        specificity_vals = np.append(specificity,specificity_vals)
        sensitivity_vals = np.append(sensitivity,sensitivity_vals)
        ppv_vals = np.append(ppv,ppv_vals)
        fpr_vals.append(fpr)
        tpr_vals.append(tpr)
        pass
    print('AUC: {:.3f} +- {:.3f}'.format(np.mean(auc_vals), np.std(auc_vals)))
    print('Sensitivity: {:.3f} +- {:.3f}'.format(np.mean(sensitivity_vals), np.std(sensitivity_vals)))
    print('Specificity: {:.3f} +- {:.3f}'.format(np.mean(specificity_vals), np.std(specificity_vals)))
    print('Positive Predictive Values: {:.3f} +- {:.3f}'.format(np.mean(ppv_vals), np.std(ppv_vals)))
    print('Accuracy: {:.3f} +- {:.3f}'.format(np.mean(acc_vals), np.std(acc_vals)))
    if best_model is True:
        max_ppv_index = 0
        if best_criteria == 'positives':
            best_vals = ppv_vals*2.5 + sensitivity_vals*2
            max_ppv_index = np.argmax(best_vals)
        elif best_criteria == 'auc':
            max_ppv_index = np.argmax(auc_vals)
        elif best_criteria == 'accuracy_ppv':
            best_vals = ppv_vals*2.5 + auc_vals*2
            max_ppv_index = np.argmax(best_vals)
        else:
            max_ppv_index = np.argmax(acc_vals)
        print(max_ppv_index)
        print('The highest ppv model has the following:')
        print('AUC: {:.3f}'.format(auc_vals[max_ppv_index]))
        print('Sensitivity: {:.3f}'.format(sensitivity_vals[max_ppv_index]))
        print('Specificity: {:.3f}'.format(specificity_vals[max_ppv_index]))
        print('Positive Predictive Values: {:.3f}'.format(ppv_vals[max_ppv_index]))
        print('Accuracy: {:.3f}'.format(acc_vals[max_ppv_index]))
        
        return model_list[max_ppv_index],proba_vals,test_vals,fpr_vals[max_ppv_index],tpr_vals[max_ppv_index], scaler
    pass

In [12]:
def interpret_model(model, data_df, features,label):
    modelLimitation = model.fit(data_df[features],data_df[label])
    explainer = shap.TreeExplainer(modelLimitation, data=data_df[features])
    shap_values = explainer.shap_values(data_df[features])
    shap.summary_plot(shap_values[1], data_df[features], plot_type='dot', max_display=data_df.shape[1])
    pass

In [13]:
def get_waterfall_plot(model, data_df, features,label):
    modelLimitation = model.fit(data_df[features],data_df[label])
    explainer = shap.TreeExplainer(modelLimitation, data=data_df[features])
    shap_values = explainer.waterfall_plot(data_df[features])
    shap.summary_plot(shap_values[1], data_df[features], plot_type='dot', max_display=data_df.shape[1])
    pass

In [14]:
def explain_model(model, df_data, features, ravel=False):
    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(df_data[features])
    if ravel is True:
        for feature in features:
            shap.dependence_plot(feature, shap_values[1], df_data[features], show=False, interaction_index=None)
    else:
        fig, axes = plt.subplots(nrows=int(np.ceil(len(features)/4)), ncols=4, figsize=(20, 14))
        axes = axes.ravel()
        for i, col in enumerate(features):
            shap.dependence_plot(col, shap_values[1], df_data[features], ax=axes[i], show=False, interaction_index=None)

In [15]:
def save_model(model, scaler, name):
    filename = name + '.sav'
    pickle.dump(model, open(filename,'wb'))
    filename = 'scaler_' + name + '.sav'
    pickle.dump(scaler, open(filename,'wb'))

## Working with the cardiac data

In [17]:
cardiac_data_100 = ['CardiacLim','DiffPercentPeakVO2', 'DiffPeakVO2','75_to_100_VO2Slope','75_to_100_HRSlope','MinO2Pulse',
                      'PeakVE','VO2vsPeakVO2atVT','second_half_RRSlope','second_half_VO2Slope','75_to_100_VCO2Slope','MeanVE',
                      'second_half_VESlope','O2PulseDiff','50_to_75_O2Slope',
                        'O2PulsePercent','75_to_100_RERSlope','PeakRER','50_to_75_VO2Slope','PeakVO2Real']
cardiac_data_90 = ['CardiacLim','DiffPercentPeakVO2','DiffPeakVO2','MinO2Pulse','second_half_VEVO2Slope',
                '25_to_50_VCO2Slope','VO2vsPeakVO2atVT','PeakVE','15_to_85_VO2Slope','first_half_VEVCO2Slope',
                '25_to_50_VO2Slope','MeanVO2','25_to_50_VESlope','PeakVO2Real','PeakVO2', '15_to_85_VESlope',
                   'second_half_RRSlope','PeakRER','second_half_VCO2Slope','O2PulsePercent','15_to_85_VCO2Slope',
                   '75_to_100_VESlope','MeanVE','first_half_VO2Slope','second_half_VESlope','first_half_VEVO2Slope',
                   '75_to_100_HRSlope','DiffPercentPeakHR','15_to_85_HRSlope']
cardiac_data_80 = ['CardiacLim','DiffPercentPeakVO2','DiffPeakVO2','first_half_VEVCO2Slope','MinO2Pulse','15_to_85_VO2Slope',
                   '25_to_50_VEVCO2Slope','MeanVO2','PeakVO2Real','PeakVO2','MeanHeartRate','first_half_VO2Slope',
                  'first_half_VEVO2Slope','O2PulsePercent','StdHeartRate','second_half_VEVO2Slope','75_to_100_RRSlope',
                  'DiffPeakHR','PredictedMaxHR']
cardiac_data_70 = ['CardiacLim','DiffPercentPeakVO2','15_to_85_VO2Slope','PeakVO2Real','DiffPeakVO2','MinO2Pulse','PeakVO2',
                   'first_half_VEVCO2Slope','MeanVO2','15_to_85_VCO2Slope','PeakVCO2','StdO2Pulse','PredictedMaxHR',
                   '15_to_85_VESlope','MeanVCO2']
cardiac_data_60 = ['CardiacLim','50_to_75_VO2Slope','50_to_75_VCO2Slope','DiffPercentPeakVO2','15_to_85_VO2Slope','StdO2Pulse',
                  '50_to_75_VESlope','first_half_VEVCO2Slope','PeakVO2','MinO2Pulse','PeakVO2Real','O2PulsePercent',
                  'MeanVO2','StdHeartRate','LowestVE/VCO2','VEvsVCO2Slope','50_to_75_HRSlope',
                  '15_to_85_VEVO2Slope']
cardiac_data_50 = ['CardiacLim','StdO2Pulse','15_to_85_VEVCO2Slope','DiffPercentPeakVO2','second_half_VCO2Slope',
                'MeanVE/VCO2','second_half_VO2Slope','second_half_VESlope','PeakVO2Real','MinO2Pulse','PeakVO2','O2PulseDiff',
                'StdHeartRate','15_to_85_VEVO2Slope','VEvsVCO2Slope','second_half_HRSlope']
cardiac_data_40 = ['CardiacLim','StdO2Pulse','DiffPercentPeakVO2','second_half_VEVCO2Slope','MinO2Pulse','O2PulseDiff',
                   'PeakVO2','PeakVO2Real','O2PulsePercent','VEvsVCO2Slope','MaxO2Pulse','LowestVE/VCO2','MeanVE/VCO2',
                  'second_half_VEVO2Slope','second_half_HRSlope','second_half_VCO2Slope','PeakVCO2','StdVE/VCO2']

In [75]:
X, y = data_df_100[cardiac_data_100[1:]].values, data_df_100['CardiacLim'].values
scaler_test = StandardScaler(with_mean=False, with_std=False)
clf_selected = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features='sqrt',min_samples_leaf=0.01,
                            min_samples_split=0.01,n_estimators=200, n_jobs=2)
clf_cardiac, proba_cardiac, y_test_cardiac, fpr_car, tpr_car, scaler_100_cardiac = evaluate_model(clf_selected,X, y, scaler_test, total_samples=150, train_size=0.7, smote=True,best_model=True, best_criteria='accuracy_ppv')

In [74]:
X, y = data_df_90[cardiac_data_90[1:]].values, data_df_90['CardiacLim'].values
scaler_test = StandardScaler(with_mean=False, with_std=False)
clf_selected = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features='sqrt',min_samples_leaf=0.01,
                            min_samples_split=0.01,n_estimators=100, n_jobs=2)
clf_cardiac_90, proba_cardiac, y_test_cardiac, fpr_car, tpr_car,scaler_90_cardiac = evaluate_model(clf_selected,X, y, scaler_test, total_samples=50, train_size=0.7, smote=True,best_model=True)

AUC: 0.872 +- 0.039
Sensitivity: 0.644 +- 0.126
Specificity: 0.884 +- 0.051
Positive Predictive Values: 0.652 +- 0.135
Accuracy: 0.823 +- 0.038
45
The highest ppv model has the following:
AUC: 0.916
Sensitivity: 0.684
Specificity: 0.979
Positive Predictive Values: 0.929
Accuracy: 0.894


In [73]:
X, y = data_df_80[cardiac_data_80[1:]].values, data_df_80['CardiacLim'].values
scaler_test = StandardScaler(with_mean=False, with_std=False)
clf_selected = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features='sqrt',min_samples_leaf=0.01,
                            min_samples_split=0.01,n_estimators=100, n_jobs=2)
clf_cardiac_80, proba_cardiac, y_test_cardiac, fpr_car, tpr_car, scaler_80_cardiac = evaluate_model(clf_selected,X, y, scaler_test, total_samples=50, train_size=0.7, smote=True,best_model=True)

AUC: 0.783 +- 0.049
Sensitivity: 0.599 +- 0.132
Specificity: 0.784 +- 0.067
Positive Predictive Values: 0.468 +- 0.109
Accuracy: 0.738 +- 0.057
48
The highest ppv model has the following:
AUC: 0.824
Sensitivity: 0.800
Specificity: 0.826
Positive Predictive Values: 0.667
Accuracy: 0.818


In [23]:
X, y = data_df_70[cardiac_data_70[1:]].values, data_df_70['CardiacLim'].values
scaler_test = StandardScaler(with_mean=False, with_std=False)
clf_selected = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features='sqrt',min_samples_leaf=0.01,
                            min_samples_split=0.01,n_estimators=100, n_jobs=2)
clf_cardiac_70, proba_cardiac, y_test_cardiac, fpr_car, tpr_car, scaler_70_cardiac = evaluate_model(clf_selected,X, y, scaler_test, total_samples=50, train_size=0.7, smote=True,best_model=True)

AUC: 0.780 +- 0.048
Sensitivity: 0.566 +- 0.135
Specificity: 0.789 +- 0.057
Positive Predictive Values: 0.472 +- 0.101
Accuracy: 0.733 +- 0.048
32
The highest ppv model has the following:
AUC: 0.920
Sensitivity: 0.842
Specificity: 0.915
Positive Predictive Values: 0.800
Accuracy: 0.894


In [27]:
X, y = data_df_60[cardiac_data_60[1:]].values, data_df_60['CardiacLim'].values
scaler_test = StandardScaler(with_mean=False, with_std=False)
clf_selected = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features='sqrt',min_samples_leaf=0.01,
                            min_samples_split=0.01,n_estimators=200, n_jobs=2)
clf_cardiac_60, proba_cardiac, y_test_cardiac, fpr_car, tpr_car, scaler_60_cardiac = evaluate_model(clf_selected,X, y, scaler_test, total_samples=50, train_size=0.7, smote=True,best_model=True)

AUC: 0.785 +- 0.046
Sensitivity: 0.516 +- 0.133
Specificity: 0.821 +- 0.067
Positive Predictive Values: 0.480 +- 0.117
Accuracy: 0.747 +- 0.051
6
The highest ppv model has the following:
AUC: 0.852
Sensitivity: 0.750
Specificity: 0.907
Positive Predictive Values: 0.643
Accuracy: 0.879


In [31]:
X, y = data_df_50[cardiac_data_50[1:]].values, data_df_50['CardiacLim'].values
scaler_test = StandardScaler(with_mean=False, with_std=False)
#clf_selected = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features=0.1,min_samples_leaf=0.01,
#                            min_samples_split=0.01,n_estimators=400,oob_score=True)
clf_selected = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features='sqrt',min_samples_leaf=0.01,
                            min_samples_split=0.01,n_estimators=200, n_jobs=2)
clf_cardiac_50, proba_cardiac, y_test_cardiac, fpr_car, tpr_car,scaler_50_cardiac = evaluate_model(clf_selected,X, y, scaler_test, total_samples=50, train_size=0.7, smote=True,best_model=True)

AUC: 0.784 +- 0.061
Sensitivity: 0.569 +- 0.145
Specificity: 0.800 +- 0.076
Positive Predictive Values: 0.490 +- 0.128
Accuracy: 0.741 +- 0.057
32
The highest ppv model has the following:
AUC: 0.926
Sensitivity: 0.684
Specificity: 0.957
Positive Predictive Values: 0.867
Accuracy: 0.879


In [30]:
X, y = data_df_40[cardiac_data_40[1:]].values, data_df_40['CardiacLim'].values
scaler_test = StandardScaler(with_mean=False, with_std=False)
#clf_selected = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features=0.1,min_samples_leaf=0.01,
#                            min_samples_split=0.01,n_estimators=400,oob_score=True)
clf_selected = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features=None,min_samples_leaf=0.01,
                            min_samples_split=0.02,n_estimators=50, n_jobs=5, oob_score=True)
clf_cardiac_40, proba_cardiac, y_test_cardiac, fpr_car, tpr_car, scaler_40_cardiac = evaluate_model(clf_selected,X, y, scaler_test, total_samples=50, train_size=0.7, smote=True,best_model=True)

AUC: 0.749 +- 0.055
Sensitivity: 0.528 +- 0.140
Specificity: 0.767 +- 0.072
Positive Predictive Values: 0.428 +- 0.093
Accuracy: 0.706 +- 0.049
1
The highest ppv model has the following:
AUC: 0.828
Sensitivity: 0.750
Specificity: 0.820
Positive Predictive Values: 0.571
Accuracy: 0.803


## Pulmonary

In [32]:
pulmonary_data_100 = ['PulmonaryLim','O2PulsePercent', 'O2PulseDiff','first_half_VO2Slope','LowestVE/VCO2',
                      'first_half_VCO2Slope', '15_to_85_RRSlope','PeakRR','50_to_75_RRSlope','MeanO2Pulse','VEvsVCO2Slope',
                     '25_to_50_VCO2Slope','StdHeartRate']
pulmonary_data_90 = ['PulmonaryLim','O2PulsePercent','O2PulseDiff','second_half_RRSlope','LowestVE/VCO2',
                    'second_half_VESlope','PeakRR','75_to_100_VESlope','75_to_100_RRSlope','PeakVE','VEvsVCO2Slope',
                    'MaxO2Pulse','StdHeartRate','MeanVO2','MeanVE']
pulmonary_data_80 = ['PulmonaryLim','O2PulsePercent','O2PulseDiff','LowestVE/VCO2','first_half_VO2Slope',
                    'first_half_VCO2Slope','VEvsVCO2Slope','75_to_100_RRSlope','MeanO2Pulse']
pulmonary_data_70 = ['PulmonaryLim','O2PulsePercent','O2PulseDiff','LowestVE/VCO2','VEvsVCO2Slope','MaxO2Pulse',
                     'second_half_RRSlope','first_half_VO2Slope','MeanVCO2','MeanVO2']
pulmonary_data_60 = ['PulmonaryLim','O2PulsePercent','O2PulseDiff','LowestVE/VCO2','MeanVO2','MeanVCO2','second_half_RRSlope',
                    'MaxO2Pulse','15_to_85_VCO2Slope','second_half_VEVO2Slope','PeakVO2Real']
pulmonary_data_50 = ['PulmonaryLim','O2PulsePercent','O2PulseDiff','DiffPercentPeakVO2','15_to_85_VO2Slope','75_to_100_RRSlope',
                     'PeakVE','MeanVO2','MaxO2Pulse','MeanVCO2','PeakVO2','PeakVCO2','MaxO2_EST']
pulmonary_data_40 = ['PulmonaryLim','O2PulsePercent','O2PulseDiff','DiffPercentPeakVO2','PeakVE','PeakVO2',
                    'PeakVO2Real','MaxO2Pulse','StdO2Pulse','MeanRR','PeakVCO2','MeanVO2','MeanVE',
                    'DiffPercentPeakHR','MeanHeartRate','LowestRR','DiffPeakHR','StdHeartRate','LowestVE/VCO2']

In [33]:
X_pulm, y_pulm = data_df_100[pulmonary_data_100[1:]].values, data_df_100['PulmonaryLim'].values
#scaler_pulm = Normalizer(norm='max')
#clf_pulm = RandomForestClassifier(bootstrap=True, class_weight="balanced", criterion="entropy",max_features='sqrt',min_samples_leaf=0.2163157894736842,
#                            min_samples_split= 0.5252631578947369,n_estimators=600,oob_score=False)
scaler_pulm = StandardScaler( with_mean=False, with_std=False)
clf_pulm = RandomForestClassifier(bootstrap=True,  criterion="entropy",max_features=0.1,min_samples_leaf=0.01,
                            min_samples_split= 0.01,n_estimators=50,oob_score=False)
clf_pulmonary_100, proba_pulm, y_test_pulm, fpr_plm, tpr_plm, scaler_100_pulmonary = evaluate_model(clf_pulm,X_pulm, y_pulm, scaler_pulm, total_samples=100, train_size=0.7, smote=True, best_model=True, best_criteria='accuracy')

AUC: 0.851 +- 0.058
Sensitivity: 0.602 +- 0.201
Specificity: 0.869 +- 0.049
Positive Predictive Values: 0.380 +- 0.131
Accuracy: 0.836 +- 0.047
58
The highest ppv model has the following:
AUC: 0.982
Sensitivity: 1.000
Specificity: 0.947
Positive Predictive Values: 0.750
Accuracy: 0.955


In [34]:
X_pulm, y_pulm = data_df_90[pulmonary_data_90[1:]].values, data_df_90['PulmonaryLim'].values
scaler_pulm = StandardScaler( with_mean=False, with_std=False)
clf_pulm = RandomForestClassifier(bootstrap=True,  criterion="entropy",max_features=0.1,min_samples_leaf=0.01,
                            min_samples_split= 0.01,n_estimators=50,oob_score=False)
clf_pulmonary_90, proba_pulm, y_test_pulm, fpr_plm, tpr_plm, scaler_90_pulmonary = evaluate_model(clf_pulm,X_pulm, y_pulm, scaler_pulm, total_samples=100, train_size=0.7, smote=True, best_model=True, best_criteria='accuracy')

AUC: 0.884 +- 0.055
Sensitivity: 0.607 +- 0.212
Specificity: 0.907 +- 0.036
Positive Predictive Values: 0.458 +- 0.137
Accuracy: 0.871 +- 0.036
78
The highest ppv model has the following:
AUC: 0.989
Sensitivity: 0.667
Specificity: 0.983
Positive Predictive Values: 0.800
Accuracy: 0.955


In [35]:
X_pulm, y_pulm = data_df_80[pulmonary_data_80[1:]].values, data_df_80['PulmonaryLim'].values
scaler_pulm = StandardScaler( with_mean=False, with_std=False)
clf_pulm = RandomForestClassifier(bootstrap=True,  criterion="entropy",max_features=0.1,min_samples_leaf=0.01,
                            min_samples_split= 0.01,n_estimators=50,oob_score=False)
clf_pulmonary_80, proba_pulm, y_test_pulm, fpr_plm, tpr_plm, scaler_80_pulmonary = evaluate_model(clf_pulm,X_pulm, y_pulm, scaler_pulm, total_samples=100, train_size=0.7, smote=True, best_model=True, best_criteria='accuracy')

AUC: 0.832 +- 0.057
Sensitivity: 0.585 +- 0.194
Specificity: 0.841 +- 0.051
Positive Predictive Values: 0.325 +- 0.129
Accuracy: 0.811 +- 0.042
21
The highest ppv model has the following:
AUC: 0.839
Sensitivity: 0.600
Specificity: 0.918
Positive Predictive Values: 0.375
Accuracy: 0.894


In [36]:
X_pulm, y_pulm = data_df_70[pulmonary_data_70[1:]].values, data_df_70['PulmonaryLim'].values
scaler_pulm = StandardScaler( with_mean=False, with_std=False)
clf_pulm = RandomForestClassifier(bootstrap=True,  criterion="entropy",max_features=0.1,min_samples_leaf=0.01,
                            min_samples_split= 0.01,n_estimators=50,oob_score=False)
clf_pulmonary_70, proba_pulm, y_test_pulm, fpr_plm, tpr_plm, scaler_70_pulmonary = evaluate_model(clf_pulm,X_pulm, y_pulm, scaler_pulm, total_samples=100, train_size=0.7, smote=True, best_model=True, best_criteria='accuracy')

AUC: 0.783 +- 0.065
Sensitivity: 0.512 +- 0.180
Specificity: 0.834 +- 0.057
Positive Predictive Values: 0.297 +- 0.126
Accuracy: 0.795 +- 0.050
9
The highest ppv model has the following:
AUC: 0.867
Sensitivity: 0.500
Specificity: 0.933
Positive Predictive Values: 0.429
Accuracy: 0.894


In [37]:
X_pulm, y_pulm = data_df_60[pulmonary_data_60[1:]].values, data_df_60['PulmonaryLim'].values
scaler_pulm = StandardScaler( with_mean=False, with_std=False)
clf_pulm = RandomForestClassifier(bootstrap=True,  criterion="entropy",max_features=0.1,min_samples_leaf=0.01,
                            min_samples_split= 0.01,n_estimators=50,oob_score=False)
clf_pulmonary_60, proba_pulm, y_test_pulm, fpr_plm, tpr_plm,scaler_60_pulmonary  = evaluate_model(clf_pulm,X_pulm, y_pulm, scaler_pulm, total_samples=100, train_size=0.7, smote=True, best_model=True, best_criteria='accuracy')

AUC: 0.790 +- 0.056
Sensitivity: 0.511 +- 0.189
Specificity: 0.817 +- 0.058
Positive Predictive Values: 0.294 +- 0.100
Accuracy: 0.775 +- 0.044
87
The highest ppv model has the following:
AUC: 0.889
Sensitivity: 0.778
Specificity: 0.877
Positive Predictive Values: 0.500
Accuracy: 0.864


In [43]:
X_pulm, y_pulm = data_df_50[pulmonary_data_50[1:]].values, data_df_50['PulmonaryLim'].values
scaler_pulm = StandardScaler( with_mean=False, with_std=False)
clf_pulm = RandomForestClassifier(bootstrap=True,  criterion="entropy",max_features=0.1,min_samples_leaf=0.01,
                            min_samples_split= 0.01,n_estimators=50,oob_score=False)
clf_pulmonary_50, proba_pulm, y_test_pulm, fpr_plm, tpr_plm, scaler_50_pulmonary = evaluate_model(clf_pulm,X_pulm, y_pulm, scaler_pulm, total_samples=100, train_size=0.7, smote=True, best_model=True)

AUC: 0.807 +- 0.066
Sensitivity: 0.548 +- 0.173
Specificity: 0.831 +- 0.055
Positive Predictive Values: 0.316 +- 0.119
Accuracy: 0.795 +- 0.045
81
The highest ppv model has the following:
AUC: 0.884
Sensitivity: 0.900
Specificity: 0.821
Positive Predictive Values: 0.474
Accuracy: 0.833


In [42]:
X_pulm, y_pulm = data_df_40[pulmonary_data_40[1:]].values, data_df_40['PulmonaryLim'].values
scaler_pulm = StandardScaler( with_mean=False, with_std=False)
clf_pulm = RandomForestClassifier(bootstrap=True,  criterion="entropy",max_features=0.1,min_samples_leaf=0.01,
                            min_samples_split= 0.01,n_estimators=50,oob_score=False)
clf_pulmonary_40, proba_pulm, y_test_pulm, fpr_plm, tpr_plm, scaler_40_pulmonary = evaluate_model(clf_pulm,X_pulm, y_pulm, scaler_pulm, total_samples=100, train_size=0.7, smote=True, best_model=True)

AUC: 0.830 +- 0.061
Sensitivity: 0.559 +- 0.199
Specificity: 0.871 +- 0.042
Positive Predictive Values: 0.370 +- 0.125
Accuracy: 0.831 +- 0.036
94
The highest ppv model has the following:
AUC: 0.969
Sensitivity: 1.000
Specificity: 0.950
Positive Predictive Values: 0.667
Accuracy: 0.955


## Other model

In [44]:
other_data_100 = ['MuscleSkeletalLim','PeakRR', 'PeakVE','PeakVCO2','MeanVCO2','PeakVO2','PeakVO2Real',
                  'LowestVE/VCO2','MeanRER','PeakRER','VO2vsPeakVO2atVT','DiffPercentPeakVO2','MeanRR',
                  '75_to_100_VEVCO2Slope','DiffPeakVO2','MeanVE','second_half_VESlope','first_half_VEVCO2Slope',
                  '0_to_25_O2Slope','VO2atVT', 'MeanVO2','second_half_VCO2Slope','DiffPeakHR','MeanVE/VCO2','75_to_100_RRSlope']
other_data_90 = ['MuscleSkeletalLim','PeakVE','PeakRR','PeakVCO2','DiffPercentPeakVO2','PeakVO2','PeakVO2Real',
                'MeanRER','MeanRR','0_to_25_O2Slope','DiffPeakVO2','LowestVE/VCO2','MeanVCO2','MeanO2Pulse',
                 '75_to_100_HRSlope','MeanVE','0_to_25_VESlope','MeanVO2','VEvsVCO2Slope','0_to_25_VO2Slope',
                'second_half_VEVCO2Slope']
other_data_80 = ['MuscleSkeletalLim','PeakVCO2','PeakRR','PeakVE','LowestVE/VCO2','DiffPercentPeakVO2','MeanVCO2',
                'PeakVO2','PeakVO2Real','O2PulseDiff','DiffPeakVO2','MeanVO2','O2PulsePercent','0_to_25_HRSlope',
                 'MeanVE','MeanO2Pulse','VEvsVCO2Slope','MeanRER','PeakRER','second_half_VCO2Slope','15_to_85_RRSlope',
                 '50_to_75_VEVCO2Slope']
other_data_70 = ['MuscleSkeletalLim','first_half_O2Slope','PeakVCO2','DiffPercentPeakVO2','LowestVE/VCO2','PeakVE',
                'PeakVO2','PeakVO2Real','PeakRR','O2PulseDiff','O2PulsePercent','MeanVCO2','DiffPeakVO2',
                '0_to_25_VO2Slope','MeanO2Pulse','50_to_75_VO2Slope','MeanVO2','0_to_25_O2Slope','DiffPercentPeakHR']
other_data_60 = ['MuscleSkeletalLim','first_half_O2Slope','LowestVE/VCO2','PeakVCO2','PeakVE','DiffPercentPeakVO2',
                'O2PulsePercent','PeakVO2','PeakVO2Real','15_to_85_VEVCO2Slope','O2PulseDiff',
                 '0_to_25_HRSlope','MeanVCO2','first_half_VO2Slope','MeanVO2','PeakRR','MeanO2Pulse','second_half_VO2Slope',
                'MeanVE/VCO2','VEvsVCO2Slope','0_to_25_VCO2Slope']
other_data_50 = ['MuscleSkeletalLim','first_half_O2Slope','LowestVE/VCO2','25_to_50_O2Slope','0_to_25_HRSlope',
                 'first_half_VESlope','75_to_100_VO2Slope','first_half_VO2Slope','PeakVCO2','MeanVCO2','MeanVE/VCO2',
                '75_to_100_O2Slope','75_to_100_VCO2Slope','VEvsVCO2Slope','DiffPercentPeakVO2','HRvsVO2Slope',
                'PeakVO2','50_to_75_VEVCO2Slope']
other_data_40 = ['MuscleSkeletalLim','LowestVE/VCO2','MeanVCO2','first_half_HRSlope','MeanVO2','first_half_VO2Slope',
                'PeakVCO2','O2PulseDiff','MeanVE/VCO2','O2PulsePercent','PeakVO2','first_half_O2Slope',
                'second_half_VCO2Slope','PeakVE','MeanO2Pulse']

In [47]:
X_other, y_other= data_df_100[other_data_100[1:]].values, data_df_100['MuscleSkeletalLim'].values
scaler_other = Normalizer(norm='l1')
clf_other = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features=None,min_samples_leaf=0.01,
                            min_samples_split=0.02,n_estimators=50, n_jobs=5, oob_score=True)
clf_other_100, proba_other, y_test_other, fpr_other, tpr_other, scaler_100_other = evaluate_model(clf_other,X_other, y_other, scaler_other, total_samples=100, train_size=0.7, smote=True, best_model=True,best_criteria='accuracy')

AUC: 0.843 +- 0.052
Sensitivity: 0.608 +- 0.125
Specificity: 0.856 +- 0.051
Positive Predictive Values: 0.537 +- 0.111
Accuracy: 0.802 +- 0.040
19
The highest ppv model has the following:
AUC: 0.913
Sensitivity: 0.571
Specificity: 0.983
Positive Predictive Values: 0.800
Accuracy: 0.939


In [60]:
X_other, y_other= data_df_90[other_data_90[1:]].values, data_df_90['MuscleSkeletalLim'].values
scaler_other = Normalizer(norm='l1')
clf_other = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features=None,min_samples_leaf=0.01,
                            min_samples_split=0.02,n_estimators=50, n_jobs=5, oob_score=True)
clf_other_90, proba_other, y_test_other, fpr_other, tpr_other, scaler_90_other = evaluate_model(clf_other,X_other, y_other, scaler_other, total_samples=100, train_size=0.7, smote=True, best_model=True,best_criteria='accuracy')

AUC: 0.828 +- 0.057
Sensitivity: 0.603 +- 0.150
Specificity: 0.839 +- 0.058
Positive Predictive Values: 0.495 +- 0.116
Accuracy: 0.788 +- 0.047
44
The highest ppv model has the following:
AUC: 0.944
Sensitivity: 1.000
Specificity: 0.881
Positive Predictive Values: 0.500
Accuracy: 0.894


In [49]:
X_other, y_other= data_df_80[other_data_80[1:]].values, data_df_80['MuscleSkeletalLim'].values
scaler_other = Normalizer(norm='l1')
#clf_other = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features=None,min_samples_leaf=0.01,
#                            min_samples_split=0.02,n_estimators=50, n_jobs=5, oob_score=True)
clf_other = RandomForestClassifier(bootstrap=True,class_weight='balanced', criterion="entropy",max_features='sqrt',
                                  min_samples_leaf=0.035789473684210524, min_samples_split= 0.01,
                                  n_estimators=100,oob_score=True)
clf_other_80, proba_other, y_test_other, fpr_other, tpr_other, scaler_80_other = evaluate_model(clf_other,X_other, y_other, scaler_other, total_samples=100, train_size=0.7, smote=True, best_model=True)

AUC: 0.823 +- 0.053
Sensitivity: 0.681 +- 0.148
Specificity: 0.782 +- 0.063
Positive Predictive Values: 0.453 +- 0.097
Accuracy: 0.761 +- 0.047
61
The highest ppv model has the following:
AUC: 0.905
Sensitivity: 0.895
Specificity: 0.809
Positive Predictive Values: 0.654
Accuracy: 0.833


In [50]:
X_other, y_other= data_df_70[other_data_70[1:]].values, data_df_70['MuscleSkeletalLim'].values
scaler_other = Normalizer(norm='l1')
clf_other = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features=None,min_samples_leaf=0.01,
                            min_samples_split=0.02,n_estimators=50, n_jobs=5, oob_score=True)
clf_other_70, proba_other, y_test_other, fpr_other, tpr_other, scaler_70_other = evaluate_model(clf_other,X_other, y_other, scaler_other, total_samples=100, train_size=0.7, smote=True, best_model=True,best_criteria='accuracy')

AUC: 0.803 +- 0.060
Sensitivity: 0.591 +- 0.148
Specificity: 0.837 +- 0.059
Positive Predictive Values: 0.479 +- 0.128
Accuracy: 0.788 +- 0.053
12
The highest ppv model has the following:
AUC: 0.876
Sensitivity: 0.667
Specificity: 0.947
Positive Predictive Values: 0.667
Accuracy: 0.909


In [51]:
X_other, y_other= data_df_60[other_data_60[1:]].values, data_df_60['MuscleSkeletalLim'].values
scaler_other = Normalizer(norm='l1')
clf_other = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features=None,min_samples_leaf=0.01,
                            min_samples_split=0.02,n_estimators=50, n_jobs=5, oob_score=True)
clf_other_60, proba_other, y_test_other, fpr_other, tpr_other, scaler_60_other = evaluate_model(clf_other,X_other, y_other, scaler_other, total_samples=100, train_size=0.7, smote=True, best_model=True,best_criteria='accuracy')

AUC: 0.827 +- 0.052
Sensitivity: 0.586 +- 0.157
Specificity: 0.821 +- 0.055
Positive Predictive Values: 0.449 +- 0.115
Accuracy: 0.774 +- 0.048
87
The highest ppv model has the following:
AUC: 0.938
Sensitivity: 0.700
Specificity: 0.946
Positive Predictive Values: 0.700
Accuracy: 0.909


In [52]:
X_other, y_other= data_df_50[other_data_50[1:]].values, data_df_50['MuscleSkeletalLim'].values
scaler_other = Normalizer(norm='l1')
clf_other = RandomForestClassifier(bootstrap=True, criterion="entropy",max_features=None,min_samples_leaf=0.01,
                            min_samples_split=0.02,n_estimators=50, n_jobs=5, oob_score=True)
clf_other_50, proba_other, y_test_other, fpr_other, tpr_other, scaler_50_other = evaluate_model(clf_other,X_other, y_other, scaler_other, total_samples=100, train_size=0.7, smote=True, best_model=True,best_criteria='accuracy')

AUC: 0.768 +- 0.065
Sensitivity: 0.506 +- 0.151
Specificity: 0.816 +- 0.057
Positive Predictive Values: 0.412 +- 0.108
Accuracy: 0.750 +- 0.051
82
The highest ppv model has the following:
AUC: 0.872
Sensitivity: 0.769
Specificity: 0.906
Positive Predictive Values: 0.667
Accuracy: 0.879


In [58]:
X_other, y_other= data_df_40[other_data_40[1:]].values, data_df_40['MuscleSkeletalLim'].values
scaler_other = Normalizer(norm='l1')
clf_other = RandomForestClassifier(bootstrap=True,class_weight='balanced', criterion="entropy",max_features='sqrt',
                                  min_samples_leaf=0.035789473684210524, min_samples_split= 0.29105263157894734,
                                  n_estimators=100,oob_score=True)
clf_other_40, proba_other, y_test_other, fpr_other, tpr_other, scaler_40_other = evaluate_model(clf_other,X_other, y_other, scaler_other, total_samples=300, train_size=0.7, smote=True, best_model=True,best_criteria='accuracy')

AUC: 0.736 +- 0.059
Sensitivity: 0.642 +- 0.133
Specificity: 0.675 +- 0.077
Positive Predictive Values: 0.341 +- 0.086
Accuracy: 0.668 +- 0.057
152
The highest ppv model has the following:
AUC: 0.850
Sensitivity: 0.727
Specificity: 0.836
Positive Predictive Values: 0.471
Accuracy: 0.818


## DataFrame creation

In [63]:
clf_cardiac_100 = clf_cardiac

In [64]:
percentage = [100, 90, 80,70,60,50,40]
df_list = [data_df_100, data_df_90, data_df_80, data_df_70, data_df_60, data_df_50, data_df_40]
cardiac_col_list = [cardiac_data_100, cardiac_data_90, cardiac_data_80, cardiac_data_70, cardiac_data_60, cardiac_data_50,
            cardiac_data_40]
cardiac_model_list = [clf_cardiac_100, clf_cardiac_90, clf_cardiac_80, clf_cardiac_70, clf_cardiac_60, clf_cardiac_50,
             clf_cardiac_40]
pulmonary_col_list = [pulmonary_data_100, pulmonary_data_90, pulmonary_data_80, pulmonary_data_70, pulmonary_data_60, 
                      pulmonary_data_50, pulmonary_data_40]
pulmonary_model_list = [clf_pulmonary_100, clf_pulmonary_90, clf_pulmonary_80, clf_pulmonary_70, clf_pulmonary_60, 
                        clf_pulmonary_50, clf_pulmonary_40]
other_col_list = [other_data_100, other_data_90, other_data_80, other_data_70, other_data_60, other_data_50, other_data_40]
other_model_list = [clf_other_100, clf_other_90, clf_other_80, clf_other_70, clf_other_60,  clf_other_50, clf_other_40]

In [69]:
save_model(clf_cardiac_100, scaler_100_cardiac, 'clf_cardiac_100')
save_model(clf_cardiac_90, scaler_90_cardiac, 'clf_cardiac_90')
save_model(clf_cardiac_80, scaler_80_cardiac, 'clf_cardiac_80')
save_model(clf_cardiac_70, scaler_70_cardiac, 'clf_cardiac_70')
save_model(clf_cardiac_60, scaler_60_cardiac, 'clf_cardiac_60')
save_model(clf_cardiac_50, scaler_50_cardiac, 'clf_cardiac_50')
save_model(clf_cardiac_40, scaler_40_cardiac, 'clf_cardiac_40')
#*******************************************************
save_model(clf_pulmonary_100, scaler_100_pulmonary, 'clf_pulmonary_100')
save_model(clf_pulmonary_90, scaler_90_pulmonary, 'clf_pulmonary_90')
save_model(clf_pulmonary_80, scaler_80_pulmonary, 'clf_pulmonary_80')
save_model(clf_pulmonary_70, scaler_70_pulmonary, 'clf_pulmonary_70')
save_model(clf_pulmonary_60, scaler_60_pulmonary, 'clf_pulmonary_60')
save_model(clf_pulmonary_50, scaler_50_pulmonary, 'clf_pulmonary_50')
save_model(clf_pulmonary_40, scaler_40_pulmonary, 'clf_pulmonary_40')
#*******************************************************
save_model(clf_other_100, scaler_100_other, 'clf_other_100')
save_model(clf_other_90, scaler_90_other, 'clf_other_90')
save_model(clf_other_80, scaler_80_other, 'clf_other_80')
save_model(clf_other_70, scaler_70_other, 'clf_other_70')
save_model(clf_other_60, scaler_60_other, 'clf_other_60')
save_model(clf_other_50, scaler_50_other, 'clf_other_50')
save_model(clf_other_40, scaler_40_other, 'clf_other_40')