In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from supervised.automl import AutoML
import sklearn.metrics
from IPython.utils import io

In [None]:
if __name__ == '__main__':
    train = {}
    train_names = [ "real" ]

    train["real"] = pd.read_csv('./split_train.csv',sep=',',decimal='.')
    test = pd.read_csv('./split_test.csv',sep=',',decimal='.')

In [None]:
%%capture
if __name__ == '__main__':
    # Define inputs for individual models
    feature_cols = {
        "AllData": ['Age', 'BMI', 'sNeurpilin', 'sTie-2', 'IL-8', 'Follistatin', 'Leptin', 'G-CSF'],
        "BMI-Biomarkers": ['BMI', 'sNeurpilin', 'sTie-2', 'IL-8', 'Follistatin', 'Leptin', 'G-CSF'],
        "Biomarkers": [ 'sNeurpilin', 'sTie-2', 'IL-8', 'Follistatin', 'Leptin', 'G-CSF'],
        "BMI-BiomarkersNoLeptin": ['BMI', 'sNeurpilin', 'sTie-2', 'IL-8', 'Follistatin', 'G-CSF'],
        "BiomarkersNoBMINoLeptin": ['sNeurpilin', 'sTie-2', 'IL-8', 'Follistatin', 'G-CSF'],
        "Selected": [ 'Age', 'IL-8', 'Leptin', 'G-CSF' ],
        "sNeurpilin": ['sNeurpilin' ],
        "sTie-2": [ 'sTie-2' ],
        "IL-8": [ 'IL-8' ],
        "Leptin": [ 'Leptin' ],
        "Follistatin": [ 'Follistatin' ],
        "G-CSF": [ 'G-CSF' ],
        "BMI": [ 'BMI' ],
    }
    
    # Define friendly names for individual models
    feature_names = {
        "AllData": "Age + BMI + AFs",
        "BMI-Biomarkers": "BMI + AFs",
        "Biomarkers": "AFs",
        "BMI-BiomarkersNoLeptin": "BMI + AFs w/o Leptin",
        "BiomarkersNoBMINoLeptin": "AFs w/o Leptin",
        "sNeurpilin": 'sNeurpilin' ,
        "sTie-2": 'sTie-2' ,
        "IL-8": 'IL-8' ,
        "Leptin": 'Leptin' ,
        "Selected": 'Age + IL-8 + Leptin + G-CSF' ,
        "Follistatin": 'Follistatin',
        "G-CSF": 'G-CSF' ,
        "BMI": "BMI"
    }

    X_train = {}
    y_train = {}
    X_test = {}
    y_test = {}
    
    y_predicted={}
    y_predicted_train={}

    fpr={}
    tpr={}
    thresholds={}
    
    fpr_train={}
    tpr_train={}
    thresholds_train={}

    # Go through all models and all variants of the models and load them, prepare the data and predictions based on each of the models for training and test data
    for selectedModelName in feature_cols:
        for selectedTrainData in train_names:
            data_key=selectedModelName + "_" + selectedTrainData
            model = AutoML(results_path="./"+selectedModelName+"_"+selectedTrainData+"_compete_auc")

            selectedModel=feature_cols[selectedModelName];

            X_train[data_key] = train[selectedTrainData].loc[:, feature_cols[selectedModelName]]
            y_train[data_key] = train[selectedTrainData].Case
            X_test[data_key] = test.loc[:, feature_cols[selectedModelName]]
            y_test[data_key] = test.Case
            
            y_predicted[data_key] = model.predict_proba(X_test[data_key])
            y_predicted_train[data_key] = model.predict_proba(X_train[data_key])

            fpr[data_key], tpr[data_key], thresholds[data_key] = sklearn.metrics.roc_curve(y_test[data_key], y_predicted[data_key][:,1])
            fpr_train[data_key], tpr_train[data_key], thresholds_train[data_key] = sklearn.metrics.roc_curve(y_train[data_key], y_predicted_train[data_key][:,1])
            

In [None]:
if __name__ == '__main__':
    stat_groups = {
        "Univariate": ["BMI", "sNeurpilin", "sTie-2", "IL-8", "Follistatin", "Leptin", "G-CSF" ],
        "Multivariate": ["AllData", "BMI-Biomarkers", "Biomarkers", "BMI-BiomarkersNoLeptin", "BiomarkersNoBMINoLeptin", "Selected" ],
    }
    
    for stat_group in stat_groups:
        print("Group " + stat_group)
        for selectedTrainData in train_names:
            print("Data for " + selectedTrainData)
            # Show the plot of training ROCs for all individual features
            plt.figure(figsize=(8, 6), dpi=300)
            for selectedModelName in stat_groups[stat_group]:
                data_key=selectedModelName + "_" + selectedTrainData
                plt.plot(fpr_train[data_key], tpr_train[data_key],
                    label='ROC curve for {0} (area = {1:0.2f})'.format(feature_names[selectedModelName], sklearn.metrics.auc(fpr_train[data_key],tpr_train[data_key])))

            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('ROC Curve for training')
            plt.legend(loc="lower right")
            plt.show()

            # Show the plot of test ROCs for all individual features
            plt.figure(figsize=(8, 6), dpi=300)
            for selectedModelName in stat_groups[stat_group]:
                data_key=selectedModelName + "_" + selectedTrainData
                plt.plot(fpr[data_key], tpr[data_key],
                         label='ROC curve for {0} (area = {1:0.2f})'.format(feature_names[selectedModelName], sklearn.metrics.auc(fpr[data_key],tpr[data_key])))

            plt.plot([0, 1], [0, 1], 'k--')
            plt.xlim([0.0, 1.0])
            plt.ylim([0.0, 1.05])
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('ROC Curve for tests')
            plt.legend(loc="lower right")
            plt.show()
            
             # Show the plot of training ROCs for all individual features
            for selectedModelName in stat_groups[stat_group]:
                plt.figure(figsize=(8, 6), dpi=300)
                data_key=selectedModelName + "_" + selectedTrainData
                plt.plot(fpr_train[data_key], tpr_train[data_key],
                    label='ROC curve for {0} (area = {1:0.2f})'.format(feature_names[selectedModelName], sklearn.metrics.auc(fpr_train[data_key],tpr_train[data_key])))

                plt.plot([0, 1], [0, 1], 'k--')
                plt.xlim([0.0, 1.0])
                plt.ylim([0.0, 1.05])
                plt.xlabel('False Positive Rate')
                plt.ylabel('True Positive Rate')
                plt.title('ROC ')
                plt.legend(loc="lower right")
                plt.show()

In [None]:
import numpy as np
import scipy.stats as stats

def get_metric_result(cv_results):
    results = pd.DataFrame.from_dict(cv_results)
    results = results[results['status'] == "Success"]
    cols = ['rank_test_scores', 'param_classifier:__choice__', 'mean_test_score']
    cols.extend([key for key in cv_results.keys() if key.startswith('metric_')])
    return results[cols]

for selectedTrainData in train_names:
    print("#" * 80)
    print("##### Data for " + selectedTrainData + " #####")

    for selectedModelName in feature_cols:
        data_key=selectedModelName + "_" + selectedTrainData
        print("Model: ", selectedModelName)
        print("-" * 80)
        optimal_thr_idx = np.argmax(tpr_train[data_key] - fpr_train[data_key])
        thr = thresholds_train[data_key][optimal_thr_idx]

        print("Threshold (train): ", thr)
        predictions_train = np.where(y_predicted_train[data_key][:,1] > thr, 1, 0)
        predictions_test = np.where(y_predicted[data_key][:,1] > thr, 1, 0)
        fpr[data_key], tpr[data_key], thresholds[data_key] = sklearn.metrics.roc_curve(y_test[data_key], predictions_test)

        print("Accuracy score (train)", sklearn.metrics.accuracy_score(y_train[data_key], predictions_train))
        print("Accuracy score (test)", sklearn.metrics.accuracy_score(y_test[data_key], predictions_test))
        print("Precision score (train)", sklearn.metrics.precision_score(y_train[data_key], predictions_train))
        print("Precision score (test)", sklearn.metrics.precision_score(y_test[data_key], predictions_test))
        print("F1 score (train)", sklearn.metrics.f1_score(y_train[data_key], predictions_train))
        print("F1 score (test)", sklearn.metrics.f1_score(y_test[data_key], predictions_test))
        print("AUC score (train)", sklearn.metrics.roc_auc_score(y_train[data_key], predictions_train))
        print("AUC score (test)", sklearn.metrics.roc_auc_score(y_test[data_key], predictions_test))
        print("-" * 80)
        print("Train Report\n", sklearn.metrics.classification_report(y_train[data_key], predictions_train))
        print("Test Report\n", sklearn.metrics.classification_report(y_test[data_key], predictions_test))    
        print("-" * 80)
        conmatrix=sklearn.metrics.confusion_matrix(y_train[data_key], predictions_train)
        print(conmatrix)
        odd_ratio, p_value = stats.fisher_exact(conmatrix)
        print('odd ratio is : ' + str(odd_ratio))
        print('p_value is : ' + str(p_value))
        print("")
        conmatrix=sklearn.metrics.confusion_matrix(y_test[data_key],  predictions_test)
        print(conmatrix)
        odd_ratio, p_value = stats.fisher_exact(conmatrix)
        print('odd ratio is : ' + str(odd_ratio))
        print('p_value is : ' + str(p_value))
        print("")