This notebook creates precision recall curves and receiver operating curves from the repeated five-fold cross validation runs (Figure 3). You must have generated 'select_perf.csv' from the notebook "Model Selection + Statistical Tests + Data Visualization.ipynb."

In [None]:
# TODO: make the max precision the final point in the graph, not 1.0

In [None]:
#Import packages
import numpy as np
import pandas as pd
from sklearn.metrics import (roc_auc_score,
                             roc_curve,
                             precision_recall_curve,
                             auc,
                             average_precision_score
                             )
from scipy import interp
from scipy.stats import sem, t
import matplotlib.pyplot as plt
%matplotlib inline

#Read select_perf.csv
rdir = 'results_2020-09-14'
perf = pd.read_csv(rdir + '/select_perf.csv')


In [None]:
import pdb 

def parse(pred):
    pred = pred.replace('\n','')
    pred = pred.replace('[','')
    pred = pred.replace(']','')
    pred = list(map(float,pred.split()))
    return pred

def prc_values(y,y_pred_proba):
    precision, recall, prcthresholds = precision_recall_curve(y, y_pred_proba, pos_label=1)
    precision[-1] = np.max(precision[:-1])
    s = np.argsort(recall)
    precision = precision[s]
    recall = recall[s]
    mean_recall = np.linspace(0.0, 1, 21)
    precision = interp(mean_recall, recall, precision)
    return mean_recall, precision

def roc_values(y,y_pred_proba):
    fpr,tpr, rocthresholds = roc_curve(y, y_pred_proba, pos_label=1)
    roc = pd.DataFrame(list(zip(fpr,tpr, rocthresholds)), columns =['fpr','tpr','thresholds']) 
    roc = roc.sort_values(by='fpr')
    tpr = roc['tpr']
    fpr = roc['fpr']
    mean_fpr = np.linspace(0, 1, 21)
    tpr = interp(mean_fpr, fpr, tpr)
    return mean_fpr, tpr
#Import correct labels
targets = {
            'htn_dx_ia':'Htndx',
            'res_htn_dx_ia':'ResHtndx', 
            'htn_hypok_dx_ia':'HtnHypoKdx', 
            'HTN_heuristic':'HtnHeuri', 
            'res_HTN_heuristic':'ResHtnHeuri',
            'hypoK_heuristic_v4':'HtnHypoKHeuri'
            }
heuristics = {
            'Htndx':'HTN_heuristic',
            'ResHtndx':'res_HTN_heuristic',
            'HtnHypoKdx':'hypoK_heuristic_v4'
    
}
targets_rev = {v:k for k,v in targets.items()}
dnames = ['Htndx',"HtnHypoKdx","ResHtndx",'HtnHeuri','HtnHypoKHeuri',"ResHtnHeuri"]
dnames_nice = ['HTN Diagnosis',"HTN-Hypokalemia Diagnosis","Resistant HTN Diagnosis",
               'HTN Heuristic','Htn-Hypokalemia Heuristic',"Resistant HTN Heuristic"]
dnames_to_nice = {k:v for k,v in zip(dnames, dnames_nice)}

folds = ['A','B','C','D','E']

models = ['RandomForest',
          'DecisionTree',
          'Feat_boolean',
          'Feat_boolean_L1',
          'GaussianNaiveBayes',
          'LogisticRegression_L2',
          'LogisticRegression_L1']
model_nice = ['RF',
          'DT',
          'FEAT',
          'FEAT L1',
          'GNB',
          'LR L2',
          'LR L1']
nice_model_labels = {k:v for k,v in zip(models,model_nice)}
nice_to_ugly = {v:k for k,v in nice_model_labels.items()}
markers = ('^','o', 's', 'p', 'h', 'D', 'P', 'X', '*','v', '<', '>',)
order = ['GNB','DT', 'LR L2','LR L1','RF','FEAT']
marker_choice = {
    'GNB':'s',
    'DT':'o',
    'LR L2':'d',
    'LR L1':'P',
    'RF':(4,1,0), #'D',
    'FEAT':'X'
}


spacing = 5
fontsize=14
# fig = plt.figure(figsize=(12,6))
for target_new, perf_t in perf.groupby('target'):
    if target_new != 'ResHtndx': continue
#     ax = fig.add_subplot(2,3, i) 
    fig = plt.figure(figsize=(12,6))
    # axis for PR curve
    ax1 = fig.add_subplot(1,2,1)
    # axis for ROC curve
    ax2 = fig.add_subplot(1,2,2)
    
    i = 1
#     for m, (model, perf_t_m) in enumerate(perf_t.groupby('model')):
    for m, model_nice in enumerate(order):
        model = nice_to_ugly[model_nice]
        perf_t_m = perf_t.loc[perf_t.model==model] 
        if model == 'Feat_boolean_L1': 
            continue
        print('graphing ',target_new,model)
        mean_run_precisions = []
        mean_run_tprs = []
        if i == 1 and target_new in heuristics.keys():
            mean_run_precision_h = []
            mean_run_recall_h = []
            mean_run_fpr_h = []
            mean_run_tpr_h = []
        for RunID, perf_t_m_id in perf_t_m.groupby('RunID'):
            precisions = []
            tprs = []
            precisions_h = []
            recalls_h = []
            fprs_h = []
            tprs_h = []
            for fold, perf_t_m_id_f in perf_t_m_id.groupby('fold'):

                #True labels
                df = pd.read_csv('../Dataset' + str(RunID) + '/' + target_new + '/' + target_new + fold 
                                 + 'Test.csv')
                y = df[targets_rev[target_new]].values
                
                # handle the heuristic
                if i == 1 and target_new in heuristics.keys():
                    y_heuristic = df[heuristics[target_new]].values
#                     print('y_heuristic:',y_heuristic)
                    precision_h = np.sum((y==1) & (y_heuristic==1))/np.sum(y_heuristic==1)
                    recall_h = np.sum((y==1) & (y_heuristic==1))/np.sum(y==1)
#                     print('precision_h:',precision_h)
#                     print('recall_h:',recall_h)
#                     precision_h, recall_h = precision_recall_curve(y, y_heuristic)
                    precisions_h.append(precision_h)
                    recalls_h.append(recall_h)
#                     fpr_h, tpr_h = roc_curve(y, y_heuristic)
                    fpr_h = np.sum((y==0) & (y_heuristic==1))/np.sum(y==0) 
                    tpr_h = recall_h
#                     print('fpr_h:',fpr_h)
#                     print('tpr_h:',tpr_h)
                
                    fprs_h.append(fpr_h)
                    tprs_h.append(tpr_h)
                    heuristic=False
#                 print('y:',len(y))
                #Predicted probabilities
                assert(len(perf_t_m_id_f)==1)
                y_pred_proba = eval(perf_t_m_id_f['pred_proba'].values[0])
            
                # Precision / Recall
                ####################
                mean_recall, precision = prc_values(y,y_pred_proba)
                precisions.append(precision)
            
                # ROC
                #####
                mean_fpr, tpr = roc_values(y,y_pred_proba)
                tprs.append(tpr)
            
            #mean_run_precisions: The mean of five fold precisions
            mean_run_precisions.append(np.mean(precisions, axis=0))
            #mean_run_tprs: The mean of five fold tprs
            mean_run_tprs.append(np.mean(tprs, axis=0))
            if i == 1 and target_new in heuristics.keys():
                mean_run_precision_h.append(np.mean(precisions_h, axis=0))
                mean_run_recall_h.append(np.mean(recalls_h, axis=0))
                mean_run_fpr_h.append(np.mean(fprs_h, axis=0))
                mean_run_tpr_h.append(np.mean(tprs_h, axis=0))
            
        #mean_precisions: The mean of mean_run_precisions over 50 iterations
        mean_precisions = np.mean(mean_run_precisions, axis=0)
        #mean_tprs: The mean of mean_run_tprs over 50 iterations
        mean_tprs = np.mean(mean_run_tprs, axis=0)
        
#         plt.figure(target_new, figsize=(10, 6))
        # Precision/Recall plot 
        ax1.plot(mean_recall, mean_precisions, 
                 alpha=1,
                 label=nice_model_labels[model],
                 marker = marker_choice[model_nice], 
                 markevery=spacing)
        if model == 'Feat_boolean':
            #print(mean_run_precisions)
            std_err = sem(mean_run_precisions, axis=0)
            h = std_err * t.ppf(1.95/2, len(mean_run_precisions) - 1)
            precisions_upper = np.minimum(mean_precisions + h, 1)
            precisions_lower = np.maximum(mean_precisions - h, 0)
            ax1.fill_between(mean_recall, precisions_lower, precisions_upper, 
                             color='grey', alpha=.2, label=r'95% Confidence Interval')
        # ROC plot
        #####
        ax2.plot(mean_fpr, mean_tprs, 
                 alpha=1,
                 label=nice_model_labels[model],
#                  marker = markers[m], 
                 marker = marker_choice[model_nice], 
                 markevery=spacing)
        ax2.plot([0,1],[0,1],'--k',label=None)
        if model == 'Feat_boolean':
            #print(mean_run_tprs)
            std_err = sem(mean_run_tprs, axis=0)
            h = std_err * t.ppf(1.95/2, len(mean_run_tprs) - 1)
            tprs_upper = np.minimum(mean_tprs + h, 1)
            tprs_lower = np.maximum(mean_tprs - h, 0)
            ax2.fill_between(mean_fpr, tprs_lower, tprs_upper, 
                             color='grey', alpha=.2, label=r'95% Confidence Interval')
        i+=1
            
    # heuristic performance
#     print('mean_run_precision_h:',mean_run_precision_h)
#     print('mean_run_recall_h:',mean_run_recall_h)
#     print('mean_run_fpr_h:',mean_run_fpr_h)
#     print('mean_run_tpr_h:',mean_run_tpr_h)
    mean_recall_h = np.mean(mean_run_recall_h, axis=0)
    mean_precision_h = np.mean(mean_run_precision_h, axis=0)
    mean_fpr_h = np.mean(mean_run_fpr_h, axis=0)
    mean_tpr_h = np.mean(mean_run_tpr_h, axis=0)
    print(mean_recall_h, mean_precision_h, mean_fpr_h, mean_tpr_h)
    # plot heuristic
    ax1.plot(mean_recall_h,
             mean_precision_h,
             'Xk',
             label='Heuristic',
            ) 
    # plot heuristic
    ax2.plot(mean_fpr_h,
             mean_tpr_h,
             'Xk',
             label='Heuristic',
            ) 
    plt.suptitle(dnames_to_nice[target_new], fontsize=fontsize)
    ax1.set_xlabel("Recall (Sensitivity)", fontsize=fontsize)
    ax1.set_ylabel("Precision", fontsize=fontsize)
    ax2.set_xlabel("1 - Specificity", fontsize=fontsize)
    ax2.set_ylabel("Sensitivity", fontsize=fontsize)
#     if i in [1,4]:
#         plt.xlabel("Recall (Sensitivity)")
#     if i > 3:
#         plt.ylabel("Precision (PPV)")
#     else:
#         plt.xticks([])
#     if i == 6:        
#         plt.legend()
    
#     i += 1
    plt.legend(loc='best')
#     plt.tight_layout()
    for filetype in ['.svg','.png','.pdf']:
        plt.savefig('figs/'+rdir + '/' + target_new + '_PRC_ROC'+ filetype, dpi=400)

# plt.show()        

        
