In [7]:
import numpy as np
import pandas as pd
from scipy.stats import sem
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

df = pd.read_excel("pathologistes_prediction_and_target.xlsx", engine="openpyxl", sheet_name="data")
df.head()

Unnamed: 0,pathologist_01,pathologist_02,pathologist_03,pathologist_04,target
0,1,1,1,1,1
1,0,0,0,0,0
2,1,0,0,1,0
3,0,0,0,0,0
4,0,0,0,0,0


In [21]:
def compute_metrics(y_true, y_pred):
    compute_metric_with_CI(accuracy_score, y_true, y_pred)
    compute_metric_with_CI(precision_score, y_true, y_pred)
    compute_metric_with_CI(recall_score, y_true, y_pred)
    compute_metric_with_CI(f1_score, y_true, y_pred)
    compute_metric_with_CI(roc_auc_score, y_true, y_pred)
    compute_metric_with_CI(average_precision_score, y_true, y_pred)


def compute_metric_with_CI(metric, y_true, y_pred):
    n_bootstraps = 2000
    rng_seed = 42
    bootstrapped_scores = []
    rng = np.random.RandomState(rng_seed)
    
    for i in range(n_bootstraps):
        # bootstrap by sampling with replacement on the prediction indices
        indices = rng.randint(0, len(y_pred), len(y_pred))
        if len(np.unique(y_true[indices])) < 2:
            # We need at least one positive and one negative sample for ROC AUC
            # to be defined: reject the sample
            continue

        score = metric(y_true[indices], y_pred[indices])
        bootstrapped_scores.append(score)

    sorted_scores = np.array(bootstrapped_scores)
    sorted_scores.sort()

    # Computing the lower and upper bound of the 90% confidence interval
    # You can change the bounds percentiles to 0.025 and 0.975 to get
    # a 95% confidence interval instead.
    score = metric(y_true, y_pred)
    confidence_lower = sorted_scores[int(0.05 * len(sorted_scores))]
    confidence_upper = sorted_scores[int(0.95 * len(sorted_scores))]

    print(f"{metric} {round(score, 4)}({round(confidence_lower, 4)}-{round(confidence_upper, 4)})")

In [22]:
df_01 = df.loc[:, ["pathologist_01", "target"]]

compute_metrics(df_01['pathologist_01'], df_01['target'])

<function accuracy_score at 0x13c163ca0> 0.8466(0.8011-0.8864)
<function precision_score at 0x13c16c9d0> 0.7059(0.5217-0.8824)
<function recall_score at 0x13c16caf0> 0.3529(0.2195-0.4865)
<function f1_score at 0x13c16c4c0> 0.4706(0.3214-0.6071)
<function roc_auc_score at 0x13c160790> 0.6589(0.5929-0.7289)
<function average_precision_score at 0x13c160550> 0.3741(0.2598-0.5074)


In [23]:
df_02 = df.loc[:, ["pathologist_02", "target"]]

compute_metrics(df_02['pathologist_02'], df_01['target'])

<function accuracy_score at 0x13c163ca0> 0.9375(0.9091-0.9659)
<function precision_score at 0x13c16c9d0> 0.7059(0.5-0.8889)
<function recall_score at 0x13c16caf0> 0.6667(0.4706-0.84)
<function f1_score at 0x13c16c4c0> 0.6857(0.5161-0.8205)
<function roc_auc_score at 0x13c160790> 0.8175(0.7196-0.904)
<function average_precision_score at 0x13c160550> 0.5047(0.3108-0.7033)


In [24]:
df_03 = df.loc[:, ["pathologist_03", "target"]]

compute_metrics(df_03['pathologist_03'], df_01['target'])

<function accuracy_score at 0x13c163ca0> 0.9148(0.8807-0.9489)
<function precision_score at 0x13c16c9d0> 0.6471(0.4444-0.8421)
<function recall_score at 0x13c16caf0> 0.55(0.3684-0.7333)
<function f1_score at 0x13c16c4c0> 0.5946(0.4138-0.7391)
<function roc_auc_score at 0x13c160790> 0.7558(0.6619-0.8474)
<function average_precision_score at 0x13c160550> 0.407(0.237-0.59)


In [25]:
df_04 = df.loc[:, ["pathologist_04", "target"]]

compute_metrics(df_04['pathologist_04'], df_01['target'])

<function accuracy_score at 0x13c163ca0> 0.9148(0.8807-0.9489)
<function precision_score at 0x13c16c9d0> 0.5294(0.3333-0.7391)
<function recall_score at 0x13c16caf0> 0.5625(0.3529-0.7692)
<function f1_score at 0x13c16c4c0> 0.5455(0.3478-0.7027)
<function roc_auc_score at 0x13c160790> 0.7562(0.6518-0.8622)
<function average_precision_score at 0x13c160550> 0.3376(0.1735-0.5329)
