# Evaluation Notebook
- Calculate the scores and format into table

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, average_precision_score
from sklearn.utils import resample

## Specify parameters and import data

In [2]:
# --- Model parameters
model_name = "Evaluation_Revised"

# --- Paths
# Base data path
base_data_path = Path("../00_Data/")
# Dataset Path (training, testing, etc.)
dataset_path =  base_data_path / "publication_ready"
# Model Path
model_path = base_data_path / "model_output"
# Export Path (model checkpoints, predictions, etc.)
export_path = model_path / model_name

assert base_data_path.is_dir(),\
  f"{base_data_path} either doesn't exist or is not a directory."
export_path.mkdir(exist_ok=True)

# --- Misc settings
# Print dataframes and keys to debug
debug = True

In [3]:
labels = ['urinary',  'respiratory',  'abdominal',  'neurological', 'skin_soft_tissue', 'ent', 'orthopaedic',
           'other_specific', 'no_specific_source', 'prophylaxis', 'uncertainty', 'not_informative']
labels_pretty = []
for label in labels:
    if label == "ent":
        labels_pretty.append("ENT")
        continue
    labels_pretty.append(" ".join(word.capitalize() for word in label.split("_")))
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels2labels_pretty = {old:pretty for old, pretty in zip(labels, labels_pretty)}

test_oxford_df = pd.read_csv(
    dataset_path / 'testing_oxford_2023-08-23.csv',
    dtype={"Indication": str},
    keep_default_na=False,
    na_values=["NA"],
).rename(
    columns=labels2labels_pretty
)

test_banbury_df = pd.read_csv(
    dataset_path / 'testing_banbury_2023-08-23.csv',
    dtype={"Indication": str},
    keep_default_na=False,
    na_values=["NA"],
).rename(
    columns=labels2labels_pretty
)

test_set_raw = {
    "Oxford": test_oxford_df,
    "Banbury": test_banbury_df,
}


In [4]:
predictions_dict = {
    "Regex": {
        "has_proba": False,
    },

    "XGBoost": {
        "has_proba": True,
    },
    
    "Base_BERT": {
        "has_proba": True,
    },

    "Bio_ClinicalBERT": {
        "has_proba": True,
    },

    "GPT3.5": {
        "has_proba": False,
        "file_location": "Gpt-3.5-turbo-0125-Finetuned-Json",
        "file_descriptor": "ft"
    },

    "GPT4": {
        "has_proba": False,
        "file_location": "Gpt-4-0125-preview-Zero_Shot-Json",
        "file_descriptor": "zs"
    }
}

## Evaluation methods

In [5]:
def per_class_accuracy(y_true, y_pred):
    class_accuracies = []
    # Assuming y_true and y_pred are numpy arrays and have shape (n_samples, n_classes)
    n_classes = y_true.shape[1]
    for i in range(n_classes):
        class_accuracies.append(accuracy_score(y_true.iloc[:, i], y_pred.iloc[:, i]))
    return class_accuracies

def weighted_accuracy(y_true, class_accuracies):
    """Calculate the weighted average based on the class distribution.
    The results agree with sklean's "average=weighted" method.
    """
    # Calculate the frequency of each class
    class_counts = np.sum(y_true, axis=0)
    # Calculate the total number of samples
    total_samples = np.sum(class_counts)
    # Calculate weights for each class
    weights = class_counts / total_samples
    # Calculate the weighted average accuracy
    weighted_avg_accuracy = np.sum(weights * class_accuracies)
    return weighted_avg_accuracy


# Metrics function
def calculate_metrics(y_true, 
                      predictions_probs,
                      predictions_binarised,
                      labels, 
                      result_precision=2, 
                      averaging_method = "weighted",
                      n_iter=1000,
                      ci=0.95,
    ):
    # Calculate per class metrics
    scores_per_class_dict = {}
    scores_per_class_dict["F1-Score"] = f1_score(y_true=y_true, y_pred=predictions_binarised, average=None)
    scores_per_class_dict["ROC AUC"] = roc_auc_score(y_true=y_true, y_score=predictions_probs, average=None)
    scores_per_class_dict["PR AUC"] = average_precision_score(y_true=y_true, y_score=predictions_probs, average=None)
    scores_per_class_dict["Accuracy"] = per_class_accuracy(y_true=y_true, y_pred=predictions_binarised)
    scores_per_class_dict["Total Accuracy"] = []

    scores_per_class = pd.DataFrame.from_dict(scores_per_class_dict,orient='index', columns=labels)
    
    # Calculate average metrics
    scores_average = {}
    scores_average["F1-Score"] = f1_score(y_true=y_true, y_pred=predictions_binarised, average=averaging_method)
    scores_average["ROC AUC"] = roc_auc_score(y_true=y_true, y_score=predictions_probs, average=averaging_method)
    scores_average["PR AUC"] = average_precision_score(y_true=y_true, y_score=predictions_probs, average=averaging_method)
    scores_average["Accuracy"] = weighted_accuracy(y_true=y_true, class_accuracies=scores_per_class_dict["Accuracy"])
    scores_average["Total Accuracy"] = np.float64(accuracy_score(y_true=y_true, y_pred=predictions_binarised))

    # Calculate 95 CI based on bootstrapping
    iter_scores = {
        "F1-Score": [],
        "ROC AUC": [],
        "PR AUC": [],
        "Accuracy": [],
        "Total Accuracy": [],
    }

    for _ in range(n_iter):
        # Resample with replacement from the original training data
        y_true_resampled, y_pred_probs, y_pred_bin_resampled = resample(y_true, predictions_probs, predictions_binarised, replace=True)

        # Calculate the metric of interest
        iter_scores["F1-Score"].append(f1_score(y_true=y_true_resampled, y_pred=y_pred_bin_resampled, average=averaging_method))
        iter_scores["ROC AUC"].append(roc_auc_score(y_true=y_true_resampled, y_score=y_pred_probs, average=averaging_method))
        iter_scores["PR AUC"].append(average_precision_score(y_true=y_true_resampled, y_score=y_pred_probs, average=averaging_method))
        iter_scores["Accuracy"].append(weighted_accuracy(y_true=y_true_resampled, class_accuracies=per_class_accuracy(y_true=y_true_resampled, y_pred=y_pred_bin_resampled)))
        iter_scores["Total Accuracy"].append(accuracy_score(y_true=y_true_resampled, y_pred=y_pred_bin_resampled))

    # Calculate the confidence interval
    scores_ci = {}
    ci = (1 - ci) / 2
    for score_name, score_values in iter_scores.items():
        scores_ci[score_name] = np.percentile(score_values, [ci * 100, (1 - ci) * 100])

    
    
    # Format into printable string
    metrics_string = ""
    for score_name, avg_score_value in scores_average.items():
        avg_score = avg_score_value.round(result_precision)
        min_sore = scores_per_class.loc[score_name].min().round(result_precision)
        max_score = scores_per_class.loc[score_name].max().round(result_precision) 
        metrics_string += f"{score_name}: {avg_score} ({min_sore}-{max_score})\n"
    
    return scores_per_class, scores_average, scores_ci, metrics_string

# for location, y_test_true in test_set_raw.items():
#     y_test_true = test_banbury_df[labels_pretty]
#     y_pred = pd.read_csv("/home/kevin/DPhil/Projects/EHR-Indication-Processing/00_Data/model_output/Bio_ClinicalBERT/predictions_Bio_ClinicalBERT_Banbury.csv").rename(columns=labels2labels_pretty)[labels_pretty]\
#             .fillna(0)
#     d = calculate_metrics(y_test_true, y_pred, y_pred, labels_pretty)
#     print(d)
#     break

Full result table (main section)

In [6]:
# --- Repeat for each location
for location, y_test_true in test_set_raw.items():
    y_test_true = y_test_true[labels_pretty]

    # --- Repeat for each model
    scores_full_list = []
    for model_name, metadata in predictions_dict.items():
        # --- Build filepaths and parse CSVs
        file_folder = metadata["file_location"] if ("file_location" in metadata) else model_name
        file_descriptor = metadata["file_descriptor"] if ("file_descriptor" in metadata) else model_name
        file_proba_tag = "_proba" if metadata["has_proba"] else ""

        predictions_bin_path = model_path / file_folder / f"predictions_{file_descriptor}_{location}.csv"
        predictions_prob_path = model_path / file_folder / f"predictions{file_proba_tag}_{file_descriptor}_{location}.csv"

        predictions_binarised = pd.read_csv(predictions_bin_path)\
            .rename(columns=labels2labels_pretty)[labels_pretty]\
            .fillna(0)
        predictions_proba = pd.read_csv(predictions_prob_path)\
            .rename(columns=labels2labels_pretty)[labels_pretty]\
            .fillna(0)
        
        if model_name == "Regex":
            from sklearn.metrics import confusion_matrix
            print(f"Model: {model_name}, Location: {location}, Category: Not Informative")
            print(confusion_matrix(y_true=y_test_true["Not Informative"], y_pred=predictions_binarised["Not Informative"]))

        # Calculate the score
        scores_per_class, scores_average, scores_ci, metrics_string = calculate_metrics(y_test_true, predictions_proba, predictions_binarised, labels_pretty, averaging_method="weighted")
        scores_per_class["Avg"] = scores_average
        scores_per_class["Min"] = scores_per_class.min(axis=1)
        scores_per_class["Max"] = scores_per_class.max(axis=1)
        scores_per_class["Str"] = scores_per_class\
            .apply(lambda row: f"{row['Avg']:.2f} [{row['Min']:.2f}-{row['Max']:.2f}]", axis=1)
        scores_per_class["CI"] = {key: np.array2string(value, precision=2, separator=",") for key, value in scores_ci.items()}


        # Add metadata back as columns
        scores_per_class = scores_per_class\
            .rename_axis('Metric')\
            .reset_index()
        scores_per_class\
            .insert(loc=0, column='Model', value=model_name)
        
        scores_full_list.append(scores_per_class)

    scores_full = pd.concat(scores_full_list)

    scores_full.to_csv(export_path / f"scores_full_{location}.csv", index=False, float_format='%.2f')

Model: Regex, Location: Oxford, Category: Not Informative
[[1947   19]
 [  34    0]]
Model: Regex, Location: Banbury, Category: Not Informative
[[1956    9]
 [  35    0]]


Table for supplements (with micro, macro average) [very ugly]

In [7]:
# --- Repeat for each location
for location, y_test_true in test_set_raw.items():
    y_test_true = y_test_true[labels_pretty]

    # --- Repeat for each model
    scores_full_list = []
    for model_name, metadata in predictions_dict.items():
        # --- Build filepaths and parse CSVs
        file_folder = metadata["file_location"] if ("file_location" in metadata) else model_name
        file_descriptor = metadata["file_descriptor"] if ("file_descriptor" in metadata) else model_name
        file_proba_tag = "_proba" if metadata["has_proba"] else ""

        predictions_bin_path = model_path / file_folder / f"predictions_{file_descriptor}_{location}.csv"
        predictions_prob_path = model_path / file_folder / f"predictions{file_proba_tag}_{file_descriptor}_{location}.csv"

        predictions_binarised = pd.read_csv(predictions_bin_path)\
            .rename(columns=labels2labels_pretty)[labels_pretty]\
            .fillna(0)
        predictions_proba = pd.read_csv(predictions_prob_path)\
            .rename(columns=labels2labels_pretty)[labels_pretty]\
            .fillna(0)
        
        if model_name == "Regex":
            from sklearn.metrics import confusion_matrix
            print(f"Model: {model_name}, Location: {location}, Category: Not Informative")
            print(confusion_matrix(y_true=y_test_true["Not Informative"], y_pred=predictions_binarised["Not Informative"]))

        # Calculate the score
        scores_per_class, scores_average_weighted, scores_ci, metrics_string = calculate_metrics(y_test_true, predictions_proba, predictions_binarised, labels_pretty, averaging_method="weighted", n_iter=1)
        _, scores_average_micro , _, _ = calculate_metrics(y_test_true, predictions_proba, predictions_binarised, labels_pretty, averaging_method="micro", n_iter=1)
        _, scores_average_macro, _, _ = calculate_metrics(y_test_true, predictions_proba, predictions_binarised, labels_pretty, averaging_method="macro", n_iter=1)
        scores_per_class["Avg Weighted"] = scores_average_weighted
        scores_per_class["Avg Micro"] = scores_average_micro
        scores_per_class["Avg Macro"] = scores_average_macro 
        


        # Add metadata back as columns
        scores_per_class = scores_per_class\
            .rename_axis('Metric')\
            .reset_index()
        scores_per_class\
            .insert(loc=0, column='Model', value=model_name)
        
        scores_full_list.append(scores_per_class)

    scores_full = pd.concat(scores_full_list)

    scores_full.to_csv(export_path / f"scores_appendix_{location}.csv", index=False, float_format='%.2f')

Model: Regex, Location: Oxford, Category: Not Informative
[[1947   19]
 [  34    0]]
Model: Regex, Location: Banbury, Category: Not Informative
[[1956    9]
 [  35    0]]
