# Evaluate the model performance
Create metrics for the multi-label output

## Setup
Import libraries & set parameters, helper functions, etc...

In [1]:
import polars as pl
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

# Sklearn imports for model evaluation
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, balanced_accuracy_score

In [2]:
# Rename the columns
column_mapper = {
    "urinary": "Urinary",
    "respiratory": "Respiratory",
    "abdominal": "Abdominal",
    "neurological": "Neurological",
    "skin_soft_tissue": "Skin & Soft Tissue",
    "ent": "ENT",
    "orthopaedic": "Orthopaedic",
    "other_specific": "Other Specific",
    "no_specific_source": "No Specific Source",
    "prophylaxis": "Prophylaxis",
    "uncertainty": "Uncertainty",
    "not_informative": "Not Informative"
}

In [17]:
# Categories
categories = ["Urinary", "Respiratory", "Abdominal", "Neurological", "Skin & Soft Tissue", "ENT", "Orthopaedic", "Other Specific", "No Specific Source", "Prophylaxis", "Uncertainty", "Not Informative"]
categories.sort()

categoires_no_uncertainty = categories.copy()
categoires_no_uncertainty.remove("Uncertainty")

In [4]:
# Define the model names & properties
model_has_proba = {
    "Regex": False,
    "Base_Bert": True,
    "Bio_ClinicalBERT": True,
    "GPT4": False,
    "GPT3.5": False
}

locations = ["oxford", "banbury"]

In [5]:
# Paths
data_dir = Path("../00_Data")
plot_dir = data_dir / "export/performance/plots"
plot_dir.mkdir(parents=True, exist_ok=True)

print("Exporting to:", plot_dir)

Exporting to: ../00_Data/export/performance/plots


## Import Data

In [18]:
def import_data(data_dir, model_name, test_location, has_proba):
    """Import the data for the model predictions and the true values.
    Returns the binary and probability predictions and the true values.
    TODO: Implement importing has_proba
    """
    # Load the data
    df_predictions = pl.read_csv(data_dir / 'export' / model_name / f'{model_name}_{test_location}_predictions.csv')\
        .rename(column_mapper)

    df_predictions_binary = df_predictions.clone()
    df_predictions_proba = df_predictions.clone()

    df_true_full = pl.read_csv(data_dir / "publication_ready" / f"testing_{test_location}_2023-08-23.csv")\
        .drop("PrescriptionID")\
        .rename(column_mapper)

    df_true = df_true_full[categories]

    # Return them all in the same order
    return df_predictions_binary[categories], df_predictions_proba[categories],\
        df_true[categories], df_true_full

# Import the data


model_name = "Bio_ClinicalBERT"
location = "banbury"


df_pred_binary, df_pred_proba, df_true, df_true_full = import_data(data_dir, model_name, location, model_has_proba[model_name])

## Calculate Metrics

Calculate per-class metrics (F1 Score and ROC AUC)

In [55]:
scores_per_class_list = []
scores_per_class_list.append([model_name, "F1-Score"] + f1_score(y_true=df_true, y_pred=df_pred_binary, average=None).tolist())
scores_per_class_list.append([model_name, "ROC AUC"] + roc_auc_score(y_true=df_true, y_score=df_pred_proba, average=None).tolist())

# Convert to dataframe
scores_per_class_df = pl.DataFrame(scores_per_class_list, schema=["Model Name", "Metric"] + categories)

# Calculate mean, min & max per row (model and metric)
(scores_per_class_df
    .with_columns(
        pl.mean_horizontal(categories).alias("Average"),
        pl.min_horizontal(categories).alias("Min"),
        pl.max_horizontal(categories).alias("Max"),
    )
)

Model Name,Metric,Abdominal,ENT,Neurological,No Specific Source,Not Informative,Orthopaedic,Other Specific,Prophylaxis,Respiratory,Skin & Soft Tissue,Uncertainty,Urinary,Mean,Min,Max
str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Bio_ClinicalBE…","""F1-Score""",0.951965,0.962963,0.972973,0.988506,0.83871,0.883721,0.862745,0.984701,0.991453,0.958678,0.988506,0.981758,0.947223,0.83871,0.991453
"""Bio_ClinicalBE…","""ROC AUC""",0.961333,0.964286,0.999748,0.99245,0.871174,0.895833,0.938987,0.991208,0.992145,0.969168,0.996634,0.984656,0.963135,0.871174,0.999748


Calculate overall averages (F1 Score and ROC AUC)

In [20]:
scores_average = {}
averaging_method = "macro"
scores_average["F1-Score"] = f1_score(y_true=df_true, y_pred=df_pred_binary, average=averaging_method)
scores_average["ROC AUC"] = roc_auc_score(y_true=df_true, y_score=df_pred_proba, average=averaging_method)
scores_average["Accuracy"] = accuracy_score(y_true=df_true, y_pred=df_pred_binary)

scores_average

{'F1-Score': 0.9472231441366833,
 'ROC AUC': 0.9631351966208479,
 'Accuracy': 0.9645}