# Antibiotic Indication Processig - Baseline Model

Pre-requisites:

1. Set Runtime --> Change Runtime type --> GPUs (or TPUs)
2. Copy the data file (indications, labeled) into `/content/`
3. Set the model type

Install and load libraries

In [1]:
# --- Check for Google Colab
import sys
IN_COLAB = 'google.colab' in sys.modules

# --- Install packages
if IN_COLAB:
  pass

# --- Load libraries
# Standard libraries
import os

# DS libs
import numpy as np
import pandas as pd
import xgboost as xgb

from pathlib import Path
from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, classification_report
from sklearn.model_selection import train_test_split

print(os.getcwd())

/home/kevin/DPhil/Projects/EHR-Indication-Processing/02_Models/01_Baseline/XGBoost


In [2]:
# Mounting Google Drive
if IN_COLAB:
  from google.colab import drive
  drive.mount('/content/drive')

# Specify Parameters

In [3]:
# --- Model parameters
model_name = "XGBoost"

# --- Paths
# Base data path
base_data_path = Path("../../../00_Data/")
# Dataset Path (training, testing, etc.)
dataset_path =  base_data_path / "publication_ready"
# Export Path (model checkpoints, predictions, etc.)
export_path = base_data_path / "model_output" / model_name

assert base_data_path.is_dir(),\
  f"{base_data_path} either doesn't exist or is not a directory."
export_path.mkdir(exist_ok=True)

# --- Misc settings
seed = 42

# Import and clean data

In [4]:
# import data --> upload into "Files" on the left-hand panel
train_eval_df = pd.read_csv(
    dataset_path / 'training_oxford_2023-08-23.csv',
    dtype={"Indication": str},
    keep_default_na=False,
    na_values=["NA"],
)

test_oxford_df = pd.read_csv(
    dataset_path / 'testing_oxford_2023-08-23.csv',
    dtype={"Indication": str},
    keep_default_na=False,
    na_values=["NA"],
)\
    .drop(columns=["PrescriptionID"])

test_banbury_df = pd.read_csv(
    dataset_path / 'testing_banbury_2023-08-23.csv',
    dtype={"Indication": str},
    keep_default_na=False,
    na_values=["NA"],
)\
    .drop(columns=["PrescriptionID"])

test_set_raw = {
    "Oxford": test_oxford_df,
    "Banbury": test_banbury_df,
}

# Use unseen dataset instead of random 2000 sample dataset
if True:
    train_df, eval_df = train_test_split(train_eval_df, test_size=0.2, random_state=seed)

print(f"No. of training examples: {train_df.shape[0]}")
print(f"No. of evaluation examples: {eval_df.shape[0]}")
print(f"No. of internal testing examples: {test_oxford_df.shape[0]}")
print(f"No. of external testing examples: {test_banbury_df.shape[0]}")

# check dtypes
#print(train.dtypes)
#print(test.dtypes)

No. of training examples: 3200
No. of evaluation examples: 800
No. of internal testing examples: 2000
No. of external testing examples: 2000


# Define labels and mappers

In [5]:
# labels
labels = [label for label in train_df.columns if label not in ['Indication']]
labels_pretty = [" ".join(word.capitalize() for word in label.split("_")) for label in labels]

labels

['urinary',
 'respiratory',
 'abdominal',
 'neurological',
 'skin_soft_tissue',
 'ent',
 'orthopaedic',
 'other_specific',
 'no_specific_source',
 'prophylaxis',
 'uncertainty',
 'not_informative']

# Preprocess data
Using n-grams to tokenise the data

In [6]:
# Define X and y train, eval and test sets
X_train, X_eval = train_df["Indication"], eval_df["Indication"]
y_train, y_eval = train_df.drop("Indication", axis=1), eval_df.drop("Indication", axis=1)

# Tokenize the text using n-grams
cv = CountVectorizer(ngram_range=(4, 4), analyzer="char")
X_train = cv.fit_transform(X_train)

X_eval = cv.transform(X_eval)

# Repeat for test sets
test_set_split = {}

for name, test_set in test_set_raw.items():
    test_set_split[name] = (
        cv.transform(test_set["Indication"]),
        test_set.drop("Indication", axis=1)
    )

# Define & train model
Create one classifier per class, as XGBoost

In [7]:
xgb_clf = xgb.XGBClassifier(seed=seed)\
    .fit(X_train, y_train)

## Evaluate the model on test sets

In [8]:
# Metrics function
def calculate_metrics(y_true, predictions_probs,
                      predictions_binarised,
                      labels, 
                      result_precision=2, 
                      averaging_method = "weighted",
    ):
    # Calculate per class metrics
    scores_per_class = {}
    scores_per_class["F1-Score"] = f1_score(y_true=y_true, y_pred=predictions_binarised, average=None)
    scores_per_class["ROC AUC"] = roc_auc_score(y_true=y_true, y_score=predictions_probs, average=None)
    scores_per_class["PR AUC"] = average_precision_score(y_true=y_true, y_score=predictions_probs, average=None)

    scores_per_class = pd.DataFrame.from_dict(scores_per_class,orient='index', columns=labels)
    
    # Calculate average metrics
    scores_average = {}
    scores_average["F1-Score"] = f1_score(y_true=y_true, y_pred=predictions_binarised, average=averaging_method)
    scores_average["ROC AUC"] = roc_auc_score(y_true=y_true, y_score=predictions_probs, average=averaging_method)
    scores_average["PR AUC"] = average_precision_score(y_true=y_true, y_score=predictions_probs, average=averaging_method)

    # Format into printable string
    metrics_string = ""
    for score_name, avg_score_value in scores_average.items():
        avg_score = avg_score_value.round(result_precision)
        min_sore = scores_per_class.loc[score_name].min().round(result_precision)
        max_score = scores_per_class.loc[score_name].max().round(result_precision) 
        metrics_string += f"{score_name}: {avg_score} ({min_sore}-{max_score})\n"
    
    return scores_per_class, scores_average, metrics_string

Internal & External test set
- Run infererrence on the internal and external test sets
- Calculate per class and average metrics
- Print string in this format per metric `<average>(<worst performing class> - <highest scoring class>)`

In [9]:
for test_location, (X_test, y_test) in test_set_split.items():
    print("Test set:", test_location)

    # Run inference on test set
    predictions_binarised = xgb_clf.predict(X_test)
    predictions_probs = xgb_clf.predict_proba(X_test)

    # Calculate metrics
    scores_per_class, scores_average, metrics_string = \
        calculate_metrics(y_test, predictions_probs, predictions_binarised, labels, averaging_method="weighted")
    
    # Print metrics
    pd.set_option('display.precision', 2)
    print(scores_per_class)
    print(metrics_string)

    # Save predictions for further analysis
    predictions_binarised_df = pd.DataFrame(predictions_binarised, columns=labels)
    predictions_binarised_df.to_csv(export_path/f"predictions_{model_name}_{test_location}.csv", index=False)

    predictions_probs_df = pd.DataFrame(predictions_probs, columns=labels)
    predictions_probs_df.to_csv(export_path/f"predictions_proba_{model_name}_{test_location}.csv", index=False)


Test set: Oxford
          urinary  respiratory  abdominal  neurological  skin_soft_tissue  \
F1-Score     0.64         0.81       0.90          0.76              0.83   
ROC AUC      0.90         0.96       0.97          0.92              0.98   
PR AUC       0.62         0.87       0.91          0.68              0.89   

           ent  orthopaedic  other_specific  no_specific_source  prophylaxis  \
F1-Score  0.83         0.70            0.71                0.91         0.96   
ROC AUC   0.96         0.91            0.92                0.97         0.99   
PR AUC    0.82         0.65            0.72                0.97         0.99   

          uncertainty  not_informative  
F1-Score         0.66             0.90  
ROC AUC          0.87             0.94  
PR AUC           0.69             0.83  
F1-Score: 0.86 (0.64-0.96)
ROC AUC: 0.96 (0.87-0.99)
PR AUC: 0.9 (0.62-0.99)

Test set: Banbury
          urinary  respiratory  abdominal  neurological  skin_soft_tissue  \
F1-Score     0.6