# Benchmark: Training

How long it takes to train the models

## Setup

- Import libraries
- Import datasets
- Run some basic preprocessing

In [1]:
# Import libraries
import os
import pickle
import pandas as pd
import time

from functools import wraps
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score, classification_report

print(os.getcwd())

/home/kevinyuan/EHR-Indication-Processing/02_Models/04_Benchmarks


Set parameters & paths

In [2]:
# --- Paths
# Base data path
base_data_path = Path("../../00_Data/")
# Data Path (training, testing, etc.)
data_path =  base_data_path / "publication_ready"


assert base_data_path.is_dir(),\
  f"{base_data_path} either doesn't exist or is not a directory."

# --- Bechmark parameters
# How often to repeat the training and testing
n_repeat = 10
#  Testset size
n_test_sizes = [10, 100, 1000, 10000]

Import the data and preprocess

In [3]:
# Import data --> upload into "Files" on the left-hand panel
train_eval_df = pd.read_csv(
    data_path / 'training_oxford_2023-08-23.csv',
    dtype={"Indication": str},
    keep_default_na=False,
    na_values=["NA"],
)

test_oxford_df = pd.read_csv(
    data_path / 'testing_oxford_2023-08-23.csv',
    dtype={"Indication": str},
    keep_default_na=False,
    na_values=["NA"],
)

test_banbury_df = pd.read_csv(
    data_path / 'testing_banbury_2023-08-23.csv',
    dtype={"Indication": str},
    keep_default_na=False,
    na_values=["NA"],
)

# --- Split into train and eval
train_df, eval_df = train_test_split(
    train_eval_df, 
    test_size=0.15,
    random_state=42,
    shuffle=True)

print("Data set size overview:")
print(f"- Training set: {train_df.shape[0]}")
print(f"- Evaluation set: {eval_df.shape[0]}")
print(f"- Testing Oxford set: {test_oxford_df.shape[0]}")
print(f"- Testing Banbury set: {test_banbury_df.shape[0]}")
print()

Data set size overview:
- Training set: 3400
- Evaluation set: 600
- Testing Oxford set: 2000
- Testing Banbury set: 2000



Convert labels to numbers and get prettier labels

In [4]:
# labels
labels = [label for label in train_df.columns if label not in ["Indication"]]
labels_pretty = [" ".join(word.capitalize() for word in label.split("_")) for label in labels]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels2labels_pretty = {old:pretty for old, pretty in zip(labels, labels_pretty)}

labels_pretty

['Urinary',
 'Respiratory',
 'Abdominal',
 'Neurological',
 'Skin Soft Tissue',
 'Ent',
 'Orthopaedic',
 'Other Specific',
 'No Specific Source',
 'Prophylaxis',
 'Uncertainty',
 'Not Informative']

### Test Data generation

In [5]:
inference_data = pd.concat([test_oxford_df.Indication, test_banbury_df.Indication]).repeat(5)

inference_data.shape

(20000,)

### Bechmark functions

In [6]:
def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        duration_list = []
        for i in range(0, n_repeat):
            print("Iteration:", i+1)

            # Start timer & process
            start_time = time.process_time()
            result = func(*args, **kwargs)
            end_time = time.process_time()
            process_duration = end_time - start_time
            
            # Save to list and report
            print(f'Took {process_duration:.2f} seconds')
            duration_list.append(process_duration)

        # Calculate the average sample time
        duration_mean = pd.Series(duration_list).mean()
        print(f"The average execution time for 10k samples took {(duration_mean/len(inference_data)*10000):.2f}s")

        return result, duration_mean
    return timeit_wrapper

## Regex

In [7]:
import math
import regex

from pprint import pprint

### Build Regex Rules

In [8]:
debug = True

model_path = base_data_path / "Regex"

rule_sheet_annotated_path = model_path / "regex_rules_sheet_annotated.xlsx"
regex_rule_sheets = pd.read_excel(rule_sheet_annotated_path, sheet_name=None, index_col=None)

categories = ['urinary', 'respiratory', 'abdominal',
       'neurological', 'skin_soft_tissue', 'ent', 'orthopaedic',
       'other_specific', 'no_specific_source', 'prophylaxis']

if debug:
    display(regex_rule_sheets.keys())

dict_keys(['urinary', 'respiratory', 'abdominal', 'neurological', 'skin_soft_tissue', 'ent', 'orthopaedic', 'other_specific', 'no_specific_source', 'prophylaxis'])

Set some baseline parameters

In [9]:
# Global parameters
default_error = 0.1 # percent error allowed (0.1 = 10%)
default_error_max = 2 # max number of errors allowed
default_l_boundary = r"\b"
default_r_boundary = r""

Convert the rules to regex strings and the compile into patterns (allows for faster matching).

Create a dictionary with patterns for each category

In [10]:
regex_pattern_dict = dict()

for individual_category in regex_rule_sheets.keys():
    # Extract an individual sheet
    rule_sheet_individual = regex_rule_sheets[individual_category]

    regex_pattern_list = []
    for _, row in rule_sheet_individual.iterrows():
        # Skip the row if it is marked to be excluded
        if row['Exclude'] == 1:
            continue
        
        # Populate with individual regex patterns
        str_indication = row['Indication']
        num_error = (default_error * len(str_indication)) if pd.isna(row['Error']) else row['Error']
        num_error = min(math.ceil(num_error), default_error_max)

        pat_l_boundary = default_l_boundary if pd.isna(row['L_Boundary']) else r'\b'
        pat_r_boundary = default_r_boundary if pd.isna(row['R_Boundary']) else r'\b'

        regex_pattern_list += [fr'(?:{pat_l_boundary}{str_indication}{pat_r_boundary}){{e<={num_error}}}']

    regex_pattern_dict[individual_category] = regex.compile("|".join(regex_pattern_list))

if debug:
    pprint(regex_pattern_dict)

{'abdominal': regex.Regex('(?:\\bcholecystitis){e<=2}|(?:\\bdiverticulitis){e<=2}|(?:\\bappendicitis){e<=2}|(?:\\bpid\\b){e<=0}|(?:\\bbiliary sepsis){e<=2}|(?:\\bcholangitis){e<=2}|(?:\\bh pylori eradication){e<=1}|(?:\\babdo sepsis){e<=2}|(?:\\bperianal abscess){e<=2}|(?:\\babdominal sepsis){e<=2}|(?:\\bintra-abdominal sepsis){e<=2}|(?:\\bcolitis){e<=1}|(?:\\bsplenectomy){e<=2}|(?:\\bc diff){e<=1}|(?:\\bsuspected c. diff){e<=2}|(?:\\bintra-abdominal infection){e<=2}|(?:\\bliver abscess){e<=2}|(?:\\bpd peritonitis){e<=2}|(?:\\bintraabdominal sepsis){e<=2}|(?:\\bh pylori){e<=0}|(?:\\bsbp\\b){e<=0}|(?:\\bintrabdominal sepsis){e<=2}|(?:\\bpelvic collection){e<=2}|(?:\\babdominal collection){e<=2}|(?:\\bintra-abdo sepsis){e<=2}|(?:\\babdominal infection){e<=2}|(?:\\bc.diff){e<=1}|(?:\\bintra abdo sepsis){e<=2}', flags=regex.V0),
 'ent': regex.Regex('(?:\\btonsillitis){e<=2}|(?:\\bquinsy){e<=1}|(?:\\btonsilitis){e<=1}|(?:\\bsinusitis){e<=1}|(?:\\bsupraglottitis){e<=2}|(?:\\bpinna cellulitis

Additional rules

In [11]:
uncertainty_pattern = regex.compile(r"\?|/|suspected|possible|probable|likely")
uncertainty_pattern

regex.Regex('\\?|/|suspected|possible|probable|likely', flags=regex.V0)

### Inferrence

Prepare the data and run first uncertainty rule

In [12]:
# Helper function to extract the matched pattern
def match_pattern(x, regex_pattern):
    if match_obj := regex_pattern.search(x):
        return match_obj.group()
    else:
        return None

In [13]:
categories_2 = ['urinary',
 'respiratory',
 'abdominal',
 'neurological',
 'skin_soft_tissue',
 'ent',
 'orthopaedic',
 'other_specific',
 'prophylaxis']

@timeit
def run_regex(prediction_df):
        # Run uncertainty pattern check
        prediction_df["uncertainty"] = prediction_df.Indication.apply(lambda x: match_pattern(x, uncertainty_pattern))

        # Split words and remove uncertainty markers
        prediction_df["Indication"] = (prediction_df["Indication"]
                .apply(lambda x: x.replace('?', '').strip()) # Remove ? from all cells
                .apply(lambda x: x.replace('/', ' ').strip()) # Split words by "/"
        )

        # Run the regex rules on the columns
        for single_category in categories:
                # Get the pattern for the current category
                regex_pattern = regex_pattern_dict[single_category]
                # Applyt the regex and save back
                prediction_df[single_category] = prediction_df.Indication.apply(lambda x: match_pattern(x, regex_pattern))

        # Reorder columns
        prediction_df = prediction_df[['Indication'] + categories + ['uncertainty']]

        # Apply last `uncertainty` and `not informative` rule
        # Change value of "uncertainty" column to "multiple entries" if there are multiple entries
        prediction_df.loc[prediction_df[categories_2].notna().sum(axis=1) > 1, "uncertainty"] = "multiple entries"


        # Not informative indicator
        prediction_df["not_informative"] = prediction_df[categories].apply(
                lambda x: True if sum(~x.isna()) == 0 else None,  # If no entries, then True
                axis=1
        )

        return prediction_df

In [14]:
run_regex(pd.DataFrame(inference_data))

Iteration: 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df["not_informative"] = prediction_df[categories].apply(


Took 12.89 seconds
Iteration: 2


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df["not_informative"] = prediction_df[categories].apply(


Took 12.83 seconds
Iteration: 3


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df["not_informative"] = prediction_df[categories].apply(


Took 12.83 seconds
Iteration: 4


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df["not_informative"] = prediction_df[categories].apply(


Took 12.82 seconds
Iteration: 5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df["not_informative"] = prediction_df[categories].apply(


Took 12.82 seconds
Iteration: 6


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df["not_informative"] = prediction_df[categories].apply(


Took 12.68 seconds
Iteration: 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df["not_informative"] = prediction_df[categories].apply(


Took 12.82 seconds
Iteration: 8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df["not_informative"] = prediction_df[categories].apply(


Took 12.82 seconds
Iteration: 9


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df["not_informative"] = prediction_df[categories].apply(


Took 12.82 seconds
Iteration: 10
Took 12.81 seconds
The average execution time for 10k samples took 6.41s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df["not_informative"] = prediction_df[categories].apply(


(     Indication urinary respiratory abdominal neurological skin_soft_tissue  \
 0      h.pylori    None        None      None         None             None   
 0      h.pylori    None        None      None         None             None   
 0      h.pylori    None        None      None         None             None   
 0      h.pylori    None        None      None         None             None   
 0      h.pylori    None        None      None         None             None   
 ...         ...     ...         ...       ...          ...              ...   
 1999        cap    None         cap      None         None             None   
 1999        cap    None         cap      None         None             None   
 1999        cap    None         cap      None         None             None   
 1999        cap    None         cap      None         None             None   
 1999        cap    None         cap      None         None             None   
 
        ent orthopaedic other_specific

## XGBoost

In [15]:
import xgboost as xgb
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_extraction.text import CountVectorizer

### Preprocess the data

In [16]:
# Define X and y train and test
X_train, y_train = train_df["Indication"], train_df.drop("Indication", axis=1)

# Tokenize the text using n-grams
cv = CountVectorizer(ngram_range=(1, 3))
X_train = cv.fit_transform(X_train)
X_test = cv.transform(test_oxford_df["Indication"])
X_inference_xgb = cv.transform(inference_data)

y_test = test_oxford_df[labels]

### Train the model

In [17]:
clf = MultiOutputClassifier(xgb.XGBClassifier())\
    .fit(X_train, y_train)

Quick check whether the model is predicting correctly

In [18]:
predictions = clf.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.88      0.93       160
           1       1.00      0.86      0.92       374
           2       0.96      0.67      0.79       239
           3       1.00      0.30      0.47        23
           4       0.93      0.65      0.77       131
           5       0.86      0.12      0.21        51
           6       0.89      0.44      0.59        55
           7       0.79      0.43      0.56        88
           8       0.96      0.83      0.89       906
           9       0.99      0.90      0.94       704
          10       0.54      0.36      0.43       174
          11       1.00      0.82      0.90        34

   micro avg       0.95      0.77      0.85      2939
   macro avg       0.91      0.61      0.70      2939
weighted avg       0.94      0.77      0.84      2939
 samples avg       0.77      0.75      0.75      2939



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Inference

In [19]:
@timeit
def run_xgboost(data):
    clf.predict(data)


run_xgboost(X_inference_xgb)

Iteration: 1
Took 3.26 seconds
Iteration: 2
Took 2.85 seconds
Iteration: 3
Took 2.87 seconds
Iteration: 4
Took 4.90 seconds
Iteration: 5
Took 6.14 seconds
Iteration: 6
Took 2.74 seconds
Iteration: 7
Took 2.87 seconds
Iteration: 8
Took 2.58 seconds
Iteration: 9
Took 3.02 seconds
Iteration: 10
Took 2.16 seconds
The average execution time for 10k samples took 1.67s


(None, 3.3403470788000105)

## Bio_ClinicalBERT

In [20]:
import pandas as pd
import torch

from pathlib import Path
from tqdm.auto import tqdm

# Transformers (Huggingface) and PyTorch Imports
from datasets import Dataset
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification
from transformers.pipelines.pt_utils import KeyDataset

In [21]:
# --- Configuration ---
model_location_path = base_data_path / "model_output"  # Path to the model & data
saved_model_name = "Bio_ClinicalBERT_5615.pth"  # Saved model name
model_hf_id = "emilyalsentzer/Bio_ClinicalBERT"  # Currently required for the correct tokenizer
cuda_device = "cpu"  # Change to "0" for the first GPU, or "cpu" for CPU

batch_size = 8  # Batch size for inference, we used 8 for the training of this model, keep it at 8
pred_threshold = 0.5  # Threshold for binarising the predictions, we choose 0.5 for training, can be changed

In [22]:
# --- Preprocess Data ---
# Convert to lower case and make unique
inference_df = pd.DataFrame({"Input_String": inference_data})
# Create a Huggingface Dataset
inference_dataset = Dataset.from_pandas(inference_df)

# --- Loading Model & Inference ---
@timeit
def bert_inference(input_data):
    # Load the model and tokeniser
    model = AutoModelForSequenceClassification.from_pretrained(model_location_path / saved_model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_hf_id)
    # Set up the pipeline
    inference_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k=None, device=cuda_device)

    # Classify the infections, runs in batches
    pred_output = []
    for out in tqdm(
            inference_pipeline(
                KeyDataset(input_data, "Input_String"),
                batch_size=batch_size)
    ):
        pred_output.append(out)

bert_inference(inference_dataset)

Iteration: 1


  0%|          | 0/2500 [00:00<?, ?it/s]

Took 3384.22 seconds
Iteration: 2


  0%|          | 0/2500 [00:00<?, ?it/s]

Took 3320.49 seconds
Iteration: 3


  0%|          | 0/2500 [00:00<?, ?it/s]

Took 3351.15 seconds
Iteration: 4


  0%|          | 0/2500 [00:00<?, ?it/s]

Took 3694.18 seconds
Iteration: 5


  0%|          | 0/2500 [00:00<?, ?it/s]

Took 3551.72 seconds
Iteration: 6


  0%|          | 0/2500 [00:00<?, ?it/s]

Took 3512.71 seconds
Iteration: 7


  0%|          | 0/2500 [00:00<?, ?it/s]

Took 3376.81 seconds
Iteration: 8


  0%|          | 0/2500 [00:00<?, ?it/s]

Took 3785.58 seconds
Iteration: 9


  0%|          | 0/2500 [00:00<?, ?it/s]

Took 3511.79 seconds
Iteration: 10


  0%|          | 0/2500 [00:00<?, ?it/s]

Took 3437.05 seconds
The average execution time for 10k samples took 1746.28s


(None, 3492.5691460724)