# Antibiotic Indication Processig - Base BERT

Pre-requisites:

1. Set Runtime --> Change Runtime type --> GPUs (or TPUs)
2. Copy the data file (indications, labeled) into `/content/`
3. Set the model type

Install and load libraries

In [1]:
# Install packages
#!pip install -q transformers datasets

# Load libraries
import sys
import glob
import os

import datasets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch

from pathlib import Path
from datasets.dataset_dict import DatasetDict
from datasets import Dataset

from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

if torch.cuda.is_available():  
  cuda_device = "cuda:0" 
  !nvidia-smi
else:  
  cuda_device = "cpu"

print("Using Cuda Device: ", torch.cuda.get_device_name(cuda_device))

Sat Apr  1 17:27:56 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 530.41.03              Driver Version: 530.41.03    CUDA Version: 12.1     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                  Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf            Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro M2000                    Off| 00000000:65:00.0  On |                  N/A |
| 58%   49C    P0               29W /  75W|   1565MiB /  4096MiB |     29%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
|   1  Tesla P100-PCIE-16GB            Off| 00000000:B3:00.0 Off |  

# Specify Parameters

In [4]:
# Model parameters
model_selection = "GPT2"

model_dict = {
    "GPT2": "gpt2",
    "Bloom": "bigscience/bloom",
    "Alpaca Native": "chavinlo/alpaca-native",
    "Base BERT": "bert-base-uncased",
    "Bio_ClinicalBERT": "emilyalsentzer/Bio_ClinicalBERT"
}

model_name_display = model_selection
model_hf_id = model_dict[model_selection]  # Huggingface name/identifier

print("Current Working Directory:" , os.getcwd())
base_data_path = Path("../../00_Data")


assert base_data_path.is_dir(),\
  f"{base_data_path} either doesn't exist or is not a directory."

Current Working Directory: /home/kevin/DPhil/Projects/EHR-Indication-Processing/02_Models/Alpaca


# Import and clean data

In [5]:
# import data --> upload into "Files" on the left-hand panel
df = pd.read_csv(base_data_path/'sources_post_consensus_v20221102.csv')

# convert IndicationRaw into str
df['IndicationRaw'] = df['IndicationRaw'].astype(str)

# drop 'n' column
df = df.drop(columns = ['n'])

# drop the following columns (from multi-label labels)
labels_to_drop = ['prophylaxis', 'procedural', 'immunosuppression', 'viral']
df = df.drop(columns = labels_to_drop)

# fill NaN with 0
df = df.fillna(0)

# move ID to column
df['ID'] = df.index

# check data types
df.dtypes

# train/test split
train, test = train_test_split(df, test_size=0.2, random_state=25)

print(f"No. of training examples: {train.shape[0]}")
print(f"No. of testing examples: {test.shape[0]}")

# turn data into dictionary
train_set = Dataset.from_dict(train)
test_set = Dataset.from_dict(test)
data = datasets.DatasetDict({"train":train_set,"test":test_set})

# example datapoint - IndicationRaw and labels
data['train'][0]

No. of training examples: 3199
No. of testing examples: 800


{'IndicationRaw': 'open skull fracture',
 'urinary': 0.0,
 'respiratory': 0.0,
 'abdominal': 0.0,
 'neurological': 0.0,
 'skin_soft_tissue': 0.0,
 'ent': 0.0,
 'orthopaedic': 1.0,
 'other': 0.0,
 'no_specific_source': 0.0,
 'uncertainty': 0.0,
 'ID': 2546}

# Define labels and mappers

In [6]:
# labels
labels = [label for label in data['train'].features.keys() if label not in ['ID', 'IndicationRaw']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['urinary',
 'respiratory',
 'abdominal',
 'neurological',
 'skin_soft_tissue',
 'ent',
 'orthopaedic',
 'other',
 'no_specific_source',
 'uncertainty']

# Preprocess data
Using AutoTokenizer.

NB: labels needs to be *floats*, not integers for PyTorch to work.

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_hf_id)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def preprocess_data(examples):
  # take a batch of texts
  text = examples["IndicationRaw"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [8]:
encoded_dataset = data.map(preprocess_data, batched=True, remove_columns=data['train'].column_names)

Map:   0%|          | 0/3199 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

In [9]:
example = encoded_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [10]:
tokenizer.decode(example['input_ids'])

'open skull fracture[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'

In [11]:
example['labels']

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]

In [12]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label == 1.0]

['orthopaedic']

In [13]:
# format dataset for PyTorch compatibility 
encoded_dataset.set_format("torch")

# Define model
In this baseline case, using weights from bert-base-uncased & random initialised classification head (linear layer) on top. 

In [14]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(model_hf_id, 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Train model
Hugging face trainer API 

In [15]:
batch_size = 1
metric_name = "f1"

In [16]:
# hyperparams
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [17]:
# performance metrics functions
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [18]:
# training
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [19]:
trainer.train()



  0%|          | 0/15995 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [157,0,0], thread: [64,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [157,0,0], thread: [65,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [157,0,0], thread: [66,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [157,0,0], thread: [67,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [157,0,0], thread: [68,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/s

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

In [None]:
trainer.evaluate()

# Evaluate the model

Fetch predictions on the test set

In [None]:
predictions_raw = trainer.predict(encoded_dataset["test"]).predictions

In [None]:
pred_threshold = 0.5
predictions_binarised = (predictions_raw > pred_threshold) * 1

## Confusion matrix

In [None]:
# Import plotting libraries 
# ToDo: Follow PEP8 and put at the top of the file
import seaborn as sns

from itertools import zip_longest
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix

# Plot embedding
%matplotlib inline

In [None]:
labels_pretty = [' '.join(x.capitalize() for x in single_label.split('_')) \
                 for single_label in labels]
                 
y_test_true = encoded_dataset["test"]["labels"]

### Multiple one-vs-all confusion matrices

In [None]:
cm_multi = multilabel_confusion_matrix(y_test_true, predictions_binarised)

In [None]:
fig, axes = plt.subplots(4, 3, figsize=(15, 15))
    
for ax, cm_matrix, label in zip_longest(axes.flatten(), cm_multi, labels_pretty):
  # Remove empty axes
  if not label:
    plt.delaxes(ax)
    continue

  # Gather the data
  labels_cm = [label, "No " + label]
  cm_matrix = pd.DataFrame(cm_matrix, index=labels_cm, columns=labels_cm)

  # Create the plot
  sns.heatmap(cm_matrix, fmt=".3g", square=True, annot=True, ax=ax, cbar=False)
  ax.set(title=f"{label}")
  ax.set_yticklabels(ax.get_yticklabels(), rotation = 0)
  ax.set_xticklabels(ax.get_xticklabels(), rotation = 45)

plt.subplots_adjust(hspace=1)
fig.suptitle(f'Confusion Matrix one-vs-all - {model_name_display}')
fig.supxlabel("Predicted Labels")
fig.supylabel("True Labels")

fig.show()

## Export the prediction data
For further analysis outside this notebook

In [None]:
# Specify the path
export_path = base_data_path/"export"
export_path.mkdir(exist_ok=True)

# Convert the data to DFs with headers
predictions_binarised_df = pd.DataFrame(predictions_binarised, columns=labels)
true_binarised_df = test.drop(columns=["ID"])

predictions_binarised_df.to_csv(export_path/f"{model_selection}_predictions.csv", index=False)
true_binarised_df.to_csv(export_path/f"{model_selection}_true_labels.csv", index=False)

## Failure Case Analysis

In [None]:
bert_true = true_binarised_df.set_index('IndicationRaw')
bert_predictions = predictions_binarised_df.set_index(bert_true.index)

# Convert to booleans
bert_true = bert_true.astype(bool)
bert_predictions = bert_predictions.astype(bool)

# Requires pandas >=1.5
bert_compared = bert_true.compare(bert_predictions, 
                                  result_names=("True", "Predicted"), 
                                  keep_equal=True).dropna(how='all')

In [None]:
def highlight_differences(s):
    row_style = []
    highlight_format_true = "color:green; font-weight:bold"
    highlight_format_false = "color:red; font-weight:bold"

    # Iterate over the first index (true labels) and set colour if not identical
    for level_value in s.index.get_level_values(0).unique():
        level_slice = s.loc[level_value, :]
        if level_slice["True"] == level_slice["Predicted"]:
            row_style += [None, None]
        else:
            row_style += [highlight_format_true, highlight_format_false]
    
    return row_style

bert_compared_highlighted = bert_compared.style.apply(highlight_differences, axis=1)