#**Medical NER using BERT**

In [None]:
!ls

sample_data


##**Data preparation**

In [None]:
# https://figshare.com/articles/dataset/MACCROBAT2018/9764942
# https://brat.nlplab.org/standoff.html

In [None]:
!mkdir MACCROBAT2018

In [None]:
!unzip ./MACCROBAT2018.zip -d ./MACCROBAT2018

unzip:  cannot find or open ./MACCROBAT2018.zip, ./MACCROBAT2018.zip.zip or ./MACCROBAT2018.zip.ZIP.


In [None]:
!pip install transformers evaluate accelerate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [None]:
import os
from typing import List, Dict, Tuple

class Preprocessing_Maccrobat:
    def __init__(self, dataset_folder, tokenizer):
        self.file_ids = [f.split(".")[0] for f in os.listdir(dataset_folder) if f.endswith('.txt')]

        self.text_files = [f+".txt" for f in self.file_ids]
        self.anno_files = [f+".ann" for f in self.file_ids]

        self.num_samples = len(self.file_ids)

        self.texts: List[str] = []
        for i in range(self.num_samples):
            file_path = os.path.join(dataset_folder, self.text_files[i])
            with open(file_path, 'r') as f:
                self.texts.append(f.read())

        self.tags: List[Dict[str, str]] = []
        for i in range(self.num_samples):
            file_path = os.path.join(dataset_folder, self.anno_files[i])
            with open(file_path, 'r') as f:
                text_bound_ann = [t.split("\t") for t in f.read().split("\n") if t.startswith("T")]
                text_bound_lst = []
                for text_b in text_bound_ann:
                    label = text_b[1].split(" ")
                    try:
                        _ = int(label[1])
                        _ = int(label[2])
                        tag = {
                            "text": text_b[-1],
                            "label": label[0],
                            "start": label[1],
                            "end": label[2]
                        }
                        text_bound_lst.append(tag)
                    except:
                        pass

                self.tags.append(text_bound_lst)
        self.tokenizer = tokenizer

    def process(self) -> Tuple[List[List[str]], List[List[str]]]:
        input_texts = []
        input_labels = []

        for idx in range(self.num_samples):
            full_text = self.texts[idx]
            tags = self.tags[idx]

            label_offset = []
            continuous_label_offset = []
            for tag in tags:
                offset = list(range(int(tag["start"]), int(tag["end"])+1))
                label_offset.append(offset) # 345
                continuous_label_offset.extend(offset) #  345

            all_offset = list(range(len(full_text)))
            zero_offset = [offset for offset in all_offset if offset not in continuous_label_offset]
            zero_offset = Preprocessing_Maccrobat.find_continuous_ranges(zero_offset) # 012 67

            self.tokens = []
            self.labels = []
            self._merge_offset(full_text, tags, zero_offset, label_offset)
            assert len(self.tokens) == len(self.labels), f"Length of tokens and labels are not equal"

            input_texts.append(self.tokens)
            input_labels.append(self.labels)

        return input_texts, input_labels

    def _merge_offset(self, full_text, tags, zero_offset, label_offset):
        # zero: [[0,1,2], [6,7]] label: [[3,4,5]]
        i = j = 0
        while i < len(zero_offset) and j < len(label_offset):
            if zero_offset[i][0] < label_offset[j][0]:
                self._add_zero(full_text, zero_offset, i)
                i += 1
            else:
                self._add_label(full_text, label_offset, j, tags)
                j += 1

        while i < len(zero_offset):
            self._add_zero(full_text, zero_offset, i)
            i += 1

        while j < len(label_offset):
            self._add_label(full_text, label_offset, j, tags)
            j += 1

    def _add_zero(self, full_text, offset, index):
        start, *_ ,end =  offset[index] if len(offset[index]) > 1 else (offset[index][0], offset[index][0]+1)
        text = full_text[start:end]
        text_tokens = self.tokenizer.tokenize(text)

        self.tokens.extend(text_tokens)
        self.labels.extend(
            ["O"]*len(text_tokens)
        )

    def _add_label(self, full_text, offset, index, tags):
        start, *_ ,end =  offset[index] if len(offset[index]) > 1 else (offset[index][0], offset[index][0]+1)
        text = full_text[start:end]
        text_tokens = self.tokenizer.tokenize(text)

        self.tokens.extend(text_tokens)
        self.labels.extend(
            [f"B-{tags[index]['label']}"] + [f"I-{tags[index]['label']}"]*(len(text_tokens)-1)
        )

    @staticmethod
    def build_label2id(tokens: List[List[str]]):
        label2id = {}
        id_counter = 0
        for token in [token for sublist in tokens for token in sublist]:
            if token not in label2id:
                label2id[token] = id_counter
                id_counter += 1
        return label2id

    @staticmethod
    def find_continuous_ranges(data: List[int]): # [0, 1, 2, 6, 7]
        if not data:
            return []
        ranges = []
        start = data[0] # 0
        prev = data[0] # 0
        for number in data[1:]: # [1, 2, 6, 7]
            if number != prev + 1:
                ranges.append(list(range(start, prev + 1)))
                start = number
            prev = number
        ranges.append(list(range(start, prev + 1)))
        return ranges



In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")

dataset_folder = "./MACCROBAT2018"

Maccrobat_builder = Preprocessing_Maccrobat(dataset_folder, tokenizer)
input_texts, input_labels = Maccrobat_builder.process()

label2id = Preprocessing_Maccrobat.build_label2id(input_labels)
id2label = {v: k for k, v in label2id.items()}

##**Dataloader**

In [None]:
from sklearn.model_selection import train_test_split

inputs_train, inputs_val, labels_train, labels_val = train_test_split(
    input_texts,
    input_labels,
    test_size=0.2,
    random_state=42
)

In [None]:
import torch
from torch.utils.data import Dataset

MAX_LEN = 512

class NER_Dataset(Dataset):
    def __init__(self, input_texts, input_labels, tokenizer, label2id, max_len=MAX_LEN):
        super().__init__()
        self.tokens = input_texts
        self.labels = input_labels
        self.tokenizer = tokenizer
        self.label2id = label2id
        self.max_len = max_len

    def __len__(self):
        return len(self.tokens)

    def __getitem__(self, idx):
            input_token = self.tokens[idx]
            label_token = [self.label2id[label] for label in self.labels[idx]]

            input_token = self.tokenizer.convert_tokens_to_ids(input_token)
            attention_mask = [1] * len(input_token)

            input_ids = self.pad_and_truncate(input_token, pad_id=self.tokenizer.pad_token_id)
            labels = self.pad_and_truncate(label_token, pad_id=0)
            attention_mask = self.pad_and_truncate(attention_mask, pad_id=0)

            return {
                "input_ids": torch.as_tensor(input_ids),
                "labels": torch.as_tensor(labels),
                "attention_mask": torch.as_tensor(attention_mask)
            }


    def pad_and_truncate(self, inputs: List[int], pad_id: int):
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id] * (self.max_len - len(inputs))
        else:
            padded_inputs = inputs[:self.max_len]
        return padded_inputs

    def label2id(self, labels: List[str]):
        return [self.label2id[label] for label in labels]

In [None]:
train_set = NER_Dataset(inputs_train, labels_train, tokenizer, label2id)
val_set = NER_Dataset(inputs_val, labels_val, tokenizer, label2id)

##**Model**

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    "d4data/biomedical-ner-all",
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True
)


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at d4data/biomedical-ner-all and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([84]) in the checkpoint and torch.Size([83]) in the model instantiated
- classifier.weight: found shape torch.Size([84, 768]) in the checkpoint and torch.Size([83, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


##**Training**

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mask = labels != 0
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions[mask], references=labels[mask])

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="ner-biomedical-maccrobat2018",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=20,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    load_best_model_at_end=True,
    optim="adamw_torch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,2.5437,1.600864,0.367116
2,1.3262,0.96597,0.625402
3,0.8675,0.743638,0.714485
4,0.6199,0.654387,0.738495
5,0.4707,0.613146,0.765985
6,0.3735,0.602733,0.770944
7,0.3049,0.605551,0.778599
8,0.2507,0.599216,0.779208
9,0.2144,0.611515,0.77799
10,0.1801,0.606163,0.786255


TrainOutput(global_step=200, training_loss=0.4107916533946991, metrics={'train_runtime': 347.5035, 'train_samples_per_second': 9.209, 'train_steps_per_second': 0.576, 'total_flos': 418702245888000.0, 'train_loss': 0.4107916533946991, 'epoch': 20.0})

In [None]:
trainer.push_to_hub(
    commit_message="Training complete",
    token="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"
)

model.safetensors:   0%|          | 0.00/266M [00:00<?, ?B/s]

events.out.tfevents.1738547888.580211c92e0c.798.0:   0%|          | 0.00/9.27k [00:00<?, ?B/s]

Upload 4 LFS files:   0%|          | 0/4 [00:00<?, ?it/s]

events.out.tfevents.1738547908.580211c92e0c.798.1:   0%|          | 0.00/20.2k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/thainq107/ner-biomedical-maccrobat2018/commit/b2557ef3bfc8bcbeadbd33b82edf5fb92422678d', commit_message='Training complete', commit_description='', oid='b2557ef3bfc8bcbeadbd33b82edf5fb92422678d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/thainq107/ner-biomedical-maccrobat2018', endpoint='https://huggingface.co', repo_type='model', repo_id='thainq107/ner-biomedical-maccrobat2018'), pr_revision=None, pr_num=None)

##**Inference**

In [None]:
def inference(sentence, model):
    # Tokenize and convert to tensor
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    inputs = {key: value.to(model.device) for key, value in inputs.items()}

    # Get model outputs
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)

    # Convert predictions to a list
    preds = predictions[0].cpu().numpy()
    return preds


In [None]:
def merge_entity(sentence, preds, model):
    tokens = sentence.split()
    merged_list = []

    # Iterate over tokens and predictions
    current_entity = None
    for token, pred in zip(tokens, preds):
        label = id2label.get(pred, "O")  # Get label from prediction

        if label.startswith("B-"):  # Begin a new entity
            if current_entity:
                merged_list.append(current_entity)
            current_entity = {"entity": label[2:], "text": token}
        elif label.startswith("I-") and current_entity and current_entity["entity"] == label[2:]:
            current_entity["text"] += " " + token  # Continue the entity
        else:
            if current_entity:
                merged_list.append(current_entity)
                current_entity = None  # Reset entity
            if label == "O":
                merged_list.append({"entity": "O", "text": token})

    # Append the last entity if any
    if current_entity:
        merged_list.append(current_entity)

    return merged_list


In [None]:
sentence = """A 48 year - old female presented with vaginal bleeding and abnormal Pap smears .
Upon diagnosis of invasive non - keratinizing SCC of the cervix ,
she underwent a radical hysterectomy with salpingo - oophorectomy
which demonstrated positive spread to the pelvic lymph nodes and the parametrium .
Pathological examination revealed that the tumour also extensively involved the lower uterine segment .
"""
preds = inference(sentence, model)
results = merge_entity(sentence, preds, model)

In [None]:
results

[{'entity': 'O', 'text': 'A'},
 {'entity': 'O', 'text': '48'},
 {'entity': 'Age', 'text': 'year - old female'},
 {'entity': 'Sex', 'text': 'presented'},
 {'entity': 'Clinical_event', 'text': 'with'},
 {'entity': 'O', 'text': 'vaginal'},
 {'entity': 'Biological_structure', 'text': 'bleeding and'},
 {'entity': 'Sign_symptom', 'text': 'abnormal'},
 {'entity': 'O', 'text': 'Pap'},
 {'entity': 'Lab_value', 'text': 'smears'},
 {'entity': 'Diagnostic_procedure', 'text': '. Upon'},
 {'entity': 'O', 'text': 'diagnosis'},
 {'entity': 'O', 'text': 'of'},
 {'entity': 'O', 'text': 'invasive'},
 {'entity': 'O', 'text': 'non'},
 {'entity': 'O', 'text': '-'},
 {'entity': 'O', 'text': 'keratinizing'},
 {'entity': 'Detailed_description', 'text': 'SCC'},
 {'entity': 'Detailed_description', 'text': 'of the cervix , she underwent'},
 {'entity': 'Disease_disorder', 'text': 'a radical'},
 {'entity': 'O', 'text': 'hysterectomy'},
 {'entity': 'O', 'text': 'with'},
 {'entity': 'Biological_structure', 'text': 's

In [None]:
from transformers import pipeline

model_checkpoint = "thainq107/ner-biomedical-maccrobat2018"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
results = token_classifier(sentence)


Device set to use cuda:0


In [None]:
from transformers import AutoModelForTokenClassification

model_name = "QCRI/bert-base-multilingual-cased-pos-english"
model = AutoModelForTokenClassification.from_pretrained(model_name)

# Get number of tags
num_tags = len(model.config.id2label)
print(f"Number of POS tags: {num_tags}")


Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Number of POS tags: 46


In [None]:
label2id = Preprocessing_Maccrobat.build_label2id(input_labels)
print(len(label2id))

83
