Biomedical Named Entity Recognition

In [1]:
!git clone https://huggingface.co/datasets/masaenger/bc5cdr

Cloning into 'bc5cdr'...
remote: Enumerating objects: 43, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (40/40), done.[K
remote: Total 43 (delta 14), reused 0 (delta 0), pack-reused 3 (from 1)[K
Unpacking objects: 100% (43/43), 22.26 KiB | 1.39 MiB/s, done.


In [14]:
!pip install seqeval gradio transformers

Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio
  Downloading gradio-5.32.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.2 (from gradio)
  Downloading gradio_client-1.10.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Do

In [3]:
import pandas as pd

train_df = pd.read_parquet('/content/bc5cdr/bc5cdr_source/train-00000-of-00001.parquet')
val_df = pd.read_parquet('/content/bc5cdr/bc5cdr_source/validation-00000-of-00001.parquet')
test_df = pd.read_parquet('/content/bc5cdr/bc5cdr_source/test-00000-of-00001.parquet')

print(f"Train: {train_df.shape}")
print(f"Validation: {val_df.shape}")
print(f"Test: {test_df.shape}")

print(train_df.head())


Train: (500, 1)
Validation: (500, 1)
Test: (500, 1)
                                            passages
0  [{'document_id': '227508', 'type': 'title', 't...
1  [{'document_id': '354896', 'type': 'title', 't...
2  [{'document_id': '435349', 'type': 'title', 't...
3  [{'document_id': '603022', 'type': 'title', 't...
4  [{'document_id': '1378968', 'type': 'title', '...


In [4]:
#unpack first row
sample = train_df['passages'][0]
text_parts = []
entities = []

for section in sample:
    #collect text
    text_parts.append(section['text'])

    #collect entities
    for ent in section['entities']:
        entities.append({
            'text': ent['text'],
            'start': ent['offsets'][0][0],
            'end': ent['offsets'][0][1],
            'type': ent['type']
        })

#combine all text (title + abstract)
full_text = ' '.join(text_parts)
print("Full text:", full_text)
print("Entities:", entities)


Full text: Naloxone reverses the antihypertensive effect of clonidine. In unanesthetized, spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine, 5 to 20 micrograms/kg, was inhibited or reversed by nalozone, 0.2 to 2 mg/kg. The hypotensive effect of 100 mg/kg alpha-methyldopa was also partially reversed by naloxone. Naloxone alone did not affect either blood pressure or heart rate. In brain membranes from spontaneously hypertensive rats clonidine, 10(-8) to 10(-5) M, did not influence stereoselective binding of [3H]-naloxone (8 nM), and naloxone, 10(-8) to 10(-4) M, did not influence clonidine-suppressible binding of [3H]-dihydroergocryptine (1 nM). These findings indicate that in spontaneously hypertensive rats the effects of central alpha-adrenoceptor stimulation involve activation of opiate receptors. As naloxone and clonidine do not appear to interact with the same receptor site, the observed functional antagonism suggests th

In [5]:
from transformers import AutoTokenizer

#load BioBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [10]:
def align_labels_with_tokens(text, entities, tokenizer):
    #create a char-level label list
    char_labels = ['O'] * len(text)
    for ent in entities:
        start = int(ent['start'])
        end = int(ent['end'])
        ent_type = ent['type']
        char_labels[start] = f'B-{ent_type}'
        for i in range(start + 1, end):
            char_labels[i] = f'I-{ent_type}'

    #tokenize text with offsets
    tokenized = tokenizer(text, return_offsets_mapping=True, truncation=True)
    labels = []

    for idx, (start, end) in enumerate(tokenized['offset_mapping']):
        if start == end:
            #special tokens like [CLS], [SEP]
            labels.append('O')
        else:
            token_label = char_labels[start]
            labels.append(token_label)

    return tokenized, labels

#example
tokenized_input, bio_labels = align_labels_with_tokens(full_text, entities, tokenizer)

print("Tokens:", tokenizer.convert_ids_to_tokens(tokenized_input['input_ids']))
print("BIO Labels:", bio_labels)

Tokens: ['[CLS]', 'na', '##lo', '##xon', '##e', 'reverse', '##s', 'the', 'anti', '##hy', '##pert', '##ens', '##ive', 'effect', 'of', 'c', '##lon', '##id', '##ine', '.', 'in', 'un', '##ane', '##st', '##he', '##tized', ',', 'spontaneous', '##ly', 'h', '##yper', '##tensive', 'rats', 'the', 'decrease', 'in', 'blood', 'pressure', 'and', 'heart', 'rate', 'produced', 'by', 'in', '##tra', '##ven', '##ous', 'c', '##lon', '##id', '##ine', ',', '5', 'to', '20', 'micro', '##gram', '##s', '/', 'kg', ',', 'was', 'in', '##hibit', '##ed', 'or', 'reversed', 'by', 'na', '##lo', '##zone', ',', '0', '.', '2', 'to', '2', 'mg', '/', 'kg', '.', 'the', 'h', '##y', '##pot', '##ens', '##ive', 'effect', 'of', '100', 'mg', '/', 'kg', 'alpha', '-', 'met', '##hyl', '##do', '##pa', 'was', 'also', 'partially', 'reversed', 'by', 'na', '##lo', '##xon', '##e', '.', 'na', '##lo', '##xon', '##e', 'alone', 'did', 'not', 'affect', 'either', 'blood', 'pressure', 'or', 'heart', 'rate', '.', 'in', 'brain', 'membrane', '##s', '

In [23]:
#define label list
label_list = ['O', 'B-Chemical', 'I-Chemical', 'B-Disease', 'I-Disease']
label_to_id = {label: idx for idx, label in enumerate(label_list)}
id2label = {idx: label for idx, label in enumerate(label_list)}

def encode_sample(text, entities, tokenizer, max_length=512):
    tokenized, bio_labels = align_labels_with_tokens(text, entities, tokenizer)
    label_ids = [label_to_id[label] if label in label_to_id else label_to_id['O'] for label in bio_labels]

    #pad or truncate labels to match max_length
    if len(label_ids) > max_length:
        label_ids = label_ids[:max_length]
    else:
        label_ids = label_ids + [label_to_id['O']] * (max_length - len(label_ids))

    encoded = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )

    return {
        'input_ids': encoded['input_ids'].squeeze(),  # remove batch dim
        'attention_mask': encoded['attention_mask'].squeeze(),
        'labels': torch.tensor(label_ids)
    }


In [12]:
#build dataset
from torch.utils.data import Dataset
import torch

class NERDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.samples = []
        for row in df['passages']:
            #flatten all sections
            text_parts = [p['text'] for p in row]
            full_text = ' '.join(text_parts)
            entities = []
            for p in row:
                for ent in p['entities']:
                    entities.append({
                        'text': ent['text'],
                        'start': ent['offsets'][0][0],
                        'end': ent['offsets'][0][1],
                        'type': ent['type']
                    })
            encoded = encode_sample(full_text, entities, tokenizer)
            self.samples.append(encoded)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx]

train_dataset = NERDataset(train_df, tokenizer)
val_dataset = NERDataset(val_df, tokenizer)

In [15]:
import numpy as np
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [label_list[label_id] for (label_id, label_mask) in zip(label_row, label_row) if label_id != -100]
        for label_row in labels
    ]
    true_predictions = [
        [label_list[pred_id] for (pred_id, label_id) in zip(pred_row, label_row) if label_id != -100]
        for pred_row, label_row in zip(predictions, labels)
    ]

    precision = precision_score(true_labels, true_predictions)
    recall = recall_score(true_labels, true_predictions)
    f1 = f1_score(true_labels, true_predictions)

    print(classification_report(true_labels, true_predictions))

    return {
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [16]:
#trainer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

#weights
class_counts = [100000, 5238, 5238, 4204, 4204]  #[O, B-Chemical, I-Chemical, B-Disease, I-Disease]
total = sum(class_counts)
class_weights = [total / c for c in class_counts]
class_weights = torch.tensor(class_weights)

model = AutoModelForTokenClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.1",
    num_labels=len(label_list),
)

#inject weights into loss function
model.config.problem_type = "single_label_classification"
model.loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


In [17]:
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33miacobiasmina[0m ([33miacobiasmina-uvt[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,0.075


TrainOutput(global_step=630, training_loss=0.06192224442012726, metrics={'train_runtime': 548.181, 'train_samples_per_second': 9.121, 'train_steps_per_second': 1.149, 'total_flos': 1306519219200000.0, 'train_loss': 0.06192224442012726, 'epoch': 10.0})

In [18]:
trainer.save_model("./bio_ner_model")
tokenizer.save_pretrained("./bio_ner_model")

('./bio_ner_model/tokenizer_config.json',
 './bio_ner_model/special_tokens_map.json',
 './bio_ner_model/vocab.txt',
 './bio_ner_model/added_tokens.json',
 './bio_ner_model/tokenizer.json')

In [19]:
#eval
metrics = trainer.evaluate()
print(metrics)

              precision    recall  f1-score   support

    Chemical       0.90      0.94      0.92      5238
     Disease       0.75      0.83      0.79      4204

   micro avg       0.83      0.89      0.86      9442
   macro avg       0.82      0.89      0.85      9442
weighted avg       0.83      0.89      0.86      9442

{'eval_loss': 0.08364984393119812, 'eval_precision': 0.8303245536154681, 'eval_recall': 0.89144249099767, 'eval_f1': 0.8597987639818173, 'eval_runtime': 17.6199, 'eval_samples_per_second': 28.377, 'eval_steps_per_second': 3.576, 'epoch': 10.0}


In [25]:
import gradio as gr
from transformers import pipeline

def clean_bio_predictions(entities):
    cleaned = []
    prev_label = 'O'
    for ent in entities:
        label = ent['entity']
        if label.startswith('I-') and prev_label == 'O':
            label = 'B-' + label[2:]
        ent['entity'] = label
        cleaned.append(ent)
        prev_label = label
    return cleaned

def merge_entities(entities):
    merged = []
    current_entity = None

    for ent in entities:
        if ent['entity'].startswith('B-'):
            #start of a new entity
            if current_entity:
                merged.append(current_entity)
            current_entity = {
                'entity': ent['entity'][2:],  #remove B-/I-
                'word': ent['word'].lstrip('##'),
                'score': ent['score'],
                'start': ent['start'],
                'end': ent['end']
            }
        elif ent['entity'].startswith('I-') and current_entity:
            #continuation of the current entity
            current_entity['word'] += ent['word'].lstrip('##')
            current_entity['score'] = max(current_entity['score'], ent['score'])
            current_entity['end'] = ent['end']
        else:
            #outside entity or unexpected I- without B-
            if current_entity:
                merged.append(current_entity)
                current_entity = None

    #append any remaining entity
    if current_entity:
        merged.append(current_entity)

    return merged

In [27]:
#interactive demo

model_path = "./bio_ner_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

def ner_predict(text):
    results = ner_pipeline(text)
    entities = []
    for r in results:
        entities.append({
            'entity': id2label[int(r['entity_group'].split('_')[-1])],
            'word': r['word'],
            'score': float(r['score']),
            'start': r['start'],
            'end': r['end']
        })

    entities = clean_bio_predictions(entities)

    merged_entities = merge_entities(entities)

    return {
        'merged_entities': merged_entities,
        'detailed_per_token': entities
    }

demo = gr.Interface(fn=ner_predict, inputs="text", outputs="json", title="Biomedical NER Demo")
demo.launch()


Some weights of the model checkpoint at ./bio_ner_model were not used when initializing BertForTokenClassification: ['loss_fct.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bd6ec4148d36cb01fc.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [21]:
#error check

#model predictions on validation set
outputs = trainer.predict(val_dataset)
predictions = np.argmax(outputs.predictions, axis=2)
labels = outputs.label_ids

#map label IDs to label names
true_labels = [
    [label_list[label_id] for label_id in label_row if label_id != -100]
    for label_row in labels
]
predicted_labels = [
    [label_list[pred_id] for pred_id, label_id in zip(pred_row, label_row) if label_id != -100]
    for pred_row, label_row in zip(predictions, labels)
]


              precision    recall  f1-score   support

    Chemical       0.90      0.94      0.92      5238
     Disease       0.75      0.83      0.79      4204

   micro avg       0.83      0.89      0.86      9442
   macro avg       0.82      0.89      0.85      9442
weighted avg       0.83      0.89      0.86      9442



In [22]:
for i in range(5):  # look at first 5 samples
    print(f"\nExample {i}")
    print("TRUE :", true_labels[i])
    print("PRED :", predicted_labels[i])

    if true_labels[i] != predicted_labels[i]:
        print(">> MISMATCH FOUND!")


Example 0
TRUE : ['O', 'B-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'O', 'B-Chemical', 'I-Chemical', 'I-Chemical', 'I-Chemical', 'B-Disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'O', 'B-Disease', 'I-Disease', 'I-Disease', 'O', 'B-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'O', 'O', 'O', 'O', 'O', 'B-Chemical', 'I-Chemical', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'I-Disease', 'O', 'B-Disease', 'I-Disease', 'I-Disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Disease', 'I-Disease', 'O', 'O', 'O', 'O', 'B-Chemical', 'I-Chemical', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Chemical', 'I-Chemical', 'I-C