<a href="https://colab.research.google.com/github/imoore2025/CISC488-Final-Project/blob/main/Clinical%20NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
%pip install -q sklearn-crfsuite
%pip install -q transformers datasets seqeval
%pip install -q torch
%pip install -q -U accelerate



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m806.5 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolv

### 1.1 — Load and Preprocess the Dataset

In [None]:
# Load the synthetic dataset
data = pd.read_csv('synthetic_clinical_ner_dataset.csv')

# Preview
print(data.head())

       Token    NER_Tag
0          A  B-SYMPTOM
1      X-ray     B-TEST
2  confirmed          O
3        the          O
4   presence          O


In [None]:
# Step 1: Reconstruct sentences
sentences = []
current_sentence = []

for idx, row in data.iterrows():
    token = row['Token']
    tag = row['NER_Tag']

    # Empty token means new sentence
    if pd.isna(token) or token.strip() == "":
        if current_sentence:
            sentences.append(current_sentence)
            current_sentence = []
    else:
        current_sentence.append((token, tag))

# Add any last pending sentence
if current_sentence:
    sentences.append(current_sentence)

print(f"Total sentences: {len(sentences)}")
print(f"Example sentence: {sentences[0]}")

Total sentences: 75
Example sentence: [('A', 'B-SYMPTOM'), ('X-ray', 'B-TEST'), ('confirmed', 'O'), ('the', 'O'), ('presence', 'O'), ('of', 'B-DRUG'), ('C', 'B-DISEASE'), ('.', 'B-DISEASE'), ('difficile', 'B-DISEASE'), ('.', 'B-DISEASE')]


### 1.2 — Training a CRF Model for Clinical NER

#### Imports

In [None]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
import random
from sklearn.model_selection import train_test_split

#### Helper Function

In [None]:
# --- Helper: Feature extractor per token ---
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[:3]': word[:3],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper()
        })
    else:
        features['BOS'] = True  # Beginning of sentence

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper()
        })
    else:
        features['EOS'] = True  # End of sentence

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]


#### Code

In [None]:
# --- Create feature and label sets ---
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Train CRF ---
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# --- Predict and Evaluate ---
y_pred = crf.predict(X_test)

print(metrics.flat_classification_report(y_test, y_pred, digits=3))

              precision    recall  f1-score   support

   B-DISEASE      1.000     0.857     0.923        14
      B-DRUG      0.857     0.857     0.857         7
   B-SYMPTOM      0.889     0.941     0.914        17
      B-TEST      1.000     0.833     0.909         6
           O      0.971     0.990     0.980       101

    accuracy                          0.959       145
   macro avg      0.943     0.896     0.917       145
weighted avg      0.960     0.959     0.958       145



### 1.3 — Fine-Tune BERT for Clinical NER (HuggingFace)

#### Imports

In [None]:
import torch
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
from sklearn.model_selection import train_test_split

  torch.utils._pytree._register_pytree_node(


#### Code

In [None]:
# Convert sentences into HuggingFace-compatible structure
token_list = []
tag_list = []

for sent in sentences:
    tokens = [token for token, tag in sent]
    tags = [tag for token, tag in sent]
    token_list.append(tokens)
    tag_list.append(tags)

# Get unique tags and make label mappings
unique_tags = sorted({tag for doc in tag_list for tag in doc})
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# Map tags to IDs
tag_list_ids = [[tag2id[tag] for tag in seq] for seq in tag_list]

# Prepare dataset
all_data = Dataset.from_dict({'tokens': token_list, 'ner_tags': tag_list_ids})
train_test = all_data.train_test_split(test_size=0.2)

# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

# Align labels with wordpieces
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenizer + label aligner
tokenized_data = train_test.map(tokenize_and_align_labels, batched=True)


Downloading tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Map:   0%|          | 0/60 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

## Stage 2: BERT Fine-Tuning for Clinical NER

In [None]:
from transformers import AutoModelForTokenClassification

# Load pre-trained BERT model for token classification
model = AutoModelForTokenClassification.from_pretrained(
    "bert-base-cased",
    num_labels=len(tag2id),
    id2label=id2tag,
    label2id=tag2id
)

# Data collator pads tokens & labels to match batch sizes
data_collator = DataCollatorForTokenClassification(tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert-clinical-ner",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch"
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# 🚀 Train!
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`