<a href="https://colab.research.google.com/github/kristiewong/CS598_FinalProject/blob/main/CS598_Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries
!pip install transformers==4.28.0 datasets==3.4.1 seqeval==1.2.2 torch --quiet

In [None]:
import re
import os
import random
import numpy as np
import torch
from torch.utils.data import DataLoader
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
import seqeval.metrics as seqeval_metrics

In [None]:
#########################
# 1) Set a random seed
#########################
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

In [None]:
#########################
# 2) Hyperparameters & Config
#########################
model_checkpoint = "bert-base-cased"  # or "bert-base-uncased"
batch_size = 8
num_epochs = 3
label_all_tokens = True  # Typically True in NER to label sub-tokens consistently

In [None]:
# Download MIMIC-III Clinical Database Files
!wget -r -N -c -np --user kevinlc221 --ask-password https://physionet.org/files/deidentifiedmedicaltext/1.0/


Password for user ‘kevinlc221’: ^C


In [None]:
# Define dataset path
PHYSIONET_FILEPATH_RES = '/content/physionet.org/files/deidentifiedmedicaltext/1.0/id.res'
PHYSIONET_FILEPATH = '/content/physionet.org/files/deidentifiedmedicaltext/1.0/id.text'

In [None]:
def read_entire_record(f):
    """
    Reads from file f until it encounters START_OF_RECORD,
    then accumulates all lines until END_OF_RECORD.
    Returns the full text of the record as one string,
    or None if we reach EOF with no record found.
    Skips blank lines outside records.
    """
    lines = []
    inside_record = False

    while True:
        pos = f.tell()
        line = f.readline()
        if not line:
            # EOF
            break

        line_stripped = line.strip()
        if not inside_record:
            # Look for START_OF_RECORD
            if line_stripped.startswith("START_OF_RECORD"):
                inside_record = True
            # otherwise, skip blank or irrelevant lines
        else:
            # We are inside a record
            if "END_OF_RECORD" in line_stripped:
                # End of this record
                return "\n".join(lines)
            else:
                # Accumulate non-empty lines
                if line_stripped:
                    lines.append(line_stripped)

    # If we exit loop, we didn't find a full record
    return None

In [None]:
import re

token_pattern = re.compile(r'(\S*\[\*\*.*?\*\*\]\S*|\S+)', re.DOTALL)

def tokenize_text_record(record_text):
    """
    Simple whitespace split for the original text (id.text).
    """
    # Or you can do something more advanced, but typically whitespace is fine
    return record_text.split()

def tokenize_res_record(record_text):
    """
    Use a regex that merges everything containing [**...**] into a single token.
    Example: "([**Hospital 1**])" => one token, "He arrived [**Hospital 2**]" => ...
    """
    tokens = token_pattern.findall(record_text)
    return tokens

def is_placeholder(token):
    """
    Returns True if this token is considered a placeholder
    (it includes [** ... **] somewhere).
    """
    return ('[**' in token and '**]' in token)

In [None]:
def align_record_tokens(text_tokens, res_tokens):
    """
    Aligns all tokens in text_tokens (from id.text) with res_tokens (from id.res).
    A single placeholder (or multiple consecutive placeholders) in res_tokens
    can replace multiple text_tokens in id.text.

    Returns: a list of labels ("PHI" or "O"), one for each token in text_tokens.

    Assumptions:
      - Normal tokens in res_tokens must appear verbatim in text_tokens
        (used as anchors).
      - Placeholders in res_tokens are detected by is_placeholder(token).
      - All placeholders in a row map to a continuous region of text_tokens
        labeled "PHI" until the next anchor or end of text_tokens.
    """
    labels = []
    i, j = 0, 0
    len_text = len(text_tokens)
    len_res  = len(res_tokens)

    while i < len_text and j < len_res:
        # If res_tokens[j] is a placeholder (or multiple placeholders in a row)
        if is_placeholder(res_tokens[j]):
            # Skip all consecutive placeholders
            while j < len_res and is_placeholder(res_tokens[j]):
                j += 1

            # If we've consumed all res_tokens,
            # label the remainder of text_tokens as PHI
            if j == len_res:
                while i < len_text:
                    labels.append("PHI")
                    i += 1
            else:
                # We have a normal token anchor: res_tokens[j]
                anchor = res_tokens[j]
                # Label text_tokens as PHI until we find this anchor or run out
                while i < len_text and text_tokens[i] != anchor:
                    labels.append("PHI")
                    i += 1
                # If we haven't run out, we presumably matched the anchor
                # We'll handle it in the next iteration, so do NOT increment i here
                # because we haven't actually labeled that anchor token yet.
                # We'll let the loop handle it.
        else:
            # Normal token in res_tokens => must match text_tokens exactly
            if text_tokens[i] != res_tokens[j]:
                raise ValueError(
                    f"Mismatch: text_tokens[{i}]='{text_tokens[i]}' "
                    f"vs res_tokens[{j}]='{res_tokens[j]}'"
                )
            # If it matches, label it "O"
            labels.append("O")
            i += 1
            j += 1

    # If we still have leftover text_tokens, label them "O"
    # (or "PHI" if you expect placeholders to cover them).
    while i < len_text:
        labels.append("O")
        i += 1

    return labels

In [None]:
def build_labeled_dataset(text_path, res_path):
    """
    - Opens both files.
    - Reads entire records from each (ignoring differences in line breaks).
    - Tokenizes text vs. placeholders.
    - Aligns them to produce "PHI"/"O" labels for the original text tokens.
    - Returns a list of dicts: [{"tokens": [...], "ner_tags": [...]}].
    """
    records = []

    with open(text_path, "r", encoding="utf-8") as ft, \
         open(res_path,  "r", encoding="utf-8") as fr:

        while True:
            text_record = read_entire_record(ft)
            res_record  = read_entire_record(fr)

            if text_record is None or res_record is None:
                # End if either file is done
                break

            # 1) Tokenize entire record
            text_tokens = tokenize_text_record(text_record)
            res_tokens  = tokenize_res_record(res_record)

            # 2) Align
            try:
                labels = align_record_tokens(text_tokens, res_tokens)
            except ValueError as e:
                # If mismatch, you might skip this record or handle differently
                print(f"Warning: mismatch in record. Skipping.\n{e}")
                continue

            # 3) Store
            records.append({
                "tokens": text_tokens,
                "ner_tags": labels
            })

    return records

In [None]:
################################################################################
# 5) Parse the entire dataset
################################################################################

all_records = build_labeled_dataset(PHYSIONET_FILEPATH, PHYSIONET_FILEPATH_RES)
print(f"Total parsed records: {len(all_records)}")
# Each element looks like: {"tokens": [...], "ner_tags": [...]}
print(all_records[0]["tokens"])
print(all_records[0]["ner_tags"])

# -- Use only a SUBSET for faster training, e.g. 200 records --
# subset_size = 200  # <-- Adjust if you want more or fewer
# all_records = all_records[:subset_size]
# print(f"Using only the first {subset_size} records to speed up training...")

Total parsed records: 2434
['O:', '58', 'YEAR', 'OLD', 'FEMALE', 'ADMITTED', 'IN', 'TRANSFER', 'FROM', 'CALVERT', 'HOSPITAL', 'FOR', 'MENTAL', 'STATUS', 'CHANGES', 'POST', 'FALL', 'AT', 'HOME', 'AND', 'CONTINUED', 'HYPOTENSION', 'AT', 'CALVERT', 'HOSPITAL', 'REQUIRING', 'DOPAMINE;', 'PMH:', 'CAD,', 'S/P', 'MI', '1992;', 'LCX', 'PTCA;', '3V', 'CABG', 'WITH', 'MVR;', 'CMP;', 'AFIB-', 'AV', 'NODE', 'ABLATION;', 'PERM', 'PACER-', 'DDD', 'MODE;', 'PULM', 'HTN;', 'PVD;', 'NIDDM;', 'HPI:', '2', 'WEEK', 'HISTORY', 'LEG', 'WEAKNESS;', '7/22', 'FOUND', 'BY', 'HUSBAND', 'ON', 'FLOOR-', 'AWAKE,', 'BUT', 'MENTAL', 'STATUS', 'CHANGES;', 'TO', 'CALVERT', 'HOSPITAL', 'ER-', 'TO', 'THEIR', 'ICU;', 'HEAD', 'CT-', 'NEG', 'FOR', 'BLEED;', 'VQ', 'SCAN-', 'NEG', 'FOR', 'PE;', 'ECHO-', 'GLOBAL', 'HYPOKINESIS;', 'EF', 'EST', '20%;', 'R/O', 'FOR', 'MI;', 'DIGOXIN', 'TOXIC', 'WITH', 'HYPERKALEMIA-', 'KAYEXALATE,', 'DEXTROSE,', 'INSULIN;', 'RENAL', 'INSUFFICIENCY-', 'BUN', '54,', 'CR', '2.8;', 'INR', '7', '(', '

In [None]:
################################################################################
# 6) Train/Test Split
#
#    The snippet below does a simple random 80/20 split for demonstration.
#    If you have an official split or want to do cross-validation, adapt this part.
################################################################################

random.shuffle(all_records)
split_idx = int(0.8 * len(all_records))
train_records = all_records[:split_idx]
test_records  = all_records[split_idx:]

train_dataset = Dataset.from_list(train_records)
test_dataset  = Dataset.from_list(test_records)

physionet_dataset = DatasetDict({
    "train": train_dataset,
    "test":  test_dataset
})

print("Train size:", len(physionet_dataset["train"]))
print("Test size: ", len(physionet_dataset["test"]))

Train size: 1947
Test size:  487


In [None]:
################################################################################
# 7) Build label mappings
#
#    In this simple example, we have only two labels: ["O", "PHI"].
#    If you adapt your logic to detect multiple entity types, add them all here.
################################################################################

unique_tags = ["O", "PHI"]
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for i, tag in enumerate(unique_tags)}
print("tag2id:", tag2id)

tag2id: {'O': 0, 'PHI': 1}


In [None]:
################################################################################
# 8) Tokenize and Align Labels
#
#    Because the notes are already "tokenized" at the whitespace level,
#    we can use the tokenizer with `is_split_into_words=True`.
#    We'll replicate each label for subword tokens (common in NER).
################################################################################

from transformers import AutoTokenizer

model_checkpoint = "bert-base-cased"  # or "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    word_ids_batch = []

    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # subword->word mapping
        aligned_labels = []

        for word_id in word_ids:
            if word_id is None:
                # This is typically a special token => mark as -100
                aligned_labels.append(-100)
            else:
                aligned_labels.append(tag2id[labels[word_id]])
        new_labels.append(aligned_labels)
        word_ids_batch.append(word_ids)

    # Save subword labels
    tokenized_inputs["labels"] = new_labels
    # Also store the word_ids so we can reconstruct original words
    tokenized_inputs["word_ids"] = word_ids_batch

    return tokenized_inputs

processed_dataset = physionet_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=["tokens", "ner_tags"],  # remove original columns post-processing
)

Map:   0%|          | 0/1947 [00:00<?, ? examples/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

In [None]:
!pip install pytorch-crf  # for example




In [None]:
from torch import nn
from transformers import BertModel, BertPreTrainedModel, BertConfig

from torchcrf import CRF

class BertCRF(BertPreTrainedModel):
    """
    BERT + CRF for token classification using torchcrf.
    Returns:
      (loss, decoded_tensor) if labels are provided
      (decoded_tensor,) if labels=None
    """
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.hidden2label = nn.Linear(config.hidden_size, self.num_labels)
        self.crf = CRF(num_tags=self.num_labels, batch_first=True)
        self.init_weights()

    def forward(self, input_ids, attention_mask=None, token_type_ids=None, labels=None):
        outputs = self.bert(
            input_ids=input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask
        )
        last_encoder_layer = outputs[0]  # shape: (batch_size, seq_len, hidden_size)

        # Project to tag space
        last_encoder_layer = self.dropout(last_encoder_layer)
        emissions = self.hidden2label(last_encoder_layer)  # (batch_size, seq_len, num_labels)

        # Build a mask for CRF
        if labels is not None:
            # Typical HF practice: -100 = "ignore"
            mask = (labels >= 0)  # shape: (batch_size, seq_len), bool
            # If a label is -100, zero it out for CRF
            labels = labels * mask
        else:
            # Use attention_mask if no labels
            # (assuming 1/0 meaning valid/invalid tokens)
            if attention_mask is not None:
                mask = attention_mask.bool()  # shape: (batch_size, seq_len)
            else:
                mask = None

        # ---- FIX CRF "first timestep" error by ensuring first token is unmasked ----
        #   If your dataset always has at least 1 token, forcibly set mask[:, 0] = True
        if mask is not None and mask.shape[1] > 0:
            mask[:, 0] = True

        # CRF decoding => a list of lists of token indices
        best_paths = self.crf.decode(emissions, mask=mask)

        # Convert each best_path list to a padded tensor of shape [B, max_len]
        max_len = max(len(p) for p in best_paths) if best_paths else 0
        batch_size = emissions.size(0)
        best_paths_tensor = torch.full(
            (batch_size, max_len),
            -1,
            dtype=torch.long,
            device=emissions.device
        )
        for i, path in enumerate(best_paths):
            best_paths_tensor[i, :len(path)] = torch.tensor(path, device=emissions.device)

        # Return format suitable for HF Trainer
        # If labels were provided, also compute negative log-likelihood
        if labels is not None:
            # TorchCRF returns log_likelihood for each sample (summed over the batch)
            log_likelihood = self.crf(emissions, tags=labels, mask=mask)
            loss = -log_likelihood  # the CRF log-likelihood is the negative loss
            return (loss, best_paths_tensor)
        else:
            return (best_paths_tensor,)


In [None]:
################################################################################
# 9) Model and Trainer Setup
################################################################################

from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

num_labels = len(unique_tags)
# model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
config = BertConfig.from_pretrained(model_checkpoint, num_labels=num_labels)
model = BertCRF.from_pretrained(model_checkpoint, config=config)

batch_size = 8
num_epochs = 3

training_args = TrainingArguments(
    output_dir="bert-deid-physionet-demo",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    logging_steps=50,
    report_to="none",  # Avoids default W&B
    load_best_model_at_end=True,
    save_total_limit=2,
    seed=SEED,
)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertCRF: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertCRF from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertCRF from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertCRF were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['hidden2label.bias', 'hidden2label.weight', 'crf.transitions', 'crf.end_transit

In [None]:
################################################################################
# 10) Metrics (Precision, Recall, F1, Accuracy via seqeval)
################################################################################

from seqeval.metrics import precision_score, recall_score, f1_score, accuracy_score

def compute_metrics(eval_preds):
    predictions, labels = eval_preds

    # Convert tensors to numpy if needed
    if isinstance(predictions, torch.Tensor):
        predictions = predictions.cpu().numpy()
    if isinstance(labels, torch.Tensor):
        labels = labels.cpu().numpy()

    true_preds = []
    true_labels = []

    for pred_seq, gold_seq in zip(predictions, labels):
        # Filter out positions where gold_seq == -100
        filtered_preds = []
        filtered_labels = []
        for p, g in zip(pred_seq, gold_seq):
            if g == -100:
                continue
            filtered_preds.append(p)
            filtered_labels.append(g)

        # 1) Map integers to strings: e.g. 0->"O", 1->"PHI"
        pred_str_seq = [id2tag[x] for x in filtered_preds]
        gold_str_seq = [id2tag[x] for x in filtered_labels]

        true_preds.append(pred_str_seq)
        true_labels.append(gold_str_seq)

    # 2) Now use seqeval with string tags
    precision = precision_score(true_labels, true_preds)
    recall    = recall_score(true_labels, true_preds)
    f1        = f1_score(true_labels, true_preds)
    accuracy  = accuracy_score(true_labels, true_preds)

    return {
        "precision": precision,
        "recall":    recall,
        "f1":        f1,
        "accuracy":  accuracy,
    }


In [None]:
################################################################################
# 11) Trainer
################################################################################

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
################################################################################
# 12) Train
################################################################################

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,73.3741,81.73394,0.572072,0.395639,0.467772,0.992169




In [None]:
################################################################################
# 13) Evaluate
################################################################################

metrics = trainer.evaluate()
print("Evaluation metrics:", metrics)

In [None]:
# Suppose you already have your (input_ids, attention_mask) for a single sentence:
sentence = "John Smith arrived at the New York Hospital."
encoding = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=128)

# The Trainer expects a Dataset object with lists (one entry per example).
# So we convert your single example to Python lists of length 1:
sample_dataset = Dataset.from_dict({
    "input_ids": [encoding["input_ids"][0].tolist()],
    "attention_mask": [encoding["attention_mask"][0].tolist()]
    # If your model is trained with token_type_ids, add them similarly.
})

predictions, label_ids, metrics = trainer.predict(sample_dataset)
predictions = predictions[0]  # shape [seq_len]

# Convert input_ids (list of int) back to tokens
tokens = tokenizer.convert_ids_to_tokens(sample_dataset[0]["input_ids"])

# Suppose predictions is shape [seq_len]
pred_labels = []
for i, pred_id in enumerate(predictions):
    # skip special tokens if needed
    if tokens[i] in ("[CLS]", "[SEP]"):
        continue
    pred_labels.append((tokens[i], id2tag[pred_id]))
print("=== Single Sentence Prediction ===")
for tok, lab in pred_labels:https://github.com/kristiewong/30-seconds-of-swift-code/blob/master/CS598%20Final%20Project.ipynb
    print(f"{tok:15} => {lab}")


In [None]:
trainer.save_model("/content/final_ner_model")

In [None]:
!zip -r /content/model.zip /content/final_ner_model

In [None]:
# ✅ 1. Install dependencies
!pip install transformers datasets seqeval --quiet

# ✅ 2. Imports
import torch
from datasets import DatasetDict, Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
from seqeval.metrics import accuracy_score, precision_score, recall_score, f1_score

# ✅ 3. Define label mappings
unique_tags = ["O", "PHI"]
tag2id = {tag: i for i, tag in enumerate(unique_tags)}
id2tag = {i: tag for tag, i in tag2id.items()}

# ✅ 4. Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True)

# ✅ 5. Tokenization function

def tokenize_and_align_labels(examples, tokenizer, tag2id):
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,
        is_split_into_words=True,
    )
    all_labels = examples["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        aligned_labels = []
        for word_id in word_ids:
            if word_id is None:
                aligned_labels.append(-100)
            else:
                aligned_labels.append(tag2id[labels[word_id]])
        new_labels.append(aligned_labels)
    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs


# ✅ 6. Load your processed dataset here
# Replace with actual load if not already present
# processed_dataset = DatasetDict(...)

# ✅ 7. Compute metrics
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = torch.tensor(logits).argmax(dim=-1).numpy()
    labels = torch.tensor(labels).numpy()

    true_preds, true_labels = [], []
    for pred_seq, label_seq in zip(predictions, labels):
        p_seq, l_seq = [], []
        for p, l in zip(pred_seq, label_seq):
            if l != -100:
                p_seq.append(id2tag[p])
                l_seq.append(id2tag[l])
        true_preds.append(p_seq)
        true_labels.append(l_seq)

    return {
        "precision": precision_score(true_labels, true_preds),
        "recall": recall_score(true_labels, true_preds),
        "f1": f1_score(true_labels, true_preds),
        "accuracy": accuracy_score(true_labels, true_preds),
    }

# ✅ 8. Define model (Vanilla BERT)
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(tag2id))

# ✅ 9. Training Arguments
training_args = TrainingArguments(
    output_dir="./bert-deid-vanilla",
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    load_best_model_at_end=True,
    seed=42,
)

# ✅ 10. Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer),
    compute_metrics=compute_metrics,
)

# ✅ 11. Train and Evaluate
trainer.train()
metrics = trainer.evaluate()
print("Evaluation metrics:", metrics)
