## Clinical Text De-identification 

This notebook implements three approaches for de-identifying Protected Health Information (PHI) in discharge reports:

- Method 1: Basic NER (general domain) + Regex rules
- Method 2: Medical domain NER (PHI de-identification) + Regex
- Method 3: 3B LLM (no fine-tuning) prompted for de-identification

In [1]:
!pip install -q spacy==3.7.5 pandas==2.2.2 numpy==1.26.4 regex==2024.7.24 transformers==4.42.4 datasets==2.20.0 accelerate==0.33.0 torch --extra-index-url https://download.pytorch.org/whl/cpu
!python3 -m spacy download en_core_web_lg -q

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [1]:
import os
import re
import json
import pandas as pd
from typing import List, Tuple, Dict

DATA_PATH = "/Users/aryamanbahl/Desktop/IIITH/M25/NLP-H/Assignments/Discharge Reports Dataset.csv"
OUTPUT_DIR = "/Users/aryamanbahl/Desktop/IIITH/M25/NLP-H/Assignments/part2/"

os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load dataset
notes_df = pd.read_csv(DATA_PATH)
print("Loaded rows:", len(notes_df))
print(notes_df.columns.tolist())

# Keep a subset for demo if dataset is huge
MAX_DOCS = 5
sample_df = notes_df.head(MAX_DOCS).copy()

def get_texts(df: pd.DataFrame) -> List[str]:
    return df["report"].astype(str).tolist()

raw_texts = get_texts(sample_df)
print("Sample chars:", sum(len(t) for t in raw_texts))


Loaded rows: 200
['note_id', 'subject_id', 'hadm_id', 'note_type', 'note_seq', 'charttime', 'storetime', 'report']
Sample chars: 36691


In [2]:
# Helper: masking utilities
from dataclasses import dataclass

@dataclass
class Span:
    start: int
    end: int
    label: str

PLACEHOLDER_MAP = {
    "PERSON": "[NAME]",
    "ORG": "[ORG]",
    "DATE": "[DATE]",
    "EMAIL": "[EMAIL]",
    "PHONE": "[PHONE]",
    "ID": "[ID]",
    "HOSPITAL": "[HOSPITAL]",
    "LOCATION": "[LOCATION]",
}


def apply_spans_mask(text: str, spans: List[Span]) -> str:
    # Resolve overlaps by sorting and merging
    spans_sorted = sorted(spans, key=lambda s: (s.start, -(s.end - s.start)))
    merged: List[Span] = []
    prev_end = -1
    for s in spans_sorted:
        if s.start >= prev_end:
            merged.append(s)
            prev_end = s.end
    # Build masked text
    out = []
    last = 0
    for s in merged:
        out.append(text[last:s.start])
        placeholder = PLACEHOLDER_MAP.get(s.label, f"[{s.label}]")
        out.append(placeholder)
        last = s.end
    out.append(text[last:])
    return "".join(out)


def save_outputs(tag: str, texts: List[str]):
    out_path = os.path.join(OUTPUT_DIR, f"deid_{tag}.jsonl")
    with open(out_path, "w") as f:
        for t in texts:
            f.write(json.dumps({"text": t}, ensure_ascii=False) + "\n")
    print("Saved:", out_path)


In [3]:
# Method 1: spaCy NER + Regex rules
import spacy
import regex as re2

nlp = spacy.load("en_core_web_lg")

# Regex patterns for structured identifiers
DATE_PATTERNS = [
    r"\b\d{4}-\d{2}-\d{2}\b",                 # 2025-08-06
    r"\b\d{2}/\d{2}/\d{4}\b",                 # 08/06/2025
    r"\b\d{2}-\d{2}-\d{4}\b",                 # 08-06-2025
    r"\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{1,2},\s*\d{4}\b",
    r"\b\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]*\s+\d{4}\b",
]
EMAIL_PATTERN = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
PHONE_PATTERN = r"\b(?:\+?\d{1,3}[\s.-]?)?(?:\(?\d{3}\)?[\s.-]?)?\d{3}[\s.-]?\d{4}\b"
ID_PATTERN = r"\b(?:MRN|ID|Unit No|Account)[#: ]*\s*\d+[\w-]*\b"


def spacy_ner_regex_deid(text: str) -> str:
    doc = nlp(text)
    spans: List[Span] = []
    for ent in doc.ents:
        if ent.label_ in {"PERSON", "ORG", "GPE", "FAC", "LOC"}:
            label = "PERSON" if ent.label_ == "PERSON" else ("ORG" if ent.label_ == "ORG" else "LOCATION")
            spans.append(Span(ent.start_char, ent.end_char, label))
    # Regex matches
    for pat in DATE_PATTERNS:
        for m in re2.finditer(pat, text, flags=re2.IGNORECASE):
            spans.append(Span(m.start(), m.end(), "DATE"))
    for m in re2.finditer(EMAIL_PATTERN, text):
        spans.append(Span(m.start(), m.end(), "EMAIL"))
    for m in re2.finditer(PHONE_PATTERN, text):
        spans.append(Span(m.start(), m.end(), "PHONE"))
    for m in re2.finditer(ID_PATTERN, text):
        spans.append(Span(m.start(), m.end(), "ID"))
    return apply_spans_mask(text, spans)

method1_outputs = [spacy_ner_regex_deid(t) for t in raw_texts]
save_outputs("method1_spacy_regex", method1_outputs)
print(method1_outputs[0][:500])


Saved: /Users/aryamanbahl/Desktop/IIITH/M25/NLP-H/Assignments/part2/deid_method1_spacy_regex.jsonl
Name: [NAME] No: 9295

Admission Date: [DATE] Discharge Date: [DATE]

Date of Birth: [DATE] Sex: F

Service: MEDICINE

Allergies:
[ORG],Other / Reglan

Attending: [NAME].

Chief Complaint:
Abdominal pain

Major [NAME] or Invasive Procedure:
None

History of Present Illness:
Patient is a 1984 yo woman with history of chronic pancreatitis
s/p cholecystectomy and sphincterotomy who presents with 1wk of
worsening abdominal pain. As per patient, the pain is
intermittent, sharp and 0/10 (no pain) in q


In [1]:
# Method 2: Medical domain NER (HuggingFace) + Regex
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

# We choose a PHI de-identification model if available, fallback to biomedical NER
# Example candidates: "obi/deid_roberta_i2b2" or "emilyalsentzer/Bio_ClinicalBERT"
MODEL_CANDIDATES = [
    "obi/deid_roberta_i2b2",        # if available
    "gliner-biomed/biomed-ner-large",  # open biomedical NER
    "emilyalsentzer/Bio_ClinicalBERT"  # general clinical model (may need token-classification adapter)
]

loaded = False
for name in MODEL_CANDIDATES:
    try:
        tok = AutoTokenizer.from_pretrained(name)
        mdl = AutoModelForTokenClassification.from_pretrained(name)
        nlp_med = pipeline("token-classification", model=mdl, tokenizer=tok, aggregation_strategy="simple")
        model_name = name
        loaded = True
        print("Loaded:", name)
        break
    except Exception as e:
        print("Failed:", name, e)

if not loaded:
    raise RuntimeError("Could not load any medical NER model.")

MED_TO_PLACEHOLDER = {
    # Common PHI tags in de-id datasets
    "PATIENT": "PERSON",
    "DOCTOR": "PERSON",
    "HOSPITAL": "HOSPITAL",
    "ORGANIZATION": "ORG",
    "LOCATION": "LOCATION",
    "CITY": "LOCATION",
    "STATE": "LOCATION",
    "STREET": "LOCATION",
    "ZIP": "LOCATION",
    "DATE": "DATE",
    "PHONE": "PHONE",
    "FAX": "PHONE",
    "EMAIL": "EMAIL",
    "ID": "ID",
}


def med_ner_regex_deid(text: str) -> str:
    spans: List[Span] = []
    preds = nlp_med(text)
    for p in preds:
        label = p["entity_group"].upper()
        label = MED_TO_PLACEHOLDER.get(label, label)
        if label in PLACEHOLDER_MAP:
            spans.append(Span(int(p["start"]), int(p["end"]), label))
    # Reuse regex from method 1
    for pat in DATE_PATTERNS:
        for m in re2.finditer(pat, text, flags=re2.IGNORECASE):
            spans.append(Span(m.start(), m.end(), "DATE"))
    for m in re2.finditer(EMAIL_PATTERN, text):
        spans.append(Span(m.start(), m.end(), "EMAIL"))
    for m in re2.finditer(PHONE_PATTERN, text):
        spans.append(Span(m.start(), m.end(), "PHONE"))
    for m in re2.finditer(ID_PATTERN, text):
        spans.append(Span(m.start(), m.end(), "ID"))
    return apply_spans_mask(text, spans)

method2_outputs = [med_ner_regex_deid(t) for t in raw_texts]
save_outputs("method2_medner_regex", method2_outputs)
print(model_name, method2_outputs[0][:500])


: 

In [None]:
# Method 3: 3B LLM (no fine-tuning) for de-identification
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

def run_llm_deid(texts, model_id: str = "microsoft/phi-3-mini-4k-instruct"):
    tok = AutoTokenizer.from_pretrained(model_id)
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="auto")

    system = (
        "You are a de-identification assistant. Replace all personal identifiers (names, organizations, "
        "hospitals, locations, phone numbers, dates, emails, account/ID numbers) with appropriate placeholders "
        "like [NAME], [ORG], [HOSPITAL], [LOCATION], [PHONE], [DATE], [EMAIL], [ID]. Keep clinical content.")

    outputs = []
    for txt in texts:
        prompt = f"<|system|>\n{system}\n<|user|>\nText:\n{txt}\n\nReturn only the de-identified text.\n<|assistant|>"
        inputs = tok(prompt, return_tensors="pt")
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        gen_ids = model.generate(**inputs, max_new_tokens=512, do_sample=False)
        out = tok.decode(gen_ids[0], skip_special_tokens=True)
        # Extract after assistant tag if present
        if "<|assistant|>" in out:
            out = out.split("<|assistant|>")[-1].strip()
        outputs.append(out)
    return outputs

try:
    method3_outputs = run_llm_deid(raw_texts)
    save_outputs("method3_llm", method3_outputs)
    print(method3_outputs[0][:500])
except Exception as e:
    print("LLM method failed:", e)
    method3_outputs = [""] * len(raw_texts)


### Pipeline Descriptions

- Method 1: spaCy NER to detect PERSON/ORG/LOC + Regex for DATE/EMAIL/PHONE/ID → mask with placeholders.
- Method 2: Clinical/PHI NER via transformers token-classification + same Regex rules → mask with placeholders. Falls back among several model IDs.
- Method 3: 3B LLM prompted to replace PHI with placeholders; no training or fine-tuning.

All outputs saved as `deid_method1_spacy_regex.jsonl`, `deid_method2_medner_regex.jsonl`, and `deid_method3_llm.jsonl`.


In [None]:
# Quick comparison and preview
from pprint import pprint

preview = {
    "method1": method1_outputs[0][:600] if method1_outputs else "",
    "method2": method2_outputs[0][:600] if method2_outputs else "",
    "method3": method3_outputs[0][:600] if method3_outputs else "",
}
pprint(preview)

# Optional: write a side-by-side CSV for the first few docs
rows = []
for i in range(len(raw_texts)):
    rows.append({
        "note_id": sample_df.iloc[i]["note_id"],
        "original": raw_texts[i],
        "method1": method1_outputs[i],
        "method2": method2_outputs[i],
        "method3": method3_outputs[i],
    })
pd.DataFrame(rows).to_csv(os.path.join(OUTPUT_DIR, "deid_comparison_preview.csv"), index=False)
print("Saved:", os.path.join(OUTPUT_DIR, "deid_comparison_preview.csv"))


In [None]:
# Simple counts of placeholders per method
from collections import Counter

PH_PLACEHOLDERS = set(PLACEHOLDER_MAP.values())

def count_placeholders(text: str) -> Dict[str, int]:
    c = Counter()
    for ph in PH_PLACEHOLDERS:
        c[ph] = text.count(ph)
    return dict(c)

summary_rows = []
for i in range(len(raw_texts)):
    row = {"note_id": sample_df.iloc[i]["note_id"]}
    row.update({f"m1_{k}": v for k, v in count_placeholders(method1_outputs[i]).items()})
    row.update({f"m2_{k}": v for k, v in count_placeholders(method2_outputs[i]).items()})
    row.update({f"m3_{k}": v for k, v in count_placeholders(method3_outputs[i]).items()})
    summary_rows.append(row)

summary_df = pd.DataFrame(summary_rows)
summary_path = os.path.join(OUTPUT_DIR, "deid_placeholder_counts.csv")
summary_df.to_csv(summary_path, index=False)
print("Saved:", summary_path)
summary_df.head(3)
