# Update PII ds
> urchade/synthetic-pii-ner-mistral-v1

In [1]:
from datasets import load_dataset, Dataset
import json
from pathlib import Path

In [2]:
DP = Path.cwd()/'data'; DP

PosixPath('/teamspace/studios/this_studio/data')

In [3]:
with open(DP/"pii-mistral.json") as f: data = json.load(f)

In [4]:
print(data[0])

{'tokenized_text': ['Mamadou', 'Diop', ',', 'a', 'resident', 'of', 'Dakar', ',', 'is', 'a', 'regular', 'at', 'the', "'", 'Club', '54', "'", 'nightclub', 'located', 'at', '45', 'Rue', 'de', 'la', 'Liberté', '.', 'He', 'usually', 'arrives', 'around', 'midnight', 'and', 'often', 'leaves', 'around', '3', 'am', '.', 'Mamadou', "'", 's', 'ID', 'card', 'number', 'is', 'WS-123456789-1', ',', 'and', 'he', 'pays', 'for', 'his', 'drinks', 'with', 'a', 'credit', 'card', ':', '1234-5678-9012-3456', '.'], 'ner': [[0, 1, 'person'], [14, 15, 'nightclub'], [45, 45, 'ID card number'], [58, 58, 'credit card number']]}


In [5]:
len(data)

19635

Normalize each NER span into a dict for compatibility

In [6]:
for sample in data:
    sample["ner"] = [
        {"start": s, "end": e, "label": lbl}
        for s, e, lbl in sample["ner"]
    ]

In [7]:
from datasets import Features, Sequence, Value, Dataset

features = Features({
    "tokenized_text": Sequence(Value("string")),
    "ner": Sequence({
        "start": Value("int64"),
        "end": Value("int64"),
        "label": Value("string")
    })
})

ds = Dataset.from_list(data, features=features)

In [8]:
ds

Dataset({
    features: ['tokenized_text', 'ner'],
    num_rows: 19635
})

Okay, we've established that this ds has non-ENG languaged, we need to join tokenized text and check for English, filter out the rest

In [9]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
def is_probably_english(text): return detect(text) == "en"

In [10]:
ds = ds.filter(lambda x: is_probably_english(' '.join(x['tokenized_text'])))

Filter:   0%|          | 0/19635 [00:00<?, ? examples/s]

In [11]:
# Load span-based NER JSON into Hugging Face Dataset
# Original JSON format:
# {
#   "tokens": [...],
#   "ner": [[start_idx, end_idx, label], ...]
# }

# Step 1: Normalize each NER span into a dict for clarity and compatibility
# → [start, end, label] → {"start": ..., "end": ..., "label": ...}

# Step 2: Define Hugging Face `Features` schema with nested structure
# → tokens: list of strings
# → ner: list of {"start": int, "end": int, "label": str}

# Step 3: Use `Dataset.from_list(data, features=features)` to load it cleanly

Let's see what unique entities we have

In [12]:
%%time
unique_ents = list(set(label for sample in ds for label in sample['ner']['label']))

CPU times: user 702 ms, sys: 4.02 ms, total: 706 ms
Wall time: 705 ms


In [13]:
len(unique_ents)

1747

In [14]:
unique_ents[:3]

['vehicle', 'enrollment_date', 'student id number']

**Accessing ents faster via indexing**: Can we do faster?

indexing?

In [15]:
from collections import defaultdict

In [16]:
ent_idx = defaultdict(list)

In [17]:
ds[0]['ner']['label']

['person', 'nightclub', 'ID card number', 'credit card number']

In [18]:
%%time
for idx, s in enumerate(ds):
    for ent in s['ner']['label']: ent_idx[ent].append(idx)

CPU times: user 709 ms, sys: 4.01 ms, total: 713 ms
Wall time: 712 ms


Let's see how many example we have of `identifier`

In [19]:
[print(ds[i]) for i in ent_idx['identifier']]

{'tokenized_text': ['The', 'Ministry', 'of', 'Health', 'and', 'Population', 'in', 'Antananarivo', 'has', 'released', 'a', 'new', 'policy', 'regarding', 'the', 'collection', 'and', 'management', 'of', 'medical', 'records', '.', 'All', 'citizens', 'are', 'required', 'to', 'submit', 'their', 'personal', 'information', ',', 'including', 'name', ',', 'date', 'of', 'birth', ',', 'and', 'Social', 'Security', 'Number', ',', 'to', 'their', 'local', 'health', 'center', 'for', 'the', 'creation', 'of', 'an', 'electronic', 'medical', 'record', '.', 'Patients', 'can', 'access', 'their', 'records', 'using', 'a', 'unique', 'identifier', ',', 'which', 'is', 'a', 'combination', 'of', 'their', 'first', 'five', 'letters', 'of', 'their', 'last', 'name', 'and', 'the', 'last', 'four', 'digits', 'of', 'their', 'Social', 'Security', 'Number', '.', 'For', 'instance', ',', 'a', 'patient', 'named', 'Rasoanaivo', ',', 'born', 'on', '15th', 'October', '1985', ',', 'with', 'SSN', '501-02-1234', ',', 'will', 'have', 

[None, None, None, None, None, None]

Okay, only 10, that's weak and odd

In [20]:
def find_matching_ents(query, all_ents=unique_ents, case_insensitive=True):
    q = query.lower() if case_insensitive else query
    return [e for e in all_ents if q in (e.lower() if case_insensitive else e)]

In [21]:
find_matching_ents("financial", unique_ents)

['financial information',
 'financial record',
 'financial_institution',
 'financial aid',
 'financial institution']

That's more like it!

In [22]:
import random

In [23]:
matching_idxs = []

In [24]:
ent_idx['identifier']

[433, 1337, 1337, 1568, 1796, 3069]

In [25]:
def get_random_sample_for_ent(ent, ds, ent_idx, unique_ents):
    matching_idxs = []
    matches = find_matching_ents(ent, unique_ents)
    if not matches: return None
    # print("Matches:", matches)
    matching_idxs = [idx for a in matches for idx in ent_idx[a]]
    # print("Matching Idxs:", matching_idxs)
    idx = random.choice(matching_idxs)
    print("idx",idx)
    return ' '.join(ds[idx]['tokenized_text'])

In [26]:
f = get_random_sample_for_ent('identifier', ds, ent_idx, unique_ents) ; f

idx 1241


'Dear valued customers , As part of our commitment to providing exceptional service , we would like to inform you about the upcoming changes to our loyalty program . Effective January 1 , 2023 , all members will receive a new membership card with an updated membership number , valid for an additional 5 years . The new card will include your full name , date of birth , and contact details : address 456 Elm Street , San Francisco , CA 94123 , phone number 415-555-1234 , and email address john . doe @ example . com . In addition , we have updated our privacy policy to include biometric data collected through our facial recognition technology during check-in . This data will be used solely for the purpose of personalizing your travel experience and will not be shared with third parties . We take your privacy seriously and are committed to protecting your Personal Identifiable Information ( PII ) . For customers traveling to South Korea , we would like to remind you of the requirements for 

Okay, not bad. Two things. I'd like to add some context around what is 'financial' data. Can we get formats or report Types rather than just names? Archita has a resource we can take advantage of.

## Structured Data Generation

In [86]:
import random

We're going to convert each entitiy set into it's own class, this  allows us to randomly sample!

In [125]:
class Financial:
    # Account numbers or billing IDs, possibly insurance-related.
    def __init__(self, accounts): self.accounts = accounts
    def random(self): return random.choice(self.accounts)

class PatientDemo:
    # Patient's date of birth.
    def __init__(self, dobs): self.dobs = dobs
    def random(self): return random.choice(self.dobs)

class Practitioner:
    # Full name of the treating clinician.
    def __init__(self, names): self.names = names
    def random(self): return random.choice(self.names)

class Professional:
    # Initials of the treating clinician.
    def __init__(self, initials): self.initials = initials
    def random(self): return random.choice(self.initials)

class PatientLocation:
    # Residential or hospital-provided address, including postal codes and provinces.
    def __init__(self, addresses): self.addresses = addresses
    def random(self): return random.choice(self.addresses)

class Name:
    # Full name of the patient.
    def __init__(self, names): self.names = names
    def random(self): return random.choice(self.names)

class EncounterDate:
    # Timestamp of a clinical encounter, e.g. 2015-10-13T11:01:00-05:00.
    def __init__(self, timestamps): self.timestamps = timestamps
    def random(self): return random.choice(self.timestamps)

class PatientContact:
    # Phone numbers, but may include addresses or physical contact information.
    def __init__(self, contacts): self.contacts = contacts
    def random(self): return random.choice(self.contacts)

class PatientID:
    # Identifiers like OHIP numbers, MRNs, JHN IDs, or CMRs.
    def __init__(self, ids): self.ids = ids
    def random(self): return random.choice(self.ids)

class Organization:
    # Hospital departments, wings, or named units.
    def __init__(self, orgs): self.orgs = orgs
    def random(self): return random.choice(self.orgs)

In [126]:
accounts = Financial(["ACC-23456", "BILL-98123", "INS-77231"])
dobs = PatientDemo(["1985-03-14", "1992-07-01", "2000-12-25"])
clinicians = Practitioner(["Emily Tran", "Rajiv Kapoor", "Sarah Mendel"])
initials = Professional(["E.T.", "R.K.", "S.M."])
addresses = PatientLocation([
    "123 Main St, Toronto, ON M4B 1B3",
    "Unit 402, 88 King St W, Hamilton, ON L8P 1A1",
    "5th Floor, 999 Health Sciences Rd, Ottawa, ON K1A 0B2"
])
patient_names = Name(["John Doe", "Jane Smith", "Alice Johnson"])
encounter_times = EncounterDate([
    "2023-09-15T14:30:00-04:00",
    "2022-11-01T08:15:00-05:00",
    "2024-02-20T19:45:00-05:00"
])
contacts = PatientContact([
    "(416) 555-0198", 
    "905-321-7645", 
    "456 Lakeshore Dr, Mississauga, ON"
])
mrns = PatientID(["MRN1234", "OHIP9876543210", "JHN-A112233"])
departments = Organization(["Cardiology Ward", "Oncology Unit", "Emergency Department"])


Now onto 'realistic' Report Templates

In [127]:
REPORT_TEMPLATES = {
    "Admission Note": {
        "template": """Admission Note
Patient Name: <name>
MRN: <patient_id>
Account#: <financial>
Admission Date: <encounter_date>
Attending Physician: Dr. <practitioner>
Reason for Admission: Chest Pain""",
        "entities": ["name", "patient_id", "financial", "encounter_date", "practitioner"]
    },
    "Discharge Summary": {
        "template": """Discharge Summary
Patient Name: <name>
MRN: <patient_id>
Date of Admission: <encounter_date>
Date of Discharge: <encounter_date>
Attending Physician: Dr. <practitioner>

Discharge Instructions: Follow-up at <organization>.""",
        "entities": ["name", "patient_id", "encounter_date", "practitioner", "organization"]
    },
    "Radiology Report": {
        "template": """Radiology Report
Patient: <name> | MRN: <patient_id> | DOB: <patient_demo>
Exam Date: <encounter_date>
Ordering Physician: Dr. <practitioner>

Electronically signed by Dr. <practitioner>, MD on <encounter_date>.""",
        "entities": ["name", "patient_id", "patient_demo", "encounter_date", "practitioner"]
    },
    "Psychiatry Evaluation": {
        "template": """Psychiatry Evaluation
Patient: <name>
MRN: <patient_id>
Date of Evaluation: <encounter_date>
Psychiatrist: <practitioner>

Chief Complaint: Depression and anxiety symptoms worsening.

Assessment:
- Mood: Depressed, anxious
- Thought process: Organized, coherent

Plan:
- Start sertraline 50 mg daily
- Follow-up appointment at <organization>""",
        "entities": ["name", "patient_id", "encounter_date", "practitioner", "organization"]
    },
    "Pathology Report": {
        "template": """Pathology Report
Patient: <name>
MRN: <patient_id>
DOB: <patient_demo>
Accession#: <financial>
Date of Procedure: <encounter_date>

Specimen:
- Colon biopsy, ascending colon

Diagnosis:
- Hyperplastic polyp

Signed by: <practitioner>, Pathologist, <encounter_date>""",
        "entities": ["name", "patient_id", "patient_demo", "financial", "encounter_date", "practitioner"]
    },
    "Operative Note": {
        "template": """Operative Note
Patient Name: <name>
MRN: <patient_id>
Date of Surgery: <encounter_date>
Surgeon: <practitioner>
Assistant: <professional>

Procedure:
- Laparoscopic appendectomy

Findings:
- Acute appendicitis with mild inflammation

Electronically signed: <practitioner>, <encounter_date>""",
        "entities": ["name", "patient_id", "encounter_date", "practitioner", "professional"]
    },
    "Emergency Department Note": {
        "template": """Emergency Department Note
Patient Name: <name>
MRN: <patient_id>
Encounter Time: <encounter_date>
Attending Physician: <practitioner>

Chief Complaint: Shortness of breath

Disposition:
- Discharged home with inhaler
- Follow-up at <organization> recommended within 72 hours""",
        "entities": ["name", "patient_id", "encounter_date", "practitioner", "organization"]
    },
    "Consultation Note": {
        "template": """Consultation Note
Patient: <name>
MRN: <patient_id>
Consultation Date: <encounter_date>
Consultant: <practitioner>

Reason for Consultation:
- Evaluate for possible rheumatoid arthritis

Recommendations:
- Initiate laboratory tests, including RF and anti-CCP
- Schedule follow-up in 2 weeks at <organization>""",
        "entities": ["name", "patient_id", "encounter_date", "practitioner", "organization"]
    },
    "Progress Note": {
        "template": """Progress Note
Patient Name: <name>
MRN: <patient_id>
Date: <encounter_date>
Physician: <practitioner>

Progress:
- Stable condition, responding well to current therapy
- Plan: Continue medications and monitor closely""",
        "entities": ["name", "patient_id", "encounter_date", "practitioner"]
    },
    "Signature and Footer": {
        "template": """Electronically signed by: <practitioner>, MD, <encounter_date>
Printed by user: <professional>, <encounter_date>
Location: <organization>, <patient_location>""",
        "entities": ["practitioner", "encounter_date", "professional", "organization", "patient_location"]
    },
    "Billing Statement": {
        "template": """Billing Statement
Patient Name: <name>
Account#: <financial>
Billing Date: <encounter_date>

Service Provided:
- Outpatient Consultation

Amount Due: $320.00
Due Date: <encounter_date>""",
        "entities": ["name", "financial", "encounter_date"]
    },
    "Family History": {
        "template": """Family History:
Father of patient <name>, deceased at age 62 due to cardiovascular disease.""",
        "entities": ["name"]
    },
    "Follow-up Instructions": {
        "template": """Follow-up:
Patient <name> advised to contact <organization> at <patient_contact> within one week to schedule follow-up.""",
        "entities": ["name", "organization", "patient_contact"]
    }
}

In [128]:
rt = random.choice(list(REPORT_TEMPLATES.keys())) ; print(rt)
info = REPORT_TEMPLATES[rt] ; print(info)

Progress Note
{'template': 'Progress Note\nPatient Name: <name>\nMRN: <patient_id>\nDate: <encounter_date>\nPhysician: <practitioner>\n\nProgress:\n- Stable condition, responding well to current therapy\n- Plan: Continue medications and monitor closely', 'entities': ['name', 'patient_id', 'encounter_date', 'practitioner']}


In [129]:
phi_classes = {
    "financial": accounts,
    "patient_demo": dobs,
    "practitioner": clinicians,
    "professional": initials,
    "patient_location": addresses,
    "name": patient_names,
    "encounter_date": encounter_times,
    "patient_contact": contacts,
    "patient_id": mrns,
    "organization": departments
}


In [130]:
phi_values = {phi_label: phi_classes[phi_label].random() for phi_label in info["entities"]} ; phi_values

{'name': 'Jane Smith',
 'patient_id': 'MRN1234',
 'encounter_date': '2023-09-15T14:30:00-04:00',
 'practitioner': 'Emily Tran'}

Randomly sample snippet from Pile Pii:

In [131]:
ti = get_random_sample_for_ent(random.choice(info['entities']), ds, ent_idx, unique_ents); ti

Create Prompt

In [132]:
def create_structured_prompt(template: str, inspiration: str) -> str:
    return f"""You are generating a realistic clinical report snippet.

Template:
{template}

Instructions:
1. Use the provided template exactly as shown.
2. Keep all placeholders exactly as provided (e.g., "<name>", "<patient_id>"). Do NOT replace placeholders with realistic or fake data.
3. Use the provided thematic inspiration ONLY for context and realistic phrasing ideas. Do NOT copy directly.
4. Ensure the text is medically plausible and structured realistically.
5. Do NOT add or remove any placeholders.

Return the output exactly as the template, with placeholders intact."""

In [133]:
rt

'Progress Note'

In [134]:
create_structured_prompt(info['template'], ti)

'You are generating a realistic clinical report snippet.\n\nTemplate:\nProgress Note\nPatient Name: <name>\nMRN: <patient_id>\nDate: <encounter_date>\nPhysician: <practitioner>\n\nProgress:\n- Stable condition, responding well to current therapy\n- Plan: Continue medications and monitor closely\n\nInstructions:\n1. Use the provided template exactly as shown.\n2. Keep all placeholders exactly as provided (e.g., "<name>", "<patient_id>"). Do NOT replace placeholders with realistic or fake data.\n3. Use the provided thematic inspiration ONLY for context and realistic phrasing ideas. Do NOT copy directly.\n4. Ensure the text is medically plausible and structured realistically.\n5. Do NOT add or remove any placeholders.\n\nReturn the output exactly as the template, with placeholders intact.'

In [135]:
break;

SyntaxError: 'break' outside loop (1693679151.py, line 1)

Replace placeholders with PII

In [114]:
import re

In [136]:
sample_o = """Progress Note  
Patient Name: <name>  
MRN: <patient_id>  
Date: <encounter_date>  
Physician: <practitioner>  

Progress:  
- Stable condition, responding well to current therapy  
- Plan: Continue medications and monitor closely  
"""

In [137]:
def sub_placeholders(text, phi_dict):
    for placeholder, real_value in phi_dict.items():
        text = re.sub(f"<{placeholder}>", real_value, text)
    return text

In [138]:
ft = sub_placeholders(sample_o, phi_values); ft

'Progress Note  \nPatient Name: Jane Smith  \nMRN: MRN1234  \nDate: 2023-09-15T14:30:00-04:00  \nPhysician: Emily Tran  \n\nProgress:  \n- Stable condition, responding well to current therapy  \n- Plan: Continue medications and monitor closely  \n'

Generate JSON NER format

In [139]:
def generate_ner_json(final_text, phi_dict):
    entities = []
    for phi_type, real_value in phi_dict.items():
        for match in re.finditer(re.escape(real_value), final_text):
            entities.append({
                "entity": real_value,
                "type": phi_type,
                "start": match.start(),
                "end": match.end()
            })
    return {"text": final_text, "entities": entities}

In [140]:
generate_ner_json(ft, phi_values)

{'text': 'Progress Note  \nPatient Name: Jane Smith  \nMRN: MRN1234  \nDate: 2023-09-15T14:30:00-04:00  \nPhysician: Emily Tran  \n\nProgress:  \n- Stable condition, responding well to current therapy  \n- Plan: Continue medications and monitor closely  \n',
 'entities': [{'entity': 'Jane Smith', 'type': 'name', 'start': 30, 'end': 40},
  {'entity': 'MRN1234', 'type': 'patient_id', 'start': 48, 'end': 55},
  {'entity': '2023-09-15T14:30:00-04:00',
   'type': 'encounter_date',
   'start': 64,
   'end': 89},
  {'entity': 'Emily Tran', 'type': 'practitioner', 'start': 103, 'end': 113}]}

## Nursing Notes; see Archita's suggestion + other notes

TBD