# Update PII ds
> urchade/synthetic-pii-ner-mistral-v1

In [1]:
from datasets import load_dataset, Dataset
import json
from pathlib import Path

In [2]:
DP = Path.cwd()/'data'; DP

PosixPath('/teamspace/studios/this_studio/data')

In [3]:
with open(DP/"pii-mistral.json") as f: data = json.load(f)

In [4]:
print(data[0])

{'tokenized_text': ['Mamadou', 'Diop', ',', 'a', 'resident', 'of', 'Dakar', ',', 'is', 'a', 'regular', 'at', 'the', "'", 'Club', '54', "'", 'nightclub', 'located', 'at', '45', 'Rue', 'de', 'la', 'Liberté', '.', 'He', 'usually', 'arrives', 'around', 'midnight', 'and', 'often', 'leaves', 'around', '3', 'am', '.', 'Mamadou', "'", 's', 'ID', 'card', 'number', 'is', 'WS-123456789-1', ',', 'and', 'he', 'pays', 'for', 'his', 'drinks', 'with', 'a', 'credit', 'card', ':', '1234-5678-9012-3456', '.'], 'ner': [[0, 1, 'person'], [14, 15, 'nightclub'], [45, 45, 'ID card number'], [58, 58, 'credit card number']]}


In [5]:
len(data)

19635

Normalize each NER span into a dict for compatibility

In [6]:
for sample in data:
    sample["ner"] = [
        {"start": s, "end": e, "label": lbl}
        for s, e, lbl in sample["ner"]
    ]

In [7]:
from datasets import Features, Sequence, Value, Dataset

features = Features({
    "tokenized_text": Sequence(Value("string")),
    "ner": Sequence({
        "start": Value("int64"),
        "end": Value("int64"),
        "label": Value("string")
    })
})

ds = Dataset.from_list(data, features=features)

In [8]:
ds

Dataset({
    features: ['tokenized_text', 'ner'],
    num_rows: 19635
})

Okay, we've established that this ds has non-ENG languaged, we need to join tokenized text and check for English, filter out the rest

In [9]:
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0
def is_probably_english(text): return detect(text) == "en"

In [10]:
ds = ds.filter(lambda x: is_probably_english(' '.join(x['tokenized_text'])))

Filter:   0%|          | 0/19635 [00:00<?, ? examples/s]

In [11]:
# Load span-based NER JSON into Hugging Face Dataset
# Original JSON format:
# {
#   "tokens": [...],
#   "ner": [[start_idx, end_idx, label], ...]
# }

# Step 1: Normalize each NER span into a dict for clarity and compatibility
# → [start, end, label] → {"start": ..., "end": ..., "label": ...}

# Step 2: Define Hugging Face `Features` schema with nested structure
# → tokens: list of strings
# → ner: list of {"start": int, "end": int, "label": str}

# Step 3: Use `Dataset.from_list(data, features=features)` to load it cleanly

Let's see what unique entities we have

In [12]:
%%time
unique_ents = list(set(label for sample in ds for label in sample['ner']['label']))

CPU times: user 687 ms, sys: 0 ns, total: 687 ms
Wall time: 686 ms


In [13]:
len(unique_ents)

1747

In [14]:
unique_ents[:3]

['dna pattern', 'person_name', 'utility account number']

**Accessing ents faster via indexing**: Can we do faster?

indexing?

In [15]:
from collections import defaultdict

In [16]:
ent_idx = defaultdict(list)

In [17]:
ds[0]['ner']['label']

['person', 'nightclub', 'ID card number', 'credit card number']

In [18]:
%%time
for idx, s in enumerate(ds):
    for ent in s['ner']['label']: ent_idx[ent].append(idx)

CPU times: user 707 ms, sys: 3.98 ms, total: 711 ms
Wall time: 709 ms


Let's see how many example we have of `identifier`

In [19]:
[print(ds[i]) for i in ent_idx['identifier']]

{'tokenized_text': ['The', 'Ministry', 'of', 'Health', 'and', 'Population', 'in', 'Antananarivo', 'has', 'released', 'a', 'new', 'policy', 'regarding', 'the', 'collection', 'and', 'management', 'of', 'medical', 'records', '.', 'All', 'citizens', 'are', 'required', 'to', 'submit', 'their', 'personal', 'information', ',', 'including', 'name', ',', 'date', 'of', 'birth', ',', 'and', 'Social', 'Security', 'Number', ',', 'to', 'their', 'local', 'health', 'center', 'for', 'the', 'creation', 'of', 'an', 'electronic', 'medical', 'record', '.', 'Patients', 'can', 'access', 'their', 'records', 'using', 'a', 'unique', 'identifier', ',', 'which', 'is', 'a', 'combination', 'of', 'their', 'first', 'five', 'letters', 'of', 'their', 'last', 'name', 'and', 'the', 'last', 'four', 'digits', 'of', 'their', 'Social', 'Security', 'Number', '.', 'For', 'instance', ',', 'a', 'patient', 'named', 'Rasoanaivo', ',', 'born', 'on', '15th', 'October', '1985', ',', 'with', 'SSN', '501-02-1234', ',', 'will', 'have', 

[None, None, None, None, None, None]

Okay, only 10, that's weak and odd

In [20]:
def find_matching_ents(query, all_ents=unique_ents, case_insensitive=True):
    q = query.lower() if case_insensitive else query
    return [e for e in all_ents if q in (e.lower() if case_insensitive else e)]

In [61]:
find_matching_ents("financial", unique_ents)

['financial record',
 'financial_institution',
 'financial aid',
 'financial institution',
 'financial information']

That's more like it!

In [22]:
import random

In [23]:
matching_idxs = []

In [24]:
ent_idx['identifier']

[433, 1337, 1337, 1568, 1796, 3069]

In [44]:
def get_random_sample_for_ent(ent, ds, ent_idx, unique_ents):
    matching_idxs = []
    matches = find_matching_ents(ent, unique_ents)
    if not matches: return None
    # print("Matches:", matches)
    matching_idxs = [idx for a in matches for idx in ent_idx[a]]
    # print("Matching Idxs:", matching_idxs)
    idx = random.choice(matching_idxs)
    print("idx",idx)
    return ' '.join(ds[idx]['tokenized_text'])

In [45]:
f = get_random_sample_for_ent('identifier', ds, ent_idx, unique_ents) ; f

idx 1322


'Imam Muhammad Al-Hanafi , a religious leader at Masjid Agung Nusantara , Jakarta , Indonesia , has been diagnosed with a heart condition and has been prescribed medication for his treatment . His medical records state that he was born on February 21 , 1972 , and his Social Security Number is 410-11-1234 . He has provided the following details for insurance purposes : Health Insurance ID Number : 123456789 , IP Address : 192 . 168 . 1 . 1 , and Biometric Identifier : Fingerprint Scan - Right Hand . He can be contacted via email at imam . alhanafi @ masjidagungnusantara . id and via phone number + 62 21 345 6789 .'

In [56]:
def create_prompt(phi_label: str, report_type: str, inspiration: str) -> str:
    return f"""Generate a single, realistic sentence from a de-identified clinical report.

Instructions:
1. The sentence must include exactly one PHI placeholder of the following type: <{phi_label}>.
2. The sentence should be medically plausible and thematically related to: "{inspiration}".
3. Do not include any other PHI placeholders.
4. Do not use real personal information — only placeholders.
5. Avoid generic phrasing. Keep it natural and contextual.

Report type: {report_type}

Output:
{{
  "text": "The patient's insurance number <{phi_label}> was used to process the radiology claim.",
  "labels": [["<{phi_label}>", "{phi_label}"]]
}}"""


In [81]:
import random
from pathlib import Path

class ReportTypes:
    "Clinical report templates and single-snippet sampler."

    SNIPPETS = {
        "Radiology": [
            "PATIENT NAME: <Name>",
            "ID NUMBER: <Identifier>",
            "REFERRING PHYSICIAN: <Name>",
            "DATE OF SERVICE: <Date>",
            "EXAM: CT Abdomen and Pelvis with contrast",
            "INDICATION: Abdominal pain",
            "TECHNIQUE: Axial images obtained with IV contrast",
            "FINDINGS:",
            "IMPRESSION: No acute intra-abdominal findings."
        ],
        "Discharge Summary": [
            "RE: <Name>",
            "DATE: <Date>",
            "MR: <Identifier>",
            "DOB: <Date>",
            "DATE OF ADMISSION: <Date>",
            "DATE OF DISCHARGE: <Date>",
            "DISCHARGE DIAGNOSIS:\n  - Hypertension\n  - Acute kidney injury",
            "DISCHARGE INSTRUCTIONS: Resume home medications and follow up with nephrology in 1 week."
        ],
        "Pathology": [
            "SPECIMEN: Left breast biopsy",
            "CLINICAL HISTORY: 3 cm mass in left breast on mammogram",
            "DIAGNOSIS:\n  Invasive ductal carcinoma, grade 2 of 3"
        ],
        "Referral Note": [
            "REASON FOR REFERRAL: Neurology consultation for evaluation of seizure",
            "REFERRED BY: <Name>",
            "HISTORY OF PRESENT ILLNESS:\n  <Name> is a 59-year-old, right-handed woman with a history of hypertension and recent onset seizures.",
            "PLAN: EEG, MRI brain, start levetiracetam"
        ]
    }

    @classmethod
    def random_snippet(cls) -> str:
        "Return a single random snippet from any report type."
        report_type = random.choice(list(cls.SNIPPETS.keys()))
        return random.choice(cls.SNIPPETS[report_type])

In [83]:
q = 'financial' 
rt = ReportTypes.random_snippet() ; print(rt)
f = get_random_sample_for_ent('financial', ds, ent_idx, unique_ents); print(f)

REFERRING PHYSICIAN: <Name>
idx 1610
On March 12 , 2023 , Ahmed Bouhafs from Algiers initiated a wire transfer of $ 50 , 000 from his Citibank account ( account number : 123456789 ) to his wife ' s account at Wells Fargo ( account number : 234567890 ) . He used the routing numbers 021000021 and 121000024 for Citibank and Wells Fargo , respectively . The transaction was confirmed with a One-Time Password ( OTP ) sent to his mobile number + 1 234 567 8901 .


In [85]:
# create_prompt(q, rt, f, PHI)

In [86]:
{
  "text": "A billing review noted that payment for the cardiac procedure was completed using the patient’s wire transfer ID <financial>.",
  "labels": [["<financial>", "financial"]]
}


{'text': 'A billing review noted that payment for the cardiac procedure was completed using the patient’s wire transfer ID <financial>.',
 'labels': [['<financial>', 'financial']]}

Okay, not bad. Two things. I'd like to add some context around what is 'financial' data. Can we get formats or report Types rather than just names? Archita has a resource we can take advantage of.

In [87]:
PHI_HINTS = {
    "financial":        "Account numbers or billing IDs, possibly insurance-related.",
    "patient_demo":     "Patient's date of birth.",
    "practitioner":     "Full name of the treating clinician.",
    "professional":     "Initials of the treating clinician.",
    "patient_location": "Residential or hospital-provided address, including postal codes and provinces.",
    "name":             "Full name of the patient.",
    "encounter_date":   "Timestamp of a clinical encounter, e.g. 2015-10-13T11:01:00-05:00.",
    "patient_contact":  "Phone numbers, but may include addresses or physical contact information.",
    "patient_id":       "Identifiers like OHIP numbers, MRNs, JHN IDs, or CMRs.",
    "organization":     "Hospital departments, wings, or named units."
}

In [88]:
PHI_HINTS.get('financial')

'Account numbers or billing IDs, possibly insurance-related.'

In [89]:
def create_prompt(phi_label: str, report_type: str, inspiration: str, phi_hints: dict) -> str:
    hint = phi_hints.get(phi_label, "")
    return f"""Generate a single, realistic sentence from a de-identified clinical report.

Instructions:
1. The sentence must include exactly one PHI placeholder of the following type: <{phi_label}>.
2. The placeholder represents: {hint}
3. The sentence should be medically plausible and thematically related to: "{inspiration}".
4. Do not include any other PHI placeholders.
5. Do not use real personal information — only placeholders.
6. Avoid generic phrasing. Keep it natural and contextual.

Report type: {report_type}

Output:
[
  {{
    "text": "Patient <Name> was seen for follow-up.",
    "labels": [["<Name>", "Name"]]
  }},
  {{
    "text": "The appointment was scheduled for <Date>.",
    "labels": [["<Date>", "Date"]]
  }},
  {{
    "text": "The CT scan was reviewed at <Hospital>.",
    "labels": [["<Hospital>", "Hospital"]]
  }},
  {{
    "text": "Results were mailed to <Location>.",
    "labels": [["<Location>", "Location"]]
  }},
  {{
    "text": "Billing was completed using ID <Identifier>.",
    "labels": [["<Identifier>", "Identifier"]]
  }},
  {{
    "text": "Please confirm via <Contact> before arrival.",
    "labels": [["<Contact>", "Contact"]]
  }}
]"""


In [94]:
q = 'identifier' 
rt = ReportTypes.random_snippet() ; print(rt)
f = get_random_sample_for_ent('identifier', ds, ent_idx, unique_ents); print(f)

DIAGNOSIS:
  Invasive ductal carcinoma, grade 2 of 3
idx 1893
At the sunny beach of Stavanger , Anne Kristin Olsen from Oslo was sunbathing with her friends . She was sipping on her piña colada , while her children played nearby in the sand . Suddenly , Anne Kristin felt unwell and had to be taken to the hospital . Her medical records revealed that she had a rare heart condition . The doctors asked for her personal information to provide her with the best possible care . Anne Kristin provided her passport number XY765238 , her driver ' s license number AB123456 , and her health insurance ID number NO987654 . The hospital staff also took her fingerprints and asked for her IP address to ensure proper identification and treatment .


In [95]:
create_prompt(q, rt, f, PHI_HINTS)

'Generate a single, realistic sentence from a de-identified clinical report.\n\nInstructions:\n1. The sentence must include exactly one PHI placeholder of the following type: <identifier>.\n2. The placeholder represents: \n3. The sentence should be medically plausible and thematically related to: "At the sunny beach of Stavanger , Anne Kristin Olsen from Oslo was sunbathing with her friends . She was sipping on her piña colada , while her children played nearby in the sand . Suddenly , Anne Kristin felt unwell and had to be taken to the hospital . Her medical records revealed that she had a rare heart condition . The doctors asked for her personal information to provide her with the best possible care . Anne Kristin provided her passport number XY765238 , her driver \' s license number AB123456 , and her health insurance ID number NO987654 . The hospital staff also took her fingerprints and asked for her IP address to ensure proper identification and treatment .".\n4. Do not include an

## Nursing Notes; see Archita's suggestion + other notes

TBD