In [24]:
!pip install stanza



In [25]:
import os
import re
import random

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


In [26]:
# ==== File path ====
CSV_PATH = 'fake_notes.csv'

# ==== Column names in your CSV ====
ID_COL    = 'patient_id'
TEXT_COL  = 'patient_note'
TIME_COL  = 'timestamp'
SEX_COL   = 'sex'
ETHN_COL  = 'ethnicity'
LABEL_COL = 'anxiety'

# ==== Debiasing toggles ====
RUN_LEN_FILTER    = True   # keep top sentences by importance
RUN_LEN_RANDOM    = True   # random sentence selection with same ratio
RUN_GENDER_DEBIAS = True  # pronoun-only gender debiasing (no name heuristics)

# ==== Length filtering parameters ====
LEN_KEEP_RATIO     = 0.8
MIN_SENTENCES_KEEP = 1

# ==== Output path ====
OUT_PATH = os.path.splitext(CSV_PATH)[0] + '_debias.csv'


In [27]:
def split_into_sentences(text: str):
    """Split text into sentences using a simple regex rule.

    If no clear sentence boundary is found, return the whole text as one sentence.
    """
    if not isinstance(text, str) or not text.strip():
        return []
    t = re.sub(r'\s+', ' ', text).strip()
    sentences = re.split(r'(?<=[.!?])\s+', t)
    return [s.strip() for s in sentences if s.strip()] or [t]

def build_tfidf_vectorizer(texts):
    """Fit a TF-IDF vectorizer over the full corpus."""
    vec = TfidfVectorizer(lowercase=True, token_pattern=r'\b\w+\b')
    vec.fit(texts)
    return vec

def sentence_importance(sentences, vec):
    """Compute a simple importance score per sentence based on mean TF-IDF.

    Each sentence is embedded via TF-IDF, and we take the mean weight over
    all tokens in the sentence as its importance.
    """
    if not sentences:
        return []
    X = vec.transform(sentences)
    # Mean TF-IDF value per sentence row
    scores = np.asarray(X.mean(axis=1)).ravel()
    return scores

def filter_by_importance(text, vec, keep_ratio: float = 0.8, min_keep: int = 1):
    """Keep the most important sentences according to TF-IDF scores.

    Sentences are ranked by importance and we keep enough top-ranked ones
    to satisfy `keep_ratio` of the original sentence count, but at least
    `min_keep` sentences.
    """
    if not isinstance(text, str) or not text.strip():
        return text
    sentences = split_into_sentences(text)
    if len(sentences) <= min_keep:
        return text
    scores = sentence_importance(sentences, vec)
    n_sent = len(sentences)
    k = max(min_keep, int(round(n_sent * keep_ratio)))
    k = min(k, n_sent)
    # Top-k sentence indices by score (descending importance)
    top_idx = np.argsort(-scores)[:k]
    top_idx = sorted(top_idx)
    kept = [sentences[i] for i in top_idx]
    return ' '.join(kept)

def filter_by_random(text, keep_ratio: float = 0.8, min_keep: int = 1, rng=None):
    """Randomly keep a subset of sentences with the same ratio as importance-based filtering.

    This acts as a random baseline for sentence dropping.
    """
    if not isinstance(text, str) or not text.strip():
        return text
    if rng is None:
        rng = random
    sentences = split_into_sentences(text)
    n_sent = len(sentences)
    if n_sent <= min_keep:
        return text
    k = max(min_keep, int(round(n_sent * keep_ratio)))
    k = min(k, n_sent)
    idx = list(range(n_sent))
    rng.shuffle(idx)
    kept_idx = sorted(idx[:k])
    kept = [sentences[i] for i in kept_idx]
    return ' '.join(kept)

import stanza

# Initialise Stanza English pipeline with NER
try:
    nlp = stanza.Pipeline(
        lang="en",
        processors="tokenize,ner",
        use_gpu=False
    )
    print("Stanza NER pipeline initialised.")
except Exception as e:
    nlp = None
    print("WARNING: Stanza pipeline could not be initialised:", e)

# Pronoun-only gender debiasing
PRONOUN_MAP = {
    'she': 'they',
    'he': 'they',
    'her': 'their',
    'him': 'them',
    'hers': 'theirs',
    'his': 'theirs',
    'herself': 'themselves',
    'himself': 'themselves',
}

def replace_names_and_pronouns(text: str) -> str:
    """
    Debias text by:
      1. Detecting PERSON names using Stanza NER.
      2. Assigning each distinct PERSON name a stable placeholder Person1, Person2, Person3...
      3. Replacing gendered pronouns using PRONOUN_MAP.

    No heuristic name detection is used — only Stanza NER.
    """
    if not isinstance(text, str) or not text.strip():
        return text

    out = text

    # ---- 1. Identify PERSON entities and assign placeholders ----
    name_map = {}       # original_name -> PersonX
    replacements = []   # list of (start, end, original_name)

    if nlp is not None:
        try:
            doc = nlp(text)
            for ent in doc.entities:
                if ent.type == "PERSON":
                    replacements.append((ent.start_char, ent.end_char, ent.text))
        except Exception as e:
            print("WARNING: Stanza NER failed; skipping name replacement.", e)

    # Build stable mapping: first name -> Person1, second -> Person2, ...
    next_id = 1
    for _, _, original_name in replacements:
        key = original_name.strip()
        if key not in name_map:
            name_map[key] = f"Person{next_id}"
            next_id += 1

    # Replace from end to start to avoid shifting character offsets
    for start, end, original_name in sorted(replacements, key=lambda x: -x[0]):
        placeholder = name_map[original_name.strip()]

        # Capitalization logic
        if original_name.isupper():
            placeholder = placeholder.upper()
        elif original_name[0].isupper():
            placeholder = placeholder.capitalize()
        else:
            placeholder = placeholder.lower()

        out = out[:start] + placeholder + out[end:]

    # ---- 2. Replace gendered pronouns ----
    def pronoun_repl(match):
        src = match.group(0)
        key = src.lower()
        tgt = PRONOUN_MAP.get(key)
        if not tgt:
            return src
        return tgt.capitalize() if src[0].isupper() else tgt

    pattern = r"\b(she|he|her|him|hers|his|herself|himself)\b"
    out = re.sub(pattern, pronoun_repl, out, flags=re.IGNORECASE)

    return out

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.11.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package                   |
-----------------------------------------
| tokenize  | combined                  |
| mwt       | combined                  |
| ner       | ontonotes-ww-multi_charlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Stanza NER pipeline initialised.


## Build corpus TF-IDF and run debiasing

In [28]:
# Load CSV
df = pd.read_csv(CSV_PATH)
assert TEXT_COL in df.columns, f"TEXT_COL '{TEXT_COL}' missing"
print('Rows:', len(df))

# Build TF-IDF vectorizer only if needed
if RUN_LEN_FILTER or RUN_LEN_RANDOM:
    corpus = df[TEXT_COL].fillna('').astype(str).tolist()
    tfidf_vec = build_tfidf_vectorizer(corpus)
else:
    tfidf_vec = None

# Length-based filtering
if RUN_LEN_FILTER and tfidf_vec is not None:
    df['note_len_filtered'] = df[TEXT_COL].fillna('').astype(str).apply(
        lambda t: filter_by_importance(
            t,
            vec=tfidf_vec,
            keep_ratio=LEN_KEEP_RATIO,
            min_keep=MIN_SENTENCES_KEEP,
        )
    )

if RUN_LEN_RANDOM:
    df['note_len_random'] = df[TEXT_COL].fillna('').astype(str).apply(
        lambda t: filter_by_random(
            t,
            keep_ratio=LEN_KEEP_RATIO,
            min_keep=MIN_SENTENCES_KEEP,
        )
    )

# Optional pronoun-only gender debiasing
if RUN_GENDER_DEBIAS:
    df['note_gender_debiased'] = df[TEXT_COL].fillna('').astype(str).apply(
        replace_names_and_pronouns
    )

df.to_csv(OUT_PATH, index=False)
print('Saved to:', OUT_PATH)

df.head(3)


Rows: 300
Saved to: fake_notes_debias.csv


Unnamed: 0,patient_id,timestamp,sex,ethinicity,ethnicity,anxiety,patient_note,note_len_filtered,note_len_random,note_gender_debiased
0,9208,2024-10-29 21:14:00,Female,,White,no,Physical exam unremarkable; vitals within norm...,Physical exam unremarkable; vitals within norm...,not currently anxious. Jordan Walker completed...,Physical exam unremarkable; vitals within norm...
1,9667,2023-12-10 07:06:00,Other,,Black,yes,Noted improved mood since last visit. Dr. Mart...,Noted improved mood since last visit. Martinez...,Noted improved mood since last visit. Dr. Mart...,Noted improved mood since last visit. Dr. Pers...
2,9240,2023-01-25 12:06:00,Male,Hispanic or Latino,,yes,Patient engaged in mindfulness and relaxation ...,Patient engaged in mindfulness and relaxation ...,Patient engaged in mindfulness and relaxation ...,Patient engaged in mindfulness and relaxation ...
