In [0]:
%pip install -q -U google-genai

In [0]:
%restart_python

In [0]:
user_prompt = """You are a clinical information extraction engine.
Task: For each input clinical note, output ONE JSON object (NDJSON format) using the provided schema.
Rules:
- Ground everything in the note. Do NOT infer or convert units. Preserve units verbatim.
- Assign labels only if explicitly stated. Do not mark “suspected” unless the note literally says so.
- If a field is absent or unclear, set null or "unknown" as specified. No hallucinations.
- Output NDJSON only. No prose, no headings, no extra text.
"""

In [0]:
system_prompt = """SCHEMA KEYS (must appear; use null/"unknown" as directed):
note_id, source{system, document_name, encounter_date, care_setting}, patient{age_years, sex_assigned_at_birth, gender_identity},
diagnosis_mentions[], training_label, label_rationale, label_confidence,
salt_wasting_features{...},
genetics{cyp21a2_confirmed, variants[...]},
key_labs_near_note{sodium, potassium, 17_ohp, acth, androstenedione, cortisol, dhea_s, pra, aldosterone, lh, fsh, testosterone_total, testosterone_free},
historical_diagnostic_highs{newborn_screen_17_ohp, max_17_ohp, max_acth},
imaging{tart_present, testicular_ultrasound_summary, adrenal_imaging_summary},
reproductive_endocrine{azoospermia, hypogonadism, semen_analysis_date},
treatment{glucocorticoid{name, dose_detail}, mineralocorticoid{name, dose_detail}, stress_dose_given, adherence_issue_noted},
comorbid_context{hypertension_or_bp_concern, adrenal_neoplasm_history, cushing_disease_history, meningioma_history, bone_density_issue},
evidence{confirmed_label_quote, suspected_label_quote, salt_wasting_quote, genetics_quote, imaging_quote, treatment_quote},
extraction_meta{model, prompt_version, processed_at, has_potential_typos_or_implausible_values}
IMPORTANT RULES:
1) Units: Preserve ground-truth units as written (e.g., "ng/dL", "µg/dL", "ng/L", "ng/mL/h"). Do not convert values or ranges.
   If unit is missing but clearly implied by a labeled table, copy the unit string as shown in that table; else set to null.
   Keep a short "verbatim" field (≤60 chars) with the surrounding snippet if available.
2) Key labs: Choose the result closest to encounter_date within ±30 days. If multiple are equidistant, pick the most recent prior to encounter_date.
   If no date is available, pick the single most prominently presented value and set date=null. If a value uses qualifiers (e.g., ">10000"), set numeric value if parseable and keep the original in "verbatim".
3) Labels:
   - diagnosis_mentions: capture each explicit phrase; classify status = confirmed/suspected/ruled_out; subtype = classic/non_classic/unspecified.
   - training_label:
       * "classic" only if a confirmed classic CAH mention exists and is not negated.
       * "non_classic" only if a confirmed non-classic CAH mention exists and is not negated.
       * Otherwise "none".
   - label_rationale: one brief sentence citing the explicit phrase and (if present) date.
   - label_confidence: 0.9–1.0 when an unambiguous explicit statement exists; otherwise 0.3–0.6.
4) Evidence quotes: Provide compact quotes (≤160 chars) directly copied from the note for audit (e.g., "confirms a diagnosis of classic CAH").
5) Safety & plausibility: If a value seems implausible or has obvious typos (e.g., glucose 897 mg/dL), still extract it literally and set has_potential_typos_or_implausible_values="yes".
6) Output: NDJSON only. Each note becomes exactly one JSON line. Do not emit arrays of objects; emit one object per line.
"""

In [0]:
from google import genai
from google.genai.types import GenerateContentConfig
client = genai.Client(api_key="AIzaSyBlwBDNErWgteUDa7Mks68CzFPl7EJ7iUo")

gz_file_path = "/Volumes/workspace/default/physician_notes_cah/combined_json_zip.gz"

df = spark.read.option(
  "multiline",
  "true"
).json(
  gz_file_path
)

data = df.select("note_text").toPandas()


# Call Gemini API
def generate_content(note_text, user_prompt, system_prompt):
    if not note_text or not isinstance(note_text, str):
        return None
    #prompt = f"{system_prompt}\n{user_prompt}\n{note_text}"
    # Build contents properly
    contents = [
        {
            "role": "user",
            "parts": [
                {"text": user_prompt},
                {"text": note_text}
            ]
        }
    ]
    response = client.models.generate_content(
    model="gemini-2.5-flash",
    contents=contents,
    config=GenerateContentConfig(
        system_instruction=system_prompt
    )
    )
    return response.text
data["response"] = data["note_text"].apply(
  lambda x: generate_content(x, user_prompt, system_prompt)
)

display(data)


In [0]:
from google import genai

client = genai.Client(api_key="AIzaSyBlwBDNErWgteUDa7Mks68CzFPl7EJ7iUo")

gz_file_path = "/Volumes/workspace/default/physician_notes_cah/combined_json_zip.gz"

df = spark.read.option(
  "multiline",
  "true"
).json(
  gz_file_path
)

data = df.select("note_text").toPandas()

def generate_content(note_text, user_prompt, system_prompt):
    if not note_text or not isinstance(note_text, str):
        return None
    prompt = f"{system_prompt}\n{user_prompt}\n{note_text}"
    response = client.models.generate_content(
      model="gemini-2.5-flash",
      contents=prompt
    )
    return response.text

data["response"] = data["note_text"].apply(
  lambda x: generate_content(x, user_prompt, system_prompt)
)

display(data)

In [0]:
output_path = "/Volumes/workspace/default/physician_notes_cah/gemini_ndjson_output.json"

# Write each response as a line in NDJSON format
with open(output_path, "w") as f:
    for line in data["response"]:
        if line:  # skip empty responses
            f.write(line.strip() + "\n")

In [0]:
import json

output_path = "/Volumes/workspace/default/physician_notes_cah/gemini_ndjson_output.json"

with open(output_path, "w") as f:
    for line in data["response"]:
        if not line:
            continue
        try:
            obj = json.loads(line)
            if "note_id" in obj:
                f.write(json.dumps(obj) + "\n")
            else:
                print("Missing note_id:", line)
        except json.JSONDecodeError:
            print("Invalid JSON:", line)