In [None]:
import pandas as pd
from openai import OpenAI
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
from tqdm import tqdm

client = OpenAI(api_key = "")

df = pd.read_csv("500_patients.csv")
df = df.sample(300, random_state=42).reset_index(drop=True)
df.to_csv("300_patients.csv", index=False)

In [2]:
# RQ1 Bullet Prompt (Direct)

def patient_to_bullet_prompt(row):
    gender = "male" if row["is_male"] == 1 else "female"

    # chronic conditions
    conditions = []
    if row["diabetes"]: conditions.append("diabetes")
    if row["hypertension"]: conditions.append("hypertension")
    if row["hyperlipidemia"]: conditions.append("hyperlipidemia")
    if row["obesity"]: conditions.append("obesity")
    if row["cancer"]: conditions.append("cancer")
    if row["heart_failure"]: conditions.append("heart failure")
    cond_text = ", ".join(conditions) if conditions else "none"

    # medications
    meds = []
    if row["insulin"]: meds.append("insulin")
    if row["metformin"]: meds.append("metformin")
    if row["statin"]: meds.append("statins")
    if row["ace_inhibitor"]: meds.append("ACE inhibitor")
    if row["anticoagulant"]: meds.append("anticoagulant")
    med_text = ", ".join(meds) if meds else "none"

    prompt = f"""
Patient Summary:
- Age: {row['age']}
- Sex: {gender}
- Chronic Conditions: {cond_text}
- Total Encounters: {row['encounter_count']}
- Emergency Visits: {row['er_visits']}
- Inpatient Stays: {row['inpatient_visits']}
- Ambulatory Visits: {row['ambulatory_visits']}
- Total Medications: {row['medication_count']}
- Key Medications: {med_text}
- Lab Tests Performed: {row['lab_measurement_count']}
- Major Surgeries: {row['major_surgery_count']}

Task:
Based on this patient summary, assess overall mortality risk.
Answer with only one word: "yes" (patient will die) or "no" (patient will not die).
"""
    return prompt.strip()

# CoT Prompt (RQ2)

def patient_to_bullet_cot_prompt(row):
    base = patient_to_bullet_prompt(row)
    cot = """
Think step by step about the patient's clinical risk factors.
Explain your reasoning process clearly.
Then give the final answer in one word: yes or no.
"""
    return base + cot

In [3]:
# LLM Predict (single-run)
def llm_predict(prompt):
    completion = client.chat.completions.create(
        model="gpt-4.1",
        messages=[{"role": "user", "content": prompt}],
        temperature=0
    )
    content = completion.choices[0].message.content.lower()
    return 1 if "yes" in content else 0

# CoT stability: repeated prediction (RQ2)

def llm_predict_repeat(prompt, n=3):
    results = [llm_predict(prompt) for _ in range(n)]
    
    final_pred = 1 if sum(results) >= 2 else 0

    is_consistent = len(set(results)) == 1

    return final_pred, is_consistent


In [6]:
bullet_preds = []
for _, row in df.iterrows():
    prompt = patient_to_bullet_prompt(row)
    pred = llm_predict(prompt)
    bullet_preds.append(pred)

df["pred_bullet"] = bullet_preds

y_true = df["mortality"].values
y_pred = df["pred_bullet"].values

print("\n===== BULLET-STYLE SUMMARY RESULTS (Bullet Prompt → LLM) =====")
print("F1:", f1_score(y_true, y_pred))
print("AUROC:", roc_auc_score(y_true, y_pred))
print("AUPRC:", average_precision_score(y_true, y_pred))


===== BULLET-STYLE SUMMARY RESULTS (Bullet Prompt → LLM) =====
F1: 0.4
AUROC: 0.6826923076923077
AUPRC: 0.23285714285714287


In [8]:
cot_preds = []
cot_stable = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Running CoT"):
    prompt = patient_to_bullet_cot_prompt(row)

    final_pred, is_consistent = llm_predict_repeat(prompt, n=3)

    cot_preds.append(final_pred)
    cot_stable.append(is_consistent)

df["pred_cot"] = cot_preds
df["cot_stable"] = cot_stable


print("\n===== BULLET + COT =====")
print("F1:", f1_score(y_true, df["pred_cot"]))
print("AUROC:", roc_auc_score(y_true, df["pred_cot"]))
print("AUPRC:", average_precision_score(y_true, df["pred_cot"]))

print("\nCoT Stability Rate:", df["cot_stable"].mean())

Running CoT: 100%|██████████| 300/300 [2:10:38<00:00, 26.13s/it]  


===== BULLET + COT =====
F1: 0.2909090909090909
AUROC: 0.5865384615384615
AUPRC: 0.21333333333333335

CoT Stability Rate: 0.9766666666666667





In [4]:
def patient_to_bullet_cot_prompt2(row):
    base = patient_to_bullet_prompt(row)
    cot = """
Think step by step by evaluating:
- Age-related risk
- Comorbidity burden
- Recent surgeries
- Healthcare utilization patterns

Keep the reasoning short and factual.
Base your reasoning strictly on the provided patient summary.
Then give the final answer in one word: yes or no.
"""
    return base + cot

In [11]:
cot_preds = []
cot_stable = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="Running CoT"):
    prompt = patient_to_bullet_cot_prompt2(row)

    final_pred, is_consistent = llm_predict_repeat(prompt, n=3)

    cot_preds.append(final_pred)
    cot_stable.append(is_consistent)

df["pred_cot"] = cot_preds
df["cot_stable"] = cot_stable


print("\n===== BULLET + COT =====")
print("F1:", f1_score(y_true, df["pred_cot"]))
print("AUROC:", roc_auc_score(y_true, df["pred_cot"]))
print("AUPRC:", average_precision_score(y_true, df["pred_cot"]))

print("\nCoT Stability Rate:", df["cot_stable"].mean())

Running CoT: 100%|██████████| 300/300 [25:15<00:00,  5.05s/it]


===== BULLET + COT =====
F1: 0.43956043956043955
AUROC: 0.6903846153846154
AUPRC: 0.2627450980392157

CoT Stability Rate: 0.92





In [7]:
# RQ3 with guideline context

# prompt + knowledge
def patient_to_knowledge_prompt(row):
    base = patient_to_bullet_prompt(row)

    knowledge = """
Clinical Knowledge:
- Older age substantially increases mortality risk.
- Multiple chronic diseases (e.g., diabetes, hypertension, heart failure, cancer) indicate higher risk.
- Recent major surgeries raise short-term mortality risk.
- Frequent emergency or inpatient visits reflect clinical instability.
- Greater medication use may indicate more severe or complex illness.
"""

    task = """
Using the clinical knowledge above, assess the patient's overall mortality risk.
Answer with only one word: yes (patient will die) or no (patient will not die).
"""
    return base + knowledge + task



In [12]:
knowledge_only_preds = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="RQ3-1: Knowledge Only"):
    prompt = patient_to_knowledge_prompt(row)
    pred = llm_predict(prompt)
    knowledge_only_preds.append(pred)

df["pred_rq3_knowledge"] = knowledge_only_preds

y_true = df["mortality"].values

print("\n===== RQ3-1: Knowledge Only =====")
print("F1:", f1_score(y_true, df["pred_rq3_knowledge"]))
print("AUROC:", roc_auc_score(y_true, df["pred_rq3_knowledge"]))
print("AUPRC:", average_precision_score(y_true, df["pred_rq3_knowledge"]))

RQ3-1: Knowledge Only: 100%|██████████| 300/300 [02:14<00:00,  2.23it/s]


===== RQ3-1: Knowledge Only =====
F1: 0.4339622641509434
AUROC: 0.7048076923076924
AUPRC: 0.2570454545454546





In [13]:
# Knowledge + CoT 

def patient_to_knowledge_cot_prompt(row):
    base = patient_to_bullet_prompt(row)

    knowledge = """
Clinical Knowledge:
- Older age substantially increases mortality risk.
- Multiple chronic diseases (e.g., diabetes, hypertension, heart failure, cancer) indicate higher risk.
- Recent major surgeries raise short-term mortality risk.
- Frequent emergency or inpatient visits reflect clinical instability.
- Greater medication use may indicate more severe or complex illness.
"""

    cot = """
Using the clinical knowledge above, evaluate the patient's mortality risk step by step:
- Age risk:
- Comorbidity risk:
- Surgery risk:
- Healthcare utilization risk:

Provide a concise reasoning based strictly on the patient summary.
Final answer in one word: yes or no.
"""

    return base + knowledge + cot


In [14]:
knowledge_cot_preds = []
knowledge_cot_stable = []

for _, row in tqdm(df.iterrows(), total=len(df), desc="RQ3-2: Knowledge + CoT"):
    prompt = patient_to_knowledge_cot_prompt(row)
    final_pred, is_consistent = llm_predict_repeat(prompt, n=3)

    knowledge_cot_preds.append(final_pred)
    knowledge_cot_stable.append(is_consistent)

df["pred_rq3_knowledge_cot"] = knowledge_cot_preds
df["stable_rq3_knowledge_cot"] = knowledge_cot_stable

print("\n===== RQ3-2: Knowledge + CoT =====")
print("F1:", f1_score(y_true, df["pred_rq3_knowledge_cot"]))
print("AUROC:", roc_auc_score(y_true, df["pred_rq3_knowledge_cot"]))
print("AUPRC:", average_precision_score(y_true, df["pred_rq3_knowledge_cot"]))
print("CoT Stability:", df["stable_rq3_knowledge_cot"].mean())

RQ3-2: Knowledge + CoT: 100%|██████████| 300/300 [27:50<00:00,  5.57s/it]


===== RQ3-2: Knowledge + CoT =====
F1: 0.4523809523809524
AUROC: 0.6894230769230769
AUPRC: 0.2751136363636364
CoT Stability: 0.9533333333333334



