# 4) Zero Shot Prompt Engineering for Bias Mitigation

In [1]:
import time
import pandas as pd
import matplotlib.pyplot as plt
import json, re
from prompts import (
    prompt_template_no_race,
    prompt_template_with_race,
    prompt_engineered_no_race,               # with fairness phrasing no race
    prompt_engineered_with_race,             # with fairness phrasing and race
    prompt_template_with_race_reasoning,      # (with justification after - with race)
    prompt_template_no_race_reason_Inverse,    # (with justification before - no race)
    prompt_template_with_race_reason_Inverse,  # (with justification before - with race)
    prompt_template_no_race_reason_first,     # CoT no race
    prompt_template_with_race_reason_first,   # CoT with race
    prompt_engineered_with_race_reason_first, # engineered CoT with race
)
from IPython.display import display
from Key import gemini_model, openai_client, GPT5_NANO_MODEL
from openai import APIError, RateLimitError, APITimeoutError, APIConnectionError
from pathlib import Path
import time, random
from google.api_core import exceptions as gax_exc

In [6]:
data_balanced_100 =  pd.read_csv("(100)Dataset_for_LLM_synthetic.csv", low_memory=False)
display(data_balanced_100.head())



Unnamed: 0,derived_race,action_taken,loan_amount,loan_to_value_ratio,property_value,income,debt_to_income_ratio,applicant_age
0,White,0,555000.0,94.99,585000,91000.0,55.0,39.5
1,White,0,225000.0,97.0,235000,27000.0,60.0,24.0
2,White,0,315000.0,100.0,325000,81000.0,38.0,29.5
3,White,1,225000.0,80.0,275000,50000.0,40.0,59.5
4,White,1,305000.0,90.0,335000,87000.0,47.0,49.5


# GEMINI 2.5 FLASH LITE

In [39]:
## note this cell was run 6 times, with 6 different seeds. Every run's results have been saved in a deisgnated file, and could be viewed in the results and analyis notebook (#5)

INPUT_RATE  = 0.10 / 1_000_000
OUTPUT_RATE = 0.40 / 1_000_000
OUT_DIR = Path("Dissertation") / "3b) Single LLM Prompt Engineering for Loan Approval Prediction" / "Gemini 2.5 Flash Lite" / "Run 6"
OUT_DIR.mkdir(parents=True, exist_ok=True)
df_source = data_balanced_100  

json_block_pattern = re.compile(r"\{[\s\S]*?\}")
RETRYABLE_EXC = (gax_exc.InternalServerError, gax_exc.ServiceUnavailable, gax_exc.DeadlineExceeded)

def fmt(tmpl: str, row: pd.Series) -> str:
    vals = {k: (None if pd.isna(v) else v) for k, v in row.to_dict().items()}
    return tmpl.format(**vals)

def call_gemini_json(prompt: str, max_retries: int = 2):
    """Single API call with limited internal retries for transient 5xx/timeouts."""
    delay = 1.0
    for attempt in range(1, max_retries + 1):
        try:
            resp = gemini_model.generate_content(
                prompt,
                generation_config={"temperature": 0},
                request_options={"timeout": 60},
            )
            text = (getattr(resp, "text", "") or "").strip()

            parsed, decision = {}, None
            m = json_block_pattern.search(text)
            if m:
                try:
                    parsed = json.loads(m.group(0))
                    val = parsed.get("decision", None)
                    if str(val) in ("0", "1", 0, 1):
                        decision = int(val)
                except Exception:
                    pass

            usage = getattr(resp, "usage_metadata", None)
            return {
                "raw": text,
                "json": parsed,
                "decision": decision,
                "prompt_tokens": getattr(usage, "prompt_token_count", 0) or 0,
                "output_tokens": getattr(usage, "candidates_token_count", 0) or 0,
                "total_tokens": getattr(usage, "total_token_count", 0) or 0,
                "error": None if decision in (0, 1) else "no_decision_or_invalid_json",
            }
        except RETRYABLE_EXC as e:
            if attempt == max_retries:
                return {
                    "raw": "", "json": {}, "decision": None,
                    "prompt_tokens": 0, "output_tokens": 0, "total_tokens": 0,
                    "error": f"{type(e).__name__}: {e}",
                }
            time.sleep(delay + random.uniform(0, 0.5))
            delay *= 2
        except Exception as e:
            return {
                "raw": "", "json": {}, "decision": None,
                "prompt_tokens": 0, "output_tokens": 0, "total_tokens": 0,
                "error": f"{type(e).__name__}: {e}",
            }

def run_over_df(df: pd.DataFrame, tmpl: str, batch_size: int = 50, tries_per_row: int = 3) -> pd.DataFrame:
    """Batches DF and retries each row up to tries_per_row if no valid decision parsed."""
    out = []
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size].copy()
        recs = []
        for _, r in batch.iterrows():
            prompt = fmt(tmpl, r)
            res = None
            for t in range(tries_per_row):
                res = call_gemini_json(prompt, max_retries=2)
                if res.get("decision") in (0, 1):
                    break
                time.sleep(0.3 * (t + 1)) 
            recs.append(res)
            time.sleep(0.02)  
        batch["llm_raw"]        = [x["raw"] for x in recs]
        batch["llm_json"]       = [x["json"] for x in recs]
        batch["llm_decision"]   = pd.to_numeric([x["decision"] for x in recs], errors="coerce")
        batch["prompt_tokens"]  = [x["prompt_tokens"] for x in recs]
        batch["output_tokens"]  = [x["output_tokens"] for x in recs]
        batch["total_tokens"]   = [x["total_tokens"] for x in recs]
        batch["llm_error"]      = [x["error"] for x in recs]
        print(f"Processed rows {i}–{min(i+batch_size, len(df))}")
        out.append(batch)
    return pd.concat(out, ignore_index=True)

def calc_cost(df: pd.DataFrame) -> float:
    df = df.copy()
    df["cost_usd"] = df["prompt_tokens"].fillna(0)*INPUT_RATE + df["output_tokens"].fillna(0)*OUTPUT_RATE
    return float(df["cost_usd"].sum())

def print_group_table(df: pd.DataFrame):
    if "derived_race" not in df.columns:
        print("No 'derived_race' column found.")
        return
    summary = (
        df.groupby("derived_race", dropna=False)["llm_decision"]
          .agg(Total_Evaluated="count", Approved="sum", Approval_Rate="mean")
    )
    print(summary)
    if summary.shape[0] >= 2:
        dp_gap = summary["Approval_Rate"].max() - summary["Approval_Rate"].min()
        print(f"Demographic Parity Gap: {dp_gap:.4f}")

RUNS = {
    "baseline_no_race":                        prompt_template_no_race,
    "baseline_with_race":                      prompt_template_with_race,
    "fairness_no_race":                        prompt_engineered_no_race,
    "fairness_with_race":                      prompt_engineered_with_race,
    "justify_after_with_race":                 prompt_template_with_race_reasoning,
    "justify_before_no_race":                  prompt_template_no_race_reason_Inverse,
    "justify_before_with_race":                prompt_template_with_race_reason_Inverse,
    "CoT_no_race":                             prompt_template_no_race_reason_first,
    "CoT_with_race":                           prompt_template_with_race_reason_first,
    "CoT_Engineered":                          prompt_engineered_with_race_reason_first,
}
for run_name, tmpl in RUNS.items():
    print(f"\n=== RUN: {run_name} ===")
    t0 = time.perf_counter()
    df_res = run_over_df(df_source, tmpl, batch_size=50, tries_per_row=3)
    elapsed = time.perf_counter() - t0

    df_res["elapsed_seconds"] = elapsed  

    csv_path = OUT_DIR / f"{run_name}.csv"
    df_res.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

    overall_rate = df_res["llm_decision"].mean()
    print(f"Overall approval rate: {overall_rate:.4f}")
    print_group_table(df_res)

    cost_usd = calc_cost(df_res)
    total_prompt = df_res["prompt_tokens"].fillna(0).sum()
    total_output = df_res["output_tokens"].fillna(0).sum()
    total_tokens = total_prompt + total_output
    rows_per_sec = len(df_res) / elapsed if elapsed > 0 else float("nan")
    toks_per_sec = total_tokens / elapsed if elapsed > 0 else float("nan")

    parsed_ok = df_res["llm_decision"].notna().sum()
    print(f"Parsed decisions: {parsed_ok}/{len(df_res)}")
    print(f"Run cost: ${cost_usd:.6f}")
    print(f"Time: {elapsed:.2f}s | Rows/sec: {rows_per_sec:.2f} | Tokens/sec: {toks_per_sec:.2f}")


base = OUT_DIR
def group_rates(csv_path: Path):
    df = pd.read_csv(csv_path)
    return df.groupby("derived_race")["llm_decision"].mean()

no_ref = {
    "baseline":        base / "baseline_no_race.csv",
    "justify_after":   base / "baseline_no_race.csv",            
    "justify_before":  base / "justify_before_no_race.csv",
    "CoT":             base / "CoT_no_race.csv",
}
yes_ref = {
    "baseline":        base / "baseline_with_race.csv",
    "justify_after":   base / "justify_after_with_race.csv",
    "justify_before":  base / "justify_before_with_race.csv",
    "CoT":             base / "CoT_with_race.csv",
}

for style in ["baseline", "justify_after", "justify_before", "CoT"]:
    p_no, p_yes = no_ref[style], yes_ref[style]
    if not (p_no.exists() and p_yes.exists()):
        print(f"[{style}] missing CSVs, skipping.")
        continue
    no_rates   = group_rates(p_no)
    with_rates = group_rates(p_yes)
    common = with_rates.index.intersection(no_rates.index)
    tab = (with_rates.loc[common] - no_rates.loc[common]).abs().sum()
    print(f"[{style}] Total Absolute Bias (sum |Δ| across groups): {tab:.4f}")


=== RUN: baseline_no_race ===
Processed rows 0–50
Processed rows 50–100
Processed rows 100–150
Processed rows 150–200
Saved: Dissertation\3b) Single LLM Prompt Engineering for Loan Approval Prediction\Gemini 2.5 Flash Lite\Run 6\baseline_no_race.csv
Overall approval rate: 0.2600
                           Total_Evaluated  Approved  Approval_Rate
derived_race                                                       
Black or African American              100        26           0.26
White                                  100        26           0.26
Demographic Parity Gap: 0.0000
Parsed decisions: 200/200
Run cost: $0.005223
Time: 158.82s | Rows/sec: 1.26 | Tokens/sec: 289.59

=== RUN: baseline_with_race ===
Processed rows 0–50
Processed rows 50–100
Processed rows 100–150
Processed rows 150–200
Saved: Dissertation\3b) Single LLM Prompt Engineering for Loan Approval Prediction\Gemini 2.5 Flash Lite\Run 6\baseline_with_race.csv
Overall approval rate: 0.3550
                           Total_

# GPT 5 NANO

In [50]:
## note this cell was run 6 times, with 6 different seeds (17, 42, 13, 1447, 2003, 25). Every run's results have been saved in a deisgnated file, and could be viewed in the results and analyis notebook (#5)
INPUT_RATE  = 0.05 / 1_000_000    
OUTPUT_RATE = 0.40 / 1_000_000    

OUT_DIR = Path("Dissertation") / "3b) Single LLM Prompt Engineering for Loan Approval Prediction" / "GPT-5 Nano" / "Minimal Level Reasoning" / "Run 3"
OUT_DIR.mkdir(parents=True, exist_ok=True)

df_source = data_balanced_100  

json_block_pattern = re.compile(r"\{[\s\S]*?\}")

RETRYABLE_EXC = (APIError, RateLimitError, APITimeoutError, APIConnectionError)

def fmt(tmpl: str, row: pd.Series) -> str:
    vals = {k: (None if pd.isna(v) else v) for k, v in row.to_dict().items()}
    return tmpl.format(**vals)

def call_gpt5nano_json(prompt: str, max_retries: int = 1):
    for attempt in range(1, max_retries + 1):
        try:
            resp = openai_client.chat.completions.create(
                model=GPT5_NANO_MODEL,
                messages=[{"role": "user", "content": prompt}],
                seed= 17,
                response_format={"type": "text"},
                reasoning_effort="minimal",    
            )
            text = (resp.choices[0].message.content or "").strip()

            parsed, decision = {}, None
            m = json_block_pattern.search(text)
            if m:
                try:
                    parsed = json.loads(m.group(0))
                    val = parsed.get("decision", None)
                    if str(val) in ("0", "1", 0, 1):
                        decision = int(val)
                except Exception:
                    pass

            usage = getattr(resp, "usage", None)
            return {
                "raw": text,
                "json": parsed,
                "decision": decision,
                "prompt_tokens": getattr(usage, "prompt_tokens", 0) or 0,
                "output_tokens": getattr(usage, "completion_tokens", 0) or 0,
                "total_tokens": getattr(usage, "total_tokens", 0) or 0,
                "error": None if decision in (0, 1) else "no_decision_or_invalid_json",
            }
        except RETRYABLE_EXC as e:
            if attempt == max_retries:
                return {
                    "raw": "", "json": {}, "decision": None,
                    "prompt_tokens": 0, "output_tokens": 0, "total_tokens": 0,
                    "error": f"{type(e).__name__}: {e}",
                }
        except Exception as e:
            return {
                "raw": "", "json": {}, "decision": None,
                "prompt_tokens": 0, "output_tokens": 0, "total_tokens": 0,
                "error": f"{type(e).__name__}: {e}",
            }

def run_over_df(df: pd.DataFrame, tmpl: str, batch_size: int = 50, tries_per_row: int = 1) -> pd.DataFrame:
    out = []
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size].copy()
        recs = []
        for _, r in batch.iterrows():
            prompt = fmt(tmpl, r)
            res = None
            for t in range(tries_per_row):
                res = call_gpt5nano_json(prompt, max_retries=1)
                if res.get("decision") in (0, 1):
                    break
            recs.append(res)
        batch["llm_raw"]        = [x["raw"] for x in recs]
        batch["llm_json"]       = [x["json"] for x in recs]
        batch["llm_decision"]   = pd.to_numeric([x["decision"] for x in recs], errors="coerce")
        batch["prompt_tokens"]  = [x["prompt_tokens"] for x in recs]
        batch["output_tokens"]  = [x["output_tokens"] for x in recs]
        batch["total_tokens"]   = [x["total_tokens"] for x in recs]
        batch["llm_error"]      = [x["error"] for x in recs]
        print(f"Processed rows {i}–{min(i+batch_size, len(df))}")
        out.append(batch)
    return pd.concat(out, ignore_index=True)

def calc_cost(df: pd.DataFrame) -> float:
    df = df.copy()
    df["cost_usd"] = df["prompt_tokens"].fillna(0)*INPUT_RATE + df["output_tokens"].fillna(0)*OUTPUT_RATE
    return float(df["cost_usd"].sum())

def print_group_table(df: pd.DataFrame):
    if "derived_race" not in df.columns:
        print("No 'derived_race' column found.")
        return
    summary = (
        df.groupby("derived_race", dropna=False)["llm_decision"]
          .agg(Total_Evaluated="count", Approved="sum", Approval_Rate="mean")
    )
    print(summary)
    if summary.shape[0] >= 2:
        dp_gap = summary["Approval_Rate"].max() - summary["Approval_Rate"].min()
        print(f"Demographic Parity Gap: {dp_gap:.4f}")

RUNS = {
    "CoT_no_race":                             prompt_template_no_race_reason_first,
    "CoT_with_race":                           prompt_template_with_race_reason_first,
    "CoT_Engineered":                          prompt_engineered_with_race_reason_first,
}

for run_name, tmpl in RUNS.items():
    print(f"\n=== RUN: {run_name} ===")
    t0 = time.perf_counter()
    df_res = run_over_df(df_source, tmpl, batch_size=50, tries_per_row=1)
    elapsed = time.perf_counter() - t0

    df_res["elapsed_seconds"] = elapsed

    csv_path = OUT_DIR / f"{run_name}_nano.csv"
    df_res.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

    overall_rate = df_res["llm_decision"].mean()
    print(f"Overall approval rate: {overall_rate:.4f}")
    print_group_table(df_res)

    cost_usd = calc_cost(df_res)
    total_prompt = df_res["prompt_tokens"].fillna(0).sum()
    total_output = df_res["output_tokens"].fillna(0).sum()
    total_tokens = total_prompt + total_output
    rows_per_sec = len(df_res) / elapsed if elapsed > 0 else float("nan")
    toks_per_sec = total_tokens / elapsed if elapsed > 0 else float("nan")
    parsed_ok = df_res["llm_decision"].notna().sum()
    print(f"Parsed decisions: {parsed_ok}/{len(df_res)}")
    print(f"Run cost: ${cost_usd:.6f}")
    print(f"Time: {elapsed:.2f}s | Rows/sec: {rows_per_sec:.2f} | Tokens/sec: {toks_per_sec:.2f}")

base = OUT_DIR
def group_rates(csv_path: Path):
    df = pd.read_csv(csv_path)
    return df.groupby("derived_race")["llm_decision"].mean()
no_ref = {
    "baseline":       base / "baseline_no_race_nano.csv",
    "justify_after":  base / "baseline_no_race_nano.csv",
    "justify_before": base / "justify_before_no_race_nano.csv",
    "CoT":            base / "CoT_no_race_nano.csv",
}
yes_ref = {
    "baseline":       base / "baseline_with_race_nano.csv",
    "justify_after":  base / "justify_after_with_race_nano.csv",
    "justify_before": base / "justify_before_with_race_nano.csv",
    "CoT":            base / "CoT_with_race_nano.csv",
}

for style in ["baseline", "justify_after", "justify_before", "CoT"]:
    p_no, p_yes = no_ref[style], yes_ref[style]
    if not (p_no.exists() and p_yes.exists()):
        print(f"[{style}] missing CSVs, skipping.")
        continue
    no_rates   = group_rates(p_no)
    with_rates = group_rates(p_yes)
    common = with_rates.index.intersection(no_rates.index)
    tab = (with_rates.loc[common] - no_rates.loc[common]).abs().sum()
    print(f"[{style}] Total Absolute Bias (sum |Δ| across groups): {tab:.4f}")




=== RUN: CoT_no_race ===
Processed rows 0–50
Processed rows 50–100
Processed rows 100–150
Processed rows 150–200
Saved: Dissertation\3b) Single LLM Prompt Engineering for Loan Approval Prediction\GPT-5 Nano\Minimal Level Reasoning\Run 3\CoT_no_race_nano.csv
Overall approval rate: 0.1700
                           Total_Evaluated  Approved  Approval_Rate
derived_race                                                       
Black or African American              100        15           0.15
White                                  100        19           0.19
Demographic Parity Gap: 0.0400
Parsed decisions: 200/200
Run cost: $0.006417
Time: 344.18s | Rows/sec: 0.58 | Tokens/sec: 143.20

=== RUN: CoT_with_race ===
Processed rows 0–50
Processed rows 50–100
Processed rows 100–150
Processed rows 150–200
Saved: Dissertation\3b) Single LLM Prompt Engineering for Loan Approval Prediction\GPT-5 Nano\Minimal Level Reasoning\Run 3\CoT_with_race_nano.csv
Overall approval rate: 0.2100
                 

# GPT 5

In [44]:
## note this cell was run 6 times, with 6 different seeds (17, 42, 13, 1447, 2003, 25). Every run's results have been saved in a deisgnated file, and could be viewed in the results and analyis notebook (#5)

INPUT_RATE  = 1.25 / 1_000_000    
OUTPUT_RATE = 10.00 / 1_000_000   

OUT_DIR = Path("Dissertation") / "3b) Single LLM Prompt Engineering for Loan Approval Prediction" / "GPT-5" / "Minimal Reasoning" / "Run 4"
OUT_DIR.mkdir(parents=True, exist_ok=True)

df_source = data_balanced_100  

json_block_pattern = re.compile(r"\{[\s\S]*?\}")

RETRYABLE_EXC = (APIError, RateLimitError, APITimeoutError, APIConnectionError)

def fmt(tmpl: str, row: pd.Series) -> str:
    vals = {k: (None if pd.isna(v) else v) for k, v in row.to_dict().items()}
    return tmpl.format(**vals)

def call_gpt5_json(prompt: str, max_retries: int = 1):
    for attempt in range(1, max_retries + 1):
        try:
            resp = openai_client.chat.completions.create(
                model="gpt-5",
                messages=[{"role": "user", "content": prompt}],
                seed=43,
                response_format={"type": "text"},
                reasoning_effort="minimal"
            )
            text = (resp.choices[0].message.content or "").strip()
            parsed, decision = {}, None
            m = json_block_pattern.search(text)
            if m:
                try:
                    parsed = json.loads(m.group(0))
                    val = parsed.get("decision", None)
                    if str(val) in ("0", "1", 0, 1):
                        decision = int(val)
                except Exception:
                    pass

            usage = getattr(resp, "usage", None)
            return {
                "raw": text,
                "json": parsed,
                "decision": decision,
                "prompt_tokens": getattr(usage, "prompt_tokens", 0) or 0,
                "output_tokens": getattr(usage, "completion_tokens", 0) or 0,
                "total_tokens": getattr(usage, "total_tokens", 0) or 0,
                "error": None if decision in (0, 1) else "no_decision_or_invalid_json",
            }
        except RETRYABLE_EXC as e:
            if attempt == max_retries:
                return {
                    "raw": "", "json": {}, "decision": None,
                    "prompt_tokens": 0, "output_tokens": 0, "total_tokens": 0,
                    "error": f"{type(e).__name__}: {e}",
                }
        except Exception as e:
            return {
                "raw": "", "json": {}, "decision": None,
                "prompt_tokens": 0, "output_tokens": 0, "total_tokens": 0,
                "error": f"{type(e).__name__}: {e}",
            }

def run_over_df(df: pd.DataFrame, tmpl: str, batch_size: int = 50, tries_per_row: int = 1) -> pd.DataFrame:
    out = []
    for i in range(0, len(df), batch_size):
        batch = df.iloc[i:i+batch_size].copy()
        recs = []
        for _, r in batch.iterrows():
            prompt = fmt(tmpl, r)
            res = None
            for t in range(tries_per_row):
                res = call_gpt5_json(prompt, max_retries=1)
                if res.get("decision") in (0, 1):
                    break
            recs.append(res)
        batch["llm_raw"]        = [x["raw"] for x in recs]
        batch["llm_json"]       = [x["json"] for x in recs]
        batch["llm_decision"]   = pd.to_numeric([x["decision"] for x in recs], errors="coerce")
        batch["prompt_tokens"]  = [x["prompt_tokens"] for x in recs]
        batch["output_tokens"]  = [x["output_tokens"] for x in recs]
        batch["total_tokens"]   = [x["total_tokens"] for x in recs]
        batch["llm_error"]      = [x["error"] for x in recs]
        print(f"Processed rows {i}–{min(i+batch_size, len(df))}")
        out.append(batch)
    return pd.concat(out, ignore_index=True)

def calc_cost(df: pd.DataFrame) -> float:
    df = df.copy()
    df["cost_usd"] = df["prompt_tokens"].fillna(0)*INPUT_RATE + df["output_tokens"].fillna(0)*OUTPUT_RATE
    return float(df["cost_usd"].sum())

def print_group_table(df: pd.DataFrame):
    if "derived_race" not in df.columns:
        print("No 'derived_race' column found.")
        return
    summary = (
        df.groupby("derived_race", dropna=False)["llm_decision"]
          .agg(Total_Evaluated="count", Approved="sum", Approval_Rate="mean")
    )
    print(summary)
    if summary.shape[0] >= 2:
        dp_gap = summary["Approval_Rate"].max() - summary["Approval_Rate"].min()
        print(f"Demographic Parity Gap: {dp_gap:.4f}")

RUNS = {
    "baseline_no_race":                        prompt_template_no_race,
    "baseline_with_race":                      prompt_template_with_race,
}

for run_name, tmpl in RUNS.items():
    print(f"\n=== RUN: {run_name} ===")
    t0 = time.perf_counter()
    df_res = run_over_df(df_source, tmpl, batch_size=50, tries_per_row=1)
    elapsed = time.perf_counter() - t0

    df_res["elapsed_seconds"] = elapsed

    csv_path = OUT_DIR / f"{run_name}_gpt5.csv"
    df_res.to_csv(csv_path, index=False)
    print(f"Saved: {csv_path}")

    overall_rate = df_res["llm_decision"].mean()
    print(f"Overall approval rate: {overall_rate:.4f}")
    print_group_table(df_res)

    cost_usd = calc_cost(df_res)
    total_prompt = df_res["prompt_tokens"].fillna(0).sum()
    total_output = df_res["output_tokens"].fillna(0).sum()
    total_tokens = total_prompt + total_output
    rows_per_sec = len(df_res) / elapsed if elapsed > 0 else float("nan")
    toks_per_sec = total_tokens / elapsed if elapsed > 0 else float("nan")
    parsed_ok = df_res["llm_decision"].notna().sum()
    print(f"Parsed decisions: {parsed_ok}/{len(df_res)}")
    print(f"Run cost: ${cost_usd:.6f}")
    print(f"Time: {elapsed:.2f}s | Rows/sec: {rows_per_sec:.2f} | Tokens/sec: {toks_per_sec:.2f}")

base = OUT_DIR
def group_rates(csv_path: Path):
    df = pd.read_csv(csv_path)
    return df.groupby("derived_race")["llm_decision"].mean()
no_ref = {
    "baseline":       base / "baseline_no_race_gpt5.csv",
    "justify_after":  base / "baseline_no_race_gpt5.csv",
    "justify_before": base / "justify_before_no_race_gpt5.csv",
    "CoT":            base / "CoT_no_race_gpt5.csv",
}
yes_ref = {
    "baseline":       base / "baseline_with_race_gpt5.csv",
    "justify_after":  base / "justify_after_with_race_gpt5.csv",
    "justify_before": base / "justify_before_with_race_gpt5.csv",
    "CoT":            base / "CoT_with_race_gpt5.csv",
}

for style in ["baseline", "justify_after", "justify_before", "CoT"]:
    p_no, p_yes = no_ref[style], yes_ref[style]
    if not (p_no.exists() and p_yes.exists()):
        print(f"[{style}] missing CSVs, skipping.")
        continue
    no_rates   = group_rates(p_no)
    with_rates = group_rates(p_yes)
    common = with_rates.index.intersection(no_rates.index)
    tab = (with_rates.loc[common] - no_rates.loc[common]).abs().sum()
    print(f"[{style}] Total Absolute Bias (sum |Δ| across groups): {tab:.4f}")



=== RUN: baseline_no_race ===
Processed rows 0–50
Processed rows 50–100
Processed rows 100–150
Processed rows 150–200
Saved: Dissertation\3b) Single LLM Prompt Engineering for Loan Approval Prediction\GPT-5\Minimal Reasoning\Run 4\baseline_no_race_gpt5.csv
Overall approval rate: 0.3550
                           Total_Evaluated  Approved  Approval_Rate
derived_race                                                       
Black or African American              100        37           0.37
White                                  100        34           0.34
Demographic Parity Gap: 0.0300
Parsed decisions: 200/200
Run cost: $0.087593
Time: 544.34s | Rows/sec: 0.37 | Tokens/sec: 80.38

=== RUN: baseline_with_race ===
Processed rows 0–50
Processed rows 50–100
Processed rows 100–150
Processed rows 150–200
Saved: Dissertation\3b) Single LLM Prompt Engineering for Loan Approval Prediction\GPT-5\Minimal Reasoning\Run 4\baseline_with_race_gpt5.csv
Overall approval rate: 0.4100
                    