In [2]:
%pip install peft

Collecting peft
  Downloading peft-0.16.0-py3-none-any.whl (472 kB)
     ---------------------------------------- 0.0/472.3 kB ? eta -:--:--
     ------------------------------------- 472.3/472.3 kB 14.9 MB/s eta 0:00:00
Installing collected packages: peft
Successfully installed peft-0.16.0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 25.1.1
[notice] To update, run: C:\Users\a4293604\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [6]:
# ------------------------------------------------------------
# Install (uncomment if needed)
# !pip install -U "transformers>=4.40" "datasets>=3.0" "accelerate>=0.27" peft scikit-learn pandas matplotlib
# ------------------------------------------------------------

import os
import json
import math
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt

from collections import Counter, defaultdict
from statistics import mean
from typing import List, Dict

from datasets import load_dataset, Dataset
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer, set_seed
)

from peft import LoraConfig, get_peft_model

# =========================
# 0) CONFIG
# =========================
CFG = dict(
    DATASET="LabHC/bias_in_bios",
    TEXT_COL="hard_text",          # "bio" is easier; "hard_text" is harder
    Y_COL="profession",
    G_COL="gender",                # 0=male, 1=female (already in dataset)
    TOP_K_PROFESSIONS=20,          # reduce to top-K classes (None -> keep all)
    MAX_PER_GROUP_TRAIN=400,       # cap per (profession, gender) in train
    MAX_PER_GROUP_DEV=100,
    MAX_PER_GROUP_TEST=200,
    MAX_LEN=128,                   # shorten for speed
    MAX_STEPS=1500,                # HARD cap on updates (tune up/down)
    BATCH_TRAIN=32,
    BATCH_EVAL=64,
    LR=2e-5,
    WEIGHT_DECAY=0.01,
    SEEDS=[1, 2, 3],               # 3 for dev; 5 for final
    LAMBDAS=[0.0, 0.05, 0.1, 0.2], # sweep
    # LoRA
    LORA_R=8,
    LORA_ALPHA=16,
    LORA_DROPOUT=0.05,
    LORA_TARGET_MODULES=["query", "key", "value", "dense"],
    # misc
    OUT_DIR="runs_bias_in_bios",
    FP16=torch.cuda.is_available(),
)

# Ensure reproducibility for meta operations (dataset sampling)
set_seed(42)
print("CUDA:", torch.cuda.is_available(),
      torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

# =========================
# 1) DATA LOADING & DOWNSIZING
# =========================
def keep_topk(ds, label_col, k):
    counts = Counter(ds[label_col])
    topk = {lab for lab, _ in counts.most_common(k)}
    return ds.filter(lambda ex: ex[label_col] in topk)

def stratified_cap(ds, label_col, group_col, cap, seed=42):
    df = ds.to_pandas()
    df_small = (df.groupby([label_col, group_col], group_keys=False)
                  .apply(lambda x: x.sample(n=min(cap, len(x)), random_state=seed))
                  .reset_index(drop=True))
    return Dataset.from_pandas(df_small, preserve_index=False)

print("Loading dataset...")
train_ds = load_dataset(CFG["DATASET"], split="train")
dev_ds   = load_dataset(CFG["DATASET"], split="dev")
test_ds  = load_dataset(CFG["DATASET"], split="test")

print("Original sizes:", len(train_ds), len(dev_ds), len(test_ds))

if CFG["TOP_K_PROFESSIONS"] is not None:
    train_ds = keep_topk(train_ds, CFG["Y_COL"], CFG["TOP_K_PROFESSIONS"])
    dev_ds   = keep_topk(dev_ds,   CFG["Y_COL"], CFG["TOP_K_PROFESSIONS"])
    test_ds  = keep_topk(test_ds,  CFG["Y_COL"], CFG["TOP_K_PROFESSIONS"])

train_ds = stratified_cap(train_ds, CFG["Y_COL"], CFG["G_COL"], CFG["MAX_PER_GROUP_TRAIN"])
dev_ds   = stratified_cap(dev_ds,   CFG["Y_COL"], CFG["G_COL"], CFG["MAX_PER_GROUP_DEV"])
test_ds  = stratified_cap(test_ds,  CFG["Y_COL"], CFG["G_COL"], CFG["MAX_PER_GROUP_TEST"])

print("Downsized sizes:", len(train_ds), len(dev_ds), len(test_ds))

# Global remap labels to 0..K-1 across ALL splits (safety)
all_labels = sorted(set(train_ds[CFG["Y_COL"]]) | set(dev_ds[CFG["Y_COL"]]) | set(test_ds[CFG["Y_COL"]]))
need_remap = (all_labels != list(range(len(all_labels)))) or (min(all_labels) != 0)

if need_remap:
    label2id = {lab: i for i, lab in enumerate(all_labels)}
    def _remap(ex):
        ex[CFG["Y_COL"]] = label2id[ex[CFG["Y_COL"]]]
        return ex
    train_ds = train_ds.map(_remap)
    dev_ds   = dev_ds.map(_remap)
    test_ds  = test_ds.map(_remap)
else:
    label2id = {lab: lab for lab in all_labels}

num_labels = len(label2id)
print("num_labels =", num_labels)

# =========================
# 2) TOKENISATION
# =========================
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tok(batch):
    enc = tokenizer(batch[CFG["TEXT_COL"]], truncation=True, max_length=CFG["MAX_LEN"])
    enc["labels"]    = batch[CFG["Y_COL"]]
    enc["gender_id"] = batch[CFG["G_COL"]]
    return enc

train_tok = train_ds.map(tok, batched=True, remove_columns=train_ds.column_names)
dev_tok   = dev_ds.map(tok,   batched=True, remove_columns=dev_ds.column_names)
test_tok  = test_ds.map(tok,  batched=True, remove_columns=test_ds.column_names)

data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8)

# =========================
# 3) METRICS / FAIRNESS
# =========================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "macro_f1": f1_score(labels, preds, average="macro"),
    }

def per_gender_scores(labels, preds, genders):
    out = {}
    for name, gid in [("male", 0), ("female", 1)]:
        mask = (genders == gid)
        if mask.sum() == 0:
            out[name] = {"n": 0, "acc": np.nan, "macro_f1": np.nan}
            continue
        out[name] = {
            "n": int(mask.sum()),
            "acc": accuracy_score(labels[mask], preds[mask]),
            "macro_f1": f1_score(labels[mask], preds[mask], average="macro")
        }
    out["Δacc"] = out["male"]["acc"] - out["female"]["acc"]
    out["Δmacro_f1"] = out["male"]["macro_f1"] - out["female"]["macro_f1"]
    return out

# Optional: ex‑post macro Equalized Odds gap (multi‑class 1-vs-rest)
def macro_equalized_odds_gap(labels, preds, genders, n_classes):
    gaps = []
    labels = np.array(labels)
    preds  = np.array(preds)
    genders = np.array(genders)
    male = genders == 0
    fem  = genders == 1
    eps = 1e-8
    for c in range(n_classes):
        y_bin     = (labels == c).astype(int)
        pred_bin  = (preds  == c).astype(int)
        def rates(mask):
            tn, fp, fn, tp = confusion_matrix(y_bin[mask], pred_bin[mask], labels=[0,1]).ravel()
            tpr = tp / (tp + fn + eps)
            fpr = fp / (fp + tn + eps)
            return tpr, fpr
        tpr_m, fpr_m = rates(male)
        tpr_f, fpr_f = rates(fem)
        gaps.append(abs(tpr_m - tpr_f) + abs(fpr_m - fpr_f))
    return float(np.mean(gaps))

# =========================
# 4) DIFFERENTIABLE EO SURROGATE + TRAINER
# =========================
def batch_equalized_odds_surrogate(logits, labels, genders, num_labels, eps=1e-8):
    """
    Differentiable approximate Equalized Odds loss.
    """
    probs = torch.softmax(logits, dim=-1)  # [B, K]
    labels_onehot = torch.nn.functional.one_hot(labels, num_classes=num_labels).float()

    male_mask = (genders == 0).float().unsqueeze(1)   # [B,1]
    fem_mask  = (genders == 1).float().unsqueeze(1)

    # per class counts
    y_pos_m = (labels_onehot * male_mask).sum(dim=0) + eps
    y_pos_f = (labels_onehot * fem_mask).sum(dim=0) + eps
    y_neg_m = ((1 - labels_onehot) * male_mask).sum(dim=0) + eps
    y_neg_f = ((1 - labels_onehot) * fem_mask).sum(dim=0) + eps

    # TPR proxy
    tpr_m = ((probs * labels_onehot * male_mask).sum(dim=0) / y_pos_m)
    tpr_f = ((probs * labels_onehot * fem_mask ).sum(dim=0) / y_pos_f)

    # FPR proxy
    fpr_m = ((probs * (1 - labels_onehot) * male_mask).sum(dim=0) / y_neg_m)
    fpr_f = ((probs * (1 - labels_onehot) * fem_mask ).sum(dim=0) / y_neg_f)

    eo_per_class = (tpr_m - tpr_f).abs() + (fpr_m - fpr_f).abs()
    return eo_per_class.mean()

from transformers import Trainer

class FairnessTrainer(Trainer):
    def __init__(self, num_labels, fairness_lambda=0.0, **kwargs):
        super().__init__(**kwargs)
        self.fairness_lambda = fairness_lambda
        self.num_labels = num_labels

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels  = inputs.pop("labels")
        genders = inputs.pop("gender_id")
        outputs = model(**inputs)
        logits  = outputs.logits

        ce = torch.nn.functional.cross_entropy(logits, labels)

        if self.fairness_lambda > 0:
            fair_loss = batch_equalized_odds_surrogate(
                logits, labels, genders, self.num_labels
            )
            loss = ce + self.fairness_lambda * fair_loss
        else:
            fair_loss = torch.tensor(0.0, device=logits.device)
            loss = ce

        if return_outputs:
            outputs.loss = loss
            outputs.fairness_loss = fair_loss.detach().item()
            return loss, outputs
        return loss

# =========================
# 5) LoRA model builder
# =========================
def build_lora_bert(num_labels: int):
    model = AutoModelForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=num_labels
    )
    lora_cfg = LoraConfig(
        r=CFG["LORA_R"],
        lora_alpha=CFG["LORA_ALPHA"],
        lora_dropout=CFG["LORA_DROPOUT"],
        bias="none",
        task_type="SEQ_CLS",
        target_modules=CFG["LORA_TARGET_MODULES"]
    )
    model = get_peft_model(model, lora_cfg)
    model.print_trainable_parameters()
    return model

# =========================
# 6) λ SWEEP
# =========================
def run_lambda_sweep(lambdas, seeds, out_dir):
    os.makedirs(out_dir, exist_ok=True)
    all_results = []

    for lam in lambdas:
        for seed in seeds:
            print(f"\n=== λ = {lam}, seed = {seed} ===")
            set_seed(seed)

            model = build_lora_bert(num_labels)

            args = TrainingArguments(
                output_dir=os.path.join(out_dir, f"lambda_{lam}_seed_{seed}"),
                max_steps=CFG["MAX_STEPS"],
                per_device_train_batch_size=CFG["BATCH_TRAIN"],
                per_device_eval_batch_size=CFG["BATCH_EVAL"],
                learning_rate=CFG["LR"],
                weight_decay=CFG["WEIGHT_DECAY"],
                eval_strategy="steps",
                eval_steps=200,
                logging_steps=200,
                save_strategy="no",
                load_best_model_at_end=False,
                seed=seed,
                fp16=CFG["FP16"],
                report_to="none",
                remove_unused_columns=False,
            )

            trainer = FairnessTrainer(
                model=model,
                args=args,
                train_dataset=train_tok,
                eval_dataset=dev_tok,
                tokenizer=tokenizer,
                data_collator=data_collator,
                compute_metrics=compute_metrics,
                fairness_lambda=lam,
                num_labels=num_labels,
            )

            trainer.train()

            # ---- TEST EVAL ----
            test_out = trainer.predict(test_tok)
            preds  = np.argmax(test_out.predictions, axis=1)
            labels = test_out.label_ids
            genders = np.array(test_tok["gender_id"])

            acc  = accuracy_score(labels, preds)
            f1m  = f1_score(labels, preds, average="macro")
            fair = per_gender_scores(labels, preds, genders)

            eo_gap = macro_equalized_odds_gap(labels, preds, genders, num_labels)

            result = {
                "lambda": lam,
                "seed": seed,
                "acc": acc,
                "macro_f1": f1m,
                "delta_acc": fair["Δacc"],
                "delta_macro_f1": fair["Δmacro_f1"],
                "eo_gap": eo_gap,
            }
            all_results.append(result)

            # save run result
            with open(os.path.join(out_dir, f"result_lambda_{lam}_seed_{seed}.json"), "w") as f:
                json.dump(result, f, indent=2)

    # save aggregated
    with open(os.path.join(out_dir, "all_results.json"), "w") as f:
        json.dump(all_results, f, indent=2)
    return all_results

# =========================
# 7) PARETO + AUF
# =========================
def pareto_and_auf(results, util_metric="macro_f1", fair_metric="delta_macro_f1", out_dir=None):
    # aggregate by lambda
    grouped = defaultdict(list)
    for r in results:
        grouped[r["lambda"]].append(r)

    rows = []
    for lam, rs in grouped.items():
        util_vals = [r[util_metric] for r in rs]
        fair_vals = [abs(r[fair_metric]) for r in rs]  # absolute gap
        eo_vals   = [r["eo_gap"] for r in rs]
        rows.append({
            "lambda": lam,
            f"{util_metric}_mean": np.mean(util_vals),
            f"{util_metric}_std":  np.std(util_vals),
            f"|{fair_metric}|_mean": np.mean(fair_vals),
            f"|{fair_metric}|_std":  np.std(fair_vals),
            "eo_gap_mean": np.mean(eo_vals),
            "eo_gap_std":  np.std(eo_vals),
        })
    df = pd.DataFrame(sorted(rows, key=lambda x: x["lambda"]))
    print("\n=== λ Summary ===")
    print(df)

    # Plot Pareto (utility vs |fair|)
    x = df[f"{util_metric}_mean"].values
    y = df[f"|{fair_metric}|_mean"].values
    plt.figure(figsize=(7,5))
    plt.scatter(x, y, c=df["lambda"], cmap="viridis", s=80)
    for i, lam in enumerate(df["lambda"]):
        plt.annotate(f"λ={lam}", (x[i]+1e-3, y[i]+1e-3), fontsize=8)
    plt.xlabel(util_metric)
    plt.ylabel(f"|{fair_metric}| (lower is better)")
    plt.title("Accuracy–Fairness Pareto")
    plt.grid(alpha=0.2)
    if out_dir:
        plt.savefig(os.path.join(out_dir, "pareto.png"), dpi=200, bbox_inches="tight")
    plt.show()

    # AUF: normalise axes to [0,1], integrate (1 - fairness) over utility
    util_norm = (x - x.min()) / (x.max() - x.min() + 1e-8)
    fair_norm = (y - y.min()) / (y.max() - y.min() + 1e-8)
    order = np.argsort(util_norm)
    auf = np.trapz(1 - fair_norm[order], util_norm[order])
    print(f"AUF (normalised): {auf:.3f}")

    if out_dir:
        df.to_csv(os.path.join(out_dir, "lambda_summary.csv"), index=False)
        with open(os.path.join(out_dir, "AUF.txt"), "w") as f:
            f.write(str(auf))

    return df, auf

# =========================
# 8) RUN EVERYTHING
# =========================
if __name__ == "__main__":
    os.makedirs(CFG["OUT_DIR"], exist_ok=True)

    # 1) λ sweep with LoRA + fairness loss
    results = run_lambda_sweep(CFG["LAMBDAS"], CFG["SEEDS"], CFG["OUT_DIR"])

    # 2) Pareto + AUF (you can switch util_metric to "accuracy")
    df_summary, auf = pareto_and_auf(
        results,
        util_metric="macro_f1",
        fair_metric="delta_macro_f1",
        out_dir=CFG["OUT_DIR"]
    )

    print("\nDone. Artifacts saved under:", CFG["OUT_DIR"])


CUDA: False CPU
Loading dataset...
Original sizes: 257478 39642 99069


  .apply(lambda x: x.sample(n=min(cap, len(x)), random_state=seed))
  .apply(lambda x: x.sample(n=min(cap, len(x)), random_state=seed))
  .apply(lambda x: x.sample(n=min(cap, len(x)), random_state=seed))


Downsized sizes: 15768 3881 7820


Map: 100%|██████████| 15768/15768 [00:00<00:00, 17331.87 examples/s]
Map: 100%|██████████| 3881/3881 [00:00<00:00, 17856.70 examples/s]
Map: 100%|██████████| 7820/7820 [00:00<00:00, 18039.32 examples/s]


num_labels = 20


Map: 100%|██████████| 15768/15768 [00:01<00:00, 9365.65 examples/s]
Map: 100%|██████████| 3881/3881 [00:00<00:00, 10354.28 examples/s]
Map: 100%|██████████| 7820/7820 [00:00<00:00, 9995.18 examples/s] 



=== λ = 0.0, seed = 1 ===


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(**kwargs)


trainable params: 1,354,772 || all params: 110,852,392 || trainable%: 1.2221




Step,Training Loss,Validation Loss


KeyboardInterrupt: 