In [12]:
import pandas as pd, numpy as np, math
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import average_precision_score
from scipy.special import expit

# ==== 0) Load ====
PATH = "/Users/harrisonmiller/Capstone/df_after_feature_engineering.csv"
df = pd.read_csv(PATH)


df.columns


  df = pd.read_csv(PATH)


Index(['cve_id', 'cve_year', 'published_date', 'base_score', 'nvd_present',
       'jvn_present', 'eu_present', 'kev_present', 'source_list', 'sources',
       ...
       'tfidf_90', 'tfidf_91', 'tfidf_92', 'tfidf_93', 'tfidf_94', 'tfidf_95',
       'tfidf_96', 'tfidf_97', 'tfidf_98', 'tfidf_99'],
      dtype='object', length=188)

In [13]:
print(df(1))

TypeError: 'DataFrame' object is not callable

In [None]:
# As a 1-row DataFrame (keeps table formatting)
print(df.iloc[[0]])


In [None]:
import pandas as pd

# If df is already loaded:
first5 = df.iloc[:, :5].copy()

# Optional: verify what you're saving
print("Saving columns:", list(first5.columns))

# Save (adjust path if you want a different location/name)
out_path = "/Users/harrisonmiller/Capstone/df_first5cols.csv"
first5.to_csv(out_path, index=False)
print(f"Saved: {out_path}")


In [14]:
# Save first 5 rows to a CSV (all columns)
first5_rows = df.iloc[:5, :].copy()

print("Saving first 5 rows with columns:", list(first5_rows.columns))
out_path = "/Users/harrisonmiller/Capstone/df_first5rows.csv"
first5_rows.to_csv(out_path, index=False)
print(f"Saved: {out_path}")


Saving first 5 rows with columns: ['cve_id', 'cve_year', 'published_date', 'base_score', 'nvd_present', 'jvn_present', 'eu_present', 'kev_present', 'source_list', 'sources', 'source_count', 'cvss_attackvector', 'nvd_base_score', 'cvss_baseseverity', 'cvss_vectorstring', 'cvss_version', 'cwe', 'description_nvd', 'id', 'lastmodified', 'nvd_published', 'references_count', 'sourceidentifier', 'cvss_AV', 'cvss_AC', 'cvss_PR', 'cvss_UI', 'cvss_S', 'cvss_C', 'cvss_I', 'cvss_A', 'year_nvd', 'nvd_id', 'year_jvn', 'jvndb_id', 'title', 'description_jvn', 'jvn_published', 'jvn_base_score', 'cvss_severity', 'affected_products', 'link', 'cve_count', 'missing_cvss', 'missing_cve', 'id_eu', 'enisaUuid', 'description', 'eu_published', 'dateUpdated', 'eu_base_score', 'baseScoreVersion', 'baseScoreVector', 'references', 'aliases', 'epss', 'exploitedSince', 'cveID', 'vendorProject', 'product', 'vulnerabilityName', 'kev_published', 'shortDescription', 'requiredAction', 'dueDate', 'knownRansomwareCampaignUs

In [17]:
# PU vs Logistic Regression (Elkan–Noto) using is_kev as PU label
# Reads: /Users/harrisonmiller/Capstone/combined_df_before_feature_engineering.csv
# Features:
#   ['base_score','repo_publication_lag','cross_listing_count','cross_listing_variance','cwe_risk_factor']

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score, average_precision_score, brier_score_loss, log_loss,
    precision_recall_curve, confusion_matrix, classification_report
)

# -----------------------------
# Config
# -----------------------------
CSV_PATH = "/Users/harrisonmiller/Capstone/df_after_feature_engineering.csv"
FEATURES = ["base_score","repo_publication_lag","cross_listing_count","cross_listing_variance","cwe_risk_factor"]
LABEL_COL = "is_kev"   # boolean -> 1/0
RANDOM_SEED = 42
TEST_SIZE = 0.25
C = 1.0
MAX_ITER = 2000
PRECISION_TARGETS = (0.90, 0.95, 0.98)   # for threshold table
PRIMARY_PRECISION_TARGET = 0.95          # used to build confusion matrices if possible

# -----------------------------
# Helpers
# -----------------------------
def build_logreg_factory(C=1.0, max_iter=2000, class_weight=None, random_state=0):
    return lambda: Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("lr", LogisticRegression(
            C=C, max_iter=max_iter, solver="lbfgs",
            class_weight=class_weight, random_state=random_state
        ))
    ])

def coerce_is_kev_to_int(series: pd.Series) -> np.ndarray:
    if series.dtype == bool:
        return series.astype(int).values
    s = series.astype(str).str.strip().str.lower()
    mapped = s.map({"true": 1, "false": 0, "1": 1, "0": 0, "yes": 1, "no": 0})
    return mapped.fillna(0).astype(int).values

def coerce_features_numeric(df: pd.DataFrame, feat_cols):
    X = df[feat_cols].copy()
    for c in feat_cols:
        X[c] = pd.to_numeric(X[c], errors="coerce")
    X = X.replace([np.inf, -np.inf], np.nan)
    return X.values

def estimate_c_elkan_noto(clf_factory, X_pos, X_unl, n_splits=5, random_state=0):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    pos_labels = np.ones(len(X_pos))
    probs = []
    y_unl = np.zeros(len(X_unl))
    for tr_idx, te_idx in skf.split(X_pos, pos_labels):
        X_tr_pos, X_te_pos = X_pos[tr_idx], X_pos[te_idx]
        X_train = np.vstack([X_tr_pos, X_unl])
        y_train = np.concatenate([np.ones(len(X_tr_pos)), y_unl])
        clf = clf_factory()
        clf.fit(X_train, y_train)
        probs.append(clf.predict_proba(X_te_pos)[:, 1])
    probs = np.concatenate(probs)
    return float(np.clip(np.mean(probs), 1e-6, 1 - 1e-6))

def train_pu_elkan_noto(X_train, s_train, clf_factory, random_state=0):
    X_pos = X_train[s_train == 1]
    X_unl = X_train[s_train == 0]
    c_hat = estimate_c_elkan_noto(clf_factory, X_pos, X_unl, n_splits=5, random_state=random_state)
    clf_s = clf_factory()
    clf_s.fit(X_train, s_train.astype(int))
    return clf_s, c_hat

def pu_predict_proba(clf_s, c_hat, X):
    ps = clf_s.predict_proba(X)[:, 1]
    py = ps / c_hat
    return np.clip(py, 0.0, 1.0)

def summarize_metrics(y_target, y_score, label):
    """Compute metrics vs whatever target we pass (y_true if available; else s as proxy)."""
    out = {"model": label, "ROC-AUC": np.nan, "PR-AUC": np.nan, "Brier": np.nan, "LogLoss": np.nan}
    if y_target is None or len(np.unique(y_target)) < 2:
        return out
    out["ROC-AUC"] = roc_auc_score(y_target, y_score)
    out["PR-AUC"]  = average_precision_score(y_target, y_score)
    try: out["Brier"] = brier_score_loss(y_target, y_score)
    except: pass
    try: out["LogLoss"] = log_loss(y_target, np.column_stack([1 - y_score, y_score]))
    except: pass
    return out

def thresholds_at_precision(y_target, y_score, precision_targets=(0.90, 0.95, 0.98)):
    if y_target is None or len(np.unique(y_target)) < 2:
        return pd.DataFrame([{"precision_target": pt, "threshold": np.nan,
                              "precision": np.nan, "recall": np.nan, "coverage": np.nan}
                             for pt in precision_targets])
    p, r, t = precision_recall_curve(y_target, y_score)
    rows = []
    for pt in precision_targets:
        idx = np.where(p[:-1] >= pt)[0]
        if len(idx) == 0:
            rows.append({"precision_target": pt, "threshold": np.nan,
                         "precision": np.nan, "recall": np.nan, "coverage": np.nan})
        else:
            j = idx[0]
            thr = t[j]
            rows.append({
                "precision_target": float(pt),
                "threshold": float(thr),
                "precision": float(p[j]),
                "recall": float(r[j]),
                "coverage": float((y_score >= thr).mean()),
            })
    return pd.DataFrame(rows)

def pick_threshold(y_target, y_score, prefer_precision=0.95):
    """
    If we can hit prefer_precision, return that threshold.
    Otherwise, return the threshold that maximizes F1 on y_target.
    """
    if y_target is not None and len(np.unique(y_target)) > 1:
        p, r, t = precision_recall_curve(y_target, y_score)
        idx = np.where(p[:-1] >= prefer_precision)[0]
        if len(idx):
            return float(t[idx[0]]), f"precision≥{prefer_precision:.2f}"
        # F1 sweep
        eps = 1e-12
        f1 = 2*p[:-1]*r[:-1]/(p[:-1]+r[:-1]+eps)
        j = np.nanargmax(f1)
        return float(t[j]), "F1-max"
    # fallback generic median cutoff
    return float(np.median(y_score)), "median-score"

def print_confusion_and_report(name, y_target, y_score, thr, note):
    yhat = (y_score >= thr).astype(int)
    print(f"\n[{name}] threshold={thr:.4f} ({note})")
    if y_target is None or len(np.unique(y_target)) < 2:
        print("  (Target has a single class or is missing; confusion matrix not available.)")
        return
    cm = confusion_matrix(y_target, yhat, labels=[0,1])
    print("Confusion matrix (rows=true [0,1]; cols=pred [0,1]):")
    print(cm)
    print("Classification report:")
    print(classification_report(y_target, yhat, digits=4))

# -----------------------------
# Main
# -----------------------------
def main():
    csv_path = Path(CSV_PATH)
    if not csv_path.exists():
        raise FileNotFoundError(f"CSV not found: {csv_path}")
    use_cols = list(set(FEATURES + [LABEL_COL] + ["y_true"]))
    df = pd.read_csv(csv_path, usecols=lambda c: c in use_cols, low_memory=False)

    # labels and features
    s = coerce_is_kev_to_int(df[LABEL_COL])
    X = coerce_features_numeric(df, FEATURES)
    y_true = df["y_true"].values if "y_true" in df.columns else None

    # split
    X_tr, X_te, s_tr, s_te, idx_tr, idx_te = train_test_split(
        X, s, np.arange(len(s)), test_size=TEST_SIZE, stratify=s, random_state=RANDOM_SEED
    )
    y_true_te = y_true[idx_te] if y_true is not None else None

    # outputs dir
    outdir = csv_path.parent / "pu_outputs"
    outdir.mkdir(parents=True, exist_ok=True)

    # baseline
    base_factory = build_logreg_factory(C=C, max_iter=MAX_ITER, class_weight=None, random_state=RANDOM_SEED)
    baseline = base_factory(); baseline.fit(X_tr, s_tr)
    base_prob = baseline.predict_proba(X_te)[:, 1]

    # PU (Elkan–Noto)
    pu_clf_s, c_hat = train_pu_elkan_noto(X_tr, s_tr, base_factory, random_state=RANDOM_SEED)
    pu_prob = pu_predict_proba(pu_clf_s, c_hat, X_te)

    # choose target for evaluation
    # prefer y_true if present & non-degenerate; else use s_te as a proxy (clearly labeled)
    if y_true_te is not None and len(np.unique(y_true_te)) > 1:
        eval_target = y_true_te
        eval_name   = "y_true (ground truth)"
    else:
        eval_target = s_te
        eval_name   = "s_te (PU proxy: labeled positives vs unlabeled)"

    # metrics
    rows = []
    rows.append(summarize_metrics(eval_target, base_prob, "Baseline LR (unlabeled=neg)"))
    rows.append(summarize_metrics(eval_target, pu_prob, "PU Elkan–Noto (LR base)"))
    metrics_df = pd.DataFrame(rows)
    metrics_df.to_csv(outdir / "pu_vs_baseline_metrics.csv", index=False)

    # thresholds table vs chosen eval target
    thr_base_tbl = thresholds_at_precision(eval_target, base_prob, PRECISION_TARGETS)
    thr_pu_tbl   = thresholds_at_precision(eval_target, pu_prob, PRECISION_TARGETS)
    thr_base_tbl.to_csv(outdir / "thresholds_baseline_vs_eval.csv", index=False)
    thr_pu_tbl.to_csv(outdir / "thresholds_pu_vs_eval.csv", index=False)

    # print summary
    print("\n=== PU vs Baseline Summary (evaluated against:", eval_name, ") ===")
    print(metrics_df.to_string(index=False))
    print(f"\nEstimated c (P(s=1|y=1)) = {c_hat:.6f}")

    # Confusion matrices at useful thresholds (vs chosen eval target)
    thr_b, note_b = pick_threshold(eval_target, base_prob, prefer_precision=PRIMARY_PRECISION_TARGET)
    thr_p, note_p = pick_threshold(eval_target, pu_prob,   prefer_precision=PRIMARY_PRECISION_TARGET)
    print_confusion_and_report("Baseline", eval_target, base_prob, thr_b, note_b)
    print_confusion_and_report("PU Elkan–Noto", eval_target, pu_prob, thr_p, note_p)

    # save predictions
    preds = pd.DataFrame({
        "idx": idx_te,
        "s_test": s_te,
        "baseline_prob": base_prob,
        "pu_prob": pu_prob
    })
    if y_true_te is not None:
        preds["y_true"] = y_true_te
    preds.to_csv(outdir / "predictions_test.csv", index=False)

    # save c
    (outdir / "c_hat.txt").write_text(f"{c_hat:.6f}")

    print(f"\nOutputs written to: {outdir}\n"
          f"- pu_vs_baseline_metrics.csv (vs {eval_name})\n"
          f"- thresholds_baseline_vs_eval.csv\n- thresholds_pu_vs_eval.csv\n"
          f"- predictions_test.csv\n- c_hat.txt\n")

if __name__ == "__main__":
    main()



=== PU vs Baseline Summary (evaluated against: s_te (PU proxy: labeled positives vs unlabeled) ) ===
                      model  ROC-AUC   PR-AUC    Brier  LogLoss
Baseline LR (unlabeled=neg) 0.915866 0.094176 0.004815 0.023981
    PU Elkan–Noto (LR base) 0.914249 0.047759 0.071208 1.084223

Estimated c (P(s=1|y=1)) = 0.033315

[Baseline] threshold=0.9982 (precision≥0.95)
Confusion matrix (rows=true [0,1]; cols=pred [0,1]):
[[37165     0]
 [  185     3]]
Classification report:
              precision    recall  f1-score   support

           0     0.9950    1.0000    0.9975     37165
           1     1.0000    0.0160    0.0314       188

    accuracy                         0.9950     37353
   macro avg     0.9975    0.5080    0.5145     37353
weighted avg     0.9951    0.9950    0.9927     37353


[PU Elkan–Noto] threshold=0.9996 (F1-max)
Confusion matrix (rows=true [0,1]; cols=pred [0,1]):
[[36193   972]
 [  114    74]]
Classification report:
              precision    recall  f1-s

In [18]:
def expected_cost(y, score, thr, c_fp=1.0, c_fn=10.0):
    yhat = (score >= thr).astype(int)
    fp = ((y==0) & (yhat==1)).sum()
    fn = ((y==1) & (yhat==0)).sum()
    return c_fp*fp + c_fn*fn

# sweep thresholds for each model; plot cost vs threshold or report min-cost threshold & cost.


In [None]:
def precision_true_est(pi, tpr, fpr):
    num = pi * tpr
    den = pi * tpr + (1-pi) * fpr
    return num / max(den, 1e-12)


In [19]:
# effective_pu_metrics.py
# Train Baseline LR vs PU (Elkan–Noto) and emit decision-focused metrics/CSVs.
# Path/columns are customized to your dataset.

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    precision_recall_curve, roc_auc_score, average_precision_score,
    confusion_matrix, classification_report, brier_score_loss, log_loss
)

# -----------------------------
# Config
# -----------------------------
CSV_PATH = "/Users/harrisonmiller/Capstone/df_after_feature_engineering.csv"
FEATURES = ["base_score","repo_publication_lag","cross_listing_count","cross_listing_variance","cwe_risk_factor"]
LABEL_COL = "is_kev"   # boolean in your data
RANDOM_SEED = 42
TEST_SIZE = 0.25
C = 1.0
MAX_ITER = 2000

PRECISION_TARGETS = (0.90, 0.95, 0.98)
TOPK_LIST = [50, 100, 200, 500, 1000]        # adjust to your review capacity
COST_GRID = [(1, r) for r in [2,5,10,20,50,100]] + [(5,1),(10,1)]  # (C_FP, C_FN)

# -----------------------------
# Core helpers
# -----------------------------
def build_logreg_factory(C=1.0, max_iter=2000, class_weight=None, random_state=0):
    return lambda: Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler",  StandardScaler()),
        ("lr",      LogisticRegression(
            C=C, max_iter=max_iter, solver="lbfgs",
            class_weight=class_weight, random_state=random_state
        ))
    ])

def coerce_is_kev_to_int(series: pd.Series) -> np.ndarray:
    if series.dtype == bool:
        return series.astype(int).values
    s = series.astype(str).str.strip().str.lower()
    return s.map({"true":1,"false":0,"1":1,"0":0,"yes":1,"no":0}).fillna(0).astype(int).values

def coerce_features_numeric(df: pd.DataFrame, feat_cols):
    X = df[feat_cols].copy()
    for c in feat_cols:
        X[c] = pd.to_numeric(X[c], errors="coerce")
    X = X.replace([np.inf, -np.inf], np.nan)
    return X.values

def estimate_c_elkan_noto(clf_factory, X_pos, X_unl, n_splits=5, random_state=0):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    pos_labels = np.ones(len(X_pos))
    probs = []
    y_unl = np.zeros(len(X_unl))
    for tr_idx, te_idx in skf.split(X_pos, pos_labels):
        X_tr_pos, X_te_pos = X_pos[tr_idx], X_pos[te_idx]
        X_train = np.vstack([X_tr_pos, X_unl])
        y_train = np.concatenate([np.ones(len(X_tr_pos)), y_unl])
        clf = clf_factory(); clf.fit(X_train, y_train)
        probs.append(clf.predict_proba(X_te_pos)[:, 1])
    c_hat = float(np.clip(np.mean(np.concatenate(probs)), 1e-6, 1-1e-6))
    return c_hat

def train_pu_elkan_noto(X_train, s_train, clf_factory, random_state=0):
    X_pos = X_train[s_train==1]; X_unl = X_train[s_train==0]
    c_hat = estimate_c_elkan_noto(clf_factory, X_pos, X_unl, random_state=random_state)
    clf_s = clf_factory(); clf_s.fit(X_train, s_train.astype(int))
    return clf_s, c_hat

def pu_predict_proba(clf_s, c_hat, X):
    ps = clf_s.predict_proba(X)[:,1]
    return np.clip(ps / c_hat, 0.0, 1.0)

# -----------------------------
# Metric helpers
# -----------------------------
def precision_matched_table(y, score, precision_targets):
    if len(np.unique(y))<2:
        return pd.DataFrame([{"precision_target":pt,"threshold":np.nan,"precision":np.nan,"recall":np.nan,"coverage":np.nan} for pt in precision_targets])
    p, r, t = precision_recall_curve(y, score)
    rows=[]
    for pt in precision_targets:
        idx = np.where(p[:-1] >= pt)[0]
        if len(idx)==0:
            rows.append({"precision_target":pt,"threshold":np.nan,"precision":np.nan,"recall":np.nan,"coverage":np.nan})
        else:
            j = idx[0]
            thr = t[j]
            rows.append({"precision_target":float(pt),"threshold":float(thr),"precision":float(p[j]),"recall":float(r[j]),"coverage":float((score>=thr).mean())})
    return pd.DataFrame(rows)

def topk_yield(y, score, K_list):
    order = np.argsort(-score)
    rows=[]
    for K in K_list:
        idx = order[:K]
        pos_at_k = int(y[idx].sum())
        prec_at_k = pos_at_k / max(K,1)
        rows.append({"K":K,"positives_at_K":pos_at_k,"precision_at_K":prec_at_k})
    return pd.DataFrame(rows)

def expected_cost(y, yhat, c_fp=1.0, c_fn=10.0):
    cm = confusion_matrix(y, yhat, labels=[0,1])
    tn, fp = cm[0,0], cm[0,1]
    fn, tp = cm[1,0], cm[1,1]
    return c_fp*fp + c_fn*fn

def cost_sweep(y, score, cost_grid):
    thr_grid = np.unique(np.round(score, 6))
    rows=[]
    for c_fp, c_fn in cost_grid:
        best = (None, float("inf"))
        for thr in thr_grid:
            yhat = (score>=thr).astype(int)
            cost = expected_cost(y, yhat, c_fp, c_fn)
            if cost < best[1]:
                best = (thr, cost)
        rows.append({"C_FP":c_fp,"C_FN":c_fn,"best_threshold":float(best[0]),"min_cost":float(best[1])})
    return pd.DataFrame(rows)

def risk_coverage(y, score, n=30):
    # confidence = max(p, 1-p); reject lowest confidence to reduce risk (error rate)
    conf = np.maximum(score, 1.0 - score)
    rows=[]
    for cov in np.linspace(0.1, 1.0, n):  # keep top-cov by confidence
        k = int(np.ceil(cov*len(score)))
        idx = np.argsort(-conf)[:k]
        yhat = (score[idx]>=0.5).astype(int)   # fixed 0.5 for a simple curve
        acc = (yhat == y[idx]).mean() if len(idx)>0 else np.nan
        rows.append({"coverage":float(cov),"accuracy":float(acc),"risk":float(1-acc)})
    return pd.DataFrame(rows)

def scalar_metrics(y, score, label):
    out = {"model": label, "ROC-AUC": np.nan, "PR-AUC": np.nan, "Brier": np.nan, "LogLoss": np.nan}
    if len(np.unique(y))<2: return out
    out["ROC-AUC"] = roc_auc_score(y, score)
    out["PR-AUC"]  = average_precision_score(y, score)
    try: out["Brier"] = brier_score_loss(y, score)
    except: pass
    try: out["LogLoss"] = log_loss(y, np.column_stack([1-score, score]))
    except: pass
    return out

def prior_corrected_table(y, score, c_hat, precision_targets):
    # pi_hat ~ P(y=1) ≈ P(s=1)/c_hat (clip to [0,1])
    pi_hat = float(np.clip(y.mean()/max(c_hat,1e-6), 0, 1))
    if len(np.unique(y))<2:
        return pd.DataFrame([{"precision_target":pt,"threshold":np.nan,"TPR":np.nan,"FPR":np.nan,"est_precision_true":np.nan,"pi_hat":pi_hat} for pt in precision_targets])
    p, r, t = precision_recall_curve(y, score)
    rows=[]
    # compute confusion-derived TPR/FPR at the chosen threshold for each precision target vs proxy y
    for pt in precision_targets:
        idx = np.where(p[:-1] >= pt)[0]
        if len(idx)==0:
            rows.append({"precision_target":pt,"threshold":np.nan,"TPR":np.nan,"FPR":np.nan,"est_precision_true":np.nan,"pi_hat":pi_hat})
        else:
            j = idx[0]; thr = t[j]
            yhat = (score>=thr).astype(int)
            cm = confusion_matrix(y, yhat, labels=[0,1])
            tn, fp = cm[0,0], cm[0,1]; fn, tp = cm[1,0], cm[1,1]
            TPR = tp / max(tp+fn,1)   # recall on labeled positives (unbiased for TPR under SCAR)
            FPR = fp / max(fp+tn,1)
            # Precision_true ≈ (pi*TPR) / (pi*TPR + (1-pi)*FPR)
            denom = pi_hat*TPR + (1-pi_hat)*FPR
            prec_true = (pi_hat*TPR) / max(denom, 1e-12)
            rows.append({"precision_target":float(pt),"threshold":float(thr),"TPR":float(TPR),"FPR":float(FPR),
                         "est_precision_true":float(prec_true),"pi_hat":pi_hat})
    return pd.DataFrame(rows)

# -----------------------------
# Main
# -----------------------------
def main():
    csv_path = Path(CSV_PATH)
    outdir = csv_path.parent / "pu_outputs"
    outdir.mkdir(parents=True, exist_ok=True)

    use_cols = list(set(FEATURES + [LABEL_COL] + ["y_true"]))
    df = pd.read_csv(csv_path, usecols=lambda c: c in use_cols, low_memory=False)

    s = coerce_is_kev_to_int(df[LABEL_COL])
    X = coerce_features_numeric(df, FEATURES)
    y_true = df["y_true"].values if "y_true" in df.columns else None

    X_tr, X_te, s_tr, s_te, idx_tr, idx_te = train_test_split(
        X, s, np.arange(len(s)), test_size=TEST_SIZE, stratify=s, random_state=RANDOM_SEED
    )
    y_true_te = y_true[idx_te] if y_true is not None else None

    # prefer y_true if it exists with both classes; else use s_te (PU proxy)
    if y_true_te is not None and len(np.unique(y_true_te))>1:
        eval_y = y_true_te; eval_name = "y_true"
    else:
        eval_y = s_te;      eval_name = "s_te (PU proxy)"

    # Train models
    factory = build_logreg_factory(C=C, max_iter=MAX_ITER, random_state=RANDOM_SEED)
    baseline = factory(); baseline.fit(X_tr, s_tr)
    base_prob = baseline.predict_proba(X_te)[:,1]

    pu_clf_s, c_hat = train_pu_elkan_noto(X_tr, s_tr, factory, random_state=RANDOM_SEED)
    pu_prob = pu_predict_proba(pu_clf_s, c_hat, X_te)

    # 1) Scalar metrics
    scalar_df = pd.DataFrame([
        scalar_metrics(eval_y, base_prob, "Baseline"),
        scalar_metrics(eval_y, pu_prob,   "PU Elkan–Noto")
    ])
    scalar_df.to_csv(outdir/"metrics_scalar.csv", index=False)

    # 2) Precision-matched recall/coverage
    pm_base = precision_matched_table(eval_y, base_prob, PRECISION_TARGETS); pm_base["model"]="Baseline"
    pm_pu   = precision_matched_table(eval_y, pu_prob,   PRECISION_TARGETS); pm_pu["model"]="PU"
    pm_df = pd.concat([pm_base, pm_pu], ignore_index=True)
    pm_df.to_csv(outdir/"metrics_precision_matched.csv", index=False)

    # 3) Top-K yield
    topk_base = topk_yield(eval_y, base_prob, TOPK_LIST); topk_base["model"]="Baseline"
    topk_pu   = topk_yield(eval_y, pu_prob,   TOPK_LIST); topk_pu["model"]="PU"
    topk_df = pd.concat([topk_base, topk_pu], ignore_index=True)
    topk_df.to_csv(outdir/"metrics_topk_yield.csv", index=False)

    # 4) Cost sweep (optimal threshold per cost ratio)
    cost_base = cost_sweep(eval_y, base_prob, COST_GRID); cost_base["model"]="Baseline"
    cost_pu   = cost_sweep(eval_y, pu_prob,   COST_GRID); cost_pu["model"]="PU"
    cost_df = pd.concat([cost_base, cost_pu], ignore_index=True)
    cost_df.to_csv(outdir/"metrics_cost_sweep.csv", index=False)

    # 5) Risk–coverage (accuracy vs keep-rate using confidence)
    rc_base = risk_coverage(eval_y, base_prob); rc_base["model"]="Baseline"
    rc_pu   = risk_coverage(eval_y, pu_prob);   rc_pu["model"]="PU"
    rc_df = pd.concat([rc_base, rc_pu], ignore_index=True)
    rc_df.to_csv(outdir/"metrics_risk_coverage.csv", index=False)

    # 6) Prior-corrected (estimate pi from s and c_hat), report est true precision at targets
    pc_base = prior_corrected_table(eval_y, base_prob, c_hat, PRECISION_TARGETS); pc_base["model"]="Baseline"
    pc_pu   = prior_corrected_table(eval_y, pu_prob,   c_hat, PRECISION_TARGETS); pc_pu["model"]="PU"
    pc_df = pd.concat([pc_base, pc_pu], ignore_index=True)
    pc_df.to_csv(outdir/"metrics_prior_corrected.csv", index=False)

    # Quick console summary
    print(f"\nEvaluated against: {eval_name}")
    print("Scalar metrics:\n", scalar_df.to_string(index=False))
    print("\nPrecision-matched (recall/coverage at targets):\n", pm_df.to_string(index=False))
    print("\nTop-K yield:\n", topk_df.to_string(index=False))
    print("\nCost sweep (min expected cost):\n", cost_df.to_string(index=False))
    print("\nRisk–coverage (first 5 rows):\n", rc_df.head().to_string(index=False))
    print("\nPrior-corrected precision (est.):\n", pc_df.to_string(index=False))
    print(f"\nEstimated c_hat (P(s=1|y=1)) = {c_hat:.6f}")
    print(f"\nOutputs written to: {outdir}")

if __name__ == "__main__":
    main()



Evaluated against: s_te (PU proxy)
Scalar metrics:
         model  ROC-AUC   PR-AUC    Brier  LogLoss
     Baseline 0.915866 0.094176 0.004815 0.023981
PU Elkan–Noto 0.914249 0.047759 0.071208 1.084223

Precision-matched (recall/coverage at targets):
  precision_target  threshold  precision   recall  coverage    model
             0.90   0.998203        1.0 0.015957   0.00008 Baseline
             0.95   0.998203        1.0 0.015957   0.00008 Baseline
             0.98   0.998203        1.0 0.015957   0.00008 Baseline
             0.90        NaN        NaN      NaN       NaN       PU
             0.95        NaN        NaN      NaN       NaN       PU
             0.98        NaN        NaN      NaN       NaN       PU

Top-K yield:
    K  positives_at_K  precision_at_K    model
  50              14           0.280 Baseline
 100              18           0.180 Baseline
 200              27           0.135 Baseline
 500              51           0.102 Baseline
1000              70      