- `conda activate mri`
  - (created in `0_setup.ipynb`)

---

- `jupyter lab` => open this file

---

- Selected Jupyter kernel (`ms_classification`)
  - (created in `0_setup.ipynb`)

---

---

# Experimental Setup Overview

We evaluate two conformal prediction approaches—**Standard (Marginal)** and **Class-Conditional**—across:

- **8 dataset variants**  
- **4 calibration/test distribution pairings**  
  - 2 expected to satisfy CP assumptions  
  - 2 designed to violate CP assumptions  
- **100 Monte Carlo splits** per configuration  

> **Total runs per configuration:**  
> 2 approaches × 8 variants × 4 cal/test pairings × 100 splits = **6,400 runs**


---

---

# Run Experiments

In [4]:
from dataclasses import dataclass
from typing   import Callable, Optional, List
import numpy as np, pandas as pd, conformal, util

df3 = pd.read_pickle('all_unseen_3T_variant_scans_preds_for_baseline_model.pkl')
df15 = pd.read_pickle('all_unseen_15T_variant_scans_preds_for_baseline_model.pkl')


# --------------------------------------------------------------------
# 1.  describe each experiment once, declaratively
# --------------------------------------------------------------------
@dataclass
class Setup:
    label:           str                         # “baseline3T‑cal_dv3T‑test”, …
    cal_variant:     Optional[str]              # None = “match test variant”
    cal_df:          pd.DataFrame               # calibration source dataframe
    test_df:         pd.DataFrame               # test‑set source dataframe
    is_ms_cal:       Callable[[str], bool]      # helper to flag MS scans
    is_ms_test:      Callable[[str], bool]

setups: List[Setup] = [
    Setup('dv3T-cal_dv3T-test',     None,      df3,     df3,
          lambda s: '_' in s,                  lambda s: '_' in s),
    Setup('baseline3T-cal_dv3T-test', 'baseline', df3,  df3,
          lambda s: '_' in s,                  lambda s: '_' in s),
    Setup('baseline3T-cal_dv1.5T-test', 'baseline', df3, df15,
          lambda s: '_' in s,                  lambda s: len(s) <= 2),
    Setup('dv1.5T-cal_dv1.5T-test',  None,      df15,  df15,
          lambda s: len(s) <= 2,               lambda s: len(s) <= 2),
]

NUM_SELECT = 42
util.set_seeds()

all_counts, all_cp = [], []

# --------------------------------------------------------------------
# 2.  shared inner routine – run CP once for one (cal,test) pair
# --------------------------------------------------------------------
def run_cp(cal: pd.DataFrame, test: pd.DataFrame):
    """Return ordinary and class‑conditional CP results concatenated."""
    cp = conformal.conformal_prediction(cal,  test, verbose=False,
                                        alpha=0.10, class_conditional=False)
    cc = conformal.conformal_prediction(cal,  test, verbose=False,
                                        alpha=0.10, class_conditional=True)
    return pd.concat([cp, cc], ignore_index=True)


# --------------------------------------------------------------------
# 3.  main experiment loop
# --------------------------------------------------------------------
for st in setups:
    ids_cal  = st.cal_df['scan_id'].unique()
    ids_test = st.test_df['scan_id'].unique()

    for run in range(100):
        # calibration needs to contain both classes -> sample IDs with retry logic
        cal_ids, final_seed = util.select_calibration_ids_with_class_check(ids_cal, st, NUM_SELECT, run)
        ids_test_no_intersect = np.setdiff1d(ids_test, cal_ids, assume_unique=True)
        rng = np.random.default_rng(final_seed)  # Ensure test selection uses the final seed
        test_ids = rng.choice(ids_test_no_intersect, len(ids_test) - NUM_SELECT, replace=False)

        # bookkeeping ------------------------------------------------
        def count(ids, fn): return sum(fn(x) for x in ids)
        cal_ms, test_ms = count(cal_ids, st.is_ms_cal), count(test_ids, st.is_ms_test)
        all_counts.append({
            "cal_test": st.label, "run": run,
            "cal_num_ms_scans": cal_ms,
            "cal_num_healthy_scans": NUM_SELECT - cal_ms,
            "cal_num_total_scans": NUM_SELECT,
            "test_num_ms_scans": test_ms,
            "test_num_healthy_scans": len(test_ids) - test_ms,
            "test_num_total_scans": len(test_ids)
        })

        # CP for every variant ---------------------------------------
        for vtd in st.test_df['variant_test_data'].unique():
            cvtd = vtd if st.cal_variant is None else st.cal_variant

            cal_slice  = st.cal_df.query(
                "variant_test_data == @cvtd and scan_id in @cal_ids")
            test_slice = st.test_df.query(
                "variant_test_data == @vtd and  scan_id in @test_ids")

            cp_res = run_cp(cal_slice, test_slice)
            cp_res["run"] = run
            cp_res["cal_test"] = st.label
            all_cp.append(cp_res)

# final dataframes
counts_df   = pd.DataFrame(all_counts)
df_combined = pd.concat(all_cp, ignore_index=True)


---

---

# Run Overall Baseline Experiment

Minimal script:  

100 Monte-Carlo runs of conformal prediction for the *baseline* `variant_test_data`,  
pulling calibration **and** test from the union of the 3 T and 1.5 T prediction tables.  

- cal ∩ test = ∅  
- calibration sample (NUM_SELECT) must contain both classes  

In [1]:
#
#
# Illustration code producing only 'baseline' result subset of above code cell (makes it easier to see what's going on)
#
#


# from __future__ import annotations
# import numpy as np, pandas as pd, conformal, util

# # ────────────────────────────────────────────────────────────────────────────────
# # 0.  load & combine prediction tables
# # ────────────────────────────────────────────────────────────────────────────────
# df3  = pd.read_pickle("all_unseen_3T_variant_scans_preds_for_baseline_model.pkl")
# df15 = pd.read_pickle("all_unseen_15T_variant_scans_preds_for_baseline_model.pkl")
# all_df = pd.concat([df3, df15], ignore_index=True)

# BASELINE    = "baseline"   # the only variant we evaluate
# NUM_SELECT  = 42
# N_RUNS      = 100

# util.set_seeds()


# # ────────────────────────────────────────────────────────────────────────────────
# # 1.  helper: draw a calibration id set that contains both classes
# # ────────────────────────────────────────────────────────────────────────────────
# def pick_calibration_ids(ids: np.ndarray,
#                          df: pd.DataFrame,
#                          k: int,
#                          run: int,
#                          max_attempts: int = 1_000) -> tuple[np.ndarray, int]:
#     """
#     Re-sample until the chosen `k` IDs contain at least one 0 and one 1 in `class`.
#     Seed escalates by 1000 each retry so test sampling remains reproducible.
#     """
#     for attempt in range(max_attempts):
#         seed = run + attempt * 1000
#         rng  = np.random.default_rng(seed)
#         cal_ids = rng.choice(ids, k, replace=False)

#         if {0, 1}.issubset(df.loc[df.scan_id.isin(cal_ids), "class"].unique()):
#             return cal_ids, seed

#     raise RuntimeError(
#         f"Could not obtain both classes after {max_attempts} attempts (run={run})."
#     )


# # ────────────────────────────────────────────────────────────────────────────────
# # 2.  core CP wrapper (ordinary + class-conditional)
# # ────────────────────────────────────────────────────────────────────────────────
# def run_cp(cal: pd.DataFrame, test: pd.DataFrame) -> pd.DataFrame:
#     cp  = conformal.conformal_prediction(cal, test, alpha=0.10,
#                                          class_conditional=False, verbose=False)
#     ccp = conformal.conformal_prediction(cal, test, alpha=0.10,
#                                          class_conditional=True,  verbose=False)
#     return pd.concat([cp, ccp], ignore_index=True)


# # ────────────────────────────────────────────────────────────────────────────────
# # 3.  Monte-Carlo loop (= 100 disjoint cal/test splits)
# # ────────────────────────────────────────────────────────────────────────────────
# ids_pool   = all_df["scan_id"].unique()
# results    = []
# split_meta = []

# for run in range(N_RUNS):
#     cal_ids, seed = pick_calibration_ids(ids_pool, all_df, NUM_SELECT, run)
#     test_ids      = np.setdiff1d(ids_pool, cal_ids, assume_unique=True)

#     cal_slice  = all_df.loc[(all_df.scan_id.isin(cal_ids))  &
#                             (all_df.variant_test_data == BASELINE)]
#     test_slice = all_df.loc[(all_df.scan_id.isin(test_ids)) &
#                             (all_df.variant_test_data == BASELINE)]

#     # If the baseline subset is too small, skip the run (rare but explicit)
#     if cal_slice.empty or test_slice.empty:
#         continue

#     res = run_cp(cal_slice, test_slice)
#     res["run"] = run
#     results.append(res)

#     split_meta.append({
#         "run": run,
#         "seed": seed,
#         "num_cal": len(cal_slice),
#         "num_test": len(test_slice),
#         "cal_ids": cal_ids.tolist(),   # keep for reproducibility / debugging
#         "test_ids": test_ids.tolist()
#     })

# # make dfs
# cp_df     = pd.concat(results, ignore_index=True)
# splits_df = pd.DataFrame(split_meta)

# # save dfs
# cp_df.to_pickle("baseline_cp_results.pkl")
# splits_df.to_pickle("baseline_split_metadata.pkl")


2025-06-26 16:42:03.642106: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-06-26 16:42:03.672681: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750977723.692495 1050975 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750977723.698551 1050975 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-06-26 16:42:03.719074: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

---

---

# Aggregate Conformal Measures

In [24]:
# --- 1) Per-run by class: mean for some, median for others ---
runs_by_class = (
    df_combined
    .groupby(
        ["class_conditional", "variant_test_data", "cal_test", "run", "class"],
        as_index=False
    )
    .agg({
        "is_correct": "mean",
        "verdict":    "mean",
        "ps_size":    "mean",
        "confidence": "median",
        "credibility":"median",
        "margin":     "median",
        "actual_class_pred_prob": "median"
    })
)

# --- 2) Per-run overall (across both classes) ---
runs_overall = (
    df_combined
    .groupby(
        ["class_conditional", "variant_test_data", "cal_test", "run"],
        as_index=False
    )
    .agg({
        "is_correct": "mean",
        "verdict":    "mean",
        "ps_size":    "mean",
        "confidence": "median",
        "credibility":"median",
        "margin":     "median",
        "actual_class_pred_prob": "median"
    })
)
runs_overall["class"] = "all"

# --- 3) Combine per-run DataFrame ---
runs = pd.concat([runs_by_class, runs_overall], ignore_index=True)

# --- 4) Per-run class-1 proportion ---
prop1 = (
    df_combined
    .assign(is1 = lambda df: df["class"] == 1)
    .groupby(["class_conditional", "variant_test_data", "cal_test", "run"], as_index=False)
    .agg(prop_class1 = ("is1", "mean"))
)
runs = runs.merge(prop1, on=["class_conditional", "variant_test_data", "cal_test", "run"])

# --- 5) Across-runs summary ---
summary = (
    runs
    .groupby(
        ["class_conditional", "variant_test_data", "cal_test", "class"],
        as_index=False
    )
    .agg({
        # medians of the per-run means
        "is_correct":             "median",
        "verdict":                "median",
        "ps_size":                "median",
        # medians of the per-run medians
        "confidence":             "median",
        "credibility":            "median",
        "margin":                 "median",
        "actual_class_pred_prob": "median",
        # proportion of class 1
        "prop_class1":            "median",
        # count of runs
        "run":                    "nunique"
    })
    .rename(columns={"run": "runs_count"})
)

---

---

# Write Instance-Level Files

In [5]:
df_combined.to_pickle('___4x_cal-test_combos__100x_cp__per_variant_test_data__cp_instance_col.pkl')
df_combined.drop(columns=['cp']).to_csv('___4x_cal-test_combos__100x_cp__per_variant_test_data.csv', index=False)
counts_df.to_csv('___4x_cal-test_combos__100x_cp__per_variant_test_data__ms_vs_healthy_scan_cnt_per_config_run.csv', index=False)

---

# Write Aggregate Files

In [28]:
# strip leading/trailing underscore
runs['variant_test_data'] = runs['variant_test_data'].apply(lambda x: x.strip('_'))
summary['variant_test_data'] = summary['variant_test_data'].apply(lambda x: x.strip('_'))

# write per-run files
runs[(runs.class_conditional==False) & (runs['class']=='all')].to_csv('___conformal_measures__runs__marginal.csv', index=False)
runs[(runs.class_conditional==False) & (runs['class']!='all')].to_csv('___conformal_measures__runs__marginal__by_class.csv', index=False)
runs[(runs.class_conditional==True) & (runs['class']=='all')].to_csv( '___conformal_measures__runs__class_conditional.csv', index=False)
runs[(runs.class_conditional==True) & (runs['class']!='all')].to_csv( '___conformal_measures__runs__class_conditional__by_class.csv', index=False)

# write across-runs files
summary[(summary.class_conditional==False) & (summary['class']=='all')].to_csv('___conformal_measures__summary__marginal.csv', index=False)
summary[(summary.class_conditional==False) & (summary['class']!='all')].to_csv('___conformal_measures__summary__marginal__by_class.csv', index=False)
summary[(summary.class_conditional==True) & (summary['class']=='all')].to_csv( '___conformal_measures__summary__class_conditional.csv', index=False)
summary[(summary.class_conditional==True) & (summary['class']!='all')].to_csv( '___conformal_measures__summary__class_conditional__by_class.csv', index=False)

---

# Read Aggregate Files

In [28]:
runs = pd.read_csv(   '___conformal_measures__runs__marginal.csv')
summary = pd.read_csv('___conformal_measures__summary__marginal.csv')

runs_by_class = pd.read_csv(   '___conformal_measures__runs__marginal__by_class.csv')
summary_by_class = pd.read_csv('___conformal_measures__summary__marginal__by_class.csv')

runs_cc = pd.read_csv(   '___conformal_measures__runs__class_conditional.csv')
summary_cc = pd.read_csv('___conformal_measures__summary__class_conditional.csv')

runs_cc_by_class = pd.read_csv(   '___conformal_measures__runs__class_conditional__by_class.csv')
summary_cc_by_class = pd.read_csv('___conformal_measures__summary__class_conditional__by_class.csv')

---

# Read Instance-Level Files

In [6]:
df_combined = pd.read_pickle('___4x_cal-test_combos__100x_cp__per_variant_test_data__cp_instance_col.pkl')
counts_df = pd.read_csv('___4x_cal-test_combos__100x_cp__per_variant_test_data__ms_vs_healthy_scan_cnt_per_config_run.csv')

---

---

---

---