In [2]:
pip install mlxtend

Collecting mlxtend
  Downloading mlxtend-0.23.4-py3-none-any.whl.metadata (7.3 kB)
Downloading mlxtend-0.23.4-py3-none-any.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.4
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, confusion_matrix
from scipy.stats import norm # For DeLong p-value
from mlxtend.evaluate import mcnemar_table, mcnemar
import warnings

# Suppress specific warnings if needed (e.g., from McNemar with small counts)
warnings.filterwarnings("ignore", category=UserWarning)

#---------------------------------------------------------------
# DeLong Test Implementation (Based on common adaptations)
#---------------------------------------------------------------
# (DeLong functions: _compute_midrank, _fast_delong_auc_variance, delong_test)
# ... (Keep the DeLong functions exactly as in the previous response) ...
def _compute_midrank(x):
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=np.float64)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5 * (i + j - 1)
        i = j
    T2 = np.empty(N, dtype=np.float64)
    T2[J] = T + 1
    return T2

def _fast_delong_auc_variance(y_true, y_scores):
    """Calculates variance of AUC using Delong's method"""
    y_true = np.asarray(y_true)
    y_scores = np.asarray(y_scores)

    pos_idx = np.where(y_true == 1)[0]
    neg_idx = np.where(y_true == 0)[0]

    m = len(pos_idx)
    n = len(neg_idx)

    if m == 0 or n == 0:
        # Return NaN variance components if AUC is undefined
        nan_array_m = np.full(m, np.nan) if m > 0 else np.array([])
        nan_array_n = np.full(n, np.nan) if n > 0 else np.array([])
        return np.nan, nan_array_m, nan_array_n

    pos_scores = y_scores[pos_idx]
    neg_scores = y_scores[neg_idx]

    ranks = _compute_midrank(np.concatenate((pos_scores, neg_scores)))
    pos_ranks = ranks[:m]
    neg_ranks = ranks[m:]

    #auc = (np.sum(pos_ranks) / m - (m + 1) / 2) / n # AUC calculation not needed here

    v_10 = (pos_ranks / m - (m + 1) / (2 * m)) / n
    v_01 = 1 - (neg_ranks / n - (n + 1) / (2 * n)) / m

    var_auc = (np.var(v_10, ddof=1) / m + np.var(v_01, ddof=1) / n) if m > 1 and n > 1 else np.nan
    return var_auc, v_10, v_01

def delong_test(y_true, y_scores1, y_scores2):
    """
    Performs DeLong's test for comparing two correlated ROC curves.
    Args:
        y_true: True binary labels (0 or 1).
        y_scores1: Predicted probabilities/scores from model 1.
        y_scores2: Predicted probabilities/scores from model 2.
    Returns:
        z_stat: Z-statistic for the difference in AUCs.
        p_value: Two-tailed p-value. Returns np.nan if test cannot be performed.
    """
    y_true = np.asarray(y_true)
    y_scores1 = np.asarray(y_scores1)
    y_scores2 = np.asarray(y_scores2)

    try:
        auc1 = roc_auc_score(y_true, y_scores1)
        auc2 = roc_auc_score(y_true, y_scores2)
    except ValueError:
         print("Warning: AUC calculation failed (likely only one class present). Skipping DeLong test.")
         return np.nan, np.nan


    var_auc1, v1_10, v1_01 = _fast_delong_auc_variance(y_true, y_scores1)
    var_auc2, v2_10, v2_01 = _fast_delong_auc_variance(y_true, y_scores2)

    # Check if variance calculation was possible
    if np.isnan(var_auc1) or np.isnan(var_auc2):
        print("Warning: Variance calculation failed (likely only one class present). Skipping DeLong test.")
        return np.nan, np.nan

    pos_idx = np.where(y_true == 1)[0]
    neg_idx = np.where(y_true == 0)[0]
    m = len(pos_idx)
    n = len(neg_idx)

    # Handle cases with insufficient samples for covariance
    cov_10 = np.cov(v1_10, v2_10, ddof=1)[0, 1] if m > 1 else 0
    cov_01 = np.cov(v1_01, v2_01, ddof=1)[0, 1] if n > 1 else 0

    cov_auc = cov_10 / m + cov_01 / n

    var_diff = var_auc1 + var_auc2 - 2 * cov_auc

    if var_diff <= 1e-8:
        # print(f"Warning: Variance of AUC difference is near zero ({var_diff}).")
        # If AUCs are identical, difference is not significant
        if np.isclose(auc1, auc2):
             z_stat = 0.0
             p_value = 1.0
        else: # If AUCs differ but variance is zero (numerical issue?), cannot determine significance reliably
             z_stat = np.inf if auc1 > auc2 else -np.inf
             p_value = 0.0 # Treat as highly significant but use caution
             print(f"Warning: Non-zero AUC difference but near-zero variance ({var_diff}). Result may be unreliable.")

    else:
        z_stat = (auc1 - auc2) / np.sqrt(var_diff)
        p_value = 2.0 * (1.0 - norm.cdf(np.abs(z_stat)))

    return z_stat, p_value


#---------------------------------------------------------------
# McNemar Test Helper for Specific Metrics
#---------------------------------------------------------------
def perform_mcnemar_metric(metric_name, y_true, y_pred1, y_pred2, alpha=0.05):
    """ Performs McNemar test tailored for Accuracy, Sensitivity, or Specificity. """
    y_true = np.asarray(y_true)
    y_pred1 = np.asarray(y_pred1)
    y_pred2 = np.asarray(y_pred2)

    if metric_name == 'Accuracy':
        # Standard McNemar: compare overall correct/incorrect
        # Correct means y_pred == y_true
        correct1 = (y_pred1 == y_true)
        correct2 = (y_pred2 == y_true)
        n11 = np.sum(correct1 & correct2)  # Both correct
        n10 = np.sum(correct1 & ~correct2) # M1 correct, M2 incorrect (b)
        n01 = np.sum(~correct1 & correct2) # M1 incorrect, M2 correct (c)
        n00 = np.sum(~correct1 & ~correct2) # Both incorrect
        tb = np.array([[n11, n10], [n01, n00]])
        subset_size = len(y_true)
        label = "overall accuracy"

    elif metric_name == 'Sensitivity':
        # Filter for true positives
        pos_mask = (y_true == 1)
        if np.sum(pos_mask) == 0:
            return "N/A (No true positives)", np.nan, "N/A"
        y_true_sub = y_true[pos_mask]
        y_pred1_sub = y_pred1[pos_mask]
        y_pred2_sub = y_pred2[pos_mask]
        # Compare correct positive predictions (y_pred == 1)
        correct1 = (y_pred1_sub == 1)
        correct2 = (y_pred2_sub == 1)
        n11 = np.sum(correct1 & correct2)  # Both TP
        n10 = np.sum(correct1 & ~correct2) # M1 TP, M2 FN (b)
        n01 = np.sum(~correct1 & correct2) # M1 FN, M2 TP (c)
        n00 = np.sum(~correct1 & ~correct2) # Both FN
        tb = np.array([[n11, n10], [n01, n00]])
        subset_size = len(y_true_sub)
        label = "sensitivity (recall)"

    elif metric_name == 'Specificity':
        # Filter for true negatives
        neg_mask = (y_true == 0)
        if np.sum(neg_mask) == 0:
            return "N/A (No true negatives)", np.nan, "N/A"
        y_true_sub = y_true[neg_mask]
        y_pred1_sub = y_pred1[neg_mask]
        y_pred2_sub = y_pred2[neg_mask]
         # Compare correct negative predictions (y_pred == 0)
        correct1 = (y_pred1_sub == 0)
        correct2 = (y_pred2_sub == 0)
        n11 = np.sum(correct1 & correct2)  # Both TN
        n10 = np.sum(correct1 & ~correct2) # M1 TN, M2 FP (b)
        n01 = np.sum(~correct1 & correct2) # M1 FP, M2 TN (c)
        n00 = np.sum(~correct1 & ~correct2) # Both FP
        tb = np.array([[n11, n10], [n01, n00]])
        subset_size = len(y_true_sub)
        label = "specificity"

    else:
        raise ValueError(f"Unknown metric for McNemar: {metric_name}")

    # Check discordant counts (b and c)
    b = tb[0, 1]
    c = tb[1, 0]

    if b + c == 0:
        # Models agree on all discordant cases for this metric/subset
        p_value = 1.0
        test_type = "N/A (No discordant pairs)"
        chi2_stat = np.nan
    else:
        # Choose exact or chi-square based test
        use_exact = (b + c) < 25 # Common threshold recommendation

        try:
            if use_exact:
                 chi2_stat, p_value = mcnemar(ary=tb, exact=True) # Returns chi2=None
                 test_type = "Exact Binomial"
            else:
                 chi2_stat, p_value = mcnemar(ary=tb, corrected=True)
                 test_type = "Chi-squared (corrected)"
        except ValueError as e:
            # Handle potential errors from mcnemar if table is degenerate
            print(f"  McNemar Error for {label} ({test_type}): {e}. Table:\n{tb}")
            return f"Error ({e})", np.nan, test_type

    # Format result string
    significance_str = " (Significant difference)" if p_value < alpha else " (No significant difference)"
    if chi2_stat is not None and not np.isnan(chi2_stat) :
        result_str = f"Chi2 = {chi2_stat:.4f}, p = {p_value:.4f}{significance_str}"
    else:
        result_str = f"p = {p_value:.4f}{significance_str}"

    return result_str, p_value, test_type


#---------------------------------------------------------------
# Data Loading and Preparation
#---------------------------------------------------------------
# (Data dictionaries data_internal, data_external remain the same as previous response)
# --- Internal Test Data ---
data_internal = {
    'y_true': np.array([1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]),
    'models': {
        'ZeroShot': {'threshold': 0.5740, 'y_prob': np.array([0.52617, 0.47898, 0.33645, 0.64043, 0.43387, 0.99309, 0.20735, 0.61164, 0.85515, 0.58967, 0.47475, 0.48622, 0.58209, 0.71559, 0.44502, 0.58001, 0.56346, 0.48188, 0.38914, 0.76931, 0.59518, 0.45631, 0.58164, 0.72502, 0.87099, 0.49761, 0.54549, 0.41043, 0.34987, 0.46863, 0.34911, 0.94183, 0.61221, 0.56701, 0.20342, 0.6074, 0.51668, 0.47116, 0.51788, 0.69543, 0.51853, 0.57219, 0.47093, 0.48471, 0.44707, 0.85313, 0.75932, 0.51529, 0.58618, 0.40117, 0.44163, 0.58032, 0.64692, 0.36164, 0.16962, 0.33814, 0.5563, 0.42715, 0.5307, 0.37632, 0.49706, 0.39208, 0.37356, 0.33651, 0.42872, 0.47362, 0.59769, 0.34611, 0.65954, 0.3079, 0.42854, 0.57598, 0.44944, 0.39925, 0.48022, 0.31961, 0.34979, 0.48372, 0.49588, 0.54845, 0.48849, 0.22842, 0.7523, 0.40747, 0.52556, 0.39936, 0.55616, 0.47905, 0.52519, 0.42925, 0.31144, 0.30299, 0.56418, 0.42066, 0.45876, 0.34374, 0.22824, 0.38112, 0.39529, 0.35431, 0.61914, 0.39853, 0.45214, 0.38914, 0.35996, 0.53829, 0.46812, 0.35099, 0.30468, 0.31287, 0.34071, 0.33637])},
        'FullViT':  {'threshold': 0.1420, 'y_prob': np.array([0.2192, 0.16266, 0.18904, 0.20166, 0.87417, 0.92504, 0.92511, 0.9271, 0.83369, 0.8235, 0.05416, 0.31417, 0.95837, 0.95701, 0.89981, 0.12477, 0.6112, 0.96219, 0.95605, 0.95877, 0.60991, 0.61259, 0.48937, 0.24332, 0.96211, 0.95268, 0.95786, 0.07384, 0.06768, 0.10564, 0.18248, 0.14968, 0.95509, 0.92354, 0.26097, 0.74518, 0.12066, 0.95648, 0.9355, 0.0781, 0.10163, 0.19167, 0.06948, 0.96469, 0.94219, 0.95774, 0.78081, 0.06712, 0.15365, 0.37505, 0.18522, 0.05536, 0.07287, 0.10791, 0.93738, 0.95436, 0.89235, 0.93557, 0.78139, 0.11477, 0.06343, 0.06432, 0.07773, 0.07609, 0.09047, 0.96213, 0.93813, 0.86297, 0.28637, 0.05359, 0.96094, 0.06805, 0.07044, 0.11531, 0.05312, 0.05494, 0.05783, 0.07351, 0.14081, 0.13365, 0.07006, 0.15859, 0.06069, 0.0846, 0.05175, 0.06222, 0.07038, 0.08178, 0.04561, 0.06992, 0.08613, 0.09393, 0.06196, 0.0651, 0.05763, 0.04713, 0.05645, 0.05049, 0.0479, 0.06074, 0.07366, 0.05771, 0.10964, 0.90971, 0.04996, 0.93999, 0.65352, 0.095, 0.18472, 0.06433, 0.05329, 0.06643])},
        'LoRAViT':  {'threshold': 0.1970, 'y_prob': np.array([0.01613, 0.01935, 0.36524, 0.00636, 0.97316, 0.99978, 0.99802, 0.99192, 0.98843, 0.18841, 0.01478, 0.09324, 0.99949, 0.99938, 0.98531, 0.32218, 0.93372, 0.99976, 0.99971, 0.99836, 0.97499, 0.30376, 0.38575, 0.1296, 0.99953, 0.99399, 0.99921, 0.00317, 0.08429, 0.02155, 0.071, 0.01526, 0.99434, 0.99517, 0.0635, 0.99351, 0.25384, 0.99982, 0.99979, 0.01637, 0.00456, 0.06334, 0.00871, 0.99867, 0.23485, 0.99899, 0.98433, 0.44371, 0.29501, 0.99916, 0.08149, 0.00802, 0.00522, 0.0608, 0.97996, 0.99881, 0.99538, 0.99228, 0.74916, 0.05503, 0.01974, 0.00232, 0.05114, 0.01115, 0.00629, 0.99898, 0.99938, 0.9937, 0.06682, 0.23106, 0.99731, 0.12343, 0.07078, 0.02656, 0.01274, 0.00275, 0.00989, 0.0042, 0.0233, 0.13212, 0.00229, 0.01576, 0.00609, 0.00294, 0.00537, 0.00427, 0.00132, 0.04712, 0.0002, 0.00234, 0.00483, 0.00624, 0.00079, 0.01528, 0.00035, 0.00055, 0.00662, 0.00069, 0.00071, 0.00151, 0.00093, 0.00046, 0.00891, 0.99948, 0.00044, 0.99893, 0.9976, 0.03384, 0.03142, 0.00099, 0.00175, 0.02231])},
        'BestCNN':  {'threshold': 0.5094, 'y_prob': np.array([0.09006, 0.91896, 0.96663, 0.00013, 0.87287, 0.98939, 0.99495, 0.99946, 0.95363, 0.98151, 0.00069, 0.00016, 0.99212, 0.99264, 0.99782, 0.96062, 0.99788, 0.99613, 0.99757, 0.95301, 0.99891, 0.30981, 0.37277, 0.0099, 0.99991, 0.99862, 0.98444, 0.02715, 0.92369, 0.97706, 0.99849, 0.00146, 0.9995, 0.99215, 0.88026, 0.99692, 0.92634, 0.99825, 0.99972, 0.90989, 0.00786, 0.99063, 0.00047, 0.99432, 0.99985, 0.997, 0.9979, 0.64822, 0.98378, 0.99615, 0.02189, 0.00043, 0.00523, 0.00085, 0.94326, 0.99717, 0.99832, 0.98859, 0.95163, 0.23519, 0.00063, 0.00802, 0.00137, 0.00448, 0.01402, 0.99934, 0.99741, 0.99962, 0.02694, 0.00143, 0.99882, 0.0038, 0.27196, 0.01143, 0.00083, 0.00997, 0.00035, 0.00073, 0.00629, 0.0036, 0.0136, 0.00422, 0.00449, 0.00051, 0.01083, 0.0015, 0.00108, 0.00991, 7e-05, 0.01546, 0.00038, 0.01545, 0.01759, 0.00119, 3e-05, 2e-05, 5e-05, 1e-05, 0.00024, 0.00014, 0.00015, 0.00109, 3e-05, 0.9917, 1e-05, 0.99872, 0.98342, 0.00106, 0.00536, 0.0044, 0.00031, 0.00016])}
    }
}

# --- External Test Data ---
data_external = {
    'y_true': np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0]),
    'models': {
        'ZeroShot': {'threshold': 0.5740, 'y_prob': np.array([0.46126, 0.53647, 0.75116, 0.35425, 0.50483, 0.60889, 0.85738, 0.48357, 0.55024, 0.43463, 0.44873, 0.3591, 0.63808, 0.64984, 0.3644, 0.83021, 0.44398, 0.64485, 0.58173, 0.42758, 0.39882, 0.53893, 0.5503, 0.70008, 0.543, 0.55256, 0.71665, 0.48571, 0.48571, 0.73148, 0.29672, 0.48655, 0.35402, 0.56987, 0.76196, 0.4739, 0.38737, 0.57607, 0.37606, 0.74428, 0.39777, 0.22691, 0.52507, 0.80487, 0.39856, 0.44952, 0.38899, 0.40976, 0.51784, 0.50628, 0.5631, 0.45881, 0.29865, 0.3927, 0.75268, 0.55401, 0.46005, 0.44801, 0.48979, 0.44797, 0.41668, 0.37455, 0.65211, 0.46053, 0.52294, 0.61139, 0.7412, 0.53912, 0.62691, 0.54261, 0.44435, 0.52805, 0.2731, 0.55506, 0.41671, 0.59025, 0.6236, 0.67886, 0.62787, 0.41762, 0.37674, 0.35792, 0.28901, 0.28014, 0.39939, 0.50142, 0.68325, 0.48539, 0.50212, 0.68782, 0.20937, 0.58188, 0.64608, 0.25785, 0.76126, 0.57327, 0.46997, 0.2159, 0.69007, 0.49108, 0.84593, 0.4801, 0.5129, 0.49672, 0.48208, 0.37435, 0.33985, 0.61064, 0.34381, 0.42031, 0.45941, 0.31865, 0.36787, 0.45928, 0.52216, 0.61653, 0.74639, 0.90123, 0.45976, 0.3117, 0.71005, 0.45315, 0.49577, 0.53895, 0.48646])},
        'FullViT':  {'threshold': 0.1420, 'y_prob': np.array([0.12993, 0.06029, 0.08849, 0.07669, 0.18217, 0.18292, 0.09118, 0.09349, 0.53345, 0.13521, 0.09492, 0.11266, 0.17494, 0.07645, 0.24027, 0.11192, 0.76683, 0.8815, 0.46597, 0.06365, 0.10668, 0.06433, 0.04836, 0.05007, 0.0643, 0.12301, 0.16264, 0.07391, 0.07391, 0.13283, 0.04652, 0.05908, 0.07345, 0.06804, 0.08377, 0.07163, 0.4714, 0.28529, 0.06529, 0.07259, 0.9526, 0.08651, 0.05435, 0.06902, 0.0636, 0.13761, 0.19204, 0.16133, 0.21335, 0.20295, 0.07201, 0.056, 0.16248, 0.55405, 0.12749, 0.0996, 0.20071, 0.17054, 0.06949, 0.10388, 0.09203, 0.05874, 0.06631, 0.05903, 0.33729, 0.58027, 0.07784, 0.41318, 0.42406, 0.08889, 0.05551, 0.12906, 0.07493, 0.16788, 0.06758, 0.3221, 0.35872, 0.19079, 0.13782, 0.1519, 0.11924, 0.14911, 0.42992, 0.06059, 0.06449, 0.89474, 0.56318, 0.93889, 0.92236, 0.06635, 0.92992, 0.86005, 0.09734, 0.06619, 0.89093, 0.94419, 0.9534, 0.87489, 0.96373, 0.9628, 0.15671, 0.06098, 0.13262, 0.94153, 0.11984, 0.70053, 0.94692, 0.96072, 0.10373, 0.10697, 0.45704, 0.18328, 0.16085, 0.08045, 0.83311, 0.08182, 0.73762, 0.05585, 0.45673, 0.11337, 0.10041, 0.91667, 0.06341, 0.93277, 0.06805])},
        'LoRAViT':  {'threshold': 0.1970, 'y_prob': np.array([0.03835, 0.00967, 0.08349, 0.00199, 0.04442, 0.03993, 0.0268, 0.01891, 0.04987, 0.02345, 0.0986, 0.18338, 0.34956, 0.00172, 0.53077, 0.00269, 0.86514, 0.75101, 0.60674, 0.00082, 0.00382, 0.05953, 0.01399, 0.00142, 0.02257, 0.00334, 0.01051, 0.0006, 0.0006, 0.00282, 0.00363, 0.04027, 0.00213, 0.00136, 0.00345, 0.03285, 0.01786, 0.16296, 0.05907, 0.0714, 0.99979, 0.04229, 0.04338, 0.00436, 0.01195, 0.02157, 0.12856, 0.0152, 0.1181, 0.15919, 0.03191, 0.0066, 0.01031, 0.14538, 0.02136, 0.02292, 0.00981, 0.03245, 0.00817, 0.03757, 0.09825, 0.10824, 0.05985, 0.01106, 0.91454, 0.98618, 0.02998, 0.01476, 0.9136, 0.3109, 0.01101, 0.01085, 0.00202, 0.02842, 0.00401, 0.53773, 0.00665, 0.01291, 0.00634, 0.07092, 0.00959, 0.01394, 0.94477, 0.00292, 0.0105, 0.99885, 0.86445, 0.99898, 0.99965, 0.02182, 0.99755, 0.98525, 0.00846, 0.03103, 0.96131, 0.93501, 0.99883, 0.63109, 0.99975, 0.99969, 0.85113, 0.03847, 0.06566, 0.99962, 0.00077, 0.99958, 0.99927, 0.99974, 0.06481, 0.03007, 0.66292, 0.94876, 0.01486, 0.01312, 0.92788, 0.01456, 0.98749, 0.00485, 0.99727, 0.01677, 0.04623, 0.99946, 0.00986, 0.99821, 0.03102])},
        'BestCNN':  {'threshold': 0.5094, 'y_prob': np.array([0.06159, 0.00248, 0.18062, 0.01029, 0.31783, 0.00691, 0.01298, 0.00299, 0.21005, 0.00016, 0.20303, 0.12074, 0.00635, 0.00572, 0.92286, 0.69077, 0.96375, 0.05518, 0.77013, 0.00229, 0.00662, 0.03712, 0.06506, 0.00201, 0.18582, 0.00537, 0.00031, 5e-05, 5e-05, 0.00198, 0.00047, 0.0, 0.0, 0.0, 0.00131, 0.00555, 0.17975, 0.01065, 0.0005, 0.82026, 0.99986, 0.01226, 0.02067, 0.00297, 0.00051, 0.00215, 0.01109, 6e-05, 0.00934, 0.02712, 0.00058, 5e-05, 0.10246, 0.09837, 0.33431, 0.39951, 0.0047, 0.00481, 0.01954, 0.23256, 0.20693, 0.16582, 0.00135, 0.01976, 0.96436, 0.98824, 0.0, 0.11957, 0.94823, 0.69148, 0.00056, 0.00589, 0.00208, 0.00097, 0.00088, 0.12651, 0.06032, 0.11947, 0.06804, 0.0089, 0.00038, 0.99621, 0.99738, 0.15834, 0.17902, 0.98906, 0.99743, 0.97723, 0.62407, 0.00634, 0.72643, 0.98247, 0.00201, 0.00756, 0.99323, 0.99191, 0.97475, 0.9905, 0.99875, 0.98852, 0.90683, 0.00708, 0.46382, 0.97762, 0.00041, 0.97713, 0.99393, 0.98325, 0.3328, 0.01181, 0.95926, 0.97821, 0.01385, 0.00058, 0.99239, 0.00079, 0.98596, 0.0, 0.8759, 6e-05, 0.06255, 0.99949, 1e-05, 0.99777, 0.30581])}
    }
}

# Calculate y_pred for all models
for dataset_name, data in [('Internal', data_internal), ('External', data_external)]:
    for model_name, model_data in data['models'].items():
        threshold = model_data['threshold']
        y_prob = model_data['y_prob']
        data['models'][model_name]['y_pred'] = (y_prob >= threshold).astype(int)

#---------------------------------------------------------------
# Define Model Pairs for Comparison
#---------------------------------------------------------------
model_pairs = [
    ('LoRAViT', 'FullViT'),    # Essential 1
    ('LoRAViT', 'ZeroShot'),   # Essential 2
    ('LoRAViT', 'BestCNN'),    # Essential 3
]

model_names_display = {
    'LoRAViT': 'RD+LoRA (Best Config)',
    'FullViT': 'RD fFT',
    'ZeroShot': 'RD Zero-Shot',
    'BestCNN': 'Best CNN (ResNeXt50)'
}

#---------------------------------------------------------------
# Perform and Print Comparisons
#---------------------------------------------------------------
alpha = 0.05 # Significance level

for dataset_name, data in [('Internal', data_internal), ('External', data_external)]:
    print(f"\n===== Comparisons for {dataset_name} Test Data =====")
    y_true = data['y_true']
    models = data['models']

    # --- Print individual model metrics first for context ---
    print("\n--- Individual Model Metrics ---")
    metrics_results = {}
    for name, model_data in models.items():
        try:
            y_prob = model_data['y_prob']
            y_pred = model_data['y_pred']
            auc = roc_auc_score(y_true, y_prob)
            acc = accuracy_score(y_true, y_pred)
            sens = recall_score(y_true, y_pred, pos_label=1, zero_division=0) # Recall
            tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
            spec = tn / (tn + fp) if (tn + fp) > 0 else 0 # Specificity
            metrics_results[name] = {'AUC': auc, 'Accuracy': acc, 'Sensitivity': sens, 'Specificity': spec}
            print(f"{model_names_display[name]:<25}: AUC={auc:.4f}, Acc={acc:.4f}, Sens={sens:.4f}, Spec={spec:.4f}")
        except Exception as e:
            print(f"Could not calculate metrics for {model_names_display[name]}: {e}")
            metrics_results[name] = {'AUC': np.nan, 'Accuracy': np.nan, 'Sensitivity': np.nan, 'Specificity': np.nan}
    print("-" * 30)


    # --- Perform Pairwise Comparisons ---
    for model1_key, model2_key in model_pairs:
        if model1_key not in models or model2_key not in models:
            print(f"\nSkipping comparison ({model1_key} vs {model2_key}): Model data missing.")
            continue

        model1_data = models[model1_key]
        model2_data = models[model2_key]
        model1_name = model_names_display[model1_key]
        model2_name = model_names_display[model2_key]

        print(f"\n--- Comparing {model1_name} vs. {model2_name} ---")

        # --- AUC Comparison (DeLong Test) ---
        try:
            auc1 = metrics_results[model1_key]['AUC']
            auc2 = metrics_results[model2_key]['AUC']
            # Only run test if AUCs could be calculated
            if not (np.isnan(auc1) or np.isnan(auc2)):
                z_stat, p_value_delong = delong_test(y_true, model1_data['y_prob'], model2_data['y_prob'])
                sig_str = " (Significant difference)" if p_value_delong < alpha else " (No significant difference)"
                print(f"  AUC Comparison (DeLong):         z={z_stat:7.4f}, p={p_value_delong:7.4f}{sig_str}")
            else:
                print(f"  AUC Comparison (DeLong):         Skipped (AUC calculation failed earlier)")
        except Exception as e:
            print(f"  AUC Comparison (DeLong):         Error - {e}")

        # Get predictions
        y_pred1 = model1_data['y_pred']
        y_pred2 = model2_data['y_pred']

        # --- Accuracy Comparison (McNemar Test) ---
        try:
            result_str, p_val, test_type = perform_mcnemar_metric('Accuracy', y_true, y_pred1, y_pred2, alpha)
            print(f"  Accuracy Comparison (McNemar {test_type}): {result_str}")
        except Exception as e:
            print(f"  Accuracy Comparison (McNemar):   Error - {e}")a

        # --- Sensitivity Comparison (McNemar Test) ---
        try:
            result_str, p_val, test_type = perform_mcnemar_metric('Sensitivity', y_true, y_pred1, y_pred2, alpha)
            print(f"  Sensitivity Comparison (McNemar {test_type}): {result_str}")
        except Exception as e:
            print(f"  Sensitivity Comparison (McNemar):Error - {e}")

        # --- Specificity Comparison (McNemar Test) ---
        try:
            result_str, p_val, test_type = perform_mcnemar_metric('Specificity', y_true, y_pred1, y_pred2, alpha)
            print(f"  Specificity Comparison (McNemar {test_type}): {result_str}")
        except Exception as e:
            print(f"  Specificity Comparison (McNemar):Error - {e}")


    print("=" * 60)


===== Comparisons for Internal Test Data =====

--- Individual Model Metrics ---
RD Zero-Shot             : AUC=0.6473, Acc=0.6161, Sens=0.3333, Spec=0.8281
RD fFT                   : AUC=0.9443, Acc=0.8393, Sens=0.8958, Spec=0.7969
RD+LoRA (Best Config)    : AUC=0.9385, Acc=0.8750, Sens=0.8333, Spec=0.9062
Best CNN (ResNeXt50)     : AUC=0.9613, Acc=0.9196, Sens=0.9375, Spec=0.9062
------------------------------

--- Comparing RD+LoRA (Best Config) vs. RD fFT ---
  AUC Comparison (DeLong):         z=-6.1344, p= 0.0000 (Significant difference)
  Accuracy Comparison (McNemar Exact Binomial): p = 0.4807 (No significant difference)
  Sensitivity Comparison (McNemar Exact Binomial): p = 0.4531 (No significant difference)
  Specificity Comparison (McNemar Exact Binomial): p = 0.0654 (No significant difference)

--- Comparing RD+LoRA (Best Config) vs. RD Zero-Shot ---
  AUC Comparison (DeLong):         z=133.7489, p= 0.0000 (Significant difference)
  Accuracy Comparison (McNemar Chi-squared 

## Adding LoRA configurations comparison on top of existing comparisons of LoRA(C+,M+) with RD_fFT, RD_zeroshot, and CNN

In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score, recall_score, precision_score, confusion_matrix
from scipy.stats import norm # For DeLong p-value
from mlxtend.evaluate import mcnemar_table, mcnemar
import warnings

# Suppress specific warnings if needed (e.g., from McNemar with small counts)
warnings.filterwarnings("ignore", category=UserWarning)

#---------------------------------------------------------------
# DeLong Test Implementation (Based on common adaptations)
#---------------------------------------------------------------
# (DeLong functions: _compute_midrank, _fast_delong_auc_variance, delong_test)
# ... (Keep the DeLong functions exactly as in the previous response) ...
def _compute_midrank(x):
    J = np.argsort(x)
    Z = x[J]
    N = len(x)
    T = np.zeros(N, dtype=np.float64)
    i = 0
    while i < N:
        j = i
        while j < N and Z[j] == Z[i]:
            j += 1
        T[i:j] = 0.5 * (i + j - 1)
        i = j
    T2 = np.empty(N, dtype=np.float64)
    T2[J] = T + 1
    return T2

def _fast_delong_auc_variance(y_true, y_scores):
    """Calculates variance of AUC using Delong's method"""
    y_true = np.asarray(y_true)
    y_scores = np.asarray(y_scores)

    pos_idx = np.where(y_true == 1)[0]
    neg_idx = np.where(y_true == 0)[0]

    m = len(pos_idx)
    n = len(neg_idx)

    if m == 0 or n == 0:
        # Return NaN variance components if AUC is undefined
        nan_array_m = np.full(m, np.nan) if m > 0 else np.array([])
        nan_array_n = np.full(n, np.nan) if n > 0 else np.array([])
        return np.nan, nan_array_m, nan_array_n

    pos_scores = y_scores[pos_idx]
    neg_scores = y_scores[neg_idx]

    ranks = _compute_midrank(np.concatenate((pos_scores, neg_scores)))
    pos_ranks = ranks[:m]
    neg_ranks = ranks[m:]

    #auc = (np.sum(pos_ranks) / m - (m + 1) / 2) / n # AUC calculation not needed here

    v_10 = (pos_ranks / m - (m + 1) / (2 * m)) / n
    v_01 = 1 - (neg_ranks / n - (n + 1) / (2 * n)) / m

    var_auc = (np.var(v_10, ddof=1) / m + np.var(v_01, ddof=1) / n) if m > 1 and n > 1 else np.nan
    return var_auc, v_10, v_01

def delong_test(y_true, y_scores1, y_scores2):
    """
    Performs DeLong's test for comparing two correlated ROC curves.
    Args:
        y_true: True binary labels (0 or 1).
        y_scores1: Predicted probabilities/scores from model 1.
        y_scores2: Predicted probabilities/scores from model 2.
    Returns:
        z_stat: Z-statistic for the difference in AUCs.
        p_value: Two-tailed p-value. Returns np.nan if test cannot be performed.
    """
    y_true = np.asarray(y_true)
    y_scores1 = np.asarray(y_scores1)
    y_scores2 = np.asarray(y_scores2)

    try:
        auc1 = roc_auc_score(y_true, y_scores1)
        auc2 = roc_auc_score(y_true, y_scores2)
    except ValueError:
         print("Warning: AUC calculation failed (likely only one class present). Skipping DeLong test.")
         return np.nan, np.nan


    var_auc1, v1_10, v1_01 = _fast_delong_auc_variance(y_true, y_scores1)
    var_auc2, v2_10, v2_01 = _fast_delong_auc_variance(y_true, y_scores2)

    # Check if variance calculation was possible
    if np.isnan(var_auc1) or np.isnan(var_auc2):
        print("Warning: Variance calculation failed (likely only one class present). Skipping DeLong test.")
        return np.nan, np.nan

    pos_idx = np.where(y_true == 1)[0]
    neg_idx = np.where(y_true == 0)[0]
    m = len(pos_idx)
    n = len(neg_idx)

    # Handle cases with insufficient samples for covariance
    cov_10 = np.cov(v1_10, v2_10, ddof=1)[0, 1] if m > 1 else 0
    cov_01 = np.cov(v1_01, v2_01, ddof=1)[0, 1] if n > 1 else 0

    cov_auc = cov_10 / m + cov_01 / n

    var_diff = var_auc1 + var_auc2 - 2 * cov_auc

    if var_diff <= 1e-8:
        # print(f"Warning: Variance of AUC difference is near zero ({var_diff}).")
        # If AUCs are identical, difference is not significant
        if np.isclose(auc1, auc2):
             z_stat = 0.0
             p_value = 1.0
        else: # If AUCs differ but variance is zero (numerical issue?), cannot determine significance reliably
             z_stat = np.inf if auc1 > auc2 else -np.inf
             p_value = 0.0 # Treat as highly significant but use caution
             print(f"Warning: Non-zero AUC difference but near-zero variance ({var_diff}). Result may be unreliable.")

    else:
        z_stat = (auc1 - auc2) / np.sqrt(var_diff)
        p_value = 2.0 * (1.0 - norm.cdf(np.abs(z_stat)))

    return z_stat, p_value


#---------------------------------------------------------------
# McNemar Test Helper for Specific Metrics
#---------------------------------------------------------------
def perform_mcnemar_metric(metric_name, y_true, y_pred1, y_pred2, alpha=0.05):
    """ Performs McNemar test tailored for Accuracy, Sensitivity, or Specificity. """
    y_true = np.asarray(y_true)
    y_pred1 = np.asarray(y_pred1)
    y_pred2 = np.asarray(y_pred2)

    if metric_name == 'Accuracy':
        # Standard McNemar: compare overall correct/incorrect
        # Correct means y_pred == y_true
        correct1 = (y_pred1 == y_true)
        correct2 = (y_pred2 == y_true)
        n11 = np.sum(correct1 & correct2)  # Both correct
        n10 = np.sum(correct1 & ~correct2) # M1 correct, M2 incorrect (b)
        n01 = np.sum(~correct1 & correct2) # M1 incorrect, M2 correct (c)
        n00 = np.sum(~correct1 & ~correct2) # Both incorrect
        tb = np.array([[n11, n10], [n01, n00]])
        subset_size = len(y_true)
        label = "overall accuracy"

    elif metric_name == 'Sensitivity':
        # Filter for true positives
        pos_mask = (y_true == 1)
        if np.sum(pos_mask) == 0:
            return "N/A (No true positives)", np.nan, "N/A"
        y_true_sub = y_true[pos_mask]
        y_pred1_sub = y_pred1[pos_mask]
        y_pred2_sub = y_pred2[pos_mask]
        # Compare correct positive predictions (y_pred == 1)
        correct1 = (y_pred1_sub == 1)
        correct2 = (y_pred2_sub == 1)
        n11 = np.sum(correct1 & correct2)  # Both TP
        n10 = np.sum(correct1 & ~correct2) # M1 TP, M2 FN (b)
        n01 = np.sum(~correct1 & correct2) # M1 FN, M2 TP (c)
        n00 = np.sum(~correct1 & ~correct2) # Both FN
        tb = np.array([[n11, n10], [n01, n00]])
        subset_size = len(y_true_sub)
        label = "sensitivity (recall)"

    elif metric_name == 'Specificity':
        # Filter for true negatives
        neg_mask = (y_true == 0)
        if np.sum(neg_mask) == 0:
            return "N/A (No true negatives)", np.nan, "N/A"
        y_true_sub = y_true[neg_mask]
        y_pred1_sub = y_pred1[neg_mask]
        y_pred2_sub = y_pred2[neg_mask]
         # Compare correct negative predictions (y_pred == 0)
        correct1 = (y_pred1_sub == 0)
        correct2 = (y_pred2_sub == 0)
        n11 = np.sum(correct1 & correct2)  # Both TN
        n10 = np.sum(correct1 & ~correct2) # M1 TN, M2 FP (b)
        n01 = np.sum(~correct1 & correct2) # M1 FP, M2 TN (c)
        n00 = np.sum(~correct1 & ~correct2) # Both FP
        tb = np.array([[n11, n10], [n01, n00]])
        subset_size = len(y_true_sub)
        label = "specificity"

    else:
        raise ValueError(f"Unknown metric for McNemar: {metric_name}")

    # Check discordant counts (b and c)
    b = tb[0, 1]
    c = tb[1, 0]

    if b + c == 0:
        # Models agree on all discordant cases for this metric/subset
        p_value = 1.0
        test_type = "N/A (No discordant pairs)"
        chi2_stat = np.nan
    else:
        # Choose exact or chi-square based test
        use_exact = (b + c) < 25 # Common threshold recommendation

        try:
            if use_exact:
                 chi2_stat, p_value = mcnemar(ary=tb, exact=True) # Returns chi2=None
                 test_type = "Exact Binomial"
            else:
                 chi2_stat, p_value = mcnemar(ary=tb, corrected=True)
                 test_type = "Chi-squared (corrected)"
        except ValueError as e:
            # Handle potential errors from mcnemar if table is degenerate
            print(f"  McNemar Error for {label} ({test_type}): {e}. Table:\n{tb}")
            return f"Error ({e})", np.nan, test_type

    # Format result string
    significance_str = " (Significant difference)" if p_value < alpha else " (No significant difference)"
    if chi2_stat is not None and not np.isnan(chi2_stat) :
        result_str = f"Chi2 = {chi2_stat:.4f}, p = {p_value:.4f}{significance_str}"
    else:
        result_str = f"p = {p_value:.4f}{significance_str}"

    return result_str, p_value, test_type


#---------------------------------------------------------------
# Data Loading and Preparation
#---------------------------------------------------------------
# (Data dictionaries data_internal, data_external remain the same as previous response)
# --- Internal Test Data ---
data_internal = {
    'y_true': np.array([1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]),
    'models': {
        # USER: ADD 'LoRA_Cminus_Mplus', 'LoRA_Cplus_Mminus', 'LoRA_Cminus_Mminus' HERE
        'LoRA_Cminus_Mminus': {'threshold': 0.4057, 'y_prob': np.array([0.28943, 0.05165, 0.88538, 0.89714, 0.67821, 0.90539, 0.08662, 0.03572, 0.09329, 0.02601, 0.91961, 0.92807, 0.89002, 0.62212, 0.05364, 0.73571, 0.31696, 0.97008, 0.9448, 0.94597, 0.73171, 0.82896, 0.76156, 0.24105, 0.02875, 0.93487, 0.96988, 0.96246, 0.98589, 0.97309, 0.98111, 0.98153, 0.96271, 0.90394, 0.94475, 0.53165, 0.60794, 0.91284, 0.68404, 0.75507, 0.83889, 0.71291, 0.3127, 0.47997, 0.29263, 0.5248, 0.90684, 0.86228, 0.16945, 0.70941, 0.82669, 0.73329, 0.00785, 0.02893, 0.01082, 0.18994, 0.01227, 0.03158, 0.01151, 0.04131, 0.01133, 0.01211, 0.0161, 0.11824, 0.06536, 0.10414, 0.12191, 0.03084, 0.1304, 0.00943, 0.03835, 0.03814, 0.01137, 0.01424, 0.01046, 0.03049, 0.01049, 0.15534, 0.03218, 0.02372, 0.01847, 0.02937, 0.04215, 0.73981, 0.90166, 0.17665, 0.16859, 0.01066, 0.17217, 0.19161, 0.01204, 0.02, 0.02309, 0.05749, 0.05523, 0.09775, 0.11945, 0.15899, 0.18368, 0.05502, 0.06824, 0.24318, 0.04618, 0.09385, 0.03673, 0.04479, 0.01397, 0.02148, 0.04238, 0.17483, 0.30947, 0.0174, 0.02824, 0.04192])},
        'LoRA_Cplus_Mminus': {'threshold': 0.0807, 'y_prob': np.array([0.02127, 0.00706, 0.91849, 0.84588, 0.91548, 0.99638, 0.01081, 0.00848, 0.0124, 0.0059, 0.96042, 0.93105, 0.91033, 0.13811, 0.01852, 0.87781, 0.42998, 0.99849, 0.92537, 0.98387, 0.93191, 0.64124, 0.73293, 0.01598, 0.01392, 0.98359, 0.98074, 0.99434, 0.99873, 0.97591, 0.99589, 0.9957, 0.94921, 0.88432, 0.36437, 0.10656, 0.10422, 0.99305, 0.50275, 0.99263, 0.97579, 0.97162, 0.07599, 0.26307, 0.30215, 0.15989, 0.53003, 0.36029, 0.01593, 0.96022, 0.9943, 0.95203, 0.00352, 0.01031, 0.00815, 0.02211, 0.00812, 0.00141, 0.00684, 0.01201, 0.01074, 0.01153, 0.0047, 0.04683, 0.02316, 0.69202, 0.04455, 0.00601, 0.15362, 0.00797, 0.00944, 0.01402, 0.01177, 0.00769, 0.00784, 0.01248, 0.01191, 0.0208, 0.0105, 0.00994, 0.01039, 0.02835, 0.00781, 0.12734, 0.90272, 0.75354, 0.58625, 0.00931, 0.03051, 0.06865, 0.00803, 0.01635, 0.00666, 0.03697, 0.03888, 0.05699, 0.04831, 0.1007, 0.06115, 0.02977, 0.0587, 0.0868, 0.02678, 0.01633, 0.01861, 0.0162, 0.02011, 0.00896, 0.03187, 0.03931, 0.04821, 0.00966, 0.02189, 0.01339])},
        'LoRA_Cminus_Mplus': {'threshold': 0.7721, 'y_prob': np.array([0.21135, 0.10994, 0.92705, 0.98311, 0.98281, 0.99598, 0.08638, 0.07292, 0.13916, 0.22528, 0.97787, 0.97943, 0.98021, 0.46002, 0.37159, 0.8885, 0.87462, 0.99519, 0.99604, 0.98288, 0.94831, 0.97663, 0.96493, 0.83817, 0.08853, 0.99139, 0.99514, 0.99633, 0.99691, 0.9944, 0.9982, 0.99749, 0.99431, 0.98507, 0.97941, 0.63345, 0.96607, 0.99504, 0.93339, 0.99122, 0.99426, 0.97734, 0.43502, 0.8068, 0.7927, 0.75522, 0.76517, 0.54884, 0.20625, 0.98544, 0.98929, 0.98007, 0.01443, 0.04787, 0.02631, 0.20867, 0.0484, 0.02413, 0.02882, 0.0543, 0.03244, 0.04624, 0.03159, 0.22794, 0.08405, 0.85492, 0.16421, 0.02942, 0.25168, 0.02483, 0.05095, 0.13623, 0.02128, 0.04404, 0.03221, 0.06037, 0.05981, 0.18884, 0.10058, 0.07873, 0.03517, 0.06095, 0.0642, 0.22031, 0.94701, 0.62453, 0.88636, 0.05171, 0.14657, 0.32751, 0.0114, 0.02755, 0.02648, 0.23039, 0.04956, 0.15373, 0.19525, 0.34602, 0.25116, 0.17898, 0.48034, 0.66569, 0.07286, 0.23326, 0.06347, 0.0804, 0.01522, 0.02688, 0.12935, 0.1275, 0.32546, 0.12225, 0.10678, 0.12753])},
        'LoRA_Cplus_Mplus':  {'threshold': 0.3724, 'y_prob': np.array([0.03179, 0.01359, 0.96684, 0.99161, 0.97803, 0.99849, 0.01544, 0.00969, 0.01388, 0.03431, 0.98535, 0.99481, 0.99835, 0.96436, 0.03782, 0.97665, 0.95629, 0.99819, 0.99651, 0.99435, 0.98338, 0.97084, 0.9792, 0.62817, 0.01846, 0.99863, 0.99975, 0.99974, 0.99966, 0.99765, 0.99974, 0.99899, 0.99939, 0.99702, 0.99852, 0.97002, 0.64767, 0.99914, 0.85723, 0.99958, 0.9859, 0.99488, 0.16115, 0.53885, 0.99802, 0.77392, 0.96989, 0.8981, 0.06555, 0.96455, 0.99467, 0.71381, 0.00283, 0.00748, 0.0157, 0.02343, 0.00837, 0.00515, 0.00334, 0.00761, 0.00906, 0.00394, 0.02615, 0.03664, 0.00885, 0.89718, 0.13134, 0.00752, 0.22281, 0.00576, 0.10222, 0.00534, 0.00549, 0.00881, 0.00481, 0.00545, 0.00501, 0.14541, 0.01491, 0.05465, 0.00546, 0.02015, 0.0089, 0.92852, 0.98904, 0.49722, 0.89534, 0.00488, 0.0168, 0.01722, 0.00764, 0.02432, 0.01049, 0.04213, 0.03334, 0.02706, 0.16299, 0.07227, 0.04315, 0.03884, 0.02639, 0.08717, 0.01399, 0.02974, 0.01203, 0.01445, 0.00926, 0.00822, 0.0307, 0.10428, 0.03447, 0.01096, 0.01409, 0.00782])},
        
        'FullViT':  {'threshold': 0.2407, 'y_prob': np.array([0.18957, 0.1009, 0.88869, 0.66357, 0.80584, 0.8986, 0.08222, 0.08284, 0.11787, 0.08678, 0.8262, 0.88687, 0.91814, 0.81035, 0.11385, 0.26048, 0.3364, 0.92615, 0.86699, 0.86133, 0.26125, 0.31779, 0.84215, 0.13428, 0.19418, 0.86355, 0.94557, 0.93204, 0.94435, 0.92426, 0.94614, 0.94671, 0.91301, 0.94475, 0.94355, 0.14372, 0.27271, 0.94596, 0.22406, 0.93349, 0.83972, 0.47095, 0.10599, 0.81843, 0.89904, 0.51672, 0.69812, 0.23299, 0.0912, 0.73806, 0.89091, 0.90147, 0.06843, 0.07577, 0.06564, 0.08788, 0.06443, 0.09487, 0.07574, 0.07112, 0.08019, 0.0692, 0.06718, 0.09685, 0.08468, 0.25048, 0.07965, 0.0721, 0.07114, 0.05856, 0.07024, 0.1334, 0.07103, 0.09667, 0.09249, 0.07647, 0.07644, 0.10514, 0.07275, 0.09363, 0.08625, 0.09475, 0.08637, 0.3733, 0.65284, 0.18158, 0.81428, 0.07425, 0.11966, 0.08439, 0.08634, 0.07087, 0.10005, 0.11574, 0.12901, 0.11072, 0.08654, 0.09947, 0.16011, 0.09142, 0.43161, 0.12091, 0.07208, 0.0972, 0.06394, 0.07854, 0.06721, 0.05947, 0.07301, 0.09243, 0.09311, 0.06703, 0.07039, 0.07356])},
        'ZeroShot': {'threshold': 0.4681, 'y_prob': np.array([0.37193, 0.5513, 0.20357, 0.58973, 0.51485, 0.35936, 0.47041, 0.73009, 0.38454, 0.74099, 0.57401, 0.79839, 0.54692, 0.48542, 0.7644, 0.24875, 0.39424, 0.53298, 0.48618, 0.3839, 0.83502, 0.49433, 0.25595, 0.60274, 0.50413, 0.57592, 0.12062, 0.32676, 0.69016, 0.6048, 0.24091, 0.99967, 1.07304, 0.9935, 0.71226, 0.33887, 0.81826, 0.64358, 0.39828, 0.30082, 0.44524, 0.30213, 0.72094, 0.52377, 0.48799, 0.51961, 0.47577, 0.41267, 0.47874, 0.51813, 0.5876, 0.30259, 0.40317, 0.42042, 0.25179, 0.50942, 0.45253, 0.359, 0.43575, 0.34784, 0.23392, 0.36576, 0.57884, 0.43723, 0.33927, 0.39525, 0.47837, 0.38537, 0.26891, 0.38187, 0.38421, 0.4134, 0.5985, 0.45778, 0.49939, 0.33658, 0.52263, 0.33978, 0.48412, 0.32804, 0.46344, 0.51351, 0.33375, 0.78252, 0.50399, 0.52321, 0.62595, 0.50897, 0.67302, 0.60402, 0.63244, 0.46496, 0.53484, 0.28145, 0.40764, 0.3619, 0.32341, 0.28275, 0.5171, 0.41151, 0.46657, 0.58783, 0.56488, 0.52529, 0.5362, 0.54863, 0.4475, 0.4953, 0.42606, 0.44005, 0.51754, 0.56229, 0.53657, 0.50167])},
        'BestCNN':  {'threshold': 0.2128, 'y_prob': np.array([0.9895, 0.02907, 0.99715, 0.99738, 0.99912, 0.99883, 0.98794, 0.00021, 0.00164, 0.4184, 0.99691, 0.99509, 0.99343, 0.97639, 1e-04, 0.98171, 0.35199, 0.9982, 0.99947, 0.99402, 0.99428, 0.96959, 0.98527, 0.13061, 1e-05, 0.9909, 0.98288, 0.99995, 0.99975, 0.99837, 0.99984, 0.99844, 0.99874, 0.8504, 0.99163, 0.08799, 0.97438, 0.99959, 0.006, 0.9987, 0.9972, 0.98457, 0.51196, 0.1955, 0.87514, 0.941, 0.90358, 0.9989, 0.0, 0.97564, 0.99988, 0.99518, 4e-05, 5e-05, 0.0, 0.00108, 0.00025, 0.00067, 2e-05, 2e-05, 1e-05, 5e-05, 0.0003, 5e-05, 5e-05, 0.89086, 2e-05, 0.00052, 0.00149, 0.0, 1e-05, 0.0, 1e-05, 0.0, 0.0, 0.0, 8e-05, 0.00038, 2e-05, 3e-05, 5e-05, 0.0, 0.00123, 0.93978, 0.99933, 0.57134, 0.99888, 7e-05, 0.03104, 0.00081, 0.01016, 0.00146, 0.01001, 0.0056, 0.04145, 0.07386, 0.00155, 0.00847, 0.00292, 0.00167, 0.00312, 0.00061, 0.00039, 0.00139, 0.00088, 8e-05, 0.00104, 0.00492, 3e-05, 0.00049, 0.00262, 3e-05, 5e-05, 0.0])}
    }
}
data_external = {
    'y_true': np.array([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0]),
    'models': {
        # USER: ADD 'LoRA_Cminus_Mplus', 'LoRA_Cplus_Mminus', 'LoRA_Cminus_Mminus' HERE
        'LoRA_Cminus_Mminus': {'threshold': 0.4057, 'y_prob': np.array([0.14953, 0.0603, 0.26759, 0.05697, 0.43834, 0.21079, 0.27369, 0.22665, 0.28478, 0.37719, 0.5774, 0.29764, 0.83379, 0.0107, 0.22801, 0.23476, 0.37978, 0.73498, 0.57103, 0.01258, 0.0356, 0.03196, 0.03034, 0.01115, 0.07946, 0.03368, 0.08643, 0.01382, 0.01382, 0.02996, 0.01507, 0.0217, 0.01463, 0.01724, 0.38342, 0.61335, 0.51944, 0.61916, 0.14135, 0.45385, 0.93356, 0.15349, 0.07893, 0.02722, 0.02846, 0.042, 0.31779, 0.07914, 0.22975, 0.38909, 0.09527, 0.07418, 0.10757, 0.46413, 0.13752, 0.62517, 0.58055, 0.25652, 0.01993, 0.39408, 0.36194, 0.12894, 0.07027, 0.12617, 0.63141, 0.75443, 0.1606, 0.22637, 0.95271, 0.65024, 0.04426, 0.04645, 0.0305, 0.16701, 0.01332, 0.48123, 0.17778, 0.11128, 0.11337, 0.28432, 0.08401, 0.17276, 0.33599, 0.0475, 0.07236, 0.90341, 0.6791, 0.90013, 0.82321, 0.03293, 0.75819, 0.54201, 0.15784, 0.25506, 0.75748, 0.89603, 0.93655, 0.47173, 0.94921, 0.98423, 0.33511, 0.0504, 0.07509, 0.95983, 0.03502, 0.87989, 0.96566, 0.93558, 0.37822, 0.28799, 0.5431, 0.3632, 0.07051, 0.09177, 0.50469, 0.14304, 0.26155, 0.06088, 0.33792, 0.01767, 0.0626, 0.79223, 0.04179, 0.76044, 0.18945])},
        'LoRA_Cplus_Mminus': {'threshold': 0.0807, 'y_prob': np.array([0.02127, 0.01426, 0.05951, 0.00744, 0.04312, 0.02286, 0.05039, 0.05733, 0.1348, 0.02642, 0.07329, 0.14484, 0.72258, 0.00349, 0.03521, 0.0226, 0.07507, 0.88829, 0.17005, 0.00512, 0.0048, 0.01602, 0.0115, 0.00977, 0.07445, 0.00539, 0.01037, 0.0043, 0.0043, 0.01808, 0.0073, 0.0108, 0.01068, 0.01197, 0.0052, 0.01509, 0.09526, 0.27032, 0.02626, 0.0501, 0.99881, 0.02398, 0.04952, 0.03231, 0.00322, 0.01324, 0.14183, 0.01646, 0.1221, 0.08307, 0.02173, 0.0057, 0.00894, 0.47503, 0.06991, 0.25106, 0.05472, 0.01766, 0.01097, 0.0387, 0.03346, 0.01566, 0.0087, 0.01138, 0.67194, 0.83719, 0.06114, 0.02347, 0.86214, 0.07698, 0.03851, 0.00864, 0.00632, 0.05097, 0.01229, 0.0946, 0.02274, 0.03531, 0.09372, 0.23004, 0.20769, 0.0416, 0.20823, 0.01024, 0.01061, 0.72251, 0.09961, 0.96375, 0.99369, 0.02628, 0.99432, 0.92544, 0.00634, 0.01981, 0.9347, 0.99368, 0.99076, 0.48932, 0.99595, 0.99578, 0.35726, 0.06092, 0.03859, 0.99792, 0.00288, 0.96147, 0.9953, 0.99676, 0.04565, 0.01394, 0.56526, 0.624, 0.01333, 0.01913, 0.97908, 0.22216, 0.5608, 0.00584, 0.86742, 0.00388, 0.01236, 0.99597, 0.02167, 0.98106, 0.03025])},
        'LoRA_Cminus_Mplus': {'threshold': 0.7721, 'y_prob': np.array([0.15885, 0.27301, 0.65814, 0.15528, 0.27356, 0.18845, 0.2636, 0.17401, 0.31956, 0.20893, 0.41738, 0.19802, 0.67897, 0.03266, 0.83517, 0.14818, 0.8951, 0.75667, 0.61317, 0.02599, 0.04553, 0.11677, 0.11262, 0.02673, 0.05911, 0.07578, 0.2023, 0.0299, 0.0299, 0.25053, 0.03461, 0.07017, 0.10271, 0.08272, 0.15286, 0.48489, 0.27739, 0.43685, 0.09366, 0.27501, 0.99508, 0.23367, 0.2768, 0.05947, 0.09709, 0.11413, 0.41218, 0.41334, 0.41056, 0.47038, 0.12497, 0.10743, 0.05277, 0.32845, 0.21054, 0.55789, 0.24086, 0.16787, 0.18919, 0.32815, 0.42545, 0.18297, 0.19292, 0.13078, 0.94869, 0.96602, 0.25598, 0.34722, 0.98619, 0.60632, 0.06564, 0.07737, 0.14594, 0.10445, 0.0248, 0.47979, 0.13992, 0.19377, 0.27565, 0.41755, 0.28844, 0.09497, 0.9281, 0.0862, 0.05798, 0.98506, 0.47353, 0.98962, 0.98647, 0.08814, 0.98436, 0.75879, 0.06216, 0.2316, 0.97549, 0.93648, 0.97998, 0.24489, 0.99415, 0.99786, 0.62308, 0.0679, 0.17319, 0.99547, 0.06625, 0.90418, 0.99152, 0.99434, 0.12669, 0.0915, 0.75638, 0.7243, 0.11852, 0.05045, 0.94731, 0.32845, 0.95546, 0.08967, 0.88849, 0.04049, 0.17335, 0.97895, 0.08427, 0.95263, 0.12619])},
        'LoRA_Cplus_Mplus':  {'threshold': 0.3724, 'y_prob': np.array([0.03434, 0.00959, 0.15749, 0.02232, 0.09559, 0.05969, 0.02916, 0.01516, 0.25188, 0.0546, 0.26537, 0.08547, 0.48939, 0.00496, 0.37601, 0.01906, 0.61691, 0.90985, 0.39261, 0.00253, 0.01208, 0.01805, 0.00976, 0.0091, 0.01728, 0.01003, 0.01698, 0.00804, 0.00804, 0.03356, 0.00521, 0.00787, 0.00593, 0.00562, 0.02386, 0.05674, 0.13154, 0.2377, 0.03891, 0.16077, 0.99869, 0.0239, 0.02553, 0.01224, 0.00739, 0.03401, 0.06967, 0.01835, 0.11816, 0.11086, 0.01719, 0.00444, 0.06133, 0.38011, 0.07073, 0.49741, 0.18246, 0.36338, 0.02449, 0.28469, 0.12515, 0.01197, 0.0313, 0.01748, 0.93029, 0.98257, 0.03068, 0.81478, 0.99879, 0.99253, 0.01247, 0.00551, 0.00896, 0.02917, 0.00725, 0.03765, 0.04259, 0.19015, 0.2212, 0.19263, 0.12805, 0.18668, 0.65452, 0.0236, 0.01992, 0.99716, 0.99123, 0.99777, 0.9995, 0.06375, 0.99284, 0.99023, 0.01427, 0.02471, 0.97817, 0.92073, 0.99826, 0.49731, 0.99892, 0.99973, 0.14224, 0.05567, 0.88097, 0.99969, 0.01571, 0.99908, 0.9994, 0.99909, 0.56375, 0.20318, 0.98352, 0.46712, 0.02593, 0.01491, 0.22221, 0.09441, 0.99714, 0.00574, 0.70502, 0.01915, 0.05119, 0.99338, 0.01217, 0.99852, 0.03577])},
        
        'FullViT':  {'threshold': 0.2407, 'y_prob': np.array([0.10286, 0.09393, 0.12861, 0.11827, 0.14531, 0.21072, 0.10866, 0.11883, 0.5201, 0.11908, 0.10469, 0.1276, 0.43939, 0.08736, 0.28971, 0.1179, 0.46235, 0.7952, 0.25777, 0.08506, 0.07121, 0.0765, 0.08391, 0.07224, 0.07434, 0.21013, 0.30655, 0.0726, 0.0726, 0.07994, 0.0706, 0.09018, 0.10422, 0.10542, 0.09201, 0.09962, 0.35377, 0.27274, 0.08806, 0.08543, 0.93212, 0.1313, 0.08378, 0.08336, 0.08947, 0.09625, 0.18976, 0.21228, 0.24454, 0.46392, 0.12475, 0.06343, 0.30339, 0.41773, 0.14639, 0.17083, 0.15823, 0.16557, 0.10916, 0.14689, 0.12782, 0.07468, 0.09999, 0.07975, 0.54118, 0.35648, 0.09907, 0.44147, 0.21182, 0.11432, 0.08784, 0.109, 0.08491, 0.20886, 0.08552, 0.36141, 0.42598, 0.2206, 0.26073, 0.1753, 0.16327, 0.16116, 0.18617, 0.08149, 0.08669, 0.80266, 0.31158, 0.9199, 0.90693, 0.07333, 0.62368, 0.78499, 0.13996, 0.12391, 0.79991, 0.89173, 0.92875, 0.86367, 0.94705, 0.9404, 0.16495, 0.07286, 0.25842, 0.86427, 0.12862, 0.4562, 0.93725, 0.93491, 0.1046, 0.13045, 0.32635, 0.15789, 0.13393, 0.09708, 0.76762, 0.13752, 0.67994, 0.09066, 0.61388, 0.13332, 0.14304, 0.91869, 0.10293, 0.89859, 0.10768])},
        'ZeroShot': {'threshold': 0.4681, 'y_prob': np.array([0.46126, 0.53647, 0.75116, 0.35425, 0.50483, 0.60889, 0.85738, 0.48357, 0.55024, 0.43463, 0.44873, 0.3591, 0.63808, 0.64984, 0.3644, 0.83021, 0.44398, 0.64485, 0.58173, 0.42758, 0.39882, 0.53893, 0.5503, 0.70008, 0.543, 0.55256, 0.71665, 0.48571, 0.48571, 0.73148, 0.29672, 0.48655, 0.35402, 0.56987, 0.76196, 0.4739, 0.38737, 0.57607, 0.37606, 0.74428, 0.39777, 0.22691, 0.52507, 0.80487, 0.39856, 0.44952, 0.38899, 0.40976, 0.51784, 0.50628, 0.5631, 0.45881, 0.29865, 0.3927, 0.75268, 0.55401, 0.46005, 0.44801, 0.48979, 0.44797, 0.41668, 0.37455, 0.65211, 0.46053, 0.52294, 0.61139, 0.7412, 0.53912, 0.62691, 0.54261, 0.44435, 0.52805, 0.2731, 0.55506, 0.41671, 0.59025, 0.6236, 0.67886, 0.62787, 0.41762, 0.37674, 0.35792, 0.28901, 0.28014, 0.39939, 0.50142, 0.68325, 0.48539, 0.50212, 0.68782, 0.20937, 0.58188, 0.64608, 0.25785, 0.76126, 0.57327, 0.46997, 0.2159, 0.69007, 0.49108, 0.84593, 0.4801, 0.5129, 0.49672, 0.48208, 0.37435, 0.33985, 0.61064, 0.34381, 0.42031, 0.45941, 0.31865, 0.36787, 0.45928, 0.52216, 0.61653, 0.74639, 0.90123, 0.45976, 0.3117, 0.71005, 0.45315, 0.49577, 0.53895, 0.48646])},
        'BestCNN':  {'threshold': 0.2128, 'y_prob': np.array([0.10316, 0.00044, 0.0011, 0.00077, 0.01477, 0.057, 0.00097, 0.00166, 0.44152, 0.00131, 0.0435, 0.01123, 0.06445, 0.00167, 0.09452, 0.00498, 0.95533, 0.43639, 0.4062, 0.00015, 7e-05, 0.91323, 0.00178, 0.01196, 0.6265, 0.0012, 4e-05, 0.00014, 0.00014, 0.00018, 1e-05, 0.0, 0.0, 0.0, 0.00678, 0.00446, 0.00476, 0.01403, 0.00174, 0.00012, 0.99993, 0.00031, 0.00069, 0.0035, 0.00051, 0.00101, 0.00399, 0.00247, 0.0053, 0.00425, 6e-05, 7e-05, 0.01001, 0.04687, 0.00077, 0.15662, 0.00427, 0.00603, 0.00299, 0.00298, 0.09952, 0.00167, 0.00081, 0.00074, 0.09069, 0.99849, 0.00011, 0.01908, 0.8852, 0.93992, 1e-05, 6e-05, 0.00031, 6e-05, 0.00012, 0.99939, 0.00597, 0.02314, 0.07447, 1e-04, 0.00039, 0.99493, 0.99912, 0.74217, 0.38573, 0.99745, 0.98499, 0.99234, 0.98749, 0.12524, 0.87089, 0.99654, 0.07623, 0.60851, 0.99457, 0.9961, 0.99684, 0.99352, 0.99987, 0.98822, 0.88722, 0.00481, 0.00279, 0.99937, 0.00301, 0.97548, 0.99947, 0.99817, 0.59263, 0.0014, 0.99895, 0.99907, 0.07725, 0.00227, 0.99941, 0.00222, 0.97781, 1e-04, 0.99979, 0.01384, 0.00737, 0.99745, 5e-05, 0.99982, 0.00148])}
    }
}

# Calculate y_pred for all models
for dataset_name, data in [('Internal', data_internal), ('External', data_external)]:
    for model_name, model_data in data['models'].items():
        threshold = model_data['threshold']
        y_prob = model_data['y_prob']
        data['models'][model_name]['y_pred'] = (y_prob >= threshold).astype(int)

#---------------------------------------------------------------
# Define Model Keys and Display Names
#---------------------------------------------------------------
KEY_LORA_CP_MP = 'LoRA_Cplus_Mplus'    # Your best (C+, M+)
KEY_LORA_CM_MP = 'LoRA_Cminus_Mplus'   # C-, M+
KEY_LORA_CP_MM = 'LoRA_Cplus_Mminus'   # C+, M-
KEY_LORA_CM_MM = 'LoRA_Cminus_Mminus'  # C-, M-

model_names_display = {
    KEY_LORA_CP_MP:   'RD+LoRA (C+, M+; Best)',
    KEY_LORA_CM_MP:   'RD+LoRA (C-, M+)',
    KEY_LORA_CP_MM:   'RD+LoRA (C+, M-)',
    KEY_LORA_CM_MM:   'RD+LoRA (C-, M-)',
    'FullViT':        'RD Full FT',
    'ZeroShot':       'RD Zero-Shot',
    'BestCNN':        'Best CNN (e.g., ResNet152)' 
}

model_pairs = [
    (KEY_LORA_CP_MP, 'FullViT'),
    (KEY_LORA_CP_MP, 'ZeroShot'),
    (KEY_LORA_CP_MP, 'BestCNN'),
    # New LoRA configuration comparisons:
    (KEY_LORA_CP_MP, KEY_LORA_CM_MP),   # Best (C+, M+) vs. (C-, M+)
    (KEY_LORA_CP_MM, KEY_LORA_CM_MM),   # (C+, M-) vs. (C-, M-)
    (KEY_LORA_CP_MP, KEY_LORA_CP_MM),
]

#---------------------------------------------------------------
# Perform and Print Comparisons
#---------------------------------------------------------------
alpha = 0.05 # Significance level

for dataset_name, data in [('Internal', data_internal), ('External', data_external)]:
    print(f"\n===== Comparisons for {dataset_name} Test Data =====")
    y_true = data['y_true']
    models = data['models']

    # --- Print individual model metrics first for context ---
    print("\n--- Individual Model Metrics ---")
    metrics_results = {}
    for name, model_data in models.items():
        try:
            y_prob = model_data['y_prob']
            y_pred = model_data['y_pred']
            auc = roc_auc_score(y_true, y_prob)
            acc = accuracy_score(y_true, y_pred)
            sens = recall_score(y_true, y_pred, pos_label=1, zero_division=0) # Recall
            tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0,1]).ravel()
            spec = tn / (tn + fp) if (tn + fp) > 0 else 0 # Specificity
            metrics_results[name] = {'AUC': auc, 'Accuracy': acc, 'Sensitivity': sens, 'Specificity': spec}
            print(f"{model_names_display[name]:<25}: AUC={auc:.4f}, Acc={acc:.4f}, Sens={sens:.4f}, Spec={spec:.4f}")
        except Exception as e:
            print(f"Could not calculate metrics for {model_names_display[name]}: {e}")
            metrics_results[name] = {'AUC': np.nan, 'Accuracy': np.nan, 'Sensitivity': np.nan, 'Specificity': np.nan}
    print("-" * 30)


    # --- Perform Pairwise Comparisons ---
    for model1_key, model2_key in model_pairs:
        if model1_key not in models or model2_key not in models:
            print(f"\nSkipping comparison ({model1_key} vs {model2_key}): Model data missing.")
            continue

        model1_data = models[model1_key]
        model2_data = models[model2_key]
        model1_name = model_names_display[model1_key]
        model2_name = model_names_display[model2_key]

        print(f"\n--- Comparing {model1_name} vs. {model2_name} ---")

        # --- AUC Comparison (DeLong Test) ---
        try:
            auc1 = metrics_results[model1_key]['AUC']
            auc2 = metrics_results[model2_key]['AUC']
            # Only run test if AUCs could be calculated
            if not (np.isnan(auc1) or np.isnan(auc2)):
                z_stat, p_value_delong = delong_test(y_true, model1_data['y_prob'], model2_data['y_prob'])
                sig_str = " (Significant difference)" if p_value_delong < alpha else " (No significant difference)"
                print(f"  AUC Comparison (DeLong):         z={z_stat:7.4f}, p={p_value_delong:7.4f}{sig_str}")
            else:
                print(f"  AUC Comparison (DeLong):         Skipped (AUC calculation failed earlier)")
        except Exception as e:
            print(f"  AUC Comparison (DeLong):         Error - {e}")

        # Get predictions
        y_pred1 = model1_data['y_pred']
        y_pred2 = model2_data['y_pred']

        # --- Accuracy Comparison (McNemar Test) ---
        try:
            result_str, p_val, test_type = perform_mcnemar_metric('Accuracy', y_true, y_pred1, y_pred2, alpha)
            print(f"  Accuracy Comparison (McNemar {test_type}): {result_str}")
        except Exception as e:
            print(f"  Accuracy Comparison (McNemar):   Error - {e}")

        # --- Sensitivity Comparison (McNemar Test) ---
        try:
            result_str, p_val, test_type = perform_mcnemar_metric('Sensitivity', y_true, y_pred1, y_pred2, alpha)
            print(f"  Sensitivity Comparison (McNemar {test_type}): {result_str}")
        except Exception as e:
            print(f"  Sensitivity Comparison (McNemar):Error - {e}")

        # --- Specificity Comparison (McNemar Test) ---
        try:
            result_str, p_val, test_type = perform_mcnemar_metric('Specificity', y_true, y_pred1, y_pred2, alpha)
            print(f"  Specificity Comparison (McNemar {test_type}): {result_str}")
        except Exception as e:
            print(f"  Specificity Comparison (McNemar):Error - {e}")


    print("=" * 60)


===== Comparisons for Internal Test Data =====

--- Individual Model Metrics ---
RD+LoRA (C-, M-)         : AUC=0.9548, Acc=0.8947, Sens=0.8372, Spec=0.9296
RD+LoRA (C+, M-)         : AUC=0.9414, Acc=0.8772, Sens=0.9070, Spec=0.8592
RD+LoRA (C-, M+)         : AUC=0.9672, Acc=0.9035, Sens=0.8372, Spec=0.9437
RD+LoRA (C+, M+; Best)   : AUC=0.9541, Acc=0.9123, Sens=0.9302, Spec=0.9014
RD Full FT               : AUC=0.9473, Acc=0.8772, Sens=0.8372, Spec=0.9014
RD Zero-Shot             : AUC=0.5732, Acc=0.5526, Sens=0.6279, Spec=0.5070
Best CNN (e.g., ResNet152): AUC=0.9758, Acc=0.9123, Sens=0.9302, Spec=0.9014
------------------------------

--- Comparing RD+LoRA (C+, M+; Best) vs. RD Full FT ---
  AUC Comparison (DeLong):         z= 7.3035, p= 0.0000 (Significant difference)
  Accuracy Comparison (McNemar Exact Binomial): p = 0.2188 (No significant difference)
  Sensitivity Comparison (McNemar Exact Binomial): p = 0.1250 (No significant difference)
  Specificity Comparison (McNemar Exact