### **Metrics calculation rough work, discovering motivations, and creating additional data for formal metrics analysis later**

In [None]:
import pandas as pd
import os

In [None]:
df_baseline = pd.read_csv('baseline_mcq.csv')
df_label_change_mcq = pd.read_csv('label_change_mcq.csv')
df_tf = pd.read_csv('tf_structured.csv')
df_mix = pd.read_csv('mixed_tf_label.csv')

Accuracy

In [None]:
df_baseline.is_correct.value_counts()

Unnamed: 0_level_0,count
is_correct,Unnamed: 1_level_1
True,8427
False,1744


In [None]:
df_baseline.is_correct.mean()

np.float64(0.8285321010716744)

In [None]:
df_tf.is_correct.value_counts()

Unnamed: 0_level_0,count
is_correct,Unnamed: 1_level_1
True,18332
False,2024


In [None]:
df_tf.is_correct.mean()

np.float64(0.9005698565533503)

df_mix contains 2 claims per mcq for alpha_dot and roman_dot (can be compared with tf_structured as tf_structured used num_dot)

In [None]:
df_mix.is_correct.value_counts()

Unnamed: 0_level_0,count
is_correct,Unnamed: 1_level_1
True,36586
False,4126


In [None]:
df_mix.is_correct.mean()

np.float64(0.8986539595205345)

### **Metrics analysis formally**

Calculating correctness metrics (Accuracy, macro Precision, Recall, F1)

In [None]:
from itertools import groupby
from typing_extensions import final
import pandas as pd
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def calculate_mcq_metrics(df, group_col=None, gold_col='answer_idx', pred_col='pred_mcq_idx'):
    """
    Calculates precision, recall, f1 (macro), and accuracy per category and overall.
    """
    df = df.dropna(subset=[gold_col, pred_col])

    def compute_stats(y_true, y_pred):
        # We use 'macro' averaging to treat all MCQ choices equally
        precision, recall, f1, _ = precision_recall_fscore_support(
            y_true, y_pred, average='macro', zero_division=0
        )
        accuracy = accuracy_score(y_true, y_pred)
        return pd.Series({
            'precision': precision,
            'recall': recall,
            'f1_score': f1,
            'accuracy': accuracy,
            'sample_count': len(y_true)
        })

    if group_col != None:
        # 1. Calculate metrics per category
        # We select only the label and prediction columns to keep the group context clean
        category_results = df.groupby(group_col)[[gold_col, pred_col]].apply(
            lambda x: compute_stats(x[gold_col], x[pred_col])
        ).reset_index()

    # 2. Calculate overall metrics
    overall_stats = compute_stats(df[gold_col], df[pred_col])
    overall_stats[group_col] = 'OVERALL'

    # 3. Combine results
    if group_col != None:
        final_report = pd.concat([category_results, pd.DataFrame([overall_stats])], ignore_index=True)
    else:
        final_report = pd.DataFrame([overall_stats])

    # 4. Format
    final_report.index = final_report.index + 1

    # Round for readability
    return final_report.round(4)

Baseline (MCQ and number dot label style)

In [None]:
temp = calculate_mcq_metrics(df_baseline)
temp.to_csv("metrics/baseline_accuracy_and_more.csv", index=False)
temp

Unnamed: 0,precision,recall,f1_score,accuracy,sample_count,None
1,0.8371,0.8267,0.8291,0.8285,10171.0,OVERALL


Label change (MCQ, label styles: number dot, alphabet paranthesis, number paranthesis, roman numeral paranthesis)

In [None]:
temp = pd.concat([df_baseline, df_label_change_mcq], ignore_index=True)
temp = calculate_mcq_metrics(temp, group_col="label_style")
temp.to_csv("metrics/label_change_accuracy_and_more.csv", index = False)
temp

Unnamed: 0,label_style,precision,recall,f1_score,accuracy,sample_count
1,alpha_paren,0.8437,0.8424,0.8424,0.8424,10174.0
2,num_dot,0.8371,0.8267,0.8291,0.8285,10171.0
3,num_paren,0.8348,0.8226,0.8254,0.8248,10172.0
4,roman_paren,0.828,0.8143,0.8169,0.817,10178.0
5,OVERALL,0.834,0.8265,0.8284,0.8282,40695.0


Computing pair-wise robustness and coherence metrics PC
(Prediction Consistency), LFR (Label Flip Rate), AR (Accuracy Retention), and WCA (Worst-Case Accuracy per seed) for True/False style questions for various option-labels styles

In [None]:
import pandas as pd
import numpy as np

def tf_structured_metrics_per_category(df: pd.DataFrame, label_name):
    """
    Computes PC, LFR, AR, WCA per category for tf_structured probe.
    Assumptions:
    - Columns: id, category, expected_tf, pred_tf, is_valid, is_correct
    - Each id should have two rows: one expected_tf=True, one expected_tf=False
    - Pair-level metrics use only ids where BOTH rows are valid.
    """

    d = df.copy()
    d = d[d.label_style == label_name]

    # 1. Ensure booleans and handle NaNs
    cols_to_fix = ["expected_tf", "pred_tf", "is_valid", "is_correct"]
    for c in cols_to_fix:
        if c in d.columns:
            # Drop NaNs for these critical columns to avoid errors
            d = d.dropna(subset=[c])
            d[c] = d[c].astype(bool)

    # 2. Keep only valid rows
    d_valid = d[d["is_valid"]]

    def calculate_group_metrics(subset):
        if subset.empty:
            return pd.Series()

        # Pivot: index by 'id', columns by 'expected_tf'
        # This handles the pairs (True/False expected) for each claim ID
        pivot_pred = subset.pivot_table(index="id", columns="expected_tf", values="pred_tf", aggfunc="first")
        pivot_corr = subset.pivot_table(index="id", columns="expected_tf", values="is_correct", aggfunc="first")

        # Ensure both True and False columns exist
        if True not in pivot_pred.columns or False not in pivot_pred.columns:
            return pd.Series({"n_pair_ids": 0})

        # Drop IDs that don't have both rows valid
        valid_pairs_idx = pivot_pred.dropna(subset=[True, False]).index
        pred_T = pivot_pred.loc[valid_pairs_idx, True]
        pred_F = pivot_pred.loc[valid_pairs_idx, False]
        corr_T = pivot_corr.loc[valid_pairs_idx, True]
        corr_F = pivot_corr.loc[valid_pairs_idx, False]

        # Metric Calculations
        PC = (pred_T != pred_F).mean()
        LFR = 1.0 - PC
        both_correct = (corr_T & corr_F)
        WCA = both_correct.mean()

        denom = corr_T.sum()
        AR = (both_correct.sum() / denom) if denom > 0 else np.nan

        return pd.Series({
            "label_style": label_name,
            "n_valid_rows": int(len(subset)),
            "n_pair_ids": int(len(valid_pairs_idx)),
            "PC": PC,
            "LFR": LFR,
            "AR": AR,
            "WCA": WCA,
            "Accuracy": subset["is_correct"].mean()
        })

    # # 3. Apply grouping logic
    # # This will cover 'Offensiveness', 'Physical Health', etc. automatically
    # category_results = d_valid.groupby("category", sort=False).apply(
    #     lambda x: calculate_group_metrics(x),
    #     include_groups=False
    # ).reset_index()

    # 4. Calculate Overall
    overall_metrics = calculate_group_metrics(d_valid)
    overall_metrics["category"] = "OVERALL"

    # 5. Combine and Format
    report =  pd.DataFrame([overall_metrics])
    # report = pd.concat([category_results, pd.DataFrame([overall_metrics])], ignore_index=True)
    report.index = report.index + 1

    return report.round(4)

# Usage:
# results = tf_structured_metrics_per_category(df)
# print(results)

In [None]:
# Combining metrics from the original tf_structured csv and
# mix_tf csv (the csv that had the tf_structured questions except
# with different option-label styles)
df_tf_multiple_styles = pd.concat([df_tf, df_mix], ignore_index=True)

In [None]:
# Model inference metrics on T/F style questions and num_dot option-label style
temp = tf_structured_metrics_per_category(df_tf_multiple_styles, label_name="num_dot")
temp.to_csv("metrics/tf_metrics_num_dot.csv", index=False)
temp

Unnamed: 0,label_style,n_valid_rows,n_pair_ids,PC,LFR,AR,WCA,Accuracy,category
1,num_dot,20356,10178,0.8503,0.1497,0.9079,0.8257,0.9006,OVERALL


In [None]:
# Model inference metrics on T/F style questions and alpha_dot option-label style
temp = tf_structured_metrics_per_category(df_tf_multiple_styles, label_name="alpha_dot")
temp.to_csv("metrics/tf_metrics_alpha_dot.csv", index=False)
temp

Unnamed: 0,label_style,n_valid_rows,n_pair_ids,PC,LFR,AR,WCA,Accuracy,category
1,alpha_dot,20356,10178,0.8488,0.1512,0.9145,0.8235,0.8991,OVERALL


In [None]:
# Model inference metrics on T/F style questions and roman_dot option-label style
temp = tf_structured_metrics_per_category(df_tf_multiple_styles, label_name="roman_dot")
temp.to_csv("metrics/tf_metrics_roman_dot.csv", index=False)
temp

Unnamed: 0,label_style,n_valid_rows,n_pair_ids,PC,LFR,AR,WCA,Accuracy,category
1,roman_dot,20356,10178,0.8478,0.1522,0.9119,0.8221,0.8982,OVERALL


Obtaining pair-outcome distributions (TT, TF, FT, FF)

In [None]:
import pandas as pd

def calculate_tf_outcome_distributions(df: pd.DataFrame, label_name: str, group_col=None, expected_col='expected_tf', pred_col='pred_tf'):
    """
    Calculates the distribution of True/False prediction outcomes (TT, TF, FT, FF)
    grouped by a specified column, and filtered by a label_name.
    """
    d = df.copy()
    d = d[d.label_style == label_name] # Filter by label_name
    d = d.dropna(subset=[expected_col, pred_col])

    # Ensure boolean types for expected and predicted values
    d[expected_col] = d[expected_col].astype(bool)
    d[pred_col] = d[pred_col].astype(bool)

    def compute_outcomes(subset):
        tt = ((subset[expected_col] == True) & (subset[pred_col] == True)).sum()
        tf = ((subset[expected_col] == True) & (subset[pred_col] == False)).sum()
        ft = ((subset[expected_col] == False) & (subset[pred_col] == True)).sum()
        ff = ((subset[expected_col] == False) & (subset[pred_col] == False)).sum()
        total = len(subset)
        return pd.Series({
            'TT': tt,
            'FF': ff,
            'TF': tf,
            'FT': ft,
            'Total': total
        })

    # Calculate metrics per group_col
    if group_col != None:
        group_results = d.groupby(group_col).apply(compute_outcomes, include_groups=False).reset_index()
        group_results['label_style'] = label_name

    # Calculate overall metrics
    overall_stats = compute_outcomes(d)
    if group_col != None:
        overall_stats[group_col] = 'OVERALL'
    overall_stats['label_style'] = label_name

    # Combine results
    if group_col != None:
        final_report = pd.concat([group_results, pd.DataFrame([overall_stats])], ignore_index=True)
        final_report.index = final_report.index + 1
        return final_report[['label_style', group_col, 'TT', 'FF', 'TF', 'FT', 'Total']]
    else:
        final_report = pd.DataFrame([overall_stats])
        final_report.index = final_report.index + 1
        return final_report[['label_style', 'TT', 'FF', 'TF', 'FT', 'Total']]

In [None]:
df_tf_multiple_styles = pd.concat([df_tf, df_mix], ignore_index=True)
if not os.path.exists("metrics"):
    os.makedirs("metrics")

In [None]:
# Apply the function to your combined True/False dataframe
temp = calculate_tf_outcome_distributions(df_tf_multiple_styles, label_name="num_dot")
temp.to_csv("metrics/tf_outcome_distributions_num_dot.csv", index=False)
temp

Unnamed: 0,label_style,TT,FF,TF,FT,Total
1,num_dot,9257,9075,921,1103,20356


In [None]:
# Apply the function to your combined True/False dataframe
temp = calculate_tf_outcome_distributions(df_tf_multiple_styles, label_name="alpha_dot")
temp.to_csv("metrics/tf_outcome_distributions_alpha_dot.csv", index=False)
temp

Unnamed: 0,label_style,TT,FF,TF,FT,Total
1,alpha_dot,9166,9137,1012,1041,20356


In [None]:
# Apply the function to your combined True/False dataframe
temp = calculate_tf_outcome_distributions(df_tf_multiple_styles, label_name="roman_dot")
temp.to_csv("metrics/tf_outcome_distributions_roman_dot.csv", index=False)
temp

Unnamed: 0,label_style,TT,FF,TF,FT,Total
1,roman_dot,9175,9108,1003,1070,20356


In [None]:
import pandas as pd

def calculate_tf_outcome_distributions_overall(df: pd.DataFrame, group_col='label_style', expected_col='expected_tf', pred_col='pred_tf'):
    """
    Calculates the distribution of True/False prediction outcomes (TT, TF, FT, FF)
    grouped by a specified column, and filtered by a label_name.
    """
    d = df.copy()
    d = d.dropna(subset=[expected_col, pred_col])

    # Ensure boolean types for expected and predicted values
    d[expected_col] = d[expected_col].astype(bool)
    d[pred_col] = d[pred_col].astype(bool)

    def compute_outcomes(subset):
        tt = ((subset[expected_col] == True) & (subset[pred_col] == True)).sum()
        tf = ((subset[expected_col] == True) & (subset[pred_col] == False)).sum()
        ft = ((subset[expected_col] == False) & (subset[pred_col] == True)).sum()
        ff = ((subset[expected_col] == False) & (subset[pred_col] == False)).sum()
        total = len(subset)
        return pd.Series({
            'TT': tt,
            'FF': ff,
            'TF': tf,
            'FT': ft,
            'Total': total
        })

    # Calculate metrics per group_col
    if group_col != None:
        group_results = d.groupby(group_col).apply(compute_outcomes, include_groups=False).reset_index()

    # Calculate overall metrics
    overall_stats = compute_outcomes(d)
    overall_stats['label_style'] = 'OVERALL'

    final_report = pd.concat([group_results, pd.DataFrame([overall_stats])], ignore_index=True)
    final_report.index = final_report.index + 1
    return final_report[['label_style', 'TT', 'FF', 'TF', 'FT', 'Total']]

In [None]:
# Apply the function to your combined True/False dataframe
temp = calculate_tf_outcome_distributions_overall(df_tf_multiple_styles)
temp.to_csv("metrics/tf_outcome_distributions_all_label_styles.csv", index=False)
temp

Unnamed: 0,label_style,TT,FF,TF,FT,Total
1,alpha_dot,9166,9137,1012,1041,20356
2,num_dot,9257,9075,921,1103,20356
3,roman_dot,9175,9108,1003,1070,20356
4,OVERALL,27598,27320,2936,3214,61068


tf outcomes per seed

In [None]:
df_tf_multiple_styles = pd.concat([df_tf, df_mix], ignore_index=True)

In [None]:
if not os.path.exists("metrics"):
    os.makedirs("metrics")

In [None]:
import pandas as pd

df = df_tf_multiple_styles.copy()
df["pred_tf"] = df["pred_tf"].astype(bool)

# If you have a within-id ordering column, include it here:
# df = df.sort_values(["label_style", "id", "row_in_seed"])
df = df.sort_values(["label_style", "id"])

# position within each (label_style, id)
df["pos"] = df.groupby(["label_style", "id"]).cumcount()
df2 = df[df["pos"] < 2].copy()

# wide pairs (keep only complete pairs)
wide = (
    df2.pivot(index=["label_style", "id"], columns="pos", values="pred_tf")
       .dropna(subset=[0, 1])
)

# internal pair label
pair_type = (
    wide[0].map({True: "T", False: "F"}) +
    wide[1].map({True: "T", False: "F"})
)

pairs = pd.DataFrame({
    "label_style": wide.index.get_level_values(0),
    "pair_type": pair_type.values
})

pair_order = ["TF", "FF", "TT", "FT"]

def report_for_style(style: str) -> pd.DataFrame:
    counts = (
        pairs.loc[pairs["label_style"] == style, "pair_type"]
             .value_counts()
             .reindex(pair_order, fill_value=0)
    )
    n_seeds = int(counts.sum())

    out = counts.to_frame().T
    out.index = pd.Index([style], name="label_style")
    out = out.reset_index()
    out["n_seeds"] = n_seeds
    out.columns.name = None

    return out[["label_style"] + pair_order + ["n_seeds"]]

In [None]:
label = "num_dot"
temp = report_for_style(label)
temp.to_csv(f"metrics/tf_outcomes_per_seed_{label}.csv", index = False)
temp

Unnamed: 0,label_style,TF,FF,TT,FT,n_seeds
0,num_dot,8404,671,853,250,10178


In [None]:
label = "alpha_dot"
temp = report_for_style(label)
temp.to_csv(f"metrics/tf_outcomes_per_seed_{label}.csv", index = False)
temp

Unnamed: 0,label_style,TF,FF,TT,FT,n_seeds
0,alpha_dot,8382,755,784,257,10178


In [None]:
label = "roman_dot"
temp = report_for_style(label)
temp.to_csv(f"metrics/tf_outcomes_per_seed_{label}.csv", index = False)
temp

Unnamed: 0,label_style,TF,FF,TT,FT,n_seeds
0,roman_dot,8367,741,808,262,10178
