In [4]:
from __future__ import annotations
from typing import Dict, List, Optional, Tuple
import os, re, csv
import numpy as np
import pandas as pd

# ---------- Canonical columns & Bloom sequence ----------
COLS = [
    "Understandable",
    "TopicRelated",
    "Grammatical",
    "Clear",
    "Rephrase",
    "Answerable",
    "Central",
    "WouldYouUseIt",
    "Bloom’sLevel",
]
BLOOM_SEQ = ["remember", "understand", "apply", "analyze", "evaluate", "create"]

# ---------- Robust CSV loader (no header), preserves blank lines ----------
def read_csv_no_header(path: str, delimiter: str = ",", encoding: Optional[str] = None) -> pd.DataFrame:
    """
    Read a CSV with NO header and force exactly 9 columns (COLS).
    - Preserves blank lines as rows (mapped to 9 empty fields).
    - Uses csv.reader for robust parsing; normalizes row width.
    """
    rows: List[List[str]] = []
    enc = encoding or "utf-8-sig"
    with open(path, "r", encoding=enc, newline="") as f:
        reader = csv.reader(f, delimiter=delimiter)
        for raw in reader:
            # csv.reader returns [] for a blank line
            if raw is None or len(raw) == 0:
                rows.append([""] * len(COLS))
                continue
            # Normalize to exactly 9 columns
            if len(raw) < len(COLS):
                raw = list(raw) + [""] * (len(COLS) - len(raw))
            elif len(raw) > len(COLS):
                # merge extras into the last column (keeps row count aligned)
                raw = raw[: len(COLS) - 1] + [delimiter.join(raw[len(COLS) - 1 :])]
            rows.append(raw)
    df = pd.DataFrame(rows, columns=COLS)
    return df

# ---------- Helpers: normalization & layout checks ----------
def _normalize_yes_no(s: pd.Series) -> pd.Series:
    return s.astype(str).str.strip().str.lower()

def infer_intended_series(n_rows: int) -> pd.Series:
    if n_rows % 6 != 0:
        raise ValueError(
            "Row count must be a multiple of 6 to infer intended Bloom levels "
            "(sequence: remember→understand→apply→analyze→evaluate→create)."
        )
    reps = n_rows // 6
    values = BLOOM_SEQ * reps
    return pd.Series(values, index=range(n_rows), name="IntendedBloom_Inferred")

def _row_is_all_empty(row: pd.Series) -> bool:
    return all(str(x).strip() == "" for x in row.values.tolist())

def enforce_expected_length(filename: str, df: pd.DataFrame, expected: int) -> Tuple[pd.DataFrame, str]:
    """
    Try to reconcile trivial off-by-one cases by trimming leading/trailing fully empty rows.
    Never deletes internal rows (to preserve mapping).
    """
    if len(df) == expected:
        return df, "OK"

    status = f"got={len(df)}, expected={expected}"
    # Drop trailing empty rows first
    while len(df) > expected and _row_is_all_empty(df.iloc[-1]):
        df = df.iloc[:-1].reset_index(drop=True)
    # Drop leading empty rows if still too long
    while len(df) > expected and _row_is_all_empty(df.iloc[0]):
        df = df.iloc[1:].reset_index(drop=True)

    if len(df) == expected:
        return df, f"TRIMMED_EMPTY_EDGES ({status} → {len(df)})"

    # Can't fix automatically (e.g., internal extras or true missing row)
    return df, f"MISMATCH ({status})"

# ---------- Core metric on a DF SLICE ----------
def compute_quality_and_skill(df: pd.DataFrame, verbose: bool = False, export_with_flag: Optional[str] = None) -> Dict[str, float]:
    data = df.copy()

    # Normalize strings used in rules
    for c in ["Understandable","Grammatical","Clear","Rephrase","Answerable","WouldYouUseIt","Bloom’sLevel"]:
        data[c] = _normalize_yes_no(data[c])

    # High-Quality logic (paper definition)
    cond_A = (
        (data["Understandable"] == "yes") &
        (data["Grammatical"] == "yes") &
        (data["Clear"] == "yes") &
        (data["Answerable"] == "yes") &
        (data["WouldYouUseIt"].isin(["yes", "maybe"]))
    )
    cond_B = (
        (data["Understandable"] == "yes") &
        (data["Grammatical"] == "yes") &
        (data["Clear"].isin(["yes", "more_or_less"])) &
        (data["Rephrase"] == "yes") &
        (data["Answerable"] == "yes")
    )
    data["HighQuality"] = (cond_A | cond_B)

    total = len(data)
    hq_count = int(data["HighQuality"].sum())
    quality_pct = (hq_count / total * 100.0) if total else 0.0

    intended = infer_intended_series(total)
    if hq_count > 0:
        evaluator = data.loc[data["HighQuality"], "Bloom’sLevel"]
        intended_aligned = intended.loc[evaluator.index]
        skill_match_mask = (evaluator == intended_aligned)
        skill_match_count = int(skill_match_mask.sum())
        skill_match_pct = float(skill_match_mask.mean() * 100.0)
    else:
        skill_match_pct = 0.0
        skill_match_count = 0

    if export_with_flag:
        data.to_csv(f"{export_with_flag}.csv", index=False)

    return {
        "Total Questions": total,
        "High-quality (#)": hq_count,
        "High-quality (%)": round(quality_pct, 2),
        "Skill Match (# among High-quality)": skill_match_count,
        "Skill Match (% among High-quality)": round(skill_match_pct, 2),
    }

# ---------- Aggregate per ACTUAL model across ALL topics within ONE CSV ----------
def compute_aggregate_per_model(
    df: pd.DataFrame,
    model_labels_in_topic_order: List[str],
    num_topics: int,
    rows_per_model: int = 6,
) -> Dict[str, Dict[str, float]]:
    """Re-stitches non-contiguous 6-row blocks for each actual model across all topics in a single CSV."""
    M = len(model_labels_in_topic_order)
    T = num_topics
    block_span = M * rows_per_model  # rows per topic
    expected_total = block_span * T

    if len(df) != expected_total:
        raise ValueError(
            f"CSV row count {len(df)} doesn't match expected {expected_total} "
            f"(models per topic={M}, topics={T}, rows_per_model={rows_per_model})."
        )

    results: Dict[str, Dict[str, float]] = {}
    for m_idx, label in enumerate(model_labels_in_topic_order):
        parts = []
        for t in range(T):
            start = t * block_span + m_idx * rows_per_model
            end = start + rows_per_model
            parts.append(df.iloc[start:end])
        gdf = pd.concat(parts, axis=0).reset_index(drop=True)
        results[label] = compute_quality_and_skill(gdf, verbose=False)
    return results

# ---------- Public: compute tidy DF over many file tags, PER PROMPT ----------
def run_eval_to_dataframe_per_prompt(
    prompts: List[str],
    file_name_tags: List[str],
    data_model_order: List[str],
    num_topics_per_prompt: int,
    rows_per_model: int = 6,
    base_dir: Optional[str] = None,
    delimiter: str = ",",
) -> pd.DataFrame:
    all_rows: List[Dict[str, object]] = []
    M = len(data_model_order)
    expected_rows_per_csv = M * rows_per_model * num_topics_per_prompt

    for prompt in prompts:
        for tag in file_name_tags:
            filename = f"{prompt}_{tag}.csv"
            path = os.path.join(base_dir, filename) if base_dir else filename

            df = read_csv_no_header(path, delimiter=delimiter)
            df, status = enforce_expected_length(filename, df, expected_rows_per_csv)
            if status != "OK" and not status.startswith("TRIMMED_EMPTY_EDGES"):
                # Keep the error strict here to avoid mis-aggregation
                raise ValueError(f"[{filename}] Row count issue: {status}")

            per_model = compute_aggregate_per_model(
                df,
                model_labels_in_topic_order=data_model_order,
                num_topics=num_topics_per_prompt,
                rows_per_model=rows_per_model,
            )

            for model_label, metrics in per_model.items():
                all_rows.append({
                    "Prompt": prompt,
                    "Eval-Model": tag.rstrip("_clean"),
                    "Model": model_label,
                    **metrics
                })

    cols = [
        "Prompt",
        "Eval-Model",
        "Model",
        "Total Questions",
        "High-quality (#)",
        "High-quality (%)",
        "Skill Match (# among High-quality)",
        "Skill Match (% among High-quality)",
    ]
    out = pd.DataFrame(all_rows)
    if not out.empty:
        out = out[cols].sort_values(["Prompt", "Eval-Model", "Model"]).reset_index(drop=True)
    return out

# ===================== AGREEMENT CHECK (labels workbook with 5 sheets) =====================

def _to_bool(series: pd.Series) -> pd.Series:
    truthy = {"true", "yes", "y", "1", 1, True}
    falsy = {"false", "no", "n", "0", 0, False}
    def cast(v):
        if pd.isna(v):
            return None
        if isinstance(v, str):
            vv = v.strip().lower()
        else:
            vv = v
        if vv in truthy: return True
        if vv in falsy: return False
        try: return bool(int(v))
        except Exception: return None
    return series.map(cast)

def compute_highquality_per_row(df: pd.DataFrame) -> pd.Series:
    """Per-row HighQuality flag using the AEQG rules with normalization."""
    data = df.copy()
    for c in ["Understandable","Grammatical","Clear","Rephrase","Answerable","WouldYouUseIt"]:
        data[c] = _normalize_yes_no(data[c])

    cond_A = (
        (data["Understandable"] == "yes") &
        (data["Grammatical"] == "yes") &
        (data["Clear"] == "yes") &
        (data["Answerable"] == "yes") &
        (data["WouldYouUseIt"].isin(["yes", "maybe"]))
    )
    cond_B = (
        (data["Understandable"] == "yes") &
        (data["Grammatical"] == "yes") &
        (data["Clear"].isin(["yes", "more_or_less"])) &
        (data["Rephrase"] == "yes") &
        (data["Answerable"] == "yes")
    )
    return (cond_A | cond_B)

def load_labels_workbook_sheet(labels_workbook_path: str, prompt: str) -> pd.DataFrame:
    """
    Load labels for given prompt from workbook sheet named exactly the prompt (e.g., 'PS1').
    Columns required: Index, IsTrueHighQuality
    """
    if not os.path.exists(labels_workbook_path):
        raise FileNotFoundError(f"Labels workbook not found: {labels_workbook_path}")
    df = pd.read_excel(labels_workbook_path, sheet_name=prompt)
    needed = {"Index", "IsTrueHighQuality"}
    missing = needed - set(df.columns)
    if missing:
        raise ValueError(f"Sheet '{prompt}' missing columns: {missing}. Need exactly: {needed}")
    df = df.copy()
    df["Index"] = pd.to_numeric(df["Index"], errors="coerce").astype("Int64")  # keep NA if any
    df["IsTrueHighQuality"] = _to_bool(df["IsTrueHighQuality"])
    return df[["Index", "IsTrueHighQuality"]].dropna(subset=["Index"]).astype({"Index": int}).reset_index(drop=True)

# ---------- NEW: auto-detect best index shift to maximize coverage ----------
def _best_index_shift(lab_idx: pd.Series, n_rows: int, shifts=range(-2, 3)) -> Tuple[int, int]:
    """
    Try shifts (e.g., -2..+2) and return (best_shift, matched_count) maximizing overlap with [0, n_rows-1].
    """
    comp_idx_set = set(range(n_rows))
    best = (0, -1)  # (shift, matches)
    for s in shifts:
        shifted = (lab_idx + s)
        matches = int(shifted.isin(comp_idx_set).sum())
        if matches > best[1]:
            best = (s, matches)
    return best

def _apply_index_shift(lab_df: pd.DataFrame, shift: int, n_rows: int) -> pd.DataFrame:
    shifted = lab_df.copy()
    shifted["Index"] = shifted["Index"] + shift
    # Keep only valid range
    shifted = shifted[(shifted["Index"] >= 0) & (shifted["Index"] < n_rows)].copy()
    # Drop duplicate indices keeping the first occurrence
    shifted = shifted.drop_duplicates(subset=["Index"], keep="first").reset_index(drop=True)
    return shifted

def compute_agreement_metrics(merged: pd.DataFrame) -> dict:
    """Return confusion + metrics; also report how many rows had no label."""
    total_rows = len(merged)
    m = merged.dropna(subset=["LabelIsTrueHighQuality"]).copy()
    unlabeled = total_rows - len(m)

    tp = int(((m["ComputedHighQuality"] == True)  & (m["LabelIsTrueHighQuality"] == True)).sum())
    tn = int(((m["ComputedHighQuality"] == False) & (m["LabelIsTrueHighQuality"] == False)).sum())
    fp = int(((m["ComputedHighQuality"] == True)  & (m["LabelIsTrueHighQuality"] == False)).sum())
    fn = int(((m["ComputedHighQuality"] == False) & (m["LabelIsTrueHighQuality"] == True)).sum())
    total_labeled = len(m)

    acc = (tp + tn) / total_labeled * 100.0 if total_labeled else 0.0
    prec = tp / (tp + fp) * 100.0 if (tp + fp) else 0.0
    rec = tp / (tp + fn) * 100.0 if (tp + fn) else 0.0
    f1 = (2 * prec * rec / (prec + rec)) if (prec + rec) else 0.0

    return {
        "Total Rows": int(total_rows),
        "Total Labeled": int(total_labeled),
        "Unlabeled (#)": int(unlabeled),
        "Accuracy (%)": round(acc, 2),
        "Precision (%)": round(prec, 2),
        "Recall (%)": round(rec, 2),
        "F1 (%)": round(f1, 2),
        "TP": tp, "FP": fp, "FN": fn, "TN": tn,
    }

# ---------- Assign per-row ACTUAL model label based on layout ----------
def assign_model_labels(n_rows: int, data_model_order: List[str], rows_per_model: int) -> pd.Series:
    """
    Assign the actual model for each row given the per-topic layout:
    inside each topic, rows are in DATA_MODEL_ORDER and each model contributes rows_per_model rows.
    """
    M = len(data_model_order)
    block_span = M * rows_per_model
    if block_span == 0:
        raise ValueError("rows_per_model or number of models is zero.")
    model_idx = [((i % block_span) // rows_per_model) for i in range(n_rows)]
    labels = [data_model_order[j] for j in model_idx]
    return pd.Series(labels, index=range(n_rows), name="Model")

# ---------- Excel sheet-name sanitizer ----------
def make_safe_sheet_name(name: str, used: set) -> str:
    base = os.path.splitext(name)[0]
    safe = re.sub(r"[\[\]\:\*\?\/\\]", "·", base).strip().strip("'")
    MAXLEN = 31
    safe = safe[:MAXLEN] or "Sheet"
    candidate = safe
    i = 1
    while candidate in used:
        suffix = f"~{i}"
        candidate = (safe[:MAXLEN - len(suffix)]) + suffix
        i += 1
    used.add(candidate)
    return candidate

# ---------- Safe recomputation of metrics from sums (no warnings) ----------
def _recompute_metrics_from_sums(df: pd.DataFrame, group_cols: List[str]) -> pd.DataFrame:
    """
    Given a df with columns TP, FP, FN, TN, Total Rows, Total Labeled, Unlabeled (#),
    compute Accuracy/Precision/Recall/F1 (masked division).
    """
    out = df.copy()

    # Ensure numeric dtype
    for c in ["TP","FP","FN","TN","Total Rows","Total Labeled","Unlabeled (#)"]:
        out[c] = pd.to_numeric(out[c], errors="coerce").fillna(0).astype(int)

    tp = out["TP"].to_numpy(dtype=float)
    fp = out["FP"].to_numpy(dtype=float)
    fn = out["FN"].to_numpy(dtype=float)
    tn = out["TN"].to_numpy(dtype=float)
    tot_lab = out["Total Labeled"].to_numpy(dtype=float)

    def safe_div(num: np.ndarray, den: np.ndarray) -> np.ndarray:
        res = np.zeros_like(num, dtype=float)
        np.divide(num, den, out=res, where=(den > 0))
        return res

    acc_r  = safe_div(tp + tn, tot_lab)
    prec_r = safe_div(tp, tp + fp)
    rec_r  = safe_div(tp, tp + fn)
    f1_r   = safe_div(2.0 * prec_r * rec_r, (prec_r + rec_r))

    out["Accuracy (%)"]  = np.round(acc_r * 100.0, 2)
    out["Precision (%)"] = np.round(prec_r * 100.0, 2)
    out["Recall (%)"]    = np.round(rec_r * 100.0, 2)
    out["F1 (%)"]        = np.round(f1_r * 100.0, 2)

    ordered = group_cols + ["Total Rows","Total Labeled","Unlabeled (#)","TP","FP","FN","TN",
                            "Accuracy (%)","Precision (%)","Recall (%)","F1 (%)"]
    return out[ordered]

def run_quality_agreement_check_with_workbook(
    prompts: List[str],
    file_name_tags: List[str],
    base_dir: Optional[str],
    delimiter: str,
    labels_workbook_path: str,
    data_model_order: List[str],
    rows_per_model: int,
    num_topics_per_prompt: int,
    export_excel_path: Optional[str] = "AEQG_quality_agreement.xlsx",
) -> Tuple[pd.DataFrame, dict, pd.DataFrame, pd.DataFrame]:
    """
    For each prompt P in prompts:
      - Load labels from workbook sheet named P (Index, IsTrueHighQuality).
      - For each file <P>_<tag>.csv:
          * Robustly read rows (preserve blanks), enforce expected rows = M*rows_per_model*num_topics
          * Compute ComputedHighQuality per row
          * Attach per-row ACTUAL Model via layout
          * Auto-detect label Index shift (−2..+2) to maximize coverage
          * Merge labels on Index
      - Produce:
          1) summary_file_df (per file)
          2) agree_by_eval_model_df (Prompt × Eval-Model)
          3) agree_by_model_df (Prompt × Eval-Model × Model)
      - detail_per_file: raw merged rows per file
    """
    summary_rows = []
    detail_per_file: Dict[str, pd.DataFrame] = {}
    by_eval_rows = []
    by_model_rows = []

    M = len(data_model_order)
    expected_rows_per_csv = M * rows_per_model * num_topics_per_prompt

    # Cache labels per prompt so we read each sheet once
    labels_cache: Dict[str, pd.DataFrame] = {}

    for prompt in prompts:
        if prompt not in labels_cache:
            labels_cache[prompt] = load_labels_workbook_sheet(labels_workbook_path, prompt)
        lab_df_base = labels_cache[prompt]

        for tag in file_name_tags:
            filename = f"{prompt}_{tag}.csv"
            path = os.path.join(base_dir, filename) if base_dir else filename

            df = read_csv_no_header(path, delimiter=delimiter)
            df, status = enforce_expected_length(filename, df, expected_rows_per_csv)
            if status != "OK" and not status.startswith("TRIMMED_EMPTY_EDGES"):
                print(f"[WARN] {filename}: {status} — proceeding with agreement (mapping still index-based).")

            n = len(df)

            # Per-row flags & model label
            comp_hq = compute_highquality_per_row(df).astype(bool)
            model_series = assign_model_labels(n, data_model_order=data_model_order, rows_per_model=rows_per_model)
            comp_df = pd.DataFrame({
                "Index": range(n),
                "ComputedHighQuality": comp_hq.values,
                "Model": model_series.values,
                "Prompt": prompt,
                "Eval-Model": tag.rstrip("_clean"),
                "File": filename,
            })

            # ----- Auto-detect best index shift -----
            shift, matched = _best_index_shift(lab_df_base["Index"], n_rows=n, shifts=range(-2, 3))
            lab_df_prompt = _apply_index_shift(lab_df_base, shift=shift, n_rows=n)

            merged = comp_df.merge(
                lab_df_prompt.rename(columns={"IsTrueHighQuality": "LabelIsTrueHighQuality"}),
                on="Index",
                how="left",
            )

            # ---------- (1) Per-file summary ----------
            size_note = f"labels={len(lab_df_prompt)}/orig={len(lab_df_base)}, data={n}, shift={shift}, matched={matched}"
            metrics_file = compute_agreement_metrics(merged)
            summary_rows.append({
                "Prompt": prompt,
                "Eval-Model": tag.rstrip("_clean"),
                "File": filename,
                "RowCountCheck": f"{status}; {size_note}" if status != "OK" else size_note,
                **metrics_file
            })

            # ---------- (2) By Eval-Model (for this prompt/tag): store counts; aggregate later ----------
            m_eval = compute_agreement_metrics(merged)
            by_eval_rows.append({
                "Prompt": prompt,
                "Eval-Model": tag.rstrip("_clean"),
                **{k: m_eval[k] for k in ["Total Rows","Total Labeled","Unlabeled (#)","TP","FP","FN","TN"]}
            })

            # ---------- (3) By Model within this file ----------
            for model_label, g in merged.groupby("Model", sort=False):
                mcounts = compute_agreement_metrics(g)
                by_model_rows.append({
                    "Prompt": prompt,
                    "Eval-Model": tag.rstrip("_clean"),
                    "Model": model_label,
                    **{k: mcounts[k] for k in ["Total Rows","Total Labeled","Unlabeled (#)","TP","FP","FN","TN"]}
                })

            merged["Match"] = (merged["ComputedHighQuality"] == merged["LabelIsTrueHighQuality"])
            detail_per_file[filename] = merged

    # Build DataFrames
    summary_df = pd.DataFrame(summary_rows).sort_values(["Prompt", "Eval-Model", "File"]).reset_index(drop=True)

    # Aggregate Eval-Model rows (sums), then recompute metrics from sums (masked division)
    if len(by_eval_rows):
        tmp = pd.DataFrame(by_eval_rows)
        grouped = tmp.groupby(["Prompt","Eval-Model"], as_index=False)[
            ["Total Rows","Total Labeled","Unlabeled (#)","TP","FP","FN","TN"]
        ].sum()
        agree_by_eval_model_df = _recompute_metrics_from_sums(grouped, ["Prompt","Eval-Model"])
    else:
        agree_by_eval_model_df = pd.DataFrame(columns=[
            "Prompt","Eval-Model","Total Rows","Total Labeled","Unlabeled (#)","TP","FP","FN","TN",
            "Accuracy (%)","Precision (%)","Recall (%)","F1 (%)"
        ])

    # Aggregate by model (Prompt × Eval-Model × Model)
    if len(by_model_rows):
        tmpm = pd.DataFrame(by_model_rows)
        groupedm = tmpm.groupby(["Prompt","Eval-Model","Model"], as_index=False)[
            ["Total Rows","Total Labeled","Unlabeled (#)","TP","FP","FN","TN"]
        ].sum()
        agree_by_model_df = _recompute_metrics_from_sums(groupedm, ["Prompt","Eval-Model","Model"]).sort_values(
            ["Prompt","Eval-Model","Model"]
        ).reset_index(drop=True)
    else:
        agree_by_model_df = pd.DataFrame(columns=[
            "Prompt","Eval-Model","Model","Total Rows","Total Labeled","Unlabeled (#)","TP","FP","FN","TN",
            "Accuracy (%)","Precision (%)","Recall (%)","F1 (%)"
        ])

    # Export
    if export_excel_path:
        with pd.ExcelWriter(export_excel_path, engine="xlsxwriter") as writer:
            summary_df.to_excel(writer, index=False, sheet_name="SUMMARY")
            agree_by_eval_model_df.to_excel(writer, index=False, sheet_name="AGREE_BY_EVAL_MODEL")
            agree_by_model_df.to_excel(writer, index=False, sheet_name="AGREE_BY_MODEL")

            used_names: set = {"SUMMARY", "AGREE_BY_EVAL_MODEL", "AGREE_BY_MODEL"}
            for fname, detail_df in detail_per_file.items():
                sheet = make_safe_sheet_name(fname, used_names)
                detail_df.to_excel(writer, index=False, sheet_name=sheet)

    # Also quick CSVs
    agree_by_eval_model_df.to_csv("AEQG_agreement_by_eval_model.csv", index=False)
    agree_by_model_df.to_csv("AEQG_agreement_by_model.csv", index=False)

    return summary_df, detail_per_file, agree_by_eval_model_df, agree_by_model_df

In [5]:
# ===================== DEFAULTS / EXAMPLE MAIN =====================

PROMPTS = ['PS1', 'PS2', 'PS3', 'PS4', 'PS5']
FILE_TAGS = [
    'deepseek-r1:14b',
    'phi4:latest',
    'gemma3:latest',
    'mistral-small3.2:latest',
    'phi4-mini:latest',
    'granite4:latest',
    'llama3.2:latest',
    'gpt-oss:latest',
]
FILE_TAGS = [f"{item}_clean" for item in FILE_TAGS]

DATA_MODEL_ORDER = ["GPT4", "GPT3.5", "Palm2", "Llama2_70B", "Mistral_7B"]
NUM_TOPICS_PER_PROMPT = 17
ROWS_PER_MODEL = 6  # Bloom cycle length

BASE_DIR = "clean_output_full"
DELIM = ","
LABELS_WORKBOOK = "All_PS_ExpertEvaluation_withExists.xlsx"  # sheets: PS1..PS5 with Index, IsTrueHighQuality

In [6]:

# 1) Original per-model metrics (unchanged)
df_metrics = run_eval_to_dataframe_per_prompt(
    prompts=PROMPTS,
    file_name_tags=FILE_TAGS,
    data_model_order=DATA_MODEL_ORDER,
    num_topics_per_prompt=NUM_TOPICS_PER_PROMPT,
    rows_per_model=ROWS_PER_MODEL,
    base_dir=BASE_DIR,
    delimiter=DELIM,
)
print("\n=== AEQG Metrics (Per Prompt × Eval-Model × Model) ===")
print(df_metrics)
df_metrics.to_excel('AEQG_temp_evals.xlsx', index=False)

# 2) Agreement with Eval-Model–wise and Model–wise splits + auto index shift + unlabeled reporting
summary_df, detail, agree_by_eval_model_df, agree_by_model_df = run_quality_agreement_check_with_workbook(
    prompts=PROMPTS,
    file_name_tags=FILE_TAGS,
    base_dir=BASE_DIR,
    delimiter=DELIM,
    labels_workbook_path=LABELS_WORKBOOK,
    data_model_order=DATA_MODEL_ORDER,
    rows_per_model=ROWS_PER_MODEL,
    num_topics_per_prompt=NUM_TOPICS_PER_PROMPT,
    export_excel_path="AEQG_quality_agreement.xlsx",
)
print("\n=== Agreement Summary (Per File) ===")
print(summary_df)

print("\n=== Agreement by Eval-Model ===")
print(agree_by_eval_model_df)

print("\n=== Agreement by Model (Prompt × Eval-Model × Model) ===")
print(agree_by_model_df)



=== AEQG Metrics (Per Prompt × Eval-Model × Model) ===
    Prompt       Eval-Model       Model  Total Questions  High-quality (#)  \
0      PS1  deepseek-r1:14b      GPT3.5              102                95   
1      PS1  deepseek-r1:14b        GPT4              102                93   
2      PS1  deepseek-r1:14b  Llama2_70B              102                95   
3      PS1  deepseek-r1:14b  Mistral_7B              102                90   
4      PS1  deepseek-r1:14b       Palm2              102                94   
..     ...              ...         ...              ...               ...   
195    PS5      phi4:latest      GPT3.5              102                62   
196    PS5      phi4:latest        GPT4              102                82   
197    PS5      phi4:latest  Llama2_70B              102                44   
198    PS5      phi4:latest  Mistral_7B              102                47   
199    PS5      phi4:latest       Palm2              102                58   

     Hi