In [3]:
# scripts/generate_table_5_1.py

import pandas as pd
from pathlib import Path

# ---------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------

RESULTS_DIR = Path("results")          # where analyze_batch.py outputs CSV
INPUT_FILE = RESULTS_DIR / "summary_all.csv"
OUTPUT_FILE = RESULTS_DIR / "table_5_1_final_knowledge.csv"

CONDITIONS = ["fixed", "adaptive"]     # baseline vs adaptive

# ---------------------------------------------------------------------
# Load aggregated results
# ---------------------------------------------------------------------

df = pd.read_csv(INPUT_FILE)

# Ensure required columns exist
required_cols = {"condition", "disability_profile", "final_knowledge"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Missing columns in input CSV: {missing}")

# ---------------------------------------------------------------------
# Filter to baseline and adaptive conditions
# ---------------------------------------------------------------------

df = df[df["condition"].isin(CONDITIONS)]

# ---------------------------------------------------------------------
# Aggregate mean ± SD of final knowledge
# ---------------------------------------------------------------------

table = (
    df.groupby(["disability_profile", "condition"])["final_knowledge"]
      .agg(["mean", "std"])
      .reset_index()
)

# Format as "mean ± sd"
table["Final Knowledge (mean ± SD)"] = (
    table["mean"].round(3).astype(str)
    + " ± "
    + table["std"].round(3).astype(str)
)

# Pivot for readability
table = table.pivot(
    index="disability_profile",
    columns="condition",
    values="Final Knowledge (mean ± SD)"
).reset_index()

# Rename columns for publication
table.columns = [
    "Learner Profile",
    "Baseline Tutor",
    "Adaptive Tutor"
]

# ---------------------------------------------------------------------
# Save Table 5.1
# ---------------------------------------------------------------------

table.to_csv(OUTPUT_FILE, index=False)

print("\nTable 5.1 generated successfully:")
print(table)
print(f"\nSaved to: {OUTPUT_FILE.resolve()}")

FileNotFoundError: [Errno 2] No such file or directory: 'results/summary_all.csv'

In [4]:
# scripts/generate_table_5_1_from_ch5_run_metrics_selected.py

import pandas as pd
from pathlib import Path

# -----------------------------
# CONFIG: set your input/output
# -----------------------------
INPUT_FILE = Path("/home/alton/alc_logs/final/ch5_run_metrics_selected.csv")
OUTPUT_FILE = Path("/home/alton/alc_logs/final/table_5_1_final_knowledge_baseline_vs_adaptive.csv")

BASELINE_NAMES = {"baseline", "fixed"}   # some pipelines label baseline as "fixed"
ADAPTIVE_NAMES = {"adaptive"}

# -----------------------------
# Load
# -----------------------------
df = pd.read_csv(INPUT_FILE)

# -----------------------------
# Helper: pick column by candidates
# -----------------------------
def pick_col(candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

# Detect key columns (handles slight naming differences)
col_condition = pick_col(["condition", "group", "policy_mode"])
col_profile   = pick_col(["disability_profile", "disability_profile_param", "profile"])
col_final_k   = pick_col(["final_knowledge", "k_final", "knowledge_final", "final_k"])

if col_condition is None:
    raise ValueError(f"Could not find a condition column. Columns: {list(df.columns)}")
if col_profile is None:
    raise ValueError(f"Could not find a disability/profile column. Columns: {list(df.columns)}")

# If final knowledge is not present, try to reconstruct it from initial + gain
if col_final_k is None:
    col_k0 = pick_col(["k_initial", "initial_knowledge", "knowledge_initial", "initial_k"])
    col_gain = pick_col(["knowledge_gain", "learning_gain", "gain_k"])
    if col_k0 is None or col_gain is None:
        raise ValueError(
            "Could not find final knowledge OR a pair of (initial knowledge + knowledge gain) to reconstruct it.\n"
            f"Columns: {list(df.columns)}"
        )
    df["_final_knowledge"] = df[col_k0] + df[col_gain]
    col_final_k = "_final_knowledge"

# -----------------------------
# Filter to baseline vs adaptive only
# -----------------------------
df = df.copy()
df[col_condition] = df[col_condition].astype(str).str.strip().str.lower()

df = df[df[col_condition].isin(BASELINE_NAMES.union(ADAPTIVE_NAMES))].copy()

# Normalize names to exactly "baseline" and "adaptive"
df["_cond_norm"] = df[col_condition].apply(lambda x: "baseline" if x in BASELINE_NAMES else "adaptive")

# -----------------------------
# Aggregate mean ± SD of final knowledge by profile × condition
# -----------------------------
agg = (
    df.groupby([col_profile, "_cond_norm"])[col_final_k]
      .agg(["count", "mean", "std"])
      .reset_index()
)

# Format "mean ± SD" (handle std NaN when count=1)
def fmt(mean, std, n):
    if pd.isna(std):
        return f"{mean:.3f} ± N/A (n={n})"
    return f"{mean:.3f} ± {std:.3f} (n={n})"

agg["Final Knowledge (mean ± SD)"] = [
    fmt(m, s, n) for m, s, n in zip(agg["mean"], agg["std"], agg["count"])
]

# Pivot into publication table
table = agg.pivot(index=col_profile, columns="_cond_norm", values="Final Knowledge (mean ± SD)").reset_index()

# Ensure both columns exist even if one is missing in data
if "baseline" not in table.columns:
    table["baseline"] = ""
if "adaptive" not in table.columns:
    table["adaptive"] = ""

# Rename columns for thesis
table = table.rename(columns={
    col_profile: "Learner Profile",
    "baseline": "Baseline Tutor (Final knowledge: mean ± SD)",
    "adaptive": "Adaptive Tutor (Final knowledge: mean ± SD)"
})

# Optional: sort profiles nicely if present
preferred_order = ["none", "dyslexia", "hearing_impairment", "low_vision"]
table["__order"] = table["Learner Profile"].astype(str).apply(lambda x: preferred_order.index(x) if x in preferred_order else 999)
table = table.sort_values("__order").drop(columns="__order")

# Save
table.to_csv(OUTPUT_FILE, index=False)-------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[4], line 37
     35     raise ValueError(f"Could not find a condition column. Columns: {list(df.columns)}")
     36 if col_profile is None:
---> 37     raise ValueError(f"Could not find a disability/profile column. Columns: {list(df.columns)}")
     39 # If final knowledge is not present, try to reconstruct it from initial + gain
     40 if col_final_k is None:

ValueError: Could not find a disability/profile column. Columns: ['source_file', 'condition', 'n_steps', 'knowledge_start', 'knowledge_end', 'knowledge_gain', 'success_rate', 'mean_error_rate', 'mean_cognitive_load', 'mean_response_time', 'cum_reward', 'mean_reward']

print("\nTable 5.1 generated successfully.\n")
print(table.to_string(index=False))
print(f"\nSaved to: {OUTPUT_FILE}")


ValueError: Could not find a disability/profile column. Columns: ['source_file', 'condition', 'n_steps', 'knowledge_start', 'knowledge_end', 'knowledge_gain', 'success_rate', 'mean_error_rate', 'mean_cognitive_load', 'mean_response_time', 'cum_reward', 'mean_reward']

In [5]:
import pandas as pd
df = pd.read_csv("/home/alton/alc_logs/final/ch5_run_metrics_selected.csv")
print(df["source_file"].head(10).tolist())


['20260115_080831.csv', '20260115_081028.csv', '20260115_082025.csv', '4b8e83db-5c81-452c-9792-ce02ad5580a7.csv']


In [6]:
import json
from pathlib import Path
import pandas as pd

# -----------------------------
# CONFIG
# -----------------------------
RUN_METRICS = Path("/home/alton/alc_logs/final/ch5_run_metrics_selected.csv")
LOG_ROOT = Path("/home/alton/alc_logs")  # where metadata lives
OUTPUT = Path("/home/alton/alc_logs/final/table_5_1_final_knowledge_by_profile.csv")

BASELINE_NAMES = {"baseline", "fixed"}
ADAPTIVE_NAMES = {"adaptive"}

# -----------------------------
# Load run metrics
# -----------------------------
df = pd.read_csv(RUN_METRICS)

# Normalize condition
df["condition"] = df["condition"].astype(str).str.strip().str.lower()
df = df[df["condition"].isin(BASELINE_NAMES.union(ADAPTIVE_NAMES))].copy()
df["condition_norm"] = df["condition"].apply(lambda x: "baseline" if x in BASELINE_NAMES else "adaptive")

# We'll use knowledge_end as final knowledge (your file has it)
if "knowledge_end" not in df.columns:
    raise ValueError(f"'knowledge_end' not found in {RUN_METRICS}. Columns: {list(df.columns)}")

# -----------------------------
# Build a lookup: run_name -> disability_profile from metadata JSON
# metadata naming convention: <run_name>_metadata.json
# run_name corresponds to CSV filename stem (e.g., 20260115_080831.csv -> 20260115_080831_metadata.json)
# -----------------------------
# Index metadata by run_name (stem)
meta_by_run = {}
for meta_path in LOG_ROOT.rglob("*_metadata.json"):
    stem = meta_path.name.replace("_metadata.json", "")
    try:
        with meta_path.open("r", encoding="utf-8") as f:
            meta = json.load(f)
        # keys per documentation: disability_profile, condition, etc.
        disability = meta.get("disability_profile") or meta.get("disability_profile_param") or meta.get("profile")
        if disability is not None:
            meta_by_run[stem] = disability
    except Exception:
        # ignore unreadable metadata files
        pass

# -----------------------------
# Attach disability_profile to each row using source_file stem
# source_file values: "20260115_080831.csv" -> stem "20260115_080831"
# -----------------------------
def infer_run_stem(source_file: str) -> str:
    return Path(str(source_file)).stem

df["run_stem"] = df["source_file"].astype(str).apply(infer_run_stem)
df["disability_profile"] = df["run_stem"].map(meta_by_run)

missing = df["disability_profile"].isna().sum()
if missing > 0:
    # show a few missing stems to help debug if needed
    missing_examples = df.loc[df["disability_profile"].isna(), "run_stem"].head(10).tolist()
    raise ValueError(
        f"Could not find disability_profile for {missing} run(s). "
        f"Example missing run_stem values: {missing_examples}\n"
        f"Checked metadata under: {LOG_ROOT}"
    )

# -----------------------------
# Aggregate: mean ± SD final knowledge by disability_profile × condition
# -----------------------------
agg = (
    df.groupby(["disability_profile", "condition_norm"])["knowledge_end"]
      .agg(["count", "mean", "std"])
      .reset_index()
)

def fmt(mean, std, n):
    if pd.isna(std):
        return f"{mean:.3f} ± N/A (n={int(n)})"
    return f"{mean:.3f} ± {std:.3f} (n={int(n)})"

agg["Final Knowledge (mean ± SD)"] = [
    fmt(m, s, n) for m, s, n in zip(agg["mean"], agg["std"], agg["count"])
]

table = agg.pivot(
    index="disability_profile",
    columns="condition_norm",
    values="Final Knowledge (mean ± SD)"
).reset_index()

# Ensure both columns exist
if "baseline" not in table.columns:
    table["baseline"] = ""
if "adaptive" not in table.columns:
    table["adaptive"] = ""

table = table.rename(columns={
    "disability_profile": "Learner Profile",
    "baseline": "Baseline Tutor (Final knowledge: mean ± SD)",
    "adaptive": "Adaptive Tutor (Final knowledge: mean ± SD)"
})

# Optional ordering
preferred = ["none", "dyslexia", "hearing_impairment", "low_vision"]
table["__order"] = table["Learner Profile"].astype(str).apply(lambda x: preferred.index(x) if x in preferred else 999)
table = table.sort_values("__order").drop(columns="__order")

# Save
table.to_csv(OUTPUT, index=False)

print("\nTable 5.1 generated successfully:\n")
print(table.to_string(index=False))
print(f"\nSaved to: {OUTPUT}")


ValueError: Could not find disability_profile for 1 run(s). Example missing run_stem values: ['4b8e83db-5c81-452c-9792-ce02ad5580a7']
Checked metadata under: /home/alton/alc_logs

In [7]:
import json
from pathlib import Path
import pandas as pd

RUN_METRICS = Path("/home/alton/alc_logs/final/ch5_run_metrics_selected.csv")
LOG_ROOTS = [Path("/home/alton/alc_logs"), Path("/home/alton/alc_logs/final")]  # search both
OUTPUT = Path("/home/alton/alc_logs/final/table_5_1_final_knowledge_by_profile.csv")

BASELINE_NAMES = {"baseline", "fixed"}
ADAPTIVE_NAMES = {"adaptive"}

df = pd.read_csv(RUN_METRICS)

df["condition"] = df["condition"].astype(str).str.strip().str.lower()
df = df[df["condition"].isin(BASELINE_NAMES.union(ADAPTIVE_NAMES))].copy()
df["condition_norm"] = df["condition"].apply(lambda x: "baseline" if x in BASELINE_NAMES else "adaptive")

if "knowledge_end" not in df.columns:
    raise ValueError(f"'knowledge_end' not found. Columns: {list(df.columns)}")

# --- Build metadata lookup across possible roots ---
meta_by_run = {}
for root in LOG_ROOTS:
    for meta_path in root.rglob("*_metadata.json"):
        stem = meta_path.name.replace("_metadata.json", "")
        try:
            with meta_path.open("r", encoding="utf-8") as f:
                meta = json.load(f)
            disability = meta.get("disability_profile") or meta.get("disability_profile_param") or meta.get("profile")
            if disability is not None:
                meta_by_run[stem] = disability
        except Exception:
            pass

df["run_stem"] = df["source_file"].astype(str).apply(lambda x: Path(x).stem)
df["disability_profile"] = df["run_stem"].map(meta_by_run)

# --- Drop unmatched rows (instead of failing) ---
missing_mask = df["disability_profile"].isna()
n_missing = int(missing_mask.sum())
if n_missing > 0:
    missing_examples = df.loc[missing_mask, "run_stem"].unique().tolist()[:10]
    print(f"WARNING: Dropping {n_missing} run(s) with no metadata match.")
    print(f"Example missing run_stem values: {missing_examples}")
    df = df[~missing_mask].copy()

# --- Aggregate ---
agg = (
    df.groupby(["disability_profile", "condition_norm"])["knowledge_end"]
      .agg(["count", "mean", "std"])
      .reset_index()
)

def fmt(mean, std, n):
    if pd.isna(std):
        return f"{mean:.3f} ± N/A (n={int(n)})"
    return f"{mean:.3f} ± {std:.3f} (n={int(n)})"

agg["Final Knowledge (mean ± SD)"] = [fmt(m, s, n) for m, s, n in zip(agg["mean"], agg["std"], agg["count"])]

table = agg.pivot(index="disability_profile", columns="condition_norm", values="Final Knowledge (mean ± SD)").reset_index()

if "baseline" not in table.columns:
    table["baseline"] = ""
if "adaptive" not in table.columns:
    table["adaptive"] = ""

table = table.rename(columns={
    "disability_profile": "Learner Profile",
    "baseline": "Baseline Tutor (Final knowledge: mean ± SD)",
    "adaptive": "Adaptive Tutor (Final knowledge: mean ± SD)"
})

preferred = ["none", "dyslexia", "hearing_impairment", "low_vision"]
table["__order"] = table["Learner Profile"].astype(str).apply(lambda x: preferred.index(x) if x in preferred else 999)
table = table.sort_values("__order").drop(columns="__order")

table.to_csv(OUTPUT, index=False)

print("\nTable 5.1 generated successfully:\n")
print(table.to_string(index=False))
print(f"\nSaved to: {OUTPUT}")


Example missing run_stem values: ['4b8e83db-5c81-452c-9792-ce02ad5580a7']

Table 5.1 generated successfully:

Empty DataFrame
Columns: [Learner Profile, Baseline Tutor (Final knowledge: mean ± SD), Adaptive Tutor (Final knowledge: mean ± SD)]
Index: []

Saved to: /home/alton/alc_logs/final/table_5_1_final_knowledge_by_profile.csv


In [8]:
!pwd

/home/alton/dev/inclusive-alc-sim/results


In [9]:
import pandas as pd
from pathlib import Path

LOG_ROOT = Path("/home/alton/alc_logs/final")  # <-- your "final" folder
OUTPUT = Path("/home/alton/alc_logs/final/table_5_1_final_knowledge_by_profile.csv")

# Collect candidate run CSVs (exclude summary/table/metrics files)
csvs = []
for p in LOG_ROOT.rglob("*.csv"):
    name = p.name.lower()
    if name.endswith("_summary.csv"):
        continue
    if name.startswith("table_") or "condition_summary" in name or "run_metrics" in name:
        continue
    csvs.append(p)

if not csvs:
    raise ValueError(f"No run CSVs found under {LOG_ROOT}")

rows = []
for p in csvs:
    try:
        df = pd.read_csv(p)

        # Required columns from logger outputs (per documentation)
        # condition + disability profile + knowledge time series
        cond_col = "condition" if "condition" in df.columns else None
        prof_col = "disability_profile_param" if "disability_profile_param" in df.columns else None
        know_col = "knowledge" if "knowledge" in df.columns else None

        if cond_col is None or prof_col is None or know_col is None:
            continue

        condition = str(df[cond_col].iloc[0]).strip().lower()
        profile = str(df[prof_col].iloc[0]).strip().lower()

        # Keep only baseline vs adaptive (baseline may appear as fixed/baseline)
        if condition not in {"fixed", "baseline", "adaptive"}:
            continue
        condition = "baseline" if condition in {"fixed", "baseline"} else "adaptive"

        # Final knowledge = last knowledge value in the run
        final_k = float(df[know_col].iloc[-1])

        rows.append({"condition": condition, "profile": profile, "final_knowledge": final_k})

    except Exception:
        # Skip unreadable files
        continue

data = pd.DataFrame(rows)
if data.empty:
    raise ValueError(
        "No usable run logs found. "
        "Check that LOG_ROOT contains the logger output CSVs with columns "
        "condition, disability_profile_param, knowledge."
    )

# Aggregate mean ± SD by profile × condition
agg = (
    data.groupby(["profile", "condition"])["final_knowledge"]
        .agg(["count", "mean", "std"])
        .reset_index()
)

# Format mean ± SD
def fmt(mean, std, n):
    if pd.isna(std):
        return f"{mean:.3f} ± N/A (n={int(n)})"
    return f"{mean:.3f} ± {std:.3f} (n={int(n)})"

agg["Final Knowledge (mean ± SD)"] = [
    fmt(m, s, n) for m, s, n in zip(agg["mean"], agg["std"], agg["count"])
]

table = agg.pivot(index="profile", columns="condition", values="Final Knowledge (mean ± SD)").reset_index()

# Ensure both columns exist
if "baseline" not in table.columns:
    table["baseline"] = ""
if "adaptive" not in table.columns:
    table["adaptive"] = ""

table = table.rename(columns={
    "profile": "Learner Profile",
    "baseline": "Baseline Tutor (Final knowledge: mean ± SD)",
    "adaptive": "Adaptive Tutor (Final knowledge: mean ± SD)",
})

# Optional nice ordering
preferred = ["none", "dyslexia", "hearing_impairment", "low_vision"]
table["__order"] = table["Learner Profile"].apply(lambda x: preferred.index(x) if x in preferred else 999)
table = table.sort_values("__order").drop(columns="__order")

table.to_csv(OUTPUT, index=False)

print("\nTable 5.1 generated successfully:\n")
print(table.to_string(index=False))
print(f"\nSaved to: {OUTPUT}")



Table 5.1 generated successfully:

Learner Profile Adaptive Tutor (Final knowledge: mean ± SD) Baseline Tutor (Final knowledge: mean ± SD)
       dyslexia                         0.029 ± 0.032 (n=5)                         0.020 ± 0.027 (n=5)

Saved to: /home/alton/alc_logs/final/table_5_1_final_knowledge_by_profile.csv
