In [2]:
from pathlib import Path
import pandas as pd
from typing import Optional

BASE_DIR = Path("/home/jupyter-yin10/EEG_HAR/NEW/data/2_EEG_window_matched")
SUBJECTS = ["s1","s2","s3","s4","s5","s6"]

# Activity slugs and the desired output order
ACTIVITIES = [
    "chair_squats",
    "light_stationary_cycling",
    "marching_in_place",
    "seated_boxing_hooks",
    "seated_leg_extensions",
    "seated_medicine_ball_twists",
    "seated_side_bends",
    "side_stepping",
    "standing_heel_to_toe_walk",
    "wall_push_ups",
]

OUT_DIR = BASE_DIR / "stacked_by_activity"
OUT_DIR.mkdir(parents=True, exist_ok=True)

def find_activity_file(folder: Path, activity_slug: str) -> Optional[Path]:
    slug = activity_slug.lower()
    candidates = []
    for p in folder.glob("*.csv"):
        name = p.name.lower()
        # skip any prior stacked outputs
        if "stacked_by_activity" in name or name.startswith("a1_") or "a2_" in name:
            continue
        if slug in name:
            candidates.append(p)
    candidates.sort()
    return candidates[0] if candidates else None

summary = []
for idx, act in enumerate(ACTIVITIES, start=1):
    out_path = OUT_DIR / f"a{idx}_{act}.csv"

    frames = []
    cols_s1 = None
    found = []
    missing = []

    for subj in SUBJECTS:
        folder = BASE_DIR / subj
        if not folder.exists():
            missing.append(f"{subj}:folder_missing")
            continue

        f = find_activity_file(folder, act)
        if f is None:
            missing.append(f"{subj}:file_missing")
            continue

        # Read file, drop matched_window_id if present, add subject column
        try:
            df = pd.read_csv(f)
            if "matched_window_id" in df.columns:
                df = df.drop(columns=["matched_window_id"])

            # Capture s1 header after the drop to fix the final schema
            if subj == "s1":
                cols_s1 = df.columns.tolist()

            # If s1 had been missing, we still proceed with this file's columns
            # but once s1 appears later its columns will define the final order
            df.insert(0, "subject", subj)
            frames.append((subj, df, f.name, len(df)))
            found.append(f"{subj}:{len(df)} rows")
        except Exception as e:
            missing.append(f"{subj}:read_error({e})")

    if not frames:
        # Nothing to write for this activity
        summary.append({
            "activity": act,
            "output": str(out_path),
            "rows_total": 0,
            "found": found,
            "missing": missing,
        })
        continue

    # If we have an s1 file, align all frames to s1 columns
    # Keep the s1 column order, and drop any extra columns from other subjects
    # If s1 is missing, use the first frame's columns as a fallback
    if cols_s1 is None:
        cols_s1 = frames[0][1].columns.tolist()

    # Also make sure subject is the first column in the final output
    if "subject" in cols_s1:
        cols_order = ["subject"] + [c for c in cols_s1 if c != "subject"]
    else:
        cols_order = ["subject"] + cols_s1

    # Reorder and align columns across subjects
    aligned = []
    for subj, df, name, n in frames:
        # Limit to known columns, then add any missing columns as NA
        keep = [c for c in df.columns if c in cols_order]
        df2 = df[keep].copy()
        for c in cols_order:
            if c not in df2.columns:
                df2[c] = pd.NA
        df2 = df2[cols_order]
        aligned.append(df2)

    stacked = pd.concat(aligned, ignore_index=True)

    # Write with a single header line
    stacked.to_csv(out_path, index=False)

    summary.append({
        "activity": act,
        "output": str(out_path),
        "rows_total": len(stacked),
        "found": found,
        "missing": missing,
    })

# Print summary
print("=== Stacking summary ===")
total_rows = 0
for i, rec in enumerate(summary, start=1):
    print(f"{i:02d}. {rec['activity']}")
    print(f"    output, {rec['output']}")
    print(f"    rows_total, {rec['rows_total']}")
    total_rows += rec["rows_total"]
    if rec["found"]:
        print(f"    found, {', '.join(rec['found'])}")
    if rec["missing"]:
        print(f"    missing, {', '.join(rec['missing'])}")

print(f"\nTotal rows across all outputs, {total_rows:,}")
print(f"Files written to, {OUT_DIR}")


=== Stacking summary ===
01. chair_squats
    output, /home/jupyter-yin10/EEG_HAR/NEW/data/2_EEG_window_matched/stacked_by_activity/a1_chair_squats.csv
    rows_total, 42853
    found, s1:8771 rows, s2:11722 rows, s3:7428 rows, s4:7115 rows, s6:7817 rows
    missing, s5:file_missing
02. light_stationary_cycling
    output, /home/jupyter-yin10/EEG_HAR/NEW/data/2_EEG_window_matched/stacked_by_activity/a2_light_stationary_cycling.csv
    rows_total, 37859
    found, s1:7641 rows, s2:5011 rows, s3:11404 rows, s4:11478 rows, s6:2325 rows
    missing, s5:file_missing
03. marching_in_place
    output, /home/jupyter-yin10/EEG_HAR/NEW/data/2_EEG_window_matched/stacked_by_activity/a3_marching_in_place.csv
    rows_total, 55246
    found, s1:10247 rows, s2:9755 rows, s3:7497 rows, s4:7730 rows, s5:11702 rows, s6:8315 rows
04. seated_boxing_hooks
    output, /home/jupyter-yin10/EEG_HAR/NEW/data/2_EEG_window_matched/stacked_by_activity/a4_seated_boxing_hooks.csv
    rows_total, 51247
    found, s1: