In [1]:
#!/usr/bin/env python3
"""
Step 0: Reproducible Environment (Colab/Jupyter adapted - top-conf/journal grade)
Generate a complete reproducible environment configuration
"""

# ===== Set environment variables directly (Colab/Jupyter env) =====
import os
import sys

os.environ["PYTHONHASHSEED"] = "0"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

print("⚠️ Note: In Jupyter/Colab, PYTHONHASHSEED must be set before the kernel starts")
print("   Suggestion: After setting environment variables, restart the runtime, then run the main code\n")

# ===== Environment variables set; continue normal flow =====
import json
import hashlib
import subprocess
from pathlib import Path
from datetime import datetime, timezone
from contextlib import redirect_stdout
import io

# Check Python version
assert sys.version_info >= (3, 10), f"Require Python ≥ 3.10, current: {sys.version}"

# Create output directory
output_dir = Path("artifacts/env")
output_dir.mkdir(parents=True, exist_ok=True)

# 1. Multiple random seeds (0–9)
SEEDS = list(range(10))
print(f"Configured random seeds: {SEEDS}")

# Import and configure
import random
import numpy as np
import torch

# Initialize with the first seed
random.seed(SEEDS[0])
np.random.seed(SEEDS[0])
torch.manual_seed(SEEDS[0])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEEDS[0])
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    # Disable TF32
    torch.backends.cuda.matmul.allow_tf32 = False
    torch.backends.cudnn.allow_tf32 = False

# Enable strict deterministic algorithms (not using warn_only)
torch.use_deterministic_algorithms(True)

# Set matmul precision
if hasattr(torch, 'set_float32_matmul_precision'):
    torch.set_float32_matmul_precision("high")

# 2. Generate SEEDS.yaml (with fallback)
print("Generating SEEDS.yaml...")
seeds_config = {
    "seeds": SEEDS,
    "default_seed": SEEDS[0],
    "description": "Random seeds for python, numpy, torch, sklearn"
}
try:
    import yaml
    with open(output_dir / "SEEDS.yaml", "w") as f:
        yaml.dump(seeds_config, f, default_flow_style=False)
except ImportError:
    # Fallback if PyYAML is not installed
    yaml_content = f"""seeds: {SEEDS}
default_seed: {SEEDS[0]}
description: Random seeds for python, numpy, torch, sklearn
"""
    with open(output_dir / "SEEDS.yaml", "w") as f:
        f.write(yaml_content)

# 3. Generate requirements.txt (frozen versions)
print("Generating requirements.txt...")
result = subprocess.run(
    [sys.executable, "-m", "pip", "freeze"],
    capture_output=True, text=True
)
requirements = result.stdout
with open(output_dir / "requirements.txt", "w") as f:
    f.write(requirements)

# 4. Collect system info (for env.txt header)
import platform
system_info = []
system_info.append("="*60)
system_info.append("Environment Snapshot - System Overview")
system_info.append("="*60)
system_info.append(f"Time (UTC): {datetime.now(timezone.utc).isoformat()}")
system_info.append(f"Python: {sys.version}")
system_info.append(f"Platform: {platform.system()} {platform.release()} ({platform.machine()})")

try:
    import psutil
    system_info.append(f"CPU: {psutil.cpu_count(logical=False)} cores / {psutil.cpu_count(logical=True)} threads")
    system_info.append(f"Memory: {round(psutil.virtual_memory().total / (1024**3), 2)} GB")
except ImportError:
    pass

system_info.append(f"PyTorch: {torch.__version__}")
if torch.cuda.is_available():
    system_info.append(f"CUDA: {torch.version.cuda}")
    system_info.append(f"cuDNN: {torch.backends.cudnn.version()}")
    try:
        out = subprocess.run(
            ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"],
            capture_output=True, text=True
        )
        if out.returncode == 0 and out.stdout.strip():
            system_info.append(f"NVIDIA driver: {out.stdout.strip().splitlines()[0]}")
    except:
        pass

system_info.append("\nEnvironment variables:")
for key in ["PYTHONHASHSEED", "CUBLAS_WORKSPACE_CONFIG", "OMP_NUM_THREADS",
            "MKL_NUM_THREADS", "OPENBLAS_NUM_THREADS", "NUMEXPR_NUM_THREADS"]:
    system_info.append(f"  {key}={os.environ.get(key, 'N/A')}")

system_info.append("\n" + "="*60)
system_info.append("Installed packages list")
system_info.append("="*60 + "\n")

# 5. Generate env.txt (human-readable + system summary)
print("Generating env.txt...")
result = subprocess.run(
    [sys.executable, "-m", "pip", "list"],
    capture_output=True, text=True
)
with open(output_dir / "env.txt", "w") as f:
    f.write("\n".join(system_info))
    f.write(result.stdout)

# 6. Generate environment.yml
print("Generating environment.yml...")
env_yml = f"""name: har_lara
channels:
  - defaults
  - conda-forge
dependencies:
  - python={sys.version_info.major}.{sys.version_info.minor}
  - pip
  - pip:
"""
for line in requirements.strip().split("\n"):
    if line and not line.startswith("#"):
        env_yml += f"      - {line}\n"

with open(output_dir / "environment.yml", "w") as f:
    f.write(env_yml)

# 7. Collect complete hardware information
print("Collecting hardware information...")
hardware_info = {
    "timestamp_utc": datetime.now(timezone.utc).isoformat(),
    "python_version": sys.version,
    "python_executable": sys.executable,
    "platform": sys.platform,
    "os": platform.system(),
    "os_release": platform.release(),
    "os_version": platform.version(),
    "machine": platform.machine(),
    "processor": platform.processor(),
}

try:
    import psutil
    hardware_info["cpu_count_physical"] = psutil.cpu_count(logical=False)
    hardware_info["cpu_count_logical"] = psutil.cpu_count(logical=True)
    hardware_info["memory_total_gb"] = round(psutil.virtual_memory().total / (1024**3), 2)
except ImportError:
    pass

hardware_info["torch_version"] = torch.__version__

if torch.cuda.is_available():
    hardware_info["gpu_available"] = True
    hardware_info["gpu_count"] = torch.cuda.device_count()
    hardware_info["gpu_names"] = [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())]
    hardware_info["cuda_version"] = torch.version.cuda
    hardware_info["cudnn_version"] = torch.backends.cudnn.version()

    gpu_details = []
    for i in range(torch.cuda.device_count()):
        props = torch.cuda.get_device_properties(i)
        gpu_details.append({
            "id": i,
            "name": props.name,
            "compute_capability": f"{props.major}.{props.minor}",
            "total_memory_gb": round(props.total_memory / (1024**3), 2),
            "multi_processor_count": props.multi_processor_count
        })
    hardware_info["gpu_details"] = gpu_details

    try:
        out = subprocess.run(
            ["nvidia-smi", "--query-gpu=driver_version", "--format=csv,noheader"],
            capture_output=True, text=True
        )
        if out.returncode == 0 and out.stdout.strip():
            hardware_info["nvidia_driver_version"] = out.stdout.strip().splitlines()[0]
    except:
        pass
else:
    hardware_info["gpu_available"] = False

hardware_info["deterministic_config"] = {
    "cudnn_deterministic": torch.backends.cudnn.deterministic,
    "cudnn_benchmark": torch.backends.cudnn.benchmark,
    "use_deterministic_algorithms": True,
    "warn_only": False,
    "tf32_disabled": not torch.backends.cuda.matmul.allow_tf32 if torch.cuda.is_available() else "N/A",
    "float32_matmul_precision": "high" if hasattr(torch, 'set_float32_matmul_precision') else "N/A",
    "PYTHONHASHSEED": os.environ.get("PYTHONHASHSEED"),
    "CUBLAS_WORKSPACE_CONFIG": os.environ.get("CUBLAS_WORKSPACE_CONFIG"),
    "OMP_NUM_THREADS": os.environ.get("OMP_NUM_THREADS"),
    "MKL_NUM_THREADS": os.environ.get("MKL_NUM_THREADS"),
    "OPENBLAS_NUM_THREADS": os.environ.get("OPENBLAS_NUM_THREADS"),
    "NUMEXPR_NUM_THREADS": os.environ.get("NUMEXPR_NUM_THREADS"),
}

with open(output_dir / "hardware_log.json", "w") as f:
    json.dump(hardware_info, f, indent=2)

# 8. Git commit + dirty flag
print("Collecting Git information...")
git_info = {}
try:
    git_commit = subprocess.run(
        ["git", "rev-parse", "HEAD"],
        capture_output=True, text=True, check=True
    ).stdout.strip()
    git_info["commit"] = git_commit

    git_branch = subprocess.run(
        ["git", "rev-parse", "--abbrev-ref", "HEAD"],
        capture_output=True, text=True, check=True
    ).stdout.strip()
    git_info["branch"] = git_branch

    dirty = subprocess.run(
        ["git", "status", "--porcelain"],
        capture_output=True, text=True
    ).stdout.strip()
    git_info["dirty"] = bool(dirty)
except:
    git_info["commit"] = "N/A (not a git repo)"
    git_info["dirty"] = False

with open(output_dir / "git_info.json", "w") as f:
    json.dump(git_info, f, indent=2)

# 9. PyTorch build information
print("Saving PyTorch build information...")
try:
    buf = io.StringIO()
    with redirect_stdout(buf):
        torch.__config__.show()
    (output_dir / "torch_build.txt").write_text(buf.getvalue(), encoding="utf-8")
except:
    pass

# 10. Data checksums (only original archives)
print("Generating data checksums...")
data_dir = Path("data")
if data_dir.exists():
    sha256sums = []
    archive_exts = {'.zip', '.tar', '.gz', '.tgz', '.bz2', '.xz', '.7z', '.rar'}
    for file_path in sorted(data_dir.rglob("*")):
        if file_path.is_file() and file_path.suffix.lower() in archive_exts:
            sha256 = hashlib.sha256()
            with open(file_path, "rb") as f:
                for chunk in iter(lambda: f.read(65536), b""):
                    sha256.update(chunk)
            rel_path = file_path.relative_to(data_dir)
            sha256sums.append(f"{sha256.hexdigest()}  {rel_path}")

    if sha256sums:
        with open(output_dir / "data_SHA256SUMS.txt", "w") as f:
            f.write("\n".join(sha256sums))
        print(f"  Generated checksums for {len(sha256sums)} archives")
    else:
        print("  No archives in data/ directory; skipping checksums")
else:
    print("  data/ directory does not exist; skipping checksums")

# 11. Compute environment hashes of all key files
print("Computing environment hashes...")
env_files = [
    "requirements.txt",
    "environment.yml",
    "env.txt",
    "SEEDS.yaml",
    "hardware_log.json",
    "git_info.json"
]
sha256_lines = []
for filename in env_files:
    filepath = output_dir / filename
    if filepath.exists():
        sha256 = hashlib.sha256()
        with open(filepath, "rb") as f:
            sha256.update(f.read())
        sha256_lines.append(f"{sha256.hexdigest()}  {filename}")

with open(output_dir / "ENV.SHA256", "w") as f:
    f.write("\n".join(sha256_lines))

# Output summary
print("\n" + "="*60)
print("Step 0 complete - Reproducible environment configuration (top-conf/journal grade)")
print("="*60)
print(f"Output directory: {output_dir}/")
print(f"  ✓ SEEDS.yaml (seeds: {SEEDS})")
print(f"  ✓ requirements.txt")
print(f"  ✓ env.txt (with system summary)")
print(f"  ✓ environment.yml")
print(f"  ✓ hardware_log.json")
print(f"  ✓ git_info.json (dirty={git_info.get('dirty', False)})")
print(f"  ✓ torch_build.txt")
print(f"  ✓ ENV.SHA256 (covers all key files)")
if (output_dir / "data_SHA256SUMS.txt").exists():
    print(f"  ✓ data_SHA256SUMS.txt (archives only)")

print(f"\nStrict determinism configuration:")
print(f"  - torch.use_deterministic_algorithms: True (warn_only=False)")
print(f"  - cudnn.deterministic: {torch.backends.cudnn.deterministic}")
print(f"  - cudnn.benchmark: {torch.backends.cudnn.benchmark}")
if torch.cuda.is_available():
    print(f"  - TF32 disabled: {not torch.backends.cuda.matmul.allow_tf32}")
print(f"  - Environment variables set:")
print(f"    PYTHONHASHSEED: {os.environ.get('PYTHONHASHSEED')}")
print(f"    CUBLAS_WORKSPACE_CONFIG: {os.environ.get('CUBLAS_WORKSPACE_CONFIG')}")
print(f"    Thread control: OMP/MKL/OPENBLAS/NUMEXPR=1")
print("="*60)

⚠️ Note: In Jupyter/Colab, PYTHONHASHSEED must be set before the kernel starts
   Suggestion: After setting environment variables, restart the runtime, then run the main code

Configured random seeds: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
Generating SEEDS.yaml...
Generating requirements.txt...
Generating env.txt...
Generating environment.yml...
Collecting hardware information...
Collecting Git information...
Saving PyTorch build information...
Generating data checksums...
  data/ directory does not exist; skipping checksums
Computing environment hashes...

Step 0 complete - Reproducible environment configuration (top-conf/journal grade)
Output directory: artifacts/env/
  ✓ SEEDS.yaml (seeds: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
  ✓ requirements.txt
  ✓ env.txt (with system summary)
  ✓ environment.yml
  ✓ hardware_log.json
  ✓ git_info.json (dirty=False)
  ✓ torch_build.txt
  ✓ ENV.SHA256 (covers all key files)

Strict determinism configuration:
  - torch.use_deterministic_algorithms: True (warn_o

In [1]:
#!/usr/bin/env python3
"""
Steps 1–2: Data Acquisition & Unpack Standardization (top-conf/journal grade)
Process the uploaded LARa MbientLab IMU archive
"""

import os
import hashlib
import zipfile
import shutil
import json
import re
import numpy as np
from pathlib import Path
from datetime import datetime, timezone
import pandas as pd

# ========== Helper functions ==========
def read_any_csv(path, nrows=None):
    """CSV reader with auto delimiter detection"""
    try:
        return pd.read_csv(path, nrows=nrows, sep=None, engine="python")
    except Exception:
        return pd.read_csv(path, nrows=nrows)

def infer_sampling_rate(df):
    """Infer sampling rate; auto-handle ns/μs/ms/s time units"""
    cols = [c.lower() for c in df.columns]
    time_cols = [c for c in df.columns if re.search(r"(time|timestamp|epoch)", c.lower())]
    if not time_cols:
        return None

    c = time_cols[0]
    t = pd.to_numeric(df[c], errors="coerce").dropna().to_numpy()
    if t.size < 3:
        return None

    # Infer time unit by magnitude
    max_val = np.nanmax(np.abs(t[:1000])) if t.size else 0
    if max_val >= 1e12:      # nanoseconds
        scale = 1e-9
    elif max_val >= 1e9:     # nanoseconds
        scale = 1e-9
    elif max_val >= 1e6:     # microseconds
        scale = 1e-6
    elif max_val >= 1e3:     # milliseconds
        scale = 1e-3
    else:                    # seconds
        scale = 1.0

    t_sec = t * scale
    dt = np.diff(t_sec)
    dt = dt[dt > 0]
    if dt.size == 0:
        return None

    # Use median for robustness
    return float(np.round(1.0 / np.median(dt), 3))

def infer_sensor_type(cols_lower, filename):
    """Infer sensor type"""
    if 'label' in filename.lower() or 'activity' in filename.lower():
        return "labels"

    sensors = []
    if any(("acc" in c) or ("accelerom" in c) for c in cols_lower):
        sensors.append("acc")
    if any(("gyro" in c) or re.search(r"\bgyr", c) for c in cols_lower):
        sensors.append("gyro")
    if any(("mag" in c) or ("magnetom" in c) for c in cols_lower):
        sensors.append("mag")

    return "+".join(sensors) if sensors else "unknown"

# LARa placement mapping (per official docs)
PLACEMENT_MAP = {
    "L01": "lwrist",      # Left wrist
    "L02": "rwrist",      # Right wrist
    "L03": "chest",       # Chest
    "L04": "belt",        # Belt
    "L05": "lankle",      # Left ankle
    "L06": "pocket",      # Pocket
    "L07": "lforearm",    # Left forearm
    "L08": "lupperarm",   # Left upper arm
}

# ========== Step 1: Acquire & verify ==========
print("="*60)
print("Step 1: Data acquisition & verification")
print("="*60)

# Create directory structure
raw_dir = Path("data/lara/mbientlab/raw")
raw_dir.mkdir(parents=True, exist_ok=True)

# Find uploaded zip files (prefer annotated versions)
uploaded_files = list(Path(".").glob("*annotated*MbientLab*.zip"))
if not uploaded_files:
    uploaded_files = list(Path(".").glob("*MbientLab*.zip"))
if not uploaded_files:
    uploaded_files = list(Path(".").glob("*.zip"))

if not uploaded_files:
    raise FileNotFoundError("No MbientLab data archive found; please upload a zip file first")

if len(uploaded_files) > 1:
    print(f"Warning: found multiple candidate files: {[f.name for f in uploaded_files]}")
    print(f"Using the first: {uploaded_files[0].name}")

zip_file = uploaded_files[0]
print(f"Found archive: {zip_file}")

# Move to raw data directory
target_zip = raw_dir / zip_file.name
if not target_zip.exists():
    shutil.copy2(zip_file, target_zip)
    print(f"Copied to: {target_zip}")
else:
    print(f"File already exists: {target_zip}")

# Compute SHA256 checksum
print("Computing SHA256 checksum...")
sha256_hash = hashlib.sha256()
with open(target_zip, "rb") as f:
    for chunk in iter(lambda: f.read(65536), b""):
        sha256_hash.update(chunk)

checksum = sha256_hash.hexdigest()
print(f"SHA256: {checksum}")

# Save checksum
sha256_file = raw_dir / "SHA256SUMS.txt"
with open(sha256_file, "w") as f:
    f.write(f"{checksum}  {target_zip.name}\n")
print(f"Saved checksum: {sha256_file}")

# Record provenance (traceability)
provenance = {
    "dataset": "LARa IMU-only / MbientLab",
    "origin": "manual-upload",
    "official_url": "https://sensor.informatik.uni-mannheim.de/#dataset_lara",
    "retrieved_at_utc": datetime.now(timezone.utc).isoformat(),
    "archive": target_zip.name,
    "sha256": checksum
}
(raw_dir / "PROVENANCE.json").write_text(
    json.dumps(provenance, indent=2, ensure_ascii=False),
    encoding="utf-8"
)
print(f"Recorded provenance info: {raw_dir / 'PROVENANCE.json'}")

# Set raw archive to read-only
os.chmod(target_zip, 0o444)
print(f"Set read-only permission: {target_zip}")

# ========== Step 2: Unpack & directory standardization ==========
print("\n" + "="*60)
print("Step 2: Unpack & directory standardization")
print("="*60)

# Extract to temp directory
temp_extract = raw_dir / "temp_extract"
temp_extract.mkdir(exist_ok=True)

print(f"Extracting {target_zip.name}...")
with zipfile.ZipFile(target_zip, 'r') as zip_ref:
    zip_ref.extractall(temp_extract)

# Scan extracted files and normalize
file_records = []
problems = []  # record files that failed to parse

# Recursively scan all CSV/TSV files
for file_path in temp_extract.rglob("*"):
    if not file_path.is_file():
        continue

    # Process only data files
    if file_path.suffix.lower() not in ['.csv', '.tsv', '.txt']:
        continue

    # Parse filename: LARa pattern L01_S07_R01.csv
    filename = file_path.stem

    # Extract L01/L02/L03 (placement)
    placement_match = re.search(r'L(\d+)', filename)
    placement_raw = f"L{placement_match.group(1).zfill(2)}" if placement_match else "L00"
    placement = PLACEMENT_MAP.get(placement_raw, placement_raw)

    # Extract S07 (subject)
    subject_match = re.search(r'S(\d+)', filename)
    subject_id = f"S{subject_match.group(1).zfill(2)}" if subject_match else "S00"

    # Extract R01 (session)
    session_match = re.search(r'R(\d+)', filename)
    session_id = f"R{session_match.group(1).zfill(2)}" if session_match else "R01"

    # Detect parse failures (avoid LOSO leakage)
    if subject_id == "S00" or session_id == "R01":
        if not re.search(r'R01', filename):  # exclude real R01
            problems.append(str(file_path.relative_to(temp_extract)))

    # Create standardized directory structure
    std_dir = raw_dir / subject_id / session_id / placement
    std_dir.mkdir(parents=True, exist_ok=True)

    # Standardized filename (lowercase, underscores)
    std_filename = file_path.name.lower().replace(' ', '_').replace('-', '_')
    std_path = std_dir / std_filename

    # Copy to standardized location
    if not std_path.exists():
        shutil.copy2(file_path, std_path)

    # Get file info
    file_size = file_path.stat().st_size
    num_rows = 0
    sampling_rate = None
    duration = None
    sensor_type = "unknown"

    try:
        # Read sample
        df_sample = read_any_csv(file_path, nrows=2000)
        columns_lower = [c.lower() for c in df_sample.columns]

        # Infer sensor type
        sensor_type = infer_sensor_type(columns_lower, filename)

        # Infer sampling rate (skip for labels)
        if sensor_type != "labels":
            sampling_rate = infer_sampling_rate(df_sample)

        # Count total rows (streaming to avoid loading big files)
        with open(file_path, "rb") as fh:
            num_rows = sum(1 for _ in fh) - 1  # minus header

        # Compute duration
        if sampling_rate and num_rows > 0:
            duration = round(num_rows / sampling_rate, 2)

    except Exception:
        pass  # silently skip files that cannot be parsed

    # Record file info
    file_records.append({
        "subject_id": subject_id,
        "session_id": session_id,
        "placement": placement,
        "placement_raw": placement_raw,
        "sensor_type": sensor_type,
        "original_path": str(file_path.relative_to(temp_extract)),
        "standardized_path": str(std_path.relative_to(raw_dir)),
        "filename": std_filename,
        "file_size_bytes": file_size,
        "num_rows": num_rows,
        "sampling_rate_hz": sampling_rate,
        "duration_sec": duration,
    })

print(f"Processed {len(file_records)} files")

# Check parse failures
if problems:
    problems_file = raw_dir / "PROBLEMS.log"
    problems_file.write_text(
        "The following files could not parse subject/session (would break LOSO):\n" +
        "\n".join(problems) + "\n",
        encoding="utf-8"
    )
    raise RuntimeError(
        f"Found {len(problems)} files with unparsed subject/session; "
        f"please check {problems_file} and fix"
    )

# Remove temp extraction directory
shutil.rmtree(temp_extract)
print("Removed temporary files")

# Generate file_index (Parquet preferred; fallback to CSV)
if file_records:
    file_index = pd.DataFrame(file_records)

    # Sort
    file_index = file_index.sort_values(
        ['subject_id', 'session_id', 'placement', 'sensor_type']
    )

    # Save index
    index_file = raw_dir / "file_index.parquet"
    try:
        file_index.to_parquet(index_file, index=False)
        saved_index = index_file
        print(f"\nGenerated file index: {saved_index}")
    except Exception as e:
        print(f"Warning: Parquet write failed ({e}); falling back to CSV")
        index_file_csv = raw_dir / "file_index.csv"
        file_index.to_csv(index_file_csv, index=False)
        saved_index = index_file_csv
        print(f"Generated file index: {saved_index}")

    # Show dataset statistics
    print("\nDataset statistics:")
    print(f"  Number of subjects: {file_index['subject_id'].nunique()}")
    print(f"  Number of sessions: {file_index.groupby('subject_id')['session_id'].nunique().sum()}")
    print(f"  Placements: {sorted(file_index['placement'].unique().tolist())}")
    print(f"  Sensor types: {sorted(file_index['sensor_type'].unique().tolist())}")
    print(f"  Total files: {len(file_index)}")

    # Sampling rate stats
    sensor_files = file_index[file_index['sensor_type'] != 'labels']
    if not sensor_files.empty:
        rates = sensor_files['sampling_rate_hz'].dropna()
        if not rates.empty:
            print(f"  Sampling rate range: {rates.min():.1f} - {rates.max():.1f} Hz")
            print(f"  Median sampling rate: {rates.median():.1f} Hz")

    # Preview first records
    print("\nFile index preview:")
    print(file_index.head(10).to_string())
else:
    print("Warning: No data files found")

print("\n" + "="*60)
print("Steps 1–2 complete (top-conf/journal grade)")
print("="*60)
print(f"Raw data: {raw_dir}/")
print(f"Checksum: {sha256_file}")
print(f"Provenance record: {raw_dir / 'PROVENANCE.json'}")
print(f"File index: {saved_index}")
print("="*60)

Step 1: Data acquisition & verification
Found archive: IMU data (annotated) _ MbientLab.zip
Copied to: data/lara/mbientlab/raw/IMU data (annotated) _ MbientLab.zip
Computing SHA256 checksum...
SHA256: 70968b6b8874375e96671af67e31c27ccb63793f31191f86e732d40f24ac3106
Saved checksum: data/lara/mbientlab/raw/SHA256SUMS.txt
Recorded provenance info: data/lara/mbientlab/raw/PROVENANCE.json
Set read-only permission: data/lara/mbientlab/raw/IMU data (annotated) _ MbientLab.zip

Step 2: Unpack & directory standardization
Extracting IMU data (annotated) _ MbientLab.zip...
Processed 386 files
Removed temporary files

Generated file index: data/lara/mbientlab/raw/file_index.parquet

Dataset statistics:
  Number of subjects: 8
  Number of sessions: 193
  Placements: ['chest', 'lwrist', 'rwrist']
  Sensor types: ['acc+gyro', 'labels']
  Total files: 386

File index preview:
   subject_id session_id placement placement_raw sensor_type                                                original_path      

In [2]:
#!/usr/bin/env python3
"""
Step 3: Metadata & Quality Audit (top-conf/journal grade - final)
Parse subjects, activity set, sampling rate, placement, session time; empty-window cleanup
"""

import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime, timezone
import json
import re

# ========== Config ==========
MISSING_THRESHOLD = 0.05      # Missing-rate threshold 5%
GAP_THRESHOLD = 2.0           # Single-gap absolute threshold (seconds)
GAP_RATIO_THRESHOLD = 0.05    # Gap ratio threshold 5%

print("="*60)
print("Step 3: Metadata & Quality Audit")
print("="*60)

# Load file index
raw_dir = Path("data/lara/mbientlab/raw")
index_file = raw_dir / "file_index.parquet"
if not index_file.exists():
    index_file = raw_dir / "file_index.csv"

print(f"Loading file index: {index_file}")
file_index = pd.read_parquet(index_file) if index_file.suffix == '.parquet' else pd.read_csv(index_file)

# Initialize variables (avoid undefined in edge cases)
subject_agg = pd.DataFrame()
meta_subjects_file = None
meta_sessions_file = None
keep_sessions_file = None

# ========== Helper functions ==========
def pick_scale(med_raw, sr_hint=None):
    """Smartly pick time unit (s/ms/μs/ns → seconds)"""
    cands = [1.0, 1e-3, 1e-6, 1e-9]

    if sr_hint and sr_hint > 0:
        target_dt = 1.0 / sr_hint
        return min(cands, key=lambda s: abs(med_raw * s - target_dt))

    # Without hint: prefer median interval mapping into 5-400 Hz, bias toward ~50 Hz
    best, err = 1.0, float("inf")
    for s in cands:
        dt = med_raw * s
        if dt <= 0:
            continue
        sr = 1.0 / dt
        score = 0 if 5 <= sr <= 400 else abs(sr - 50) * 10
        if score < err:
            best, err = s, score
    return best

def extract_time_range_and_gaps(file_path, sampling_rate_hint=None, head_rows=20000, chunksize=200000):
    """Read time column in chunks; extract range and gaps (incl. inter-chunk gaps, memory-friendly)"""
    try:
        # Infer time column & unit from a small sample
        df_head = pd.read_csv(file_path, nrows=head_rows, sep=None, engine="python")
        time_cols = [c for c in df_head.columns if re.search(r"(time|timestamp|epoch|ts)", c, re.I)]
        if not time_cols:
            return None, None, 0.0, 0.0, 0.0

        c = time_cols[0]
        s = pd.to_numeric(df_head[c], errors="coerce").dropna().to_numpy()

        # Numeric timestamp branch
        if s.size >= 3:
            diffs = np.diff(s)
            diffs = diffs[np.isfinite(diffs) & (diffs > 0)]
            if diffs.size > 0:
                med = float(np.median(diffs))
                scale = pick_scale(med, sampling_rate_hint)
                expected = (1.0 / sampling_rate_hint) if (sampling_rate_hint and sampling_rate_hint > 0) else (med * scale)

                # OR logic: two independent thresholds
                rel_threshold = 10.0 * expected  # Relative threshold: 10× expected interval
                abs_threshold = GAP_THRESHOLD    # Absolute threshold: 2 s

                first = None
                last = None
                prev = None
                gap_sec = 0.0
                max_gap = 0.0

                for chunk in pd.read_csv(file_path, usecols=[c], sep=None, engine="python", chunksize=chunksize):
                    v = pd.to_numeric(chunk[c], errors="coerce").dropna().to_numpy()
                    if v.size == 0:
                        continue

                    if first is None:
                        first = v[0]

                    # Inter-chunk gaps (fix: use max as baseline)
                    if prev is not None:
                        delta = (v[0] - prev) * scale
                        cond_rel = delta > rel_threshold
                        cond_abs = delta > abs_threshold

                        if cond_rel or cond_abs:
                            # If both trigger, use max (more lenient); if only one, use that one
                            if cond_rel and cond_abs:
                                base = max(rel_threshold, abs_threshold)
                            elif cond_rel:
                                base = rel_threshold
                            else:
                                base = abs_threshold

                            gap_this = delta - base
                            gap_sec += gap_this
                            max_gap = max(max_gap, gap_this)

                    # Intra-chunk gaps (fix: shape + baseline)
                    d = np.diff(v) * scale
                    mask_rel = d > rel_threshold
                    mask_abs = d > abs_threshold
                    mask = mask_rel | mask_abs

                    if mask.any():
                        # Vectorized: choose the threshold triggered by each gap (use max if both)
                        both_triggered = mask_rel & mask_abs
                        thr_used = np.where(
                            both_triggered,
                            max(rel_threshold, abs_threshold),
                            np.where(mask_rel, rel_threshold, abs_threshold)
                        )
                        gaps = d[mask] - thr_used[mask]  # Fix: also index thr_used
                        gap_sec += float(gaps.sum())
                        max_gap = max(max_gap, float(gaps.max()))

                    prev = v[-1]
                    last = v[-1]

                if first is not None and last is not None:
                    start_sec = float(first * scale)
                    end_sec = float(last * scale)
                    total = end_sec - start_sec
                    ratio = float(gap_sec / total) if total > 0 else 0.0
                    return start_sec, end_sec, float(round(gap_sec, 2)), float(round(ratio, 4)), float(round(max_gap, 2))

        # Fallback branch: datetime strings
        t_head = pd.to_datetime(df_head[c], utc=True, errors="coerce").dropna()
        if t_head.size >= 3:
            med = float(t_head.diff().dt.total_seconds().dropna().median())
            if med > 0:
                expected = (1.0 / sampling_rate_hint) if (sampling_rate_hint and sampling_rate_hint > 0) else med

                # OR logic
                rel_threshold = 10.0 * expected
                abs_threshold = GAP_THRESHOLD

                first = None
                last = None
                prev = None
                gap_sec = 0.0
                max_gap = 0.0

                for chunk in pd.read_csv(file_path, usecols=[c], sep=None, engine="python", chunksize=chunksize):
                    tt = pd.to_datetime(chunk[c], utc=True, errors="coerce").dropna()
                    if tt.empty:
                        continue

                    if first is None:
                        first = tt.iloc[0]

                    # Inter-chunk gaps (fix: use max as baseline)
                    if prev is not None:
                        delta = (tt.iloc[0] - prev).total_seconds()
                        cond_rel = delta > rel_threshold
                        cond_abs = delta > abs_threshold

                        if cond_rel or cond_abs:
                            if cond_rel and cond_abs:
                                base = max(rel_threshold, abs_threshold)
                            elif cond_rel:
                                base = rel_threshold
                            else:
                                base = abs_threshold

                            gap_this = delta - base
                            gap_sec += gap_this
                            max_gap = max(max_gap, gap_this)

                    # Intra-chunk gaps (fix: shape + baseline)
                    d = tt.diff().dt.total_seconds().dropna()
                    mask_rel = d > rel_threshold
                    mask_abs = d > abs_threshold
                    mask = mask_rel | mask_abs

                    if not mask.empty and mask.any():
                        both_triggered = mask_rel & mask_abs
                        thr_used = np.where(
                            both_triggered,
                            max(rel_threshold, abs_threshold),
                            np.where(mask_rel, rel_threshold, abs_threshold)
                        )
                        gaps = d[mask].values - thr_used[mask]
                        gap_sec += float(gaps.sum())
                        max_gap = max(max_gap, float(gaps.max()))

                    prev = tt.iloc[-1]
                    last = tt.iloc[-1]

                if first is not None and last is not None:
                    total = (last - first).total_seconds()
                    ratio = float(gap_sec / total) if total > 0 else 0.0
                    return first.timestamp(), last.timestamp(), float(round(gap_sec, 2)), float(round(ratio, 4)), float(round(max_gap, 2))

        return None, None, 0.0, 0.0, 0.0

    except Exception:
        return None, None, 0.0, 0.0, 0.0

def safe_float(x, default=0.0):
    """Safely cast to float, handling NaN/Inf"""
    try:
        if x is None or (isinstance(x, float) and (np.isnan(x) or np.isinf(x))):
            return default
        return float(x)
    except:
        return default

# ========== 1. Parse sensor data metadata ==========
print("\n" + "="*60)
print("1. Parse sensor data metadata")
print("="*60)

# Determine label files directly from filenames (more reliable)
label_files = file_index[
    file_index['filename'].str.contains('label', case=False, na=False)
].copy()
sensor_files = file_index[
    ~file_index['filename'].str.contains('label', case=False, na=False)
].copy()

print(f"Sensor files: {len(sensor_files)}")
print(f"Label files: {len(label_files)}")

# Extract time ranges for sensor files (receive 5 return values)
print("Extracting time spans and gap statistics (chunked)...")
time_records = []
for idx, row in sensor_files.iterrows():
    file_path = raw_dir / row['standardized_path']
    start, end, gap_sec, gap_ratio, max_gap = extract_time_range_and_gaps(
        file_path,
        row['sampling_rate_hz']
    )
    time_records.append({
        'subject_id': row['subject_id'],
        'session_id': row['session_id'],
        'placement': row['placement'],
        'start_time': start,
        'end_time': end,
        'gap_seconds': gap_sec,
        'gap_ratio': gap_ratio,
        'max_gap_seconds': max_gap,
    })

df_time_ranges = pd.DataFrame(time_records)

# Aggregate time ranges by session (includes max_gap)
session_time_agg = df_time_ranges.groupby(['subject_id', 'session_id']).agg({
    'start_time': 'min',
    'end_time': 'max',
    'gap_seconds': 'sum',
    'max_gap_seconds': 'max',
}).reset_index()

session_time_agg['session_duration_sec'] = (
    session_time_agg['end_time'] - session_time_agg['start_time']
)
session_time_agg['gap_ratio'] = (
    session_time_agg['gap_seconds'] / session_time_agg['session_duration_sec']
).fillna(0.0).infer_objects(copy=False)

session_time_agg.rename(columns={
    'start_time': 'session_start_time',
    'end_time': 'session_end_time'
}, inplace=True)

# Add ISO8601 (human-readable) times
def to_iso(x):
    try:
        if pd.notna(x):
            return datetime.fromtimestamp(float(x), tz=timezone.utc).isoformat()
    except:
        pass
    return None

session_time_agg['session_start_utc'] = session_time_agg['session_start_time'].apply(to_iso)
session_time_agg['session_end_utc'] = session_time_agg['session_end_time'].apply(to_iso)

print(f"Extracted time spans for {len(session_time_agg)} sessions")

# ========== 2. Parse labels & activity statistics ==========
print("\n" + "="*60)
print("2. Parse labels & activity statistics")
print("="*60)

activity_stats = []
session_records = []

for idx, label_row in label_files.iterrows():
    label_path = raw_dir / label_row['standardized_path']

    if not label_path.exists():
        continue

    try:
        # Read label file
        df_label = pd.read_csv(label_path, sep=None, engine='python')

        # Find label column (LARa dataset uses 'Class')
        if 'Class' in df_label.columns:
            label_col = 'Class'
        elif 'class' in df_label.columns:
            label_col = 'class'
        else:
            label_cols = [c for c in df_label.columns if 'label' in c.lower() or 'activity' in c.lower()]
            if not label_cols:
                print(f"  No label column ({df_label.columns.tolist()}): {label_path.name}")
                continue
            label_col = label_cols[0]

        # Count activity distribution
        activity_counts = df_label[label_col].value_counts()
        total_samples = len(df_label)

        # Check missing
        missing_count = df_label[label_col].isna().sum()
        missing_rate = missing_count / total_samples if total_samples > 0 else 0

        # Record session info
        session_info = {
            'subject_id': label_row['subject_id'],
            'session_id': label_row['session_id'],
            'placement': label_row['placement'],
            'total_samples': total_samples,
            'missing_samples': missing_count,
            'missing_rate': round(missing_rate, 4),
            'num_activities': len(activity_counts),
        }

        # Add per-activity stats
        for activity, count in activity_counts.items():
            activity_stats.append({
                'subject_id': label_row['subject_id'],
                'session_id': label_row['session_id'],
                'placement': label_row['placement'],
                'activity': str(activity),
                'count': int(count),
                'percentage': round(count / total_samples * 100, 2)
            })

        session_records.append(session_info)

    except Exception as e:
        print(f"  Warning: failed to parse {label_path.name}: {e}")
        continue

print(f"Parsed {len(session_records)} sessions")

# ========== 2.1 Orphan session check ==========
print("\nChecking orphan sessions...")
sess_from_sensors = set(zip(sensor_files['subject_id'], sensor_files['session_id']))
sess_from_labels = set(zip(label_files['subject_id'], label_files['session_id']))
orphans = sess_from_sensors - sess_from_labels

if orphans:
    orphan_file = raw_dir / "QA_ISSUES.log"
    with open(orphan_file, "a", encoding="utf-8") as f:
        f.write("\nSessions with sensors but no labels (orphan sessions):\n")
        for s, r in sorted(orphans):
            f.write(f"  {s}-{r}\n")
    print(f"⚠️  Found {len(orphans)} orphan sessions; logged to QA_ISSUES.log")

# ========== 3. Merge session metadata ==========
print("\n" + "="*60)
print("3. Merge session metadata")
print("="*60)

df_sessions = pd.DataFrame(session_records)
df_activities = pd.DataFrame(activity_stats)

# Merge time info
if not df_sessions.empty and not session_time_agg.empty:
    df_sessions = df_sessions.merge(
        session_time_agg,
        on=['subject_id', 'session_id'],
        how='left'
    )
    print(f"Merged time span info")

# ========== 4. Data quality checks & empty-window cleanup ==========
print("\n" + "="*60)
print("4. Data quality checks & empty-window cleanup")
print("="*60)

if not df_sessions.empty:
    # Generate keep flag
    df_sessions['keep'] = True
    df_sessions['reject_reason'] = ''

    # Check missing-rate exceeds threshold
    high_missing_mask = df_sessions['missing_rate'] > MISSING_THRESHOLD
    if high_missing_mask.any():
        df_sessions.loc[high_missing_mask, 'keep'] = False
        df_sessions.loc[high_missing_mask, 'reject_reason'] = 'high_missing_rate'
        print(f"⚠️  {high_missing_mask.sum()} sessions marked not kept due to high missing rate")

    # Check time-gap ratio exceeds threshold
    if 'gap_ratio' in df_sessions.columns:
        high_gap_mask = df_sessions['gap_ratio'] > GAP_RATIO_THRESHOLD
        if high_gap_mask.any():
            # Append reason if already rejected; otherwise mark alone
            for idx in df_sessions[high_gap_mask].index:
                if df_sessions.loc[idx, 'keep']:
                    df_sessions.loc[idx, 'keep'] = False
                    df_sessions.loc[idx, 'reject_reason'] = 'high_gap_ratio'
                else:
                    df_sessions.loc[idx, 'reject_reason'] += '+high_gap_ratio'
            print(f"⚠️  {high_gap_mask.sum()} sessions marked not kept due to high gap ratio")

    # Summary
    keep_count = df_sessions['keep'].sum()
    reject_count = (~df_sessions['keep']).sum()
    print(f"✓ QC result: keep {keep_count} sessions, reject {reject_count} sessions")

    # Save keep list
    keep_sessions_file = raw_dir / "qa_keep_sessions.csv"
    df_sessions[['subject_id', 'session_id', 'placement', 'keep', 'reject_reason',
                 'missing_rate', 'gap_ratio']].to_csv(keep_sessions_file, index=False)
    print(f"✓ Saved: {keep_sessions_file}")

    # Log rejection details
    if reject_count > 0:
        rejected = df_sessions[~df_sessions['keep']]
        qa_issues = raw_dir / "QA_ISSUES.log"
        with open(qa_issues, "a") as f:
            f.write(f"\nSessions rejected by QC (total {reject_count}):\n\n")
            f.write(rejected[['subject_id', 'session_id', 'placement', 'reject_reason',
                             'missing_rate', 'gap_ratio']].to_string(index=False))
        print(f"  Details logged to: {qa_issues}")

# ========== 4.1 Generate file-level empty-window list ==========
print("\nGenerating file-level empty-window list...")
if not df_time_ranges.empty:
    empty_segments = df_time_ranges[
        df_time_ranges['gap_ratio'].notna() &
        (df_time_ranges['gap_ratio'] > GAP_RATIO_THRESHOLD)
    ].copy()

    if not empty_segments.empty:
        empty_todo_file = raw_dir / "EMPTY_SEGMENTS_TODO.csv"
        empty_segments[['subject_id', 'session_id', 'placement',
                       'gap_seconds', 'gap_ratio', 'max_gap_seconds']].to_csv(empty_todo_file, index=False)
        print(f"⚠️  Generated empty-segment list: {empty_todo_file} ({len(empty_segments)} files)")

# ========== 5. Generate subject-level metadata ==========
print("\n" + "="*60)
print("5. Generate subject-level metadata")
print("="*60)

if not df_sessions.empty:
    # Only count kept sessions
    df_keep = df_sessions[df_sessions['keep']]

    if not df_keep.empty:
        # Aggregate by subject
        subject_agg = df_keep.groupby('subject_id').agg({
            'session_id': 'nunique',
            'total_samples': 'sum',
            'missing_samples': 'sum',
            'session_duration_sec': 'sum',
            'num_activities': 'sum',
        }).reset_index()

        subject_agg.columns = ['subject_id', 'num_sessions', 'total_samples',
                               'total_missing', 'total_duration_sec', 'total_activities']

        # Compute overall missing rate
        subject_agg['overall_missing_rate'] = (
            subject_agg['total_missing'] / subject_agg['total_samples']
        ).round(4)

        # Add placement coverage
        placement_coverage = df_keep.groupby('subject_id')['placement'].apply(
            lambda x: ','.join(sorted(set(x)))
        ).reset_index()
        placement_coverage.columns = ['subject_id', 'placements']

        subject_agg = subject_agg.merge(placement_coverage, on='subject_id')

        # Save subject metadata
        meta_subjects_file = raw_dir / "meta_subjects.csv"
        subject_agg.to_csv(meta_subjects_file, index=False)
        print(f"✓ Saved: {meta_subjects_file}")
        print(f"  Number of subjects: {len(subject_agg)}")

# ========== 6. Generate session-level metadata ==========
print("\n" + "="*60)
print("6. Generate session-level metadata")
print("="*60)

if not df_sessions.empty:
    # Add activity list
    if not df_activities.empty:
        activity_list = df_activities.groupby(['subject_id', 'session_id'])['activity'].apply(
            lambda x: ','.join(sorted(set(x)))
        ).reset_index()
        activity_list.columns = ['subject_id', 'session_id', 'activities']

        df_sessions_full = df_sessions.merge(
            activity_list,
            on=['subject_id', 'session_id'],
            how='left'
        )
    else:
        df_sessions_full = df_sessions

    # Save session metadata
    meta_sessions_file = raw_dir / "meta_sessions.csv"
    df_sessions_full.to_csv(meta_sessions_file, index=False)
    print(f"✓ Saved: {meta_sessions_file}")
    print(f"  Number of sessions: {len(df_sessions_full)}")

# ========== 7. Generate quality audit report ==========
print("\n" + "="*60)
print("7. Generate quality audit report")
print("="*60)

qa_report = []
qa_report.append("="*70)
qa_report.append("LARa MbientLab IMU Dataset - Quality Audit Report")
qa_report.append("="*70)
qa_report.append(f"Generated at: {datetime.now(timezone.utc).isoformat()}")
qa_report.append(f"Data path: {raw_dir}")
qa_report.append("")

# Overall stats
qa_report.append("[1. Dataset overview]")
qa_report.append("-"*70)
if not subject_agg.empty:
    total_hours = safe_float(subject_agg['total_duration_sec'].sum() / 3600)
    qa_report.append(f"Number of subjects: {len(subject_agg)}")
    qa_report.append(f"Total sessions: {subject_agg['num_sessions'].sum()}")
    qa_report.append(f"Total duration: {total_hours:.2f} hours")
    qa_report.append(f"Total samples: {subject_agg['total_samples'].sum():,}")
qa_report.append("")

# Sampling rate stats
qa_report.append("[2. Sampling rate statistics]")
qa_report.append("-"*70)
if not sensor_files.empty:
    rates = sensor_files['sampling_rate_hz'].dropna()
    if not rates.empty:
        qa_report.append(f"Sampling rate range: {rates.min():.2f} - {rates.max():.2f} Hz")
        qa_report.append(f"Median sampling rate: {rates.median():.2f} Hz")
        qa_report.append(f"Mode sampling rate: {rates.mode().values[0]:.2f} Hz")
qa_report.append("")

# Placement coverage
qa_report.append("[3. Sensor placement coverage]")
qa_report.append("-"*70)
if not df_sessions.empty:
    df_keep = df_sessions[df_sessions['keep']]
    if not df_keep.empty:
        placement_dist = df_keep['placement'].value_counts()
        for placement, count in placement_dist.items():
            percentage = count / len(df_keep) * 100
            qa_report.append(f"  {placement:15s}: {count:3d} sessions ({percentage:5.1f}%)")
qa_report.append("")

# Activity distribution
qa_report.append("[4. Activity distribution]")
qa_report.append("-"*70)
if not df_activities.empty:
    activity_total = df_activities.groupby('activity').agg({
        'count': 'sum',
    }).sort_values('count', ascending=False)

    total_count = activity_total['count'].sum()
    qa_report.append(f"Number of activity classes: {len(activity_total)}")
    qa_report.append(f"Total samples: {total_count:,}")
    qa_report.append("")
    qa_report.append("Per-activity share:")
    for activity, row in activity_total.iterrows():
        percentage = row['count'] / total_count * 100
        qa_report.append(f"  {str(activity):30s}: {row['count']:8,} ({percentage:5.2f}%)")
qa_report.append("")

# Data quality (incl. max_gap stats)
qa_report.append("[5. Data quality assessment]")
qa_report.append("-"*70)
if not df_sessions.empty:
    qa_report.append(f"Missing-rate threshold: {MISSING_THRESHOLD*100}%")
    qa_report.append(f"Gap absolute threshold: {GAP_THRESHOLD} s")
    qa_report.append(f"Gap relative threshold: 10× expected interval")
    qa_report.append(f"Gap ratio threshold: {GAP_RATIO_THRESHOLD*100}%")

    avg_miss = safe_float(df_sessions['missing_rate'].mean())
    max_miss = safe_float(df_sessions['missing_rate'].max())
    med_miss = safe_float(df_sessions['missing_rate'].median())

    qa_report.append(f"Overall average missing rate: {avg_miss*100:.2f}%")
    qa_report.append(f"Max missing rate: {max_miss*100:.2f}%")
    qa_report.append(f"Median missing rate: {med_miss*100:.2f}%")

    if 'gap_ratio' in df_sessions.columns:
        avg_gap = safe_float(df_sessions['gap_ratio'].mean())
        max_gap_ratio = safe_float(df_sessions['gap_ratio'].max())
        qa_report.append(f"Average gap ratio: {avg_gap*100:.2f}%")
        qa_report.append(f"Max gap ratio: {max_gap_ratio*100:.2f}%")

    if 'max_gap_seconds' in df_sessions.columns:
        max_single_gap = safe_float(df_sessions['max_gap_seconds'].max())
        qa_report.append(f"Max single gap: {max_single_gap:.2f} s")

    keep_count = df_sessions['keep'].sum()
    total_count = len(df_sessions)
    pass_rate = keep_count / total_count * 100 if total_count > 0 else 0
    qa_report.append(f"")
    qa_report.append(f"Sessions passing QC: {keep_count}/{total_count} ({pass_rate:.1f}%)")

if (raw_dir / "EMPTY_SEGMENTS_TODO.csv").exists():
    qa_report.append("")
    qa_report.append("[Note] Empty/abnormal segments found; see: EMPTY_SEGMENTS_TODO.csv (exclude during later sliding-window segmentation)")

qa_report.append("")

# Per-subject details
qa_report.append("[6. Subject-level details]")
qa_report.append("-"*70)
if not subject_agg.empty:
    for _, subj in subject_agg.iterrows():
        qa_report.append(f"Subject {subj['subject_id']}:")
        qa_report.append(f"  # sessions: {subj['num_sessions']}")
        qa_report.append(f"  Total duration: {subj['total_duration_sec']/60:.1f} minutes")
        qa_report.append(f"  Total samples: {subj['total_samples']:,}")
        qa_report.append(f"  Missing rate: {subj['overall_missing_rate']*100:.2f}%")
        qa_report.append(f"  Placements: {subj['placements']}")
        qa_report.append("")

qa_report.append("="*70)
qa_report.append("End of report")
qa_report.append("="*70)

# Save QA report
qa_report_file = raw_dir / "QA_REPORT.txt"
with open(qa_report_file, "w", encoding="utf-8") as f:
    f.write("\n".join(qa_report))

print(f"✓ Saved quality report: {qa_report_file}")

# Also print to console
print("\n" + "\n".join(qa_report))

# ========== 8. Generate summary JSON ==========
summary = {
    "generated_at_utc": datetime.now(timezone.utc).isoformat(),
    "num_subjects": int(len(subject_agg)) if not subject_agg.empty else 0,
    "num_sessions_total": len(df_sessions) if not df_sessions.empty else 0,
    "num_sessions_keep": int(df_sessions['keep'].sum()) if not df_sessions.empty else 0,
    "total_duration_hours": safe_float(subject_agg['total_duration_sec'].sum() / 3600) if not subject_agg.empty else 0.0,
    "missing_threshold": MISSING_THRESHOLD,
    "gap_threshold_sec": GAP_THRESHOLD,
    "gap_ratio_threshold": GAP_RATIO_THRESHOLD,
    "avg_missing_rate": safe_float(df_sessions['missing_rate'].mean()) if not df_sessions.empty else 0.0,
    "avg_gap_ratio": safe_float(df_sessions['gap_ratio'].mean()) if not df_sessions.empty and 'gap_ratio' in df_sessions.columns else 0.0,
    "max_single_gap_seconds": safe_float(df_sessions['max_gap_seconds'].max()) if not df_sessions.empty and 'max_gap_seconds' in df_sessions.columns else 0.0,
    "num_activities": int(len(activity_total)) if not df_activities.empty else 0,
    "placements": sorted(df_sessions[df_sessions['keep']]['placement'].unique().tolist()) if not df_sessions.empty and df_sessions['keep'].any() else [],
}

summary_file = raw_dir / "qa_summary.json"
with open(summary_file, "w") as f:
    json.dump(summary, f, indent=2)

print(f"\n✓ Saved summary: {summary_file}")

print("\n" + "="*60)
print("Step 3 complete - Metadata & Quality Audit (top-conf/journal grade)")
print("="*60)
print(f"Output files:")
if meta_subjects_file:
    print(f"  - {meta_subjects_file}")
if meta_sessions_file:
    print(f"  - {meta_sessions_file}")
if keep_sessions_file:
    print(f"  - {keep_sessions_file}")
print(f"  - {qa_report_file}")
print(f"  - {summary_file}")
if (raw_dir / "EMPTY_SEGMENTS_TODO.csv").exists():
    print(f"  - {raw_dir / 'EMPTY_SEGMENTS_TODO.csv'} (file-level empty-window list)")
if (raw_dir / "QA_ISSUES.log").exists():
    print(f"  - {raw_dir / 'QA_ISSUES.log'} (quality issue details)")
print("="*60)

Step 3: Metadata & Quality Audit
Loading file index: data/lara/mbientlab/raw/file_index.parquet

1. Parse sensor data metadata
Sensor files: 193
Label files: 193
Extracting time spans and gap statistics (chunked)...
Extracted time spans for 193 sessions

2. Parse labels & activity statistics
Parsed 193 sessions

Checking orphan sessions...

3. Merge session metadata
Merged time span info

4. Data quality checks & empty-window cleanup
✓ QC result: keep 193 sessions, reject 0 sessions
✓ Saved: data/lara/mbientlab/raw/qa_keep_sessions.csv

Generating file-level empty-window list...

5. Generate subject-level metadata
✓ Saved: data/lara/mbientlab/raw/meta_subjects.csv
  Number of subjects: 8

6. Generate session-level metadata
✓ Saved: data/lara/mbientlab/raw/meta_sessions.csv
  Number of sessions: 193

7. Generate quality audit report
✓ Saved quality report: data/lara/mbientlab/raw/QA_REPORT.txt

LARa MbientLab IMU Dataset - Quality Audit Report
Generated at: 2025-11-14T18:19:25.436256+00

In [3]:
#!/usr/bin/env python3
"""
Step 4: Channel & Placement Strategy Selection (top-conf/journal grade)
Select placement, raw channels, derived channels; generate config file
"""

import pandas as pd
import numpy as np
from pathlib import Path
import yaml
import re

print("="*60)
print("Step 4: Channel & Placement Strategy Selection")
print("="*60)

# ========== Placement → Prefix allowlist (eradicate cross-placement leakage) ==========
PREFIX_ALLOWLIST = {
    "rwrist": ["RA_"],
    "lwrist": ["LA_"],
    "chest":  ["N_"],
    # Extensible: "rleg": ["RL_"], "lleg": ["LL_"]
}

REQ_SUFFIX = {
    "ax": "AccelerometerX", "ay": "AccelerometerY", "az": "AccelerometerZ",
    "gx": "GyroscopeX",     "gy": "GyroscopeY",     "gz": "GyroscopeZ",
}

# Coverage threshold: required column presence ratio across files (1.0=100%, 0.95=95%)
MIN_COVERAGE = 1.0

# Load metadata
raw_dir = Path("data/lara/mbientlab/raw")
configs_dir = Path("configs")
configs_dir.mkdir(parents=True, exist_ok=True)

# Load subject metadata
meta_subjects = pd.read_csv(raw_dir / "meta_subjects.csv")
print(f"\nLoaded subject metadata: {len(meta_subjects)} subjects")

# Load file index
index_file = raw_dir / "file_index.parquet"
if not index_file.exists():
    index_file = raw_dir / "file_index.csv"
file_index = pd.read_parquet(index_file) if index_file.suffix == '.parquet' else pd.read_csv(index_file)

# Keep only sensor files (more robust: filter by sensor_type and filename)
if 'sensor_type' in file_index.columns:
    sensor_files = file_index[
        (file_index['sensor_type'].isin(['acc+gyro', 'acc', 'gyro'])) &
        ~file_index['filename'].str.contains('label', case=False, na=False)
    ].copy()
else:
    sensor_files = file_index[
        ~file_index['filename'].str.contains('label', case=False, na=False)
    ].copy()

print(f"Number of sensor files: {len(sensor_files)}")

# ========== 1. Analyze placement coverage ==========
print("\n" + "="*60)
print("1. Analyze placement coverage")
print("="*60)

# Count data volume per placement
placement_stats = sensor_files.groupby('placement').agg({
    'subject_id': 'nunique',
    'session_id': 'nunique',
    'file_size_bytes': 'sum',
    'num_rows': 'sum',
}).reset_index()
placement_stats.columns = ['placement', 'num_subjects', 'num_sessions', 'total_bytes', 'total_samples']
placement_stats = placement_stats.sort_values('total_samples', ascending=False)

print("\nPlacement statistics (sorted by sample count):")
print(placement_stats.to_string(index=False))

# Fix selection to right wrist (this round)
selected_placement = "rwrist"
print(f"\nFixed placement for this round: {selected_placement}")

# Check whether placement exists
if selected_placement not in placement_stats['placement'].values:
    raise ValueError(f"Specified placement '{selected_placement}' does not exist in the data")

# Check which subjects have that placement
subjects_with_selected = sensor_files[sensor_files['placement'] == selected_placement]['subject_id'].unique()
print(f"Subjects with {selected_placement} data: {len(subjects_with_selected)}/{len(meta_subjects)}")

# ========== 2. Allowlist validation & channel check ==========
print("\n" + "="*60)
print("2. Allowlist validation & channel check")
print("="*60)

# Read only from files of selected placement
placement_files = sensor_files[sensor_files['placement'] == selected_placement]
print(f"Number of files for selected placement '{selected_placement}': {len(placement_files)}")

# Get allowlist prefixes
allowed_prefixes = PREFIX_ALLOWLIST.get(selected_placement, [])
assert allowed_prefixes, f"Prefix allowlist for '{selected_placement}' not configured; please add it in PREFIX_ALLOWLIST"
print(f"\nUsing placement→prefix allowlist: {selected_placement} → {allowed_prefixes}")

# Robust header-reading function
def read_cols(fp):
    """Read column names (with fallback)"""
    try:
        return pd.read_csv(fp, nrows=5, sep=None, engine='python').columns.tolist()
    except Exception:
        return pd.read_csv(fp, nrows=5, sep=",").columns.tolist()

# Read headers of all files
print(f"\nRead headers of all {len(placement_files)} files to check consistency...")
all_columns_by_file = []

for _, row in placement_files.iterrows():
    fp = raw_dir / row['standardized_path']
    cols = read_cols(fp)
    data_cols = [c for c in cols if not re.search(r'(time|timestamp|epoch|index|id|class|label)', c, re.I)]
    all_columns_by_file.append(data_cols)

# Assert all files were read successfully
assert len(all_columns_by_file) == len(placement_files), \
    f"{len(placement_files)-len(all_columns_by_file)} '{selected_placement}' files failed header reading; fix or exclude these files first"

print(f"✓ Successfully read {len(all_columns_by_file)} files")

# Show columns of the first file as a reference
if all_columns_by_file:
    print(f"\nData columns of the first file:")
    for col in all_columns_by_file[0]:
        print(f"  {col}")

# ========== 3. Build strict channel mapping (allowlist + consistency assertions) ==========
print("\n" + "="*60)
print("3. Build strict channel mapping (allowlist + consistency assertions)")
print("="*60)

def extract_prefix(col):
    """Extract column prefix"""
    m = re.match(r'^([A-Z]{1,}_)', col)
    return m.group(1) if m else None

def build_mapping_from_allowlist(allowed_prefixes, all_cols_by_file, min_coverage=1.0):
    """Compose column names from allowlist × suffix and check coverage"""
    mapping = {}
    missing_files = {}

    for std, suf in REQ_SUFFIX.items():
        chosen = None
        for pfx in allowed_prefixes:
            cand = f"{pfx}{suf}"
            # Count in how many files this column exists
            present_files = [i for i, cols in enumerate(all_cols_by_file) if cand in cols]
            coverage = len(present_files) / len(all_cols_by_file)

            if coverage >= min_coverage:
                chosen = cand
                if coverage < 1.0:
                    # Record indices of files missing this column (for later inspection)
                    missing_idx = [i for i in range(len(all_cols_by_file)) if i not in present_files]
                    missing_files[std] = missing_idx
                break

        if not chosen:
            raise RuntimeError(
                f"[Consistency assertion failed] {std}: Under prefixes {allowed_prefixes}, no '{suf}' meets {min_coverage*100:.0f}% coverage. "
                f"Check raw column names or change placement/prefix allowlist."
            )

        mapping[std] = chosen

    # Prefix consistency check: all mapped columns must come from allowlist
    used_prefixes = {extract_prefix(v) for v in mapping.values()}
    if not used_prefixes.issubset(set(allowed_prefixes)):
        raise RuntimeError(
            f"[Consistency assertion failed] Final mapping prefixes {used_prefixes} are not all within allowlist {allowed_prefixes}"
        )

    return mapping, used_prefixes, missing_files

# Build mapping
final_mapping, used_prefixes, missing_files = build_mapping_from_allowlist(
    allowed_prefixes, all_columns_by_file, MIN_COVERAGE
)

print("\nFinal channel mapping (standard_name <- original_column):")
for std, orig in sorted(final_mapping.items()):
    print(f"  {std} <- {orig}")

# Explicit hard assertions
assert len(used_prefixes) == 1, f"A single prefix should be used; got {used_prefixes}"
assert list(used_prefixes)[0] in set(PREFIX_ALLOWLIST[selected_placement]), \
    f"Source prefix {used_prefixes} not in allowlist {PREFIX_ALLOWLIST[selected_placement]} for {selected_placement}"

print(f"\n✓ Consistency assertions passed:")
print(f"  - Using a single prefix: {sorted(used_prefixes)}")
print(f"  - Prefix is in the allowlist: {PREFIX_ALLOWLIST[selected_placement]}")
print(f"  - Number of files checked: {len(all_columns_by_file)}")
print(f"  - Coverage requirement: {MIN_COVERAGE*100:.0f}%")

# If there are missing, print warnings
if missing_files:
    print(f"\n⚠️  The following channels are missing in some files (coverage threshold set to {MIN_COVERAGE*100:.0f}%):")
    for std, idx_list in missing_files.items():
        print(f"  {std}: missing in {len(idx_list)} files")

# ========== 4. Generate channel & placement config ==========
print("\n" + "="*60)
print("4. Generate channel & placement config")
print("="*60)

# Config content
config = {
    'dataset': 'LARa_MbientLab_IMU',
    'strategy': 'single_placement_baseline',

    # Placement configuration
    'placements': {
        'selected': [selected_placement],
        'available': placement_stats['placement'].tolist(),
        'rationale': f'Fixed selection {selected_placement}, covering {len(subjects_with_selected)} subjects',
    },

    # Raw channel configuration
    'channels': {
        'raw': ['ax', 'ay', 'az', 'gx', 'gy', 'gz'],
        'mapping': final_mapping,
        'prefix_allowlist': PREFIX_ALLOWLIST,
        'source_prefix': sorted(used_prefixes)[0],
        'min_coverage': MIN_COVERAGE,
        'description': {
            'ax': 'Accelerometer X axis (m/s² or g)',
            'ay': 'Accelerometer Y axis (m/s² or g)',
            'az': 'Accelerometer Z axis (m/s² or g)',
            'gx': 'Gyroscope X axis (rad/s or deg/s)',
            'gy': 'Gyroscope Y axis (rad/s or deg/s)',
            'gz': 'Gyroscope Z axis (rad/s or deg/s)',
        }
    },

    # Derived channel configuration
    'derived_channels': {
        'acc_mag': {
            'formula': 'sqrt(ax^2 + ay^2 + az^2)',
            'description': 'Accelerometer vector magnitude',
        },
        'gyr_mag': {
            'formula': 'sqrt(gx^2 + gy^2 + gz^2)',
            'description': 'Gyroscope vector magnitude',
        }
    },

    # Final channel order
    'final_channels': ['ax', 'ay', 'az', 'gx', 'gy', 'gz', 'acc_mag', 'gyr_mag'],

    # Multi-placement fusion (reserved; currently disabled)
    'multi_placement_fusion': {
        'enabled': False,
        'strategy': None,
        'warning': 'If enabling multi-placement fusion, you must select the fusion strategy independently within each training fold to avoid cross-fold leakage',
    },

    # Rigor notes
    'notes': [
        'Single-placement baseline: avoid cross-placement information leakage',
        'Channel mapping uses "placement→prefix allowlist + consistency assertions"; no cross-prefix voting',
        f'Consistency checked over all {len(all_columns_by_file)} {selected_placement} files',
        f'Coverage requirement: {MIN_COVERAGE*100:.0f}% (tunable tolerance)',
        'Derived channels are computed at feature-extraction stage to preserve raw data integrity',
        'Any multi-placement fusion must be chosen & validated within each LOSO fold',
    ]
}

# Save config
config_file = configs_dir / "channels.yaml"
with open(config_file, 'w', encoding='utf-8') as f:
    yaml.dump(config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

print(f"✓ Saved config: {config_file}")

# ========== 5. Validate config (random multi-file sampling) ==========
print("\n" + "="*60)
print("5. Validate config")
print("="*60)

# Verify coverage of selected placement across all sensor files
files_with_placement = sensor_files[sensor_files['placement'] == selected_placement]

print(f"\nValidate placement '{selected_placement}':")
print(f"  Files: {len(files_with_placement)}")
print(f"  Subjects: {files_with_placement['subject_id'].nunique()}")
print(f"  Sessions: {files_with_placement['session_id'].nunique()}")

# Validate channel mapping: randomly sample multiple files
verify_sample_size = min(5, len(files_with_placement))
verify_df = files_with_placement.sample(n=verify_sample_size, random_state=0)

print(f"\nValidate channel mapping (random sample of {verify_sample_size} files):")
for idx, sample_file in verify_df.iterrows():
    sample_path = raw_dir / sample_file['standardized_path']
    try:
        df_verify = pd.read_csv(sample_path, nrows=100, sep=None, engine='python')

        print(f"\nFile: {sample_file['filename']}")
        all_found = True
        for std_name in ['ax', 'ay', 'az', 'gx', 'gy', 'gz']:
            if std_name in final_mapping:
                orig_name = final_mapping[std_name]
                if orig_name in df_verify.columns:
                    sample_val = df_verify[orig_name].iloc[0]
                    print(f"  ✓ {std_name} <- {orig_name} (sample value: {sample_val:.4f})")
                else:
                    print(f"  ✗ {std_name} <- {orig_name} (column not found)")
                    all_found = False
            else:
                print(f"  ✗ {std_name} (not mapped)")
                all_found = False

        if not all_found:
            print(f"  ⚠️  This file failed validation")

    except Exception as e:
        print(f"\nFile: {sample_file['filename']}")
        print(f"  ✗ Error during validation: {e}")

# Compute derived-channel examples on the first successfully validated file
for idx, sample_file in verify_df.iterrows():
    sample_path = raw_dir / sample_file['standardized_path']
    try:
        df_verify = pd.read_csv(sample_path, nrows=100, sep=None, engine='python')
        if all(final_mapping[ch] in df_verify.columns for ch in ['ax', 'ay', 'az', 'gx', 'gy', 'gz']):
            acc_mag = np.sqrt(
                df_verify[final_mapping['ax']].values**2 +
                df_verify[final_mapping['ay']].values**2 +
                df_verify[final_mapping['az']].values**2
            )
            gyr_mag = np.sqrt(
                df_verify[final_mapping['gx']].values**2 +
                df_verify[final_mapping['gy']].values**2 +
                df_verify[final_mapping['gz']].values**2
            )

            print(f"\nDerived-channel example values (file: {sample_file['filename']}):")
            print(f"  acc_mag: min={acc_mag.min():.4f}, max={acc_mag.max():.4f}, mean={acc_mag.mean():.4f}")
            print(f"  gyr_mag: min={gyr_mag.min():.4f}, max={gyr_mag.max():.4f}, mean={gyr_mag.mean():.4f}")
            break
    except:
        continue

# ========== 6. Fuse check (reload config for verification) ==========
print("\n" + "="*60)
print("6. Fuse check (reload config for verification)")
print("="*60)

with open(config_file, "r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

# Extract prefixes of all mapped columns
srcs = list(cfg["channels"]["mapping"].values())
pfxs = {re.match(r'^([A-Za-z]+_)', s).group(1) for s in srcs if re.match(r'^([A-Za-z]+_)', s)}

# Assertion: all channels use the same prefix
assert len(pfxs) == 1, f"ax..gz not using a single prefix: {pfxs}"

# Assertion: prefix in allowlist
sel = cfg["placements"]["selected"][0]
allow = set(cfg["channels"]["prefix_allowlist"][sel])
assert list(pfxs)[0] in allow, f"Prefix {pfxs} not in {sel} allowlist {allow}"

print(f"✓ Config fuse check passed:")
print(f"  - Reloaded config: {config_file}")
print(f"  - All channels use a single prefix: {pfxs}")
print(f"  - Prefix is in {sel} allowlist: {allow}")

# ========== 7. Summary ==========
print("\n" + "="*60)
print("Step 4 complete - Channels & Placement Strategy")
print("="*60)
print(f"\nConfig summary:")
print(f"  Strategy: single-placement baseline")
print(f"  Fixed placement: {config['placements']['selected']}")
print(f"  Raw channels: {config['channels']['raw']}")
print(f"  Derived channels: {list(config['derived_channels'].keys())}")
print(f"  Final number of channels: {len(config['final_channels'])}")
print(f"  Prefix used: {sorted(used_prefixes)}")
print(f"  Coverage requirement: {MIN_COVERAGE*100:.0f}%")
print(f"\nConfig file: {config_file}")
print(f"\nRigor guarantees:")
print(f"  1. ✓ Use placement→prefix allowlist (hard-coded)")
print(f"  2. ✓ Consistency assertions across all files ({len(all_columns_by_file)} files)")
print(f"  3. ✓ No cross-prefix voting; avoid mis-selection")
print(f"  4. ✓ Error out if column names don't match allowlist")
print(f"  5. ✓ Explicit assertions: single prefix + within allowlist")
print(f"  6. ✓ Abort if header reading fails")
print(f"  7. ✓ Randomly sample {verify_sample_size} files to validate mapping")
print(f"  8. ✓ Fuse check: reload config and verify prefix")
print("="*60)

Step 4: Channel & Placement Strategy Selection

Loaded subject metadata: 8 subjects
Number of sensor files: 193

1. Analyze placement coverage

Placement statistics (sorted by sample count):
placement  num_subjects  num_sessions  total_bytes  total_samples
   rwrist             8            14    685467785        1120045
    chest             7            14    595662725         972496
   lwrist             6             2     80626579         131911

Fixed placement for this round: rwrist
Subjects with rwrist data: 8/8

2. Allowlist validation & channel check
Number of files for selected placement 'rwrist': 96

Using placement→prefix allowlist: rwrist → ['RA_']

Read headers of all 96 files to check consistency...
✓ Successfully read 96 files

Data columns of the first file:
  LA_AccelerometerX
  LA_AccelerometerY
  LA_AccelerometerZ
  LA_GyroscopeX
  LA_GyroscopeY
  LA_GyroscopeZ
  LL_AccelerometerX
  LL_AccelerometerY
  LL_AccelerometerZ
  LL_GyroscopeX
  LL_GyroscopeY
  LL_Gyroscop

In [1]:
import os

"""
Step 5: Timeline Unification & Resampling (top-conf/journal grade - flawless)
Unify to 50 Hz; linear interpolation/forward-fill; align start/end
"""

import pandas as pd
import numpy as np
from pathlib import Path
import yaml
import re
import json

# ========== Config ==========
TARGET_FREQ_HZ = 50.0           # Target sampling rate
MAX_INTERP_GAP_MS = 20.0        # Maximum interpolation gap (milliseconds)
MAX_INTERP_RATIO = 0.15         # Gap coverage threshold 15% (constant; applied globally)

print("="*60)
print("Step 5: Timeline Unification & Resampling")
print("="*60)

# Load config and metadata
raw_dir = Path("data/lara/mbientlab/raw")
proc_dir = Path("data/lara/mbientlab/proc")
proc_dir.mkdir(parents=True, exist_ok=True)

configs_dir = Path("configs")
with open(configs_dir / "channels.yaml", 'r', encoding='utf-8') as f:
    channel_config = yaml.safe_load(f)

selected_placement = channel_config['placements']['selected'][0]
channel_mapping = channel_config['channels']['mapping']
print(f"\nTarget sampling rate: {TARGET_FREQ_HZ} Hz")
print(f"Selected placement: {selected_placement}")

# Load QC results (all kept sessions)
qa_keep = pd.read_csv(raw_dir / "qa_keep_sessions.csv")
keep_sessions = qa_keep[qa_keep['keep'] == True].copy()
keep_sessions = keep_sessions[keep_sessions['placement'] == selected_placement].copy()

# Global processing (no per-fold dependency)
print(f"\nGlobal resampling over all kept sessions (no per-fold markers)")
print(f"  Total sessions: {len(keep_sessions)}")

# Prune switch: always ON (remove sessions with excessive gaps globally)
APPLY_PRUNE = True
keep_sessions['is_train'] = False  # Kept for compatibility with stats / logs

# Load file index
index_file = raw_dir / "file_index.parquet"
if not index_file.exists():
    index_file = raw_dir / "file_index.csv"
file_index = pd.read_parquet(index_file) if index_file.suffix == '.parquet' else pd.read_csv(index_file)

# ========== Helper functions ==========
def detect_time_column(df):
    """Detect time column (avoid false positive matches on 'ts' substring)"""
    time_cols = [c for c in df.columns
                 if re.search(r'(^|_)(time|timestamp|epoch|ts)($|_)', c, re.I)]
    return time_cols[0] if time_cols else None

def parse_time_to_seconds(time_series):
    """Convert time to seconds (correctly infer Unix timestamp units)"""
    numeric = pd.to_numeric(time_series, errors='coerce')
    if numeric.notna().sum() > len(time_series) * 0.9:
        vals = numeric.dropna().values
        max_val = np.abs(vals[:1000]).max() if len(vals) else 0

        # Infer by 2025 Unix timestamp magnitude
        if max_val > 1e17:      # nanoseconds
            return numeric * 1e-9
        elif max_val > 1e14:    # microseconds
            return numeric * 1e-6
        elif max_val > 1e11:    # milliseconds
            return numeric * 1e-3
        else:                   # seconds
            return numeric

    dt = pd.to_datetime(time_series, utc=True, errors='coerce')
    if dt.notna().sum() > len(time_series) * 0.9:
        epoch = pd.Timestamp("1970-01-01", tz='UTC')
        return (dt - epoch).dt.total_seconds()

    return None

def resample_sensor_data(df, time_col, data_cols, target_freq_hz=50.0, max_gap_ms=20.0):
    """Resample sensor data (return cleaned time for labels)"""
    time_sec = parse_time_to_seconds(df[time_col])
    if time_sec is None:
        raise ValueError("Unable to parse time column")

    valid_mask = time_sec.notna() & df[data_cols].notna().all(axis=1)
    time_clean = time_sec[valid_mask].values
    data_clean = df.loc[valid_mask, data_cols].values

    if len(time_clean) < 2:
        return None, 0.0, 0, 0.0, 0.0, None

    # De-duplicate + sort
    unique_idx = np.unique(time_clean, return_index=True)[1]
    time_clean = time_clean[unique_idx]
    data_clean = data_clean[unique_idx]

    order = np.argsort(time_clean)
    time_clean = time_clean[order]
    data_clean = data_clean[order]

    # Original frequency
    dt_orig = np.median(np.diff(time_clean))
    orig_freq_hz = 1.0 / dt_orig if dt_orig > 0 else 0.0

    # Build target timeline with integer number of samples
    dt = 1.0 / target_freq_hz
    t_start = time_clean[0]
    t_end = time_clean[-1]
    n_samples = int(np.round((t_end - t_start) / dt))
    target_time = t_start + np.arange(n_samples + 1) * dt

    # Linear interpolation
    resampled_data = np.zeros((len(target_time), len(data_cols)))
    for i in range(len(data_cols)):
        resampled_data[:, i] = np.interp(target_time, time_clean, data_clean[:, i])

    # Large-gap detection (account for jitter)
    max_gap_sec = max(max_gap_ms / 1000.0, 1.25 * dt)
    time_diffs = np.diff(time_clean)
    gap_mask = time_diffs > max_gap_sec

    is_in_gap = np.zeros(len(target_time), dtype=int)
    is_forced_nan = np.zeros(len(target_time), dtype=int)
    actual_interp_count = 0
    total_gap_time = 0.0

    if gap_mask.any():
        for i in range(len(time_clean) - 1):
            if gap_mask[i]:
                t_gap_start = time_clean[i]
                t_gap_end = time_clean[i + 1]
                gap_duration = t_gap_end - t_gap_start
                total_gap_time += gap_duration

                idxs = np.where((target_time > t_gap_start) & (target_time < t_gap_end))[0]

                if idxs.size > 0:
                    is_in_gap[idxs] = 1
                    actual_interp_count += 1

                    if idxs.size > 1:
                        forced_nan_idxs = idxs[1:]
                        is_forced_nan[forced_nan_idxs] = 1
                        resampled_data[forced_nan_idxs, :] = np.nan

    # Gap coverage
    gap_points = int(is_in_gap.sum())
    interp_ratio = gap_points / len(target_time) if len(target_time) > 0 else 0.0

    # Gap time fraction
    total_duration = t_end - t_start
    gap_time_fraction = total_gap_time / total_duration if total_duration > 0 else 0.0

    resampled_df = pd.DataFrame(resampled_data, columns=data_cols)
    resampled_df.insert(0, 'time_sec', target_time)
    resampled_df['is_in_gap'] = is_in_gap
    resampled_df['is_forced_nan'] = is_forced_nan

    return resampled_df, interp_ratio, gap_points, gap_time_fraction, orig_freq_hz, time_clean

def resample_labels(df_label, df_sensor_time_clean, label_col, target_time, label_time_col=None):
    """Resample labels (boundary NaN + sorting)"""
    if label_time_col is not None:
        time_sec = parse_time_to_seconds(df_label[label_time_col])
        if time_sec is None:
            raise ValueError("Unable to parse label time column")

        valid_mask = time_sec.notna() & df_label[label_col].notna()
        time_clean = time_sec[valid_mask].values
        labels_clean = df_label.loc[valid_mask, label_col].values
    else:
        # Use cleaned sensor time as reference
        sensor_time_original = df_sensor_time_clean
        if sensor_time_original is None:
            raise ValueError("Labels have no time column and no sensor time provided")

        min_len = min(len(df_label), len(sensor_time_original))
        if abs(len(df_label) - len(sensor_time_original)) > min_len * 0.01:
            raise ValueError(
                f"Label rows ({len(df_label)}) differ too much from sensor rows ({len(sensor_time_original)})"
            )

        time_clean = sensor_time_original[:min_len]
        labels_clean = df_label[label_col].iloc[:min_len].values

        valid_mask = pd.notna(labels_clean)
        time_clean = time_clean[valid_mask]
        labels_clean = labels_clean[valid_mask]

    if len(time_clean) == 0:
        return np.full(len(target_time), np.nan)

    # Explicit sorting
    order = np.argsort(time_clean)
    time_clean = time_clean[order]
    labels_clean = labels_clean[order]

    idx = np.searchsorted(time_clean, target_time, side='right') - 1
    idx = np.clip(idx, 0, len(time_clean) - 1)

    labels = labels_clean[idx].copy()

    # Fix: cast integers to float to allow NaN
    if labels.dtype.kind in ['i', 'u']:  # integer or unsigned integer
        labels = labels.astype('float64')

    # Boundary NaNs
    mask_before = target_time < time_clean[0]
    mask_after = target_time > time_clean[-1]
    labels[mask_before | mask_after] = np.nan

    return labels

# ========== 1. Process all sessions ==========
print("\n" + "="*60)
print("1. Resampling")
print("="*60)

resampled_records = []
interp_stats = []
issues = []

for idx, session in keep_sessions.iterrows():
    subject_id = session['subject_id']
    session_id = session['session_id']
    placement = session['placement']
    is_train = session['is_train']

    print(f"\nProcessing {subject_id}/{session_id}/{placement} {'[TRAIN]' if is_train else '[TEST]'}...")

    sensor_file = file_index[
        (file_index['subject_id'] == subject_id) &
        (file_index['session_id'] == session_id) &
        (file_index['placement'] == placement) &
        (~file_index['filename'].str.contains('label', case=False, na=False))
    ]

    label_file = file_index[
        (file_index['subject_id'] == subject_id) &
        (file_index['session_id'] == session_id) &
        (file_index['placement'] == placement) &
        (file_index['filename'].str.contains('label', case=False, na=False))
    ]

    if sensor_file.empty or label_file.empty:
        print(f"  Skip: missing files")
        continue

    sensor_path = raw_dir / sensor_file.iloc[0]['standardized_path']
    label_path = raw_dir / label_file.iloc[0]['standardized_path']

    try:
        df_sensor = pd.read_csv(sensor_path, sep=None, engine='python')
        time_col = detect_time_column(df_sensor)
        if not time_col:
            print(f"  Skip: no time column")
            continue

        data_cols = [channel_mapping[std] for std in ['ax', 'ay', 'az', 'gx', 'gy', 'gz']]
        missing_cols = [c for c in data_cols if c not in df_sensor.columns]
        if missing_cols:
            print(f"  Skip: missing columns {missing_cols}")
            continue

        print(f"  Resampling sensors ({len(df_sensor)} rows)...")
        result = resample_sensor_data(
            df_sensor, time_col, data_cols, TARGET_FREQ_HZ, MAX_INTERP_GAP_MS
        )

        if result[0] is None:
            print(f"  Skip: resampling failed")
            continue

        # Receive cleaned time for labels
        resampled_sensor, interp_ratio, gap_points, gap_time_frac, orig_freq, sensor_time_clean = result

        valid_samples = resampled_sensor[data_cols].notna().all(axis=1).sum()
        nan_samples = len(resampled_sensor) - valid_samples
        forced_nan_points = int(resampled_sensor['is_forced_nan'].sum())

        print(f"  → {len(resampled_sensor)} rows, gap coverage: {interp_ratio*100:.2f}%, NaN: {nan_samples}")

        # Prune based on global switch
        if interp_ratio > MAX_INTERP_RATIO:
            msg = f"Gap coverage too high ({interp_ratio*100:.1f}%)"
            print(f"  ⚠️  {msg}")
            issues.append({
                'subject_id': subject_id,
                'session_id': session_id,
                'placement': placement,
                'is_train': is_train,
                'issue': 'high_gap_coverage',
                'gap_coverage': round(interp_ratio, 4),
            })
            if APPLY_PRUNE:
                continue

        interp_stats.append({
            'subject_id': subject_id,
            'session_id': session_id,
            'placement': placement,
            'is_train': is_train,
            'original_samples': len(df_sensor),
            'original_freq_hz': round(orig_freq, 2),
            'resampled_samples': len(resampled_sensor),
            'valid_samples': valid_samples,
            'nan_samples': nan_samples,
            'gap_points': gap_points,
            'gap_coverage': round(interp_ratio, 4),
            'gap_time_fraction': round(gap_time_frac, 4),
            'forced_nan_points': forced_nan_points,
        })

        df_label = pd.read_csv(label_path, sep=None, engine='python')

        label_col = None
        for col_candidate in ['Class', 'class', 'label', 'Label', 'activity', 'Activity']:
            if col_candidate in df_label.columns:
                label_col = col_candidate
                break

        if not label_col:
            for col in df_label.columns:
                if any(kw in col.lower() for kw in ['label', 'activity', 'class', 'action']):
                    label_col = col
                    break

        if not label_col:
            print(f"  Skip: no label column")
            issues.append({
                'subject_id': subject_id,
                'session_id': session_id,
                'placement': placement,
                'is_train': is_train,
                'issue': 'no_label_column',
            })
            continue

        label_time_col = detect_time_column(df_label)
        target_time = resampled_sensor['time_sec'].values

        print(f"  Resampling labels...")
        try:
            if label_time_col:
                resampled_labels = resample_labels(
                    df_label, sensor_time_clean, label_col, target_time,
                    label_time_col=label_time_col
                )
            else:
                resampled_labels = resample_labels(
                    df_label, sensor_time_clean, label_col, target_time
                )

            resampled_sensor['label'] = resampled_labels

        except Exception as e:
            print(f"  Skip: label resampling failed - {e}")
            issues.append({
                'subject_id': subject_id,
                'session_id': session_id,
                'placement': placement,
                'is_train': is_train,
                'issue': 'label_resample_error',
                'error': str(e),  # include error details
            })
            continue

        resampled_sensor.rename(columns={
            channel_mapping['ax']: 'ax',
            channel_mapping['ay']: 'ay',
            channel_mapping['az']: 'az',
            channel_mapping['gx']: 'gx',
            channel_mapping['gy']: 'gy',
            channel_mapping['gz']: 'gz',
        }, inplace=True)

        resampled_sensor.insert(0, 'subject_id', subject_id)
        resampled_sensor.insert(1, 'session_id', session_id)
        resampled_sensor.insert(2, 'placement', placement)

        resampled_records.append(resampled_sensor)
        print(f"  ✓ Done")

    except Exception as e:
        print(f"  ✗ Error: {e}")
        issues.append({
            'subject_id': subject_id,
            'session_id': session_id,
            'placement': placement,
            'is_train': is_train,
            'issue': 'processing_error',
            'error': str(e),  # include error details
        })

print(f"\nSuccessfully processed: {len(resampled_records)} sessions")
print(f"Skipped/failed: {len(issues)} sessions")

# ========== 2. Combine & save ==========
print("\n" + "="*60)
print("2. Combine & Save")
print("="*60)

if resampled_records:
    df_all = pd.concat(resampled_records, ignore_index=True)

    # Optimization: cast dtypes (reduce size)
    for c in ['ax', 'ay', 'az', 'gx', 'gy', 'gz']:
        df_all[c] = df_all[c].astype('float32')
    df_all['time_sec'] = df_all['time_sec'].astype('float64')  # Keep high precision for time

    output_file = proc_dir / "resampled.parquet"

    if output_file.exists():
        import shutil
        if output_file.is_dir():
            shutil.rmtree(output_file)
        else:
            output_file.unlink()
        print(f"Removed old data: {output_file}")

    df_all.to_parquet(
        output_file,
        index=False,
        partition_cols=['subject_id', 'placement'],
        engine='pyarrow'
    )
    print(f"✓ Saved: {output_file}")
    print(f"  Total rows: {len(df_all):,}")
    print(f"  # subjects: {df_all['subject_id'].nunique()}")
    print(f"  # sessions: {df_all.groupby(['subject_id', 'session_id']).ngroups}")

    valid_mask = df_all[['ax', 'ay', 'az', 'gx', 'gy', 'gz']].notna().all(axis=1)
    print(f"  Valid samples: {valid_mask.sum():,} ({valid_mask.sum()/len(df_all)*100:.1f}%)")
    print(f"  Samples with NaN: {(~valid_mask).sum():,}")

    print("\nData preview:")
    print(df_all.head(10).to_string())

    print("\nNumeric column stats (valid samples):")
    numeric_cols = ['ax', 'ay', 'az', 'gx', 'gy', 'gz']
    print(df_all.loc[valid_mask, numeric_cols].describe().round(4))
else:
    print("Warning: No data to save")

# ========== 3. Save statistics ==========
if interp_stats:
    df_interp = pd.DataFrame(interp_stats)
    interp_file = proc_dir / "resample_stats.csv"
    df_interp.to_csv(interp_file, index=False)
    print(f"\n✓ Saved stats: {interp_file}")

    if 'is_train' in df_interp.columns and df_interp['is_train'].any():
        train_stats = df_interp[df_interp['is_train']]
        print(f"\nGap statistics (train fold):")
        print(f"  Mean gap coverage: {train_stats['gap_coverage'].mean()*100:.2f}%")
        print(f"  Max gap coverage: {train_stats['gap_coverage'].max()*100:.2f}%")
        print(f"  Mean gap time fraction: {train_stats['gap_time_fraction'].mean()*100:.2f}%")

        print(f"\nGap statistics (overall):")
        print(f"  Mean gap coverage: {df_interp['gap_coverage'].mean()*100:.2f}%")
        print(f"  Max gap coverage: {df_interp['gap_coverage'].max()*100:.2f}%")
    else:
        print(f"\nGap statistics:")
        print(f"  Mean gap coverage: {df_interp['gap_coverage'].mean()*100:.2f}%")
        print(f"  Max gap coverage: {df_interp['gap_coverage'].max()*100:.2f}%")

if issues:
    df_issues = pd.DataFrame(issues)
    issues_file = proc_dir / "resample_issues.csv"
    df_issues.to_csv(issues_file, index=False)
    print(f"\n⚠️  Saved issue records: {issues_file} ({len(issues)} items)")

print("\n" + "="*60)
print("Step 5 complete - Flawless version")
print("="*60)
print(f"\nFinal fixes:")
print(f"  1. ✓ Prune switch (always ON; global high-gap sessions removed)")
print(f"  2. ✓ Label time harmonized (reuse cleaned time)")
print(f"  3. ✓ Complete error information (\"error\" field)")
print(f"  4. ✓ Comment fix (constant threshold 0.15)")
print(f"  5. ✓ Type optimization (float32/float64)")
print("="*60)

Step 5: Timeline Unification & Resampling

Target sampling rate: 50.0 Hz
Selected placement: rwrist

Global resampling over all kept sessions (no per-fold markers)
  Total sessions: 96

1. Resampling

Processing S07/R03/rwrist [TEST]...
  Resampling sensors (11758 rows)...
  → 5879 rows, gap coverage: 0.00%, NaN: 0
  Resampling labels...
  ✓ Done

Processing S07/R05/rwrist [TEST]...
  Resampling sensors (11766 rows)...
  → 5883 rows, gap coverage: 0.00%, NaN: 0
  Resampling labels...
  ✓ Done

Processing S07/R06/rwrist [TEST]...
  Resampling sensors (11838 rows)...
  → 5919 rows, gap coverage: 0.00%, NaN: 0
  Resampling labels...
  ✓ Done

Processing S07/R07/rwrist [TEST]...
  Resampling sensors (11795 rows)...
  → 5898 rows, gap coverage: 0.00%, NaN: 0
  Resampling labels...
  ✓ Done

Processing S07/R08/rwrist [TEST]...
  Resampling sensors (11804 rows)...
  → 5902 rows, gap coverage: 0.00%, NaN: 0
  Resampling labels...
  ✓ Done

Processing S07/R09/rwrist [TEST]...
  Resampling senso

In [2]:
import os

"""
Step 6: Sensor Preprocessing (top-conf/journal grade - final fixed version)
Accelerometer high-pass to remove gravity; gyroscope denoising; adaptive ±Nσ clipping (target 1%)
Global version: no FOLD_ID required; thresholds estimated on all data.
"""

import pandas as pd
import numpy as np
from pathlib import Path
import yaml
import json
from scipy import signal

# ========== Config ==========
# Accelerometer high-pass (remove gravity)
ACC_HPF_CUTOFF_HZ = 0.3      # Cutoff frequency
ACC_HPF_ORDER = 2            # Filter order

# Gyroscope low-pass (denoise)
GYR_LPF_CUTOFF_HZ = 20.0     # Cutoff frequency
GYR_LPF_ORDER = 2            # Filter order

# Adaptive clipping threshold (auto-tuned to target clipping rate)
TARGET_CLIP_RATE = 0.01      # Target clipping rate 1% (sum of both tails)

# Sampling rate (from Step 5)
SAMPLING_RATE_HZ = 50.0

# Unit conversions
DEG2RAD = np.pi / 180.0
G_TO_MS2 = 9.80665

print("="*60)
print("Step 6: Sensor Preprocessing")
print("="*60)

# Load data
proc_dir = Path("data/lara/mbientlab/proc")
configs_dir = Path("configs")

print(f"\nLoading resampled data: {proc_dir / 'resampled.parquet'}")
df = pd.read_parquet(proc_dir / "resampled.parquet")

print(f"Data shape: {df.shape}")
print(f"Number of subjects: {df['subject_id'].nunique()}")
print(f"Number of sessions: {df.groupby(['subject_id', 'session_id'], observed=True).ngroups}")

# ========== 0. Unit normalization ==========
print("\n" + "="*60)
print("0. Unit normalization")
print("="*60)

acc_channels = ['ax', 'ay', 'az']
print(f"\nAccelerometer unit conversion: g → m/s²")
for ch in acc_channels:
    if ch in df.columns:
        mask = df[ch].notna()
        df.loc[mask, ch] = df.loc[mask, ch] * G_TO_MS2
print(f"✓ Conversion factor: {G_TO_MS2:.5f}")

gyr_channels = ['gx', 'gy', 'gz']
print(f"\nGyroscope unit conversion: deg/s → rad/s")
for ch in gyr_channels:
    if ch in df.columns:
        mask = df[ch].notna()
        df.loc[mask, ch] = df.loc[mask, ch] * DEG2RAD
print(f"✓ Conversion factor: π/180 = {DEG2RAD:.6f}")

# ========== Helper functions ==========
def design_highpass_filter(cutoff_hz, fs_hz, order=2):
    """Design a high-pass Butterworth filter"""
    nyq = 0.5 * fs_hz
    normal_cutoff = cutoff_hz / nyq
    b, a = signal.butter(order, normal_cutoff, btype='high', analog=False)
    return b, a

def design_lowpass_filter(cutoff_hz, fs_hz, order=2):
    """Design a low-pass Butterworth filter"""
    nyq = 0.5 * fs_hz
    normal_cutoff = cutoff_hz / nyq
    b, a = signal.butter(order, normal_cutoff, btype='low', analog=False)
    return b, a

def filtfilt_nan_safe(x, b, a):
    """Zero-phase filtering tolerant to NaN (filter each contiguous non-NaN run)"""
    y = x.copy()
    good = np.isfinite(x)

    if not good.any():
        return x

    idx = np.where(good)[0]
    cuts = np.where(np.diff(idx) > 1)[0] + 1
    runs = np.split(idx, cuts)

    padlen = 3 * (max(len(a), len(b)) - 1)

    for run in runs:
        seg = x[run]

        if len(seg) > padlen:
            y[run] = signal.filtfilt(b, a, seg, method="pad")
        else:
            tmp = signal.lfilter(b, a, seg)
            y[run] = signal.lfilter(b, a, tmp[::-1])[::-1]

    return y

def apply_filter_by_session(df, channels, b, a):
    """Apply zero-phase filtering grouped by session (include placement grouping + sorting)"""
    filtered_data = []

    for (subj, sess, plc), group in df.groupby(['subject_id', 'session_id', 'placement'], observed=True):
        group = group.sort_values('time_sec').copy()

        for ch in channels:
            if ch not in group.columns:
                continue

            data = group[ch].values
            filtered = filtfilt_nan_safe(data, b, a)
            group[ch] = filtered

        filtered_data.append(group)

    return pd.concat(filtered_data, ignore_index=True)

def compute_clip_thresholds_target(df, channels, target_rate=0.01, use_robust=True):
    """Adaptive thresholds to a target clipping rate (Scheme A)

    Args:
        target_rate: target total clipping rate for both tails (e.g., 0.01 = 1%)
        use_robust: if True, use Median±k·(1.4826·MAD); otherwise Mean±k·Std
    """
    eps = 1e-6
    thresholds = {}

    for ch in channels:
        if ch not in df.columns:
            continue

        x = df[ch].dropna().values
        if x.size == 0:
            continue

        if use_robust:
            # Robust estimate: Median ± k·(1.4826·MAD)
            median = np.median(x)
            mad = np.median(np.abs(x - median))
            robust_std = max(1.4826 * mad, eps)

            deviations = np.abs(x - median) / robust_std
            k = np.quantile(deviations, 1 - target_rate)

            lower = median - k * robust_std
            upper = median + k * robust_std

            thresholds[ch] = {
                'center': float(median),
                'scale': float(robust_std),
                'k': float(k),
                'lower': float(lower),
                'upper': float(upper),
                'method': f'Median±k·MAD (k={k:.3f}, both tails total {target_rate*100:.1f}%)',
            }
        else:
            # Conventional estimate: Mean ± k·Std
            mean = np.mean(x)
            std = max(np.std(x), eps)

            deviations = np.abs(x - mean) / std
            k = np.quantile(deviations, 1 - target_rate)

            lower = mean - k * std
            upper = mean + k * std

            thresholds[ch] = {
                'center': float(mean),
                'scale': float(std),
                'k': float(k),
                'lower': float(lower),
                'upper': float(upper),
                'method': f'Mean±k·Std (k={k:.3f}, both tails total {target_rate*100:.1f}%)',
            }

    return thresholds

def apply_clip(df, channels, thresholds):
    """Apply clipping and compute actual clipping rate"""
    df_clipped = df.copy()
    clip_stats = {}

    for ch in channels:
        if ch not in df_clipped.columns or ch not in thresholds:
            continue

        lower = thresholds[ch]['lower']
        upper = thresholds[ch]['upper']

        mask = df_clipped[ch].notna()
        total = mask.sum()

        if total > 0:
            outliers = ((df_clipped.loc[mask, ch] < lower) | (df_clipped.loc[mask, ch] > upper)).sum()
            clip_rate = outliers / total
            clip_stats[ch] = {
                'outliers': int(outliers),
                'total': int(total),
                'rate': float(clip_rate),
            }

        df_clipped.loc[mask, ch] = df_clipped.loc[mask, ch].clip(lower, upper)

    return df_clipped, clip_stats

# ========== 1. Design filters ==========
print("\n" + "="*60)
print("1. Design filters")
print("="*60)

print(f"\nAccelerometer high-pass filter:")
print(f"  Cutoff frequency: {ACC_HPF_CUTOFF_HZ} Hz")
print(f"  Order: {ACC_HPF_ORDER}")
acc_b, acc_a = design_highpass_filter(ACC_HPF_CUTOFF_HZ, SAMPLING_RATE_HZ, ACC_HPF_ORDER)

print(f"\nGyroscope low-pass filter:")
print(f"  Cutoff frequency: {GYR_LPF_CUTOFF_HZ} Hz")
print(f"  Order: {GYR_LPF_ORDER}")
gyr_b, gyr_a = design_lowpass_filter(GYR_LPF_CUTOFF_HZ, SAMPLING_RATE_HZ, GYR_LPF_ORDER)

# ========== 2. Apply filters (by session + placement) ==========
print("\n" + "="*60)
print("2. Apply filters (by session + placement, zero-phase)")
print("="*60)

print("\nApplying accelerometer high-pass (remove gravity)...")
df_filtered = apply_filter_by_session(df, acc_channels, acc_b, acc_a)
print("✓ Done")

print("\nApplying gyroscope low-pass (denoise)...")
df_filtered = apply_filter_by_session(df_filtered, gyr_channels, gyr_b, gyr_a)
print("✓ Done")

# ========== 3. Compute clipping thresholds (adaptive to target rate) ==========
print("\n" + "="*60)
print("3. Compute adaptive clipping thresholds (target clipping rate)")
print("="*60)

print("Estimate clipping thresholds on all data")
print(f"  Target clip rate: {TARGET_CLIP_RATE*100:.1f}%")

all_channels = acc_channels + gyr_channels
df_for_stats = df_filtered  # global estimation on all subjects
clip_thresholds = compute_clip_thresholds_target(
    df_for_stats, all_channels, TARGET_CLIP_RATE, use_robust=True
)

print(f"\nClipping thresholds (adaptive robust estimation):")
for ch, thresh in clip_thresholds.items():
    print(f"  {ch}:")
    print(f"    center: {thresh['center']:.4f}")
    print(f"    scale: {thresh['scale']:.4f}")
    print(f"    k: {thresh['k']:.3f}")
    print(f"    range: [{thresh['lower']:.4f}, {thresh['upper']:.4f}]")

# ========== 4. Apply clipping ==========
print("\n" + "="*60)
print("4. Apply adaptive clipping")
print("="*60)

df_clipped, clip_stats = apply_clip(df_filtered, all_channels, clip_thresholds)

print("\nActual clipping statistics:")
for ch, stats in clip_stats.items():
    print(f"  {ch}: {stats['outliers']:,} / {stats['total']:,} ({stats['rate']*100:.2f}%)")

# ========== 5. Cast to float32 to save memory ==========
print("\n" + "="*60)
print("5. Data type optimization")
print("="*60)

numeric_cols = ['ax', 'ay', 'az', 'gx', 'gy', 'gz']
for col in numeric_cols:
    if col in df_clipped.columns:
        df_clipped[col] = df_clipped[col].astype('float32')

print(f"✓ Sensor columns cast to float32")
print(f"✓ time_sec kept as float64")

# ========== 6. Save results ==========
print("\n" + "="*60)
print("6. Save results")
print("="*60)

output_file = proc_dir / "filtered.parquet"

if output_file.exists():
    import shutil
    if output_file.is_dir():
        shutil.rmtree(output_file)
    else:
        output_file.unlink()
    print(f"Removed old data: {output_file}")

df_clipped.to_parquet(
    output_file,
    index=False,
    partition_cols=['subject_id', 'placement'],
    engine='pyarrow'
)
print(f"✓ Saved: {output_file}")
print(f"  Data shape: {df_clipped.shape}")

print("\nData preview:")
print(df_clipped.head(10).to_string())

print("\nPost-filter numeric column stats:")
valid_mask = df_clipped[numeric_cols].notna().all(axis=1)
print(df_clipped.loc[valid_mask, numeric_cols].describe().round(4))

# ========== 7. Save filter configuration ==========
print("\n" + "="*60)
print("7. Save filter configuration")
print("="*60)

filter_config = {
    'sampling_rate_hz': SAMPLING_RATE_HZ,

    'units': {
        'accelerometer': 'm/s² (converted from g)',
        'gyroscope': 'rad/s (converted from deg/s)',
        'conversion': {
            'accelerometer_g_to_ms2': G_TO_MS2,
            'gyroscope_deg_to_rad': DEG2RAD,
        }
    },

    'dtypes': {
        'sensor_channels': 'float32',
        'time_sec': 'float64',
    },

    'accelerometer': {
        'filter_type': 'highpass',
        'purpose': 'detrend (remove gravity)',
        'method': 'Butterworth',
        'cutoff_hz': ACC_HPF_CUTOFF_HZ,
        'order': ACC_HPF_ORDER,
        'coefficients': {
            'b': acc_b.tolist(),
            'a': acc_a.tolist(),
        },
        'zero_phase': True,
    },

    'gyroscope': {
        'filter_type': 'lowpass',
        'purpose': 'denoise',
        'method': 'Butterworth',
        'cutoff_hz': GYR_LPF_CUTOFF_HZ,
        'order': GYR_LPF_ORDER,
        'coefficients': {
            'b': gyr_b.tolist(),
            'a': gyr_a.tolist(),
        },
        'zero_phase': True,
    },

    'clipping': {
        'method': 'Adaptive robust estimation (Median±k·MAD, Scheme A)',
        'target_clip_rate': TARGET_CLIP_RATE,
        'estimated_on': 'all_data',
        'fold_id': None,
        'thresholds': clip_thresholds,
        'actual_clip_stats': clip_stats,
        'rationale': (
            f'Auto-adjust k so the global clipping rate reaches the target '
            f'{TARGET_CLIP_RATE*100:.1f}% over all subjects'
        ),
    },

    'notes': [
        'All filters use filtfilt for zero phase',
        'Filtering is grouped by session + placement, sorted by time_sec; avoid crossing session boundaries',
        'filtfilt_nan_safe filters each contiguous non-NaN run separately',
        'Accelerometer converted from g to m/s² (×9.80665)',
        'Gyroscope converted from deg/s to rad/s (×π/180)',
        f'Adaptive clipping thresholds: determine k on all data so clipping ≈ {TARGET_CLIP_RATE*100:.1f}%, then apply consistently to all data',
        'NaNs remain unchanged',
        'Sensor columns are float32; time_sec is float64',
    ]
}

filter_config_file = configs_dir / "filter.yaml"
with open(filter_config_file, 'w', encoding='utf-8') as f:
    yaml.dump(filter_config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"✓ Saved filter configuration: {filter_config_file}")

filter_config_json = configs_dir / "filter.json"
with open(filter_config_json, 'w', encoding='utf-8') as f:
    json.dump(filter_config, f, indent=2)
print(f"✓ Saved filter configuration: {filter_config_json}")

# ========== 8. Summary ==========
print("\n" + "="*60)
print("Step 6 complete - Sensor preprocessing (global version)")
print("="*60)
print(f"\nConfig:")
print(f"  Units: Acc g→m/s², Gyro deg/s→rad/s")
print(f"  Accelerometer: high-pass {ACC_HPF_CUTOFF_HZ} Hz (remove gravity)")
print(f"  Gyroscope: low-pass {GYR_LPF_CUTOFF_HZ} Hz (denoise)")
print(f"  Clipping: adaptive ±k·MAD (target {TARGET_CLIP_RATE*100:.1f}%)")
print(f"  Clipping thresholds estimated on: all data")
print(f"\nResults:")
print(f"  Output file: {output_file}")
print(f"  Config file: {filter_config_file}")
print(f"  Data shape: {df_clipped.shape}")
print("\nFinal fixes:")
print(f"  ✓ Adaptive clipping thresholds (Scheme A)")
print(f"  ✓ Target clipping rate {TARGET_CLIP_RATE*100:.1f}%, auto-solve k (global)")
print(f"  ✓ Group by placement + sort by time_sec")
print(f"  ✓ Write actual clipping rate into config")
print("="*60)

Step 6: Sensor Preprocessing

Loading resampled data: data/lara/mbientlab/proc/resampled.parquet
Data shape: (560070, 13)
Number of subjects: 8
Number of sessions: 96

0. Unit normalization

Accelerometer unit conversion: g → m/s²
✓ Conversion factor: 9.80665

Gyroscope unit conversion: deg/s → rad/s
✓ Conversion factor: π/180 = 0.017453

1. Design filters

Accelerometer high-pass filter:
  Cutoff frequency: 0.3 Hz
  Order: 2

Gyroscope low-pass filter:
  Cutoff frequency: 20.0 Hz
  Order: 2

2. Apply filters (by session + placement, zero-phase)

Applying accelerometer high-pass (remove gravity)...
✓ Done

Applying gyroscope low-pass (denoise)...
✓ Done

3. Compute adaptive clipping thresholds (target clipping rate)
Estimate clipping thresholds on all data
  Target clip rate: 1.0%

Clipping thresholds (adaptive robust estimation):
  ax:
    center: 0.0152
    scale: 1.3530
    k: 6.712
    range: [-9.0660, 9.0964]
  ay:
    center: 0.0141
    scale: 1.3815
    k: 5.790
    range: [-7.9

In [3]:
import os

"""
Step 7: Coordinate/Magnitude Normalization (top-conf/journal grade)
Compute magnitude channels; z-score standardization (global statistics)
"""

import pandas as pd
import numpy as np
from pathlib import Path
import json
import pickle

# ========== Config ==========
EPSILON = 1e-8  # Prevent division by zero

print("="*60)
print("Step 7: Coordinate/Magnitude Normalization")
print("="*60)

# Load data
proc_dir = Path("data/lara/mbientlab/proc")
configs_dir = Path("configs")

print(f"\nLoading filtered data: {proc_dir / 'filtered.parquet'}")
df = pd.read_parquet(proc_dir / "filtered.parquet")

print(f"Data shape: {df.shape}")
print(f"Number of subjects: {df['subject_id'].nunique()}")
print(f"Number of sessions: {df.groupby(['subject_id', 'session_id'], observed=True).ngroups}")

# ========== 1. Compute derived channels (magnitude) ==========
print("\n" + "="*60)
print("1. Compute derived channels (magnitude)")
print("="*60)

# Accelerometer magnitude
print("\nComputing acc_mag = sqrt(ax² + ay² + az²)...")
df['acc_mag'] = np.sqrt(
    df['ax'].values**2 +
    df['ay'].values**2 +
    df['az'].values**2
).astype('float32')

# Gyroscope magnitude
print("Computing gyr_mag = sqrt(gx² + gy² + gz²)...")
df['gyr_mag'] = np.sqrt(
    df['gx'].values**2 +
    df['gy'].values**2 +
    df['gz'].values**2
).astype('float32')

print(f"✓ Added derived channels: acc_mag, gyr_mag")

# Show derived-channel stats
print("\nDerived channel statistics (post-filter):")
for col in ['acc_mag', 'gyr_mag']:
    valid_data = df[col].dropna()
    if len(valid_data) > 0:
        print(f"  {col}:")
        print(f"    Mean: {valid_data.mean():.4f}")
        print(f"    Std: {valid_data.std():.4f}")
        print(f"    Range: [{valid_data.min():.4f}, {valid_data.max():.4f}]")

# ========== 2. Determine training set (global) ==========
print("\n" + "="*60)
print("2. Determine training set (global)")
print("="*60)

# In this simplified global version, we use ALL subjects to estimate statistics
df_train = df
train_subjects = set(df['subject_id'].unique())
test_subjects = set()  # no explicit test set at this step

print("Compute statistics on all data (no per-fold split)")
print(f"  Samples: {len(df):,}")
print(f"  Subjects: {len(train_subjects)}")

# ========== 3. Compute z-score parameters (global) ==========
print("\n" + "="*60)
print("3. Compute z-score parameters (global)")
print("="*60)

# Channels to standardize
channels_to_normalize = ['ax', 'ay', 'az', 'gx', 'gy', 'gz', 'acc_mag', 'gyr_mag']

# Compute mean and std (valid data only)
scaler_params = {}

print("\nz-score parameters (global):")
for ch in channels_to_normalize:
    if ch not in df_train.columns:
        continue

    valid_data = df_train[ch].dropna().values

    if len(valid_data) > 0:
        mean = float(np.mean(valid_data))
        std = float(np.std(valid_data))

        # Guard against zero std
        if std < EPSILON:
            std = 1.0

        scaler_params[ch] = {
            'mean': mean,
            'std': std,
        }

        print(f"  {ch}:")
        print(f"    Mean: {mean:.6f}")
        print(f"    Std: {std:.6f}")

# ========== 4. Apply z-score standardization ==========
print("\n" + "="*60)
print("4. Apply z-score standardization")
print("="*60)

df_normalized = df.copy()

for ch in channels_to_normalize:
    if ch not in scaler_params:
        continue

    mean = scaler_params[ch]['mean']
    std = scaler_params[ch]['std']

    # Standardize non-NaN values only; cast to float32 to avoid warnings
    mask = df_normalized[ch].notna()
    normalized_values = ((df_normalized.loc[mask, ch] - mean) / (std + EPSILON)).astype('float32')
    df_normalized.loc[mask, ch] = normalized_values

print(f"✓ Standardized {len(scaler_params)} channels")

# Show post-standardization stats (global)
print("\nPost-standardization stats (global):")
for ch in channels_to_normalize:
    if ch not in scaler_params:
        continue

    valid_data = df_normalized[ch].dropna()
    if len(valid_data) > 0:
        print(f"  {ch}:")
        print(f"    Mean: {valid_data.mean():.6f} (should be near 0)")
        print(f"    Std: {valid_data.std():.6f} (should be near 1)")

# ========== 5. Save results ==========
print("\n" + "="*60)
print("5. Save results")
print("="*60)

# Save normalized data
output_file = proc_dir / "normalized.parquet"

# Delete existing directory/file (avoid duplicate appends)
if output_file.exists():
    import shutil
    if output_file.is_dir():
        shutil.rmtree(output_file)
    else:
        output_file.unlink()
    print(f"Removed old data: {output_file}")

df_normalized.to_parquet(
    output_file,
    index=False,
    partition_cols=['subject_id', 'placement'],
    engine='pyarrow'
)
print(f"✓ Saved: {output_file}")
print(f"  Data shape: {df_normalized.shape}")

# Show data preview
print("\nData preview:")
display_cols = ['subject_id', 'session_id', 'ax', 'ay', 'az', 'gx', 'gy', 'gz', 'acc_mag', 'gyr_mag', 'label']
available_cols = [c for c in display_cols if c in df_normalized.columns]
print(df_normalized[available_cols].head(10).to_string())

# Post-standardization numeric stats (overall)
print("\nPost-standardization numeric column stats (overall):")
numeric_cols = ['ax', 'ay', 'az', 'gx', 'gy', 'gz', 'acc_mag', 'gyr_mag']
valid_mask = df_normalized[numeric_cols].notna().all(axis=1)
print(df_normalized.loc[valid_mask, numeric_cols].describe().round(4))

# ========== 6. Save scaler parameters ==========
print("\n" + "="*60)
print("6. Save scaler parameters")
print("="*60)

scaler_info = {
    'fold_id': None,             # global
    'epsilon': EPSILON,
    'train_subjects': sorted(list(train_subjects)),
    'test_subjects': None,       # not defined at this step
    'channels': channels_to_normalize,
    'params': scaler_params,
    'notes': [
        'z-score standardization: (x - mean) / (std + ε)',
        'Mean and std computed from all available samples (global statistics)',
        'If std < ε, set std = 1.0 to avoid divide-by-zero',
        'NaN values are excluded from stats and remain NaN after normalization',
    ]
}

# Save as pickle (global)
scaler_file = proc_dir / "standardization.pkl"
with open(scaler_file, 'wb') as f:
    pickle.dump(scaler_info, f)
print(f"✓ Saved scaler: {scaler_file}")

# Also save as JSON (human-readable)
scaler_json = proc_dir / "standardization.json"
with open(scaler_json, 'w') as f:
    json.dump(scaler_info, f, indent=2)
print(f"✓ Saved scaler: {scaler_json}")

# ========== 7. Validate standardization ==========
print("\n" + "="*60)
print("7. Validate standardization (global)")
print("="*60)

for ch in channels_to_normalize[:3]:  # check first 3 channels only
    if ch in scaler_params:
        valid_data = df_normalized[ch].dropna()
        if len(valid_data) > 0:
            mean_check = valid_data.mean()
            std_check = valid_data.std()
            print(f"  {ch}: mean={mean_check:.6f}, std={std_check:.6f}")

# ========== 8. Summary ==========
print("\n" + "="*60)
print("Step 7 complete - Coordinate/Magnitude Normalization (global)")
print("="*60)
print(f"\nConfig:")
print(f"  Method: z-score standardization (global)")
print(f"  ε (avoid divide-by-zero): {EPSILON}")
print(f"  Standardized channels: {len(scaler_params)}")
print(f"\nResults:")
print(f"  Output data: {output_file}")
print(f"  Scaler (pkl): {scaler_file}")
print(f"  Scaler (json): {scaler_json}")
print(f"  Data shape: {df_normalized.shape}")
print(f"  New columns: acc_mag, gyr_mag")
print("\nRigor guarantees:")
print("  1. ✓ Mean/std computed once on all data (global stats)")
print("  2. ✓ NaNs remain unchanged")
print("  3. ✓ ε={} prevents divide-by-zero".format(EPSILON))
print("  4. ✓ Derived channels acc_mag, gyr_mag")
print("="*60)

Step 7: Coordinate/Magnitude Normalization

Loading filtered data: data/lara/mbientlab/proc/filtered.parquet
Data shape: (560070, 13)
Number of subjects: 8
Number of sessions: 96

1. Compute derived channels (magnitude)

Computing acc_mag = sqrt(ax² + ay² + az²)...
Computing gyr_mag = sqrt(gx² + gy² + gz²)...
✓ Added derived channels: acc_mag, gyr_mag

Derived channel statistics (post-filter):
  acc_mag:
    Mean: 2.9151
    Std: 2.3751
    Range: [0.0073, 14.5074]
  gyr_mag:
    Mean: 1.3122
    Std: 1.1772
    Range: [0.0015, 7.0648]

2. Determine training set (global)
Compute statistics on all data (no per-fold split)
  Samples: 560,070
  Subjects: 8

3. Compute z-score parameters (global)

z-score parameters (global):
  ax:
    Mean: -0.008808
    Std: 2.274599
  ay:
    Mean: 0.006988
    Std: 2.146420
  az:
    Mean: 0.001384
    Std: 2.087749
  gx:
    Mean: -0.002287
    Std: 0.886048
  gy:
    Mean: 0.010105
    Std: 1.160388
  gz:
    Mean: 0.023344
    Std: 0.987726
  acc_ma

In [4]:
#!/usr/bin/env python3

"""
Step 8: Label Alignment & Cleaning (top-conf/journal grade - revised)
Clean NULL/transition, unify to a standard label set, and record mappings
"""

import pandas as pd
import numpy as np
from pathlib import Path
import yaml
import json
from collections import Counter

# ========== Config ==========

# Label cleaning strategy
NULL_STRATEGY = "remove"  # "remove" or "merge_to_transition"
TRANSITION_STRATEGY = "merge_to_nearest"  # "remove" or "merge_to_nearest"

# Unmapped label threshold (abort if exceeded)
UNMAPPED_THRESHOLD = 0.01  # 1%

print("="*60)
print("Step 8: Label Alignment & Cleaning")
print("="*60)

# Create directories
proc_dir = Path("data/lara/mbientlab/proc")
configs_dir = Path("configs")
reports_dir = Path("reports")
reports_dir.mkdir(parents=True, exist_ok=True)

print(f"\nLoading normalized data: {proc_dir / 'normalized.parquet'}")
df = pd.read_parquet(proc_dir / "normalized.parquet")

print(f"Data shape: {df.shape}")
print(f"Number of subjects: {df['subject_id'].nunique()}")

# ========== 1. Analyze original label distribution ==========

print("\n" + "="*60)
print("1. Analyze original label distribution")
print("="*60)

# Count all labels
label_counts = df['label'].value_counts(dropna=False)
total_samples = len(df)
null_count = df['label'].isna().sum()

print(f"\nOriginal label stats:")
print(f"  Total samples: {total_samples:,}")
print(f"  NULL samples: {null_count:,} ({null_count/total_samples*100:.2f}%)")
print(f"  Number of label classes: {df['label'].nunique(dropna=True)}")

print(f"\nLabel distribution (top 20):")
for label, count in label_counts.head(20).items():
    pct = count / total_samples * 100
    print(f"  {str(label):30s}: {count:8,} ({pct:5.2f}%)")

# ========== 2. Define label mapping rules ==========

print("\n" + "="*60)
print("2. Define label mapping rules")
print("="*60)

# Map LARa dataset labels to a cross-dataset unified label superset
# Covers LARa / RealWorld / SHL
LABEL_MAPPING = {
    # Basic activities (shared by RealWorld + LARa)
    1: {"original": "walking", "mapped": "walking", "category": "locomotion"},
    2: {"original": "running", "mapped": "running", "category": "locomotion"},
    3: {"original": "shuffling", "mapped": "walking", "category": "locomotion"},  # merge into walking
    4: {"original": "stairs (ascending)", "mapped": "upstairs", "category": "locomotion"},
    5: {"original": "stairs (descending)", "mapped": "downstairs", "category": "locomotion"},
    6: {"original": "standing", "mapped": "standing", "category": "static"},
    7: {"original": "sitting", "mapped": "sitting", "category": "static"},
    8: {"original": "lying", "mapped": "lying", "category": "static"},

    # Transport (specific to LARa; not in RealWorld)
    13: {"original": "cycling (sit)", "mapped": "cycling", "category": "transport"},
    14: {"original": "cycling (stand)", "mapped": "cycling", "category": "transport"},
    130: {"original": "cycling", "mapped": "cycling", "category": "transport"},

    17: {"original": "car", "mapped": "car", "category": "transport"},
    18: {"original": "bus", "mapped": "bus", "category": "transport"},
    19: {"original": "train", "mapped": "train", "category": "transport"},
    20: {"original": "subway", "mapped": "subway", "category": "transport"},

    # Transition label
    0: {"original": "transition", "mapped": "transition", "category": "transition"},
}

# Cross-dataset unified label superset (LARa + RealWorld + SHL)
UNIFIED_LABELS = {
    "walking": 1,
    "running": 2,
    "sitting": 3,
    "standing": 4,
    "upstairs": 5,
    "downstairs": 6,
    "lying": 7,
    "cycling": 8,
    "car": 9,
    "bus": 10,
    "train": 11,
    "subway": 12,
    "transition": 0,  # kept or cleaned
}

print(f"\nDefined mapping rules: {len(LABEL_MAPPING)} original labels")
print(f"Unified label set: {len(UNIFIED_LABELS)} labels (cross-dataset superset)")

print(f"\nMapping examples:")
for orig_id, info in list(LABEL_MAPPING.items())[:10]:
    print(f"  {orig_id} ({info['original']}) -> {info['mapped']}")

# ========== 3. Audit assertion: check unmapped labels ==========

print("\n" + "="*60)
print("3. Audit assertion: check unmapped labels")
print("="*60)

# Find all original label IDs (excluding NULL)
orig_ids = set(df['label'].dropna().astype(int).unique())
covered_ids = set(LABEL_MAPPING.keys())
unmapped_ids = sorted(orig_ids - covered_ids)

if unmapped_ids:
    # Count samples for unmapped labels
    unmapped_counts = []
    for uid in unmapped_ids:
        count = (df['label'] == uid).sum()
        pct = count / total_samples
        unmapped_counts.append({
            'original_label_id': uid,
            'sample_count': count,
            'percentage': round(pct * 100, 4),
        })

    df_unmapped = pd.DataFrame(unmapped_counts)
    total_unmapped = df_unmapped['sample_count'].sum()
    unmapped_ratio = total_unmapped / total_samples

    # Save list of unmapped labels
    unmapped_file = reports_dir / "unmapped_labels.csv"
    df_unmapped.to_csv(unmapped_file, index=False)

    print(f"\n⚠️ Found unmapped labels: {len(unmapped_ids)}")
    print(f"  Unmapped sample count: {total_unmapped:,} ({unmapped_ratio*100:.2f}%)")
    print(f"  Details saved to: {unmapped_file}")
    print(f"\nList of unmapped labels:")
    print(df_unmapped.to_string(index=False))

    # Abort if threshold exceeded
    if unmapped_ratio > UNMAPPED_THRESHOLD:
        raise RuntimeError(
            f"Unmapped label ratio {unmapped_ratio*100:.2f}% exceeds threshold {UNMAPPED_THRESHOLD*100}%. "
            f"Please check {unmapped_file} and extend LABEL_MAPPING."
        )
    else:
        print(f"\n✓ Unmapped label ratio does not exceed threshold {UNMAPPED_THRESHOLD*100}%; continuing (will mark as NULL)")
else:
    print(f"\n✓ All original labels are covered")

# ========== 4. Apply label mapping ==========

print("\n" + "="*60)
print("4. Apply label mapping")
print("="*60)

df_mapped = df.copy()

# Keep a copy of original labels (nullable integer)
df_mapped['label_original'] = df_mapped['label'].astype('Int32')

# Apply mapping
def map_label(label):
    """Map a single label"""
    if pd.isna(label):
        return np.nan

    label = int(label)
    if label in LABEL_MAPPING:
        mapped_name = LABEL_MAPPING[label]['mapped']
        return UNIFIED_LABELS[mapped_name]
    else:
        # Unknown labels marked as NaN
        return np.nan

df_mapped['label'] = df_mapped['label_original'].apply(map_label)

# Stats after mapping
mapped_label_counts = df_mapped['label'].value_counts(dropna=False)
null_after_mapping = df_mapped['label'].isna().sum()

print(f"\nPost-mapping label stats:")
print(f"  NULL samples: {null_after_mapping:,} ({null_after_mapping/total_samples*100:.2f}%)")
print(f"  Number of valid label classes: {df_mapped['label'].nunique(dropna=True)}")

print(f"\nPost-mapping distribution:")
for label, count in mapped_label_counts.head(15).items():
    pct = count / total_samples * 100
    # find label name
    label_name = "NULL"
    if not pd.isna(label):
        label_name = [k for k, v in UNIFIED_LABELS.items() if v == int(label)][0]
    print(f"  {label_name:15s} ({str(label):2s}): {count:8,} ({pct:5.2f}%)")

# ========== 5. Clean NULL and transition labels (true nearest neighbor) ==========

print("\n" + "="*60)
print("5. Clean NULL and transition labels (true nearest neighbor)")
print("="*60)

df_cleaned = df_mapped.copy()

# Handle NULL labels
if NULL_STRATEGY == "remove":
    null_mask = df_cleaned['label'].isna()
    removed_null = null_mask.sum()
    df_cleaned = df_cleaned[~null_mask].copy()
    print(f"\nNULL handling: removed {removed_null:,} samples")
elif NULL_STRATEGY == "merge_to_transition":
    null_mask = df_cleaned['label'].isna()
    df_cleaned.loc[null_mask, 'label'] = UNIFIED_LABELS['transition']
    print(f"\nNULL handling: merged into transition ({null_mask.sum():,} samples)")

# Detect time column
time_col = None
for candidate in ['time_sec', 'timestamp', 'timestamp_ms', 'time', 'epoch_ms']:
    if candidate in df_cleaned.columns:
        time_col = candidate
        break

if time_col:
    print(f"\nDetected time column: {time_col}")
else:
    print(f"\nNo time column detected; will process by index order")

# Handle transition label (true nearest neighbor)
transition_value = UNIFIED_LABELS['transition']
if TRANSITION_STRATEGY == "remove":
    trans_mask = df_cleaned['label'] == transition_value
    removed_trans = trans_mask.sum()
    df_cleaned = df_cleaned[~trans_mask].copy()
    print(f"Transition handling: removed {removed_trans:,} samples")

elif TRANSITION_STRATEGY == "merge_to_nearest":
    trans_mask = df_cleaned['label'] == transition_value
    trans_count = trans_mask.sum()

    if trans_count > 0:
        print(f"Transition handling: merge {trans_count:,} samples using nearest-neighbor interpolation")

        # Sort by time (ensure nearest-neighbor semantics)
        if time_col:
            df_cleaned = df_cleaned.sort_values(
                ['subject_id', 'session_id', 'placement', time_col],
                kind='stable'
            ).copy()
            print(f"  ✓ Sorted by [{time_col}]")
        else:
            df_cleaned = df_cleaned.sort_index(kind='stable').copy()
            print(f"  ⚠️ Sorted by index (no time column)")

        # True nearest-neighbor merge
        merged_count = 0
        for (subj, sess, plc), group in df_cleaned.groupby(
            ['subject_id', 'session_id', 'placement'], observed=True
        ):
            idx = group.index
            labels = df_cleaned.loc[idx, 'label'].copy()

            # Replace transition with NaN
            labels_with_nan = labels.replace(transition_value, np.nan).astype('float')

            if labels_with_nan.isna().any():
                # Use nearest interpolation (true nearest neighbor)
                labels_filled = labels_with_nan.interpolate(
                    method='nearest',
                    limit_direction='both'
                )

                # Count successfully merged items
                was_trans = (labels == transition_value)
                now_filled = labels_filled.notna()
                merged_this_group = (was_trans & now_filled).sum()
                merged_count += merged_this_group

                # Update labels (round then cast to int)
                df_cleaned.loc[idx, 'label'] = labels_filled.round()

        print(f"  ✓ Successfully merged {merged_count:,} transition samples to nearest labels")

        # Remove transitions that could not be merged (entire segments are transition)
        remaining_trans = (df_cleaned['label'] == transition_value).sum()
        if remaining_trans > 0:
            df_cleaned = df_cleaned[df_cleaned['label'] != transition_value].copy()
            print(f"  ✓ Removed remaining {remaining_trans:,} transition samples that could not be merged")

# Remove remaining NaNs
final_nan = df_cleaned['label'].isna().sum()
if final_nan > 0:
    df_cleaned = df_cleaned[df_cleaned['label'].notna()].copy()
    print(f"\nRemoved final residual NaN samples: {final_nan:,}")

# Cast to int32
df_cleaned['label'] = df_cleaned['label'].astype('int32')

# Reset index
df_cleaned = df_cleaned.reset_index(drop=True)

print(f"\nData after cleaning:")
print(f"  Samples: {len(df_cleaned):,}")
print(f"  Number of label classes: {df_cleaned['label'].nunique()}")
print(f"  Retention rate: {len(df_cleaned)/total_samples*100:.2f}%")

# ========== 6. Audit assertion: verify final label set ==========

print("\n" + "="*60)
print("6. Audit assertion: verify final label set")
print("="*60)

# Determine allowed label set
allowed_labels = set(UNIFIED_LABELS.values())
if TRANSITION_STRATEGY == "remove":
    allowed_labels.discard(UNIFIED_LABELS['transition'])

# Check actual label set
actual_labels = set(df_cleaned['label'].unique())
unexpected = sorted(actual_labels - allowed_labels)

if unexpected:
    raise RuntimeError(
        f"Illegal labels found after cleaning: {unexpected}\n"
        f"Allowed labels: {sorted(allowed_labels)}"
    )
else:
    print(f"✓ Final label set validation passed")
    print(f"  Allowed labels: {sorted(allowed_labels)}")
    print(f"  Actual labels: {sorted(actual_labels)}")

# ========== 7. Final label distribution ==========

print("\n" + "="*60)
print("7. Final label distribution")
print("="*60)

final_label_counts = df_cleaned['label'].value_counts()

print(f"\nFinal label distribution:")
for label_id, count in final_label_counts.items():
    pct = count / len(df_cleaned) * 100
    label_name = [k for k, v in UNIFIED_LABELS.items() if v == int(label_id)][0]
    print(f"  {label_name:15s} ({int(label_id):2d}): {count:8,} ({pct:5.2f}%)")

# By-category statistics
category_stats = {}
for label_id, count in final_label_counts.items():
    label_name = [k for k, v in UNIFIED_LABELS.items() if v == int(label_id)][0]
    # Find category
    category = None
    for orig_id, info in LABEL_MAPPING.items():
        if info['mapped'] == label_name:
            category = info['category']
            break

    if category:
        category_stats[category] = category_stats.get(category, 0) + count

print(f"\nBy-category statistics:")
for category, count in sorted(category_stats.items()):
    pct = count / len(df_cleaned) * 100
    print(f"  {category:15s}: {count:8,} ({pct:5.2f}%)")

# ========== 8. Save results ==========

print("\n" + "="*60)
print("8. Save results")
print("="*60)

# Save cleaned data (using directory layout)
output_dir = proc_dir / "labeled"
if output_dir.exists():
    import shutil
    shutil.rmtree(output_dir)

df_cleaned.to_parquet(
    output_dir,
    index=False,
    partition_cols=['subject_id', 'placement'],
    engine='pyarrow'
)

print(f"✓ Saved: {output_dir}/")
print(f"  Data shape: {df_cleaned.shape}")
print(f"  Partitions: subject_id / placement")

# ========== 9. Save label mapping config (rich) ==========

print("\n" + "="*60)
print("9. Save label mapping config (rich)")
print("="*60)

# Build labels_map with more info
labels_map_data = []
for label_name, label_id in sorted(UNIFIED_LABELS.items(), key=lambda x: x[1]):
    if label_name == "transition" and TRANSITION_STRATEGY == "remove":
        continue  # exclude removed transition

    # Find original label IDs and names
    original_ids = []
    original_names = []
    category = None

    for orig_id, info in LABEL_MAPPING.items():
        if info['mapped'] == label_name:
            original_ids.append(str(orig_id))
            original_names.append(info['original'])
            if category is None:
                category = info['category']

    # Actual sample count
    sample_count = final_label_counts.get(label_id, 0)

    labels_map_data.append({
        'label_id': label_id,
        'label_name': label_name,
        'category': category or 'unknown',
        'sample_count': int(sample_count),
        'percentage': round(sample_count / len(df_cleaned) * 100, 2) if len(df_cleaned) > 0 else 0.0,
        'original_label_ids': ','.join(original_ids) if original_ids else '',
        'original_label_names': '; '.join(original_names) if original_names else '',
        'source_dataset': 'LARa-MbientLab',
        'description': f"{label_name} activity",
    })

df_labels_map = pd.DataFrame(labels_map_data)
labels_map_file = proc_dir / "labels_map.csv"
df_labels_map.to_csv(labels_map_file, index=False)

print(f"✓ Saved label mapping: {labels_map_file}")
print(f"\nLabel mapping table:")
print(df_labels_map.to_string(index=False))

# Save detailed configuration
label_config = {
    'dataset': 'LARa-MbientLab',
    'label_system': 'Cross-dataset unified label superset (covers LARa/RealWorld/SHL)',
    'unified_labels': UNIFIED_LABELS,
    'label_mapping': LABEL_MAPPING,
    'cleaning_strategy': {
        'null_strategy': NULL_STRATEGY,
        'transition_strategy': TRANSITION_STRATEGY,
        'transition_method': 'nearest-neighbor interpolation (true nearest neighbor)' if TRANSITION_STRATEGY == 'merge_to_nearest' else 'remove',
        'time_sorted': time_col is not None,
        'time_column': time_col,
        'unmapped_threshold': UNMAPPED_THRESHOLD,
    },
    'statistics': {
        'original_samples': int(total_samples),
        'cleaned_samples': int(len(df_cleaned)),
        'removed_samples': int(total_samples - len(df_cleaned)),
        'removal_rate': float((total_samples - len(df_cleaned)) / total_samples),
        'original_label_count': int(df['label'].nunique(dropna=True)),
        'final_label_count': int(df_cleaned['label'].nunique()),
        'unmapped_label_count': len(unmapped_ids) if unmapped_ids else 0,
    },
    'label_distribution': {
        label_name: int(final_label_counts.get(label_id, 0))
        for label_name, label_id in UNIFIED_LABELS.items()
        if label_name != 'transition' or TRANSITION_STRATEGY != 'remove'
    },
    'notes': [
        'Label mapping based on cross-dataset unified label superset (LARa + RealWorld + SHL)',
        f'NULL label strategy: {NULL_STRATEGY}',
        f'Transition label strategy: {TRANSITION_STRATEGY} (true nearest-neighbor interpolation)',
        'Unmapped original labels are automatically marked as NULL',
        f'Unmapped label threshold: {UNMAPPED_THRESHOLD*100}%',
        f'Sorted by time column: {time_col if time_col else "No (by index)"}',
        'Mapping table saved at proc/labels_map.csv',
        'label_original column uses nullable integer Int32',
        'Includes audit assertions to ensure label set integrity',
    ]
}

label_config_file = configs_dir / "labels.yaml"
with open(label_config_file, 'w', encoding='utf-8') as f:
    yaml.dump(label_config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

print(f"✓ Saved config: {label_config_file}")

label_config_json = configs_dir / "labels.json"
with open(label_config_json, 'w', encoding='utf-8') as f:
    json.dump(label_config, f, indent=2)

print(f"✓ Saved config: {label_config_json}")

# ========== 10. Summary ==========

print("\n" + "="*60)
print("Step 8 complete - Label alignment & cleaning (top-tier revised)")
print("="*60)

print(f"\nConfig:")
print(f"  Label system: cross-dataset unified superset (LARa/RealWorld/SHL)")
print(f"  NULL strategy: {NULL_STRATEGY}")
print(f"  Transition strategy: {TRANSITION_STRATEGY} (true nearest neighbor)")
print(f"  Unmapped threshold: {UNMAPPED_THRESHOLD*100}%")
print(f"  Time column: {time_col if time_col else 'No (by index)'}")

print(f"\nResults:")
print(f"  Original samples: {total_samples:,}")
print(f"  Cleaned samples: {len(df_cleaned):,}")
print(f"  Removed samples: {total_samples - len(df_cleaned):,}")
print(f"  Retention rate: {len(df_cleaned)/total_samples*100:.2f}%")

print(f"\nLabel stats:")
print(f"  Original label classes: {df['label'].nunique(dropna=True)}")
print(f"  Final label classes: {df_cleaned['label'].nunique()}")
print(f"  Unmapped labels: {len(unmapped_ids) if unmapped_ids else 0}")

print(f"\nOutputs:")
print(f"  Data: {output_dir}/")
print(f"  Mapping table: {labels_map_file}")
print(f"  Config: {label_config_file}")
if unmapped_ids:
    print(f"  Unmapped list: {reports_dir / 'unmapped_labels.csv'}")

print("\nKey fixes (top-tier):")
print("  1. ✓ True nearest-neighbor merge (interpolate method='nearest')")
print("  2. ✓ Sort by time before processing (correct semantics)")
print("  3. ✓ Record unmapped labels to reports/unmapped_labels.csv")
print("  4. ✓ label_original uses nullable Int32")
print("  5. ✓ Removed irrelevant MAJORITY_VOTE_THRESHOLD")
print("  6. ✓ Audit assertions (fail-fast)")
print("  7. ✓ labels_map.csv includes original names and source")
print("  8. ✓ Label system described as cross-dataset superset")
print("  9. ✓ Output directory changed to labeled/")
print("="*60)

Step 8: Label Alignment & Cleaning

Loading normalized data: data/lara/mbientlab/proc/normalized.parquet
Data shape: (560070, 15)
Number of subjects: 8

1. Analyze original label distribution

Original label stats:
  Total samples: 560,070
  NULL samples: 28 (0.00%)
  Number of label classes: 8

Label distribution (top 20):
  4.0                           :  215,988 (38.56%)
  2.0                           :   86,132 (15.38%)
  0.0                           :   73,180 (13.07%)
  7.0                           :   48,927 ( 8.74%)
  5.0                           :   43,966 ( 7.85%)
  1.0                           :   42,039 ( 7.51%)
  3.0                           :   38,224 ( 6.82%)
  6.0                           :   11,586 ( 2.07%)
  nan                           :       28 ( 0.00%)

2. Define label mapping rules

Defined mapping rules: 16 original labels
Unified label set: 13 labels (cross-dataset superset)

Mapping examples:
  1 (walking) -> walking
  2 (running) -> running
  3 (shuf

In [5]:
import os

"""
Step 9: Sliding-window Slicing (top-conf/journal grade - multi-fold version)
Slice with fixed window length/step; assign window label by majority label
For each fold in configs/splits.json, generate windows/{fold_xx}/X_train.npy, X_test.npy, etc.
"""

import pandas as pd
import numpy as np
from pathlib import Path
import yaml
import json
from collections import Counter

# ========== Config ==========

# Sliding-window parameters
SAMPLING_RATE_HZ = 50.0
WINDOW_SIZE_SEC = 3.0
OVERLAP_RATIO = 0.5

# Compute sample counts
WINDOW_SIZE = int(WINDOW_SIZE_SEC * SAMPLING_RATE_HZ)        # 150 samples
STEP_SIZE = int(WINDOW_SIZE * (1 - OVERLAP_RATIO))           # 75 samples

# Majority label threshold
DOMINANT_THRESHOLD = 0.8

# Feature columns (8 channels)
FEATURE_COLS = ['ax', 'ay', 'az', 'gx', 'gy', 'gz', 'acc_mag', 'gyr_mag']

print("="*60)
print("Step 9: Sliding-window slicing (multi-fold)")
print("="*60)

# Base directories
proc_dir = Path("data/lara/mbientlab/proc")
configs_dir = Path("configs")
windows_root = proc_dir / "windows"
windows_root.mkdir(parents=True, exist_ok=True)

print(f"\nSliding-window parameters:")
print(f"  Window length: {WINDOW_SIZE_SEC} s = {WINDOW_SIZE} samples @ {SAMPLING_RATE_HZ} Hz")
print(f"  Step size: {STEP_SIZE} samples (overlap {OVERLAP_RATIO*100:.0f}%)")
print(f"  Dominant label threshold: {DOMINANT_THRESHOLD*100:.0f}%")
print(f"  Feature columns: {FEATURE_COLS}")

# ========== 1. Load data ==========

print("\n" + "="*60)
print("1. Load cleaned & labeled data")
print("="*60)

labeled_dir = proc_dir / "labeled"
print(f"Loading data from: {labeled_dir}/")
df = pd.read_parquet(labeled_dir)

print(f"Data shape: {df.shape}")
print(f"Number of subjects: {df['subject_id'].nunique()}")
print(f"Number of label classes: {df['label'].nunique()}")

# Check required columns
required_cols = ['subject_id', 'session_id', 'placement', 'label'] + FEATURE_COLS
missing_cols = [c for c in required_cols if c not in df.columns]
if missing_cols:
    raise ValueError(f"Missing required columns: {missing_cols}")

# Detect time column
time_col = 'time_sec' if 'time_sec' in df.columns else None
if time_col:
    print(f"Time column: {time_col}")
else:
    print("No time column detected; will sort by index")

# ========== 2. Load splits and determine folds ==========

print("\n" + "="*60)
print("2. Load train/test splits (folds)")
print("="*60)

splits_path = configs_dir / "splits.json"
fold_ids = []
splits = None

if splits_path.exists():
    with open(splits_path, "r") as f:
        splits = json.load(f)

    # Expect keys like "0","1","2",...
    fold_ids = sorted(int(k) for k in splits.keys())
    print(f"Detected {len(fold_ids)} folds from {splits_path}: {fold_ids}")
else:
    # Fallback: single "all" fold (no LOSO config)
    print("⚠️ splits.json not found; will treat all data as a single 'all' fold")
    fold_ids = [None]

# ========== 3. Sliding-window function (with time continuity check) ==========

def sliding_window_extract(df_subset, window_size, step_size, dominant_threshold, time_col=None):
    """
    Perform sliding-window slicing grouped by session.

    Returns:
        windows_list: list of window feature arrays
        metadata_list: list of window metadata dicts
    """
    windows_list = []
    metadata_list = []
    window_id = 0

    # Group by session + placement
    for (subj, sess, plc), group in df_subset.groupby(
        ['subject_id', 'session_id', 'placement'], observed=True
    ):
        # Sort by time column (preferred), otherwise by index
        if time_col and time_col in group.columns:
            group = group.sort_values(time_col, kind='stable').copy()
        else:
            group = group.sort_index(kind='stable').copy()

        # Extract features and labels
        features = group[FEATURE_COLS].values
        labels = group['label'].values

        # Extract timestamps (if any)
        if time_col and time_col in group.columns:
            timestamps = group[time_col].values
        else:
            timestamps = None

        # Sliding-window slicing
        n_samples = len(group)
        for start_idx in range(0, n_samples - window_size + 1, step_size):
            end_idx = start_idx + window_size

            # Extract window
            window_features = features[start_idx:end_idx]
            window_labels = labels[start_idx:end_idx]

            # Check NaNs
            if np.isnan(window_features).any():
                continue

            # Time continuity check (if timestamps exist)
            if timestamps is not None:
                expected_duration = (window_size - 1) / SAMPLING_RATE_HZ
                actual_duration = timestamps[end_idx - 1] - timestamps[start_idx]
                # Allow 10% jitter
                if abs(actual_duration - expected_duration) > 0.1 * expected_duration:
                    continue

            # Compute dominant label
            label_counts = Counter(window_labels)
            dominant_label, dominant_count = label_counts.most_common(1)[0]
            dominant_ratio = dominant_count / window_size

            # Keep only windows that meet the threshold
            if dominant_ratio < dominant_threshold:
                continue

            # Extract time range
            if timestamps is not None:
                time_start = timestamps[start_idx]
                time_end = timestamps[end_idx - 1]
                time_range = f"{time_start:.3f}-{time_end:.3f}"
            else:
                time_range = f"{start_idx}-{end_idx-1}"

            # Save window
            windows_list.append(window_features)

            # Save metadata
            metadata_list.append({
                'window_id': window_id,
                'subject_id': subj,
                'session_id': sess,
                'placement': plc,
                'label': int(dominant_label),
                'label_purity': round(dominant_ratio, 4),
                'time_range': time_range,
                'start_idx': start_idx,
                'end_idx': end_idx,
            })

            window_id += 1

    return windows_list, metadata_list

# ========== 4. Loop over folds and extract windows ==========

print("\n" + "="*60)
print("3. Extract windows for each fold")
print("="*60)

fold_stats = {}  # for global config (per-fold statistics)

for fold_id in fold_ids:
    if fold_id is None:
        fold_tag = "all"
        print(f"\n--- Processing pseudo-fold: {fold_tag} (all data) ---")

        train_subjects = set(df['subject_id'].unique())
        test_subjects = set()
        df_train = df.copy()
        df_test = pd.DataFrame()
    else:
        fold_tag = f"fold_{fold_id:02d}"
        print(f"\n--- Processing fold {fold_id} ({fold_tag}) ---")

        fold_cfg = splits[str(fold_id)]
        train_subjects = set(fold_cfg["train_subjects"])
        test_subjects = set(fold_cfg["test_subjects"])

        # Split data
        df_train = df[df['subject_id'].isin(train_subjects)].copy()
        df_test = df[df['subject_id'].isin(test_subjects)].copy()

        print(f"  Train subjects: {len(train_subjects)}")
        print(f"  Test subjects:  {len(test_subjects)}")
        print(f"  Train samples:  {len(df_train):,}")
        print(f"  Test samples:   {len(df_test):,}")

    # Create output directory for this fold
    windows_dir = windows_root / fold_tag
    windows_dir.mkdir(parents=True, exist_ok=True)
    print(f"  Output directory: {windows_dir}")

    # ----- 4.1 Extract training-set windows -----
    print("\n  [Train] Sliding-window extraction")

    train_windows = []
    df_train_meta = pd.DataFrame()
    train_label_counts = pd.Series(dtype=int)

    if not df_train.empty:
        print(f"  Processing train set ({len(df_train):,} samples)...")

        train_windows, train_metadata = sliding_window_extract(
            df_train, WINDOW_SIZE, STEP_SIZE, DOMINANT_THRESHOLD, time_col
        )

        print(f"  ✓ Extracted train windows: {len(train_windows):,}")

        if train_windows:
            # To numpy array
            X_train = np.array(train_windows, dtype='float32')  # (n_windows, window_size, n_features)
            df_train_meta = pd.DataFrame(train_metadata)

            print(f"    X_train shape: {X_train.shape}")
            print(f"    Feature dims : {X_train.shape[2]} channels × {X_train.shape[1]} timesteps")

            # Label distribution
            train_label_counts = df_train_meta['label'].value_counts().sort_index()
            print(f"\n    Train-set label distribution:")
            for label, count in train_label_counts.items():
                pct = count / len(df_train_meta) * 100
                print(f"      Label {label}: {count:6,} windows ({pct:5.2f}%)")

            # Label purity stats
            avg_purity = df_train_meta['label_purity'].mean()
            min_purity = df_train_meta['label_purity'].min()
            print(f"\n    Train-set label purity:")
            print(f"      Mean: {avg_purity*100:.2f}%")
            print(f"      Min:  {min_purity*100:.2f}%")

            # Save train set
            print(f"\n    Saving train set...")

            # Save features (numpy)
            X_train_npy_file = windows_dir / "X_train.npy"
            np.save(X_train_npy_file, X_train)
            print(f"      ✓ {X_train_npy_file} (feature tensor)")

            # Save metadata (Parquet)
            X_train_meta_file = windows_dir / "X_train.parquet"
            df_train_meta[['window_id', 'subject_id', 'session_id', 'placement',
                           'label', 'label_purity', 'time_range', 'start_idx', 'end_idx']].to_parquet(
                X_train_meta_file, index=False
            )
            print(f"      ✓ {X_train_meta_file} (metadata)")

            # Save label vector
            y_train = df_train_meta['label'].values.astype('int32')
            y_train_file = windows_dir / "y_train.npy"
            np.save(y_train_file, y_train)
            print(f"      ✓ {y_train_file}")

            # Export label distribution snapshot (for audit)
            train_label_counts.to_csv(windows_dir / "train_label_counts.csv", header=['count'])
            print(f"      ✓ train_label_counts.csv")
        else:
            print("  ⚠️ No train windows extracted")
    else:
        print("  Train set is empty; skipping")

    # ----- 4.2 Extract test-set windows -----
    print("\n  [Test] Sliding-window extraction")

    test_windows = []
    df_test_meta = pd.DataFrame()
    test_label_counts = pd.Series(dtype=int)

    if not df_test.empty:
        print(f"  Processing test set ({len(df_test):,} samples)...")

        test_windows, test_metadata = sliding_window_extract(
            df_test, WINDOW_SIZE, STEP_SIZE, DOMINANT_THRESHOLD, time_col
        )

        print(f"  ✓ Extracted test windows: {len(test_windows):,}")

        if test_windows:
            # To numpy array
            X_test = np.array(test_windows, dtype='float32')
            df_test_meta = pd.DataFrame(test_metadata)

            print(f"    X_test shape: {X_test.shape}")

            # Label distribution
            test_label_counts = df_test_meta['label'].value_counts().sort_index()
            print(f"\n    Test-set label distribution:")
            for label, count in test_label_counts.items():
                pct = count / len(df_test_meta) * 100
                print(f"      Label {label}: {count:6,} windows ({pct:5.2f}%)")

            # Label purity stats
            avg_purity = df_test_meta['label_purity'].mean()
            min_purity = df_test_meta['label_purity'].min()
            print(f"\n    Test-set label purity:")
            print(f"      Mean: {avg_purity*100:.2f}%")
            print(f"      Min:  {min_purity*100:.2f}%")

            # Save test set
            print(f"\n    Saving test set...")

            # Save features (numpy)
            X_test_npy_file = windows_dir / "X_test.npy"
            np.save(X_test_npy_file, X_test)
            print(f"      ✓ {X_test_npy_file} (feature tensor)")

            # Save metadata (Parquet)
            X_test_meta_file = windows_dir / "X_test.parquet"
            df_test_meta[['window_id', 'subject_id', 'session_id', 'placement',
                          'label', 'label_purity', 'time_range', 'start_idx', 'end_idx']].to_parquet(
                X_test_meta_file, index=False
            )
            print(f"      ✓ {X_test_meta_file} (metadata)")

            # Save label vector
            y_test = df_test_meta['label'].values.astype('int32')
            y_test_file = windows_dir / "y_test.npy"
            np.save(y_test_file, y_test)
            print(f"      ✓ {y_test_file}")

            # Export label distribution snapshot (for audit)
            test_label_counts.to_csv(windows_dir / "test_label_counts.csv", header=['count'])
            print(f"      ✓ test_label_counts.csv")
        else:
            print("  ⚠️ No test windows extracted")
    else:
        print("  Test set is empty; skipping")

    # ----- 4.3 Collect statistics for this fold -----
    fold_key = "all" if fold_id is None else str(fold_id)
    fold_stats[fold_key] = {}

    if train_windows:
        fold_stats[fold_key]['train'] = {
            'n_windows': int(len(train_windows)),
            'n_subjects': int(df_train_meta['subject_id'].nunique()),
            'n_sessions': int(df_train_meta.groupby(['subject_id', 'session_id']).ngroups),
            'label_distribution': {int(k): int(v) for k, v in train_label_counts.items()},
            'avg_label_purity': round(float(df_train_meta['label_purity'].mean()), 4),
            'min_label_purity': round(float(df_train_meta['label_purity'].min()), 4),
        }

    if test_windows:
        fold_stats[fold_key]['test'] = {
            'n_windows': int(len(test_windows)),
            'n_subjects': int(df_test_meta['subject_id'].nunique()),
            'n_sessions': int(df_test_meta.groupby(['subject_id', 'session_id']).ngroups),
            'label_distribution': {int(k): int(v) for k, v in test_label_counts.items()},
            'avg_label_purity': round(float(df_test_meta['label_purity'].mean()), 4),
            'min_label_purity': round(float(df_test_meta['label_purity'].min()), 4),
        }

# ========== 5. Save window configuration (global, multi-fold) ==========

print("\n" + "="*60)
print("4. Save window configuration (global)")
print("="*60)

fold_ids_str = ["all" if fid is None else str(fid) for fid in fold_ids]

window_config = {
    'window_parameters': {
        'sampling_rate_hz': SAMPLING_RATE_HZ,
        'window_size_sec': WINDOW_SIZE_SEC,
        'window_size_samples': WINDOW_SIZE,
        'overlap_ratio': OVERLAP_RATIO,
        'step_size_samples': STEP_SIZE,
        'dominant_threshold': DOMINANT_THRESHOLD,
    },
    'features': {
        'channels': FEATURE_COLS,
        'n_channels': len(FEATURE_COLS),
        'description': '8-channel IMU features (ax,ay,az,gx,gy,gz,acc_mag,gyr_mag)',
    },
    'dataset_split': {
        'num_folds': len(fold_ids),
        'fold_ids': fold_ids_str,
        'source': str(splits_path) if splits_path.exists() else None,
    },
    'statistics': fold_stats,
    'notes': [
        f'Window parameters: {WINDOW_SIZE_SEC}s @ {SAMPLING_RATE_HZ}Hz = {WINDOW_SIZE} samples',
        f'Step size: {STEP_SIZE} samples (overlap {OVERLAP_RATIO*100:.0f}%)',
        f'Dominant label threshold: {DOMINANT_THRESHOLD*100:.0f}% (discard windows below threshold)',
        'Features: 8 channels (3-axis accelerometer + 3-axis gyroscope + 2 magnitudes)',
        'Data formats: X_*.npy (float32 tensor), X_*.parquet (metadata), y_*.npy (int32)',
        'Metadata includes: window_id/time_range/label/label_purity, etc.',
        'Slice per session to ensure temporal continuity',
        f'Order by {time_col if time_col else "index"}',
        'Discard windows containing NaN',
        'Time continuity check (allow 10% jitter)',
        'Persist by fold: windows/fold_xx/ (avoid overwrite when looping over folds)',
    ]
}

window_config_file = configs_dir / "windows.yaml"
with open(window_config_file, 'w', encoding='utf-8') as f:
    yaml.dump(window_config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
print(f"✓ Saved config: {window_config_file}")

window_config_json = configs_dir / "windows.json"
with open(window_config_json, 'w', encoding='utf-8') as f:
    json.dump(window_config, f, indent=2)
print(f"✓ Saved config: {window_config_json}")

# ========== 6. Summary ==========

print("\n" + "="*60)
print("Step 9 complete - Sliding-window slicing (multi-fold)")
print("="*60)

print(f"\nWindow parameters:")
print(f"  Window length: {WINDOW_SIZE_SEC} s = {WINDOW_SIZE} samples")
print(f"  Step size: {STEP_SIZE} samples (overlap {OVERLAP_RATIO*100:.0f}%)")
print(f"  Dominant threshold: {DOMINANT_THRESHOLD*100:.0f}%")
print(f"  Feature dimension: {len(FEATURE_COLS)} channels")
print(f"  Sort order: {time_col if time_col else 'index'}")

print("\nPer-fold window statistics:")
for fold_key, stats in fold_stats.items():
    print(f"\n  Fold {fold_key}:")
    if 'train' in stats:
        tr = stats['train']
        print(f"    Train: {tr['n_windows']} windows, "
              f"{tr['n_subjects']} subjects, {tr['n_sessions']} sessions, "
              f"avg purity {tr['avg_label_purity']*100:.2f}%")
    else:
        print("    Train: (no windows)")
    if 'test' in stats:
        te = stats['test']
        print(f"    Test : {te['n_windows']} windows, "
              f"{te['n_subjects']} subjects, {te['n_sessions']} sessions, "
              f"avg purity {te['avg_label_purity']*100:.2f}%")
    else:
        print("    Test : (no windows)")

print(f"\nOutputs per fold:")
print(f"  Root directory: {windows_root}/")
print(f"  For each fold: X_train.npy, X_train.parquet, y_train.npy, "
      f"train_label_counts.csv (+ test equivalents when applicable)")
print(f"  Global config: {window_config_file}, {window_config_json}")
print("="*60)

Step 9: Sliding-window slicing (multi-fold)

Sliding-window parameters:
  Window length: 3.0 s = 150 samples @ 50.0 Hz
  Step size: 75 samples (overlap 50%)
  Dominant label threshold: 80%
  Feature columns: ['ax', 'ay', 'az', 'gx', 'gy', 'gz', 'acc_mag', 'gyr_mag']

1. Load cleaned & labeled data
Loading data from: data/lara/mbientlab/proc/labeled/
Data shape: (556504, 16)
Number of subjects: 8
Number of label classes: 6
Time column: time_sec

2. Load train/test splits (folds)
Detected 8 folds from configs/splits.json: [0, 1, 2, 3, 4, 5, 6, 7]

3. Extract windows for each fold

--- Processing fold 0 (fold_00) ---
  Train subjects: 7
  Test subjects:  1
  Train samples:  479,982
  Test samples:   76,522
  Output directory: data/lara/mbientlab/proc/windows/fold_00

  [Train] Sliding-window extraction
  Processing train set (479,982 samples)...
  ✓ Extracted train windows: 4,965
    X_train shape: (4965, 150, 8)
    Feature dims : 8 channels × 150 timesteps

    Train-set label distribut

In [6]:
#!/usr/bin/env python3

"""
Step 10: LOSO Split (top-conf/journal grade)
Leave-One-Subject-Out: 1 subject for test per fold, the rest for training
"""

import pandas as pd
import numpy as np
from pathlib import Path
import json
import yaml
from collections import defaultdict

print("="*60)
print("Step 10: LOSO split")
print("="*60)

# Path configuration
proc_dir = Path("data/lara/mbientlab/proc")
configs_dir = Path("configs")
configs_dir.mkdir(parents=True, exist_ok=True)

# ========== 1. Load data and get subject list ==========

print("\n" + "="*60)
print("1. Load data and get subject list")
print("="*60)

labeled_dir = proc_dir / "labeled"
print(f"Loading data: {labeled_dir}/")

df = pd.read_parquet(labeled_dir)

print(f"Data shape: {df.shape}")
print(f"Total samples: {len(df):,}")

# Extract all subjects
all_subjects = sorted(df['subject_id'].unique().tolist())
n_subjects = len(all_subjects)

print(f"\nSubject list:")
print(f"  Total: {n_subjects} subjects")
print(f"  IDs: {all_subjects}")

# Sample count per subject
subject_sample_counts = df['subject_id'].value_counts().sort_index()
print(f"\nSample count per subject:")
for subj in all_subjects:
    count = subject_sample_counts.get(subj, 0)
    pct = count / len(df) * 100
    print(f"  {subj}: {count:8,} samples ({pct:5.2f}%)")

# ========== 2. Generate LOSO split ==========

print("\n" + "="*60)
print("2. Generate LOSO split")
print("="*60)

print(f"\nLOSO strategy: Leave-One-Subject-Out")
print(f"  #folds = #subjects = {n_subjects}")
print(f"  Per fold: 1 subject for test, {n_subjects-1} subjects for train")

# Create split dict
splits = {}

for fold_id, test_subject in enumerate(all_subjects):
    # Test set: current subject
    test_subjects = [test_subject]

    # Train set: all other subjects
    train_subjects = [s for s in all_subjects if s != test_subject]

    # Save split
    splits[str(fold_id)] = {
        "fold_id": fold_id,
        "test_subject": test_subject,
        "test_subjects": test_subjects,  # list for compatibility
        "train_subjects": train_subjects,
        "n_train": len(train_subjects),
        "n_test": len(test_subjects),
    }

    print(f"  Fold {fold_id}: test {test_subject}, train {len(train_subjects)} subjects")

print(f"\n✓ Generated {len(splits)} LOSO folds")

# ========== 3. Validate split integrity ==========

print("\n" + "="*60)
print("3. Validate split integrity")
print("="*60)

# Check 1: each subject appears exactly once in the test set
test_subject_appearances = defaultdict(int)
for fold_id, fold_info in splits.items():
    for subj in fold_info['test_subjects']:
        test_subject_appearances[subj] += 1

print(f"\nCheck 1: times each subject appears as test")
all_once = True
for subj in all_subjects:
    count = test_subject_appearances[subj]
    status = "✓" if count == 1 else "✗"
    print(f"  {status} {subj}: {count} time(s)")
    if count != 1:
        all_once = False

if all_once:
    print(f"  ✓ All subjects appear exactly once")
else:
    raise RuntimeError("Split validation failed: subject test appearances not equal to 1")

# Check 2: train and test sets are disjoint
print(f"\nCheck 2: train and test sets are disjoint")
all_disjoint = True
for fold_id, fold_info in splits.items():
    train_set = set(fold_info['train_subjects'])
    test_set = set(fold_info['test_subjects'])
    overlap = train_set & test_set

    if overlap:
        print(f"  ✗ Fold {fold_id}: overlap exists {overlap}")
        all_disjoint = False

if all_disjoint:
    print(f"  ✓ Train/test sets are completely disjoint for all folds")
else:
    raise RuntimeError("Split validation failed: train and test sets have overlap")

# Check 3: all subjects covered
print(f"\nCheck 3: all subjects covered")
covered_subjects = set()
for fold_id, fold_info in splits.items():
    covered_subjects.update(fold_info['train_subjects'])
    covered_subjects.update(fold_info['test_subjects'])

missing = set(all_subjects) - covered_subjects
extra = covered_subjects - set(all_subjects)

if not missing and not extra:
    print(f"  ✓ All subjects are covered; no missing or extra subjects")
else:
    if missing:
        print(f"  ✗ Missing subjects: {missing}")
    if extra:
        print(f"  ✗ Extra subjects: {extra}")
    raise RuntimeError("Split validation failed: subject coverage incomplete")

# Check 4: sample count stats
print(f"\nCheck 4: per-fold sample counts")
fold_sample_stats = []
for fold_id, fold_info in splits.items():
    train_subjects = fold_info['train_subjects']
    test_subjects = fold_info['test_subjects']

    n_train_samples = df[df['subject_id'].isin(train_subjects)].shape[0]
    n_test_samples = df[df['subject_id'].isin(test_subjects)].shape[0]

    fold_sample_stats.append({
        'fold_id': int(fold_id),
        'test_subject': fold_info['test_subject'],
        'n_train_samples': n_train_samples,
        'n_test_samples': n_test_samples,
        'train_ratio': round(n_train_samples / len(df), 4),
        'test_ratio': round(n_test_samples / len(df), 4),
    })

df_fold_stats = pd.DataFrame(fold_sample_stats)

print(f"\nPer-fold sample distribution:")
print(df_fold_stats.to_string(index=False))

# Summary
print(f"\nSample distribution summary:")
print(f"  Train sample count: {df_fold_stats['n_train_samples'].min():,} ~ {df_fold_stats['n_train_samples'].max():,}")
print(f"  Test sample count: {df_fold_stats['n_test_samples'].min():,} ~ {df_fold_stats['n_test_samples'].max():,}")
print(f"  Average train ratio: {df_fold_stats['train_ratio'].mean()*100:.2f}%")
print(f"  Average test ratio: {df_fold_stats['test_ratio'].mean()*100:.2f}%")

print(f"\n✓ All validations passed")

# ========== 4. Save split configuration ==========

print("\n" + "="*60)
print("4. Save split configuration")
print("="*60)

# Save splits.json
splits_file = configs_dir / "splits.json"
with open(splits_file, 'w', encoding='utf-8') as f:
    json.dump(splits, f, indent=2)

print(f"✓ Saved: {splits_file}")

# Save detailed config (with metadata)
loso_config = {
    'strategy': 'LOSO (Leave-One-Subject-Out)',
    'description': 'One subject for test in each fold; remaining subjects for training',
    'n_folds': n_subjects,
    'n_subjects': n_subjects,
    'all_subjects': all_subjects,
    'fold_statistics': {
        'train_samples_min': int(df_fold_stats['n_train_samples'].min()),
        'train_samples_max': int(df_fold_stats['n_train_samples'].max()),
        'train_samples_mean': int(df_fold_stats['n_train_samples'].mean()),
        'test_samples_min': int(df_fold_stats['n_test_samples'].min()),
        'test_samples_max': int(df_fold_stats['n_test_samples'].max()),
        'test_samples_mean': int(df_fold_stats['n_test_samples'].mean()),
        'avg_train_ratio': round(float(df_fold_stats['train_ratio'].mean()), 4),
        'avg_test_ratio': round(float(df_fold_stats['test_ratio'].mean()), 4),
    },
    'validation': {
        'no_subject_overlap': True,
        'all_subjects_covered': True,
        'each_subject_tested_once': True,
    },
    'anti_leakage_principles': [
        'Train and test sets are completely separated by subject',
        'Window slicing is performed after splitting to ensure no cross-fold leakage',
        'Statistics (mean/std) are computed from the training fold only',
        'Feature engineering is performed independently within each fold',
        'Hyperparameter tuning uses training-fold data only (nested CV optional)',
        'Final model evaluation is strictly based on the corresponding fold’s test set',
        'When aggregating results across folds, use metrics from independent test sets',
    ],
    'notes': [
        f'LOSO split: {n_subjects} folds; 1 subject per fold for test',
        'Ensure each subject appears exactly once in the test set',
        'Train/test sets are mutually exclusive with no subject overlap',
        'Suitable for small-sample settings with large inter-subject variability',
        'Report mean and standard deviation across all folds',
    ]
}

loso_config_file = configs_dir / "loso.yaml"
with open(loso_config_file, 'w', encoding='utf-8') as f:
    yaml.dump(loso_config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)

print(f"✓ Saved: {loso_config_file}")

loso_config_json = configs_dir / "loso.json"
with open(loso_config_json, 'w', encoding='utf-8') as f:
    json.dump(loso_config, f, indent=2)

print(f"✓ Saved: {loso_config_json}")

# Save per-fold sample stats
fold_stats_file = configs_dir / "loso_fold_stats.csv"
df_fold_stats.to_csv(fold_stats_file, index=False)
print(f"✓ Saved: {fold_stats_file}")

# ========== 5. Generate usage example ==========

print("\n" + "="*60)
print("5. Generate usage example")
print("="*60)

example_code = '''
# ========== LOSO Usage Example ==========

import json
from pathlib import Path

# 1. Load splits
with open("configs/splits.json", "r") as f:
    splits = json.load(f)

# 2. Iterate over folds
for fold_id in range(len(splits)):
    print(f"\\n========== Fold {fold_id} ==========")

    # Get current fold split
    fold = splits[str(fold_id)]
    train_subjects = fold["train_subjects"]
    test_subject = fold["test_subject"]

    print(f"Train: {len(train_subjects)} subjects")
    print(f"Test: {test_subject}")

    # 3. Set environment variable (used by later steps)
    import os
    os.environ["FOLD_ID"] = str(fold_id)

    # 4. Run training pipeline
    # - Step 6: per-fold clipping (statistics from train only)
    # - Step 7: per-fold standardization (statistics from train only)
    # - Step 9: per-fold windowing
    # - Train model (training windows only)
    # - Evaluate model (test windows only)

    # 5. Save results of current fold
    # results[fold_id] = {"accuracy": acc, "f1": f1, ...}

# 6. Aggregate results across folds
# mean_acc = np.mean([r["accuracy"] for r in results.values()])
# std_acc = np.std([r["accuracy"] for r in results.values()])
# print(f"Mean accuracy: {mean_acc:.4f} ± {std_acc:.4f}")

# ========== Anti-leakage Checklist ==========
# ✓ Train/test separated by subject
# ✓ Statistics (mean/std) computed from training set only
# ✓ Feature scaling uses parameters from training set
# ✓ Windowing performed after splitting
# ✓ Hyperparameter tuning uses training data only
# ✓ Test set used strictly for final evaluation
'''

example_file = configs_dir / "loso_usage_example.py"
with open(example_file, 'w', encoding='utf-8') as f:
    f.write(example_code)

print(f"✓ Generated usage example: {example_file}")

print("\nHow to use:")
print("  1. export FOLD_ID=0  # set current fold")
print("  2. Run steps 6–9 (they will use the corresponding fold automatically)")
print("  3. Train the model and evaluate")
print("  4. Repeat steps 1–3 for all folds")
print("  5. Aggregate results (mean ± std)")

# ========== 6. Split visualization info ==========

print("\n" + "="*60)
print("6. Split visualization info")
print("="*60)

print(f"\nLOSO split matrix (first 5 folds):")
print(f"{'Fold':<6} {'TestSubject':<12} {'#TrainSubs':<12} {'#TestSamples':<12} {'#TrainSamples':<12}")
print("-" * 60)

for i in range(min(5, len(splits))):
    fold = splits[str(i)]
    stats = df_fold_stats[df_fold_stats['fold_id'] == i].iloc[0]
    print(f"{i:<6} {fold['test_subject']:<12} {fold['n_train']:<12} "
          f"{stats['n_test_samples']:<12} {stats['n_train_samples']:<12}")

if len(splits) > 5:
    print(f"... (total {len(splits)} folds)")

# ========== 7. Summary ==========

print("\n" + "="*60)
print("Step 10 complete - LOSO split")
print("="*60)

print(f"\nSplit strategy:")
print(f"  Method: LOSO (Leave-One-Subject-Out)")
print(f"  #folds: {n_subjects}")
print(f"  #subjects: {n_subjects}")
print(f"  Train per fold: {n_subjects-1} subjects")
print(f"  Test per fold: 1 subject")

print(f"\nData distribution:")
print(f"  Total samples: {len(df):,}")
print(f"  Train ratio (avg): {df_fold_stats['train_ratio'].mean()*100:.2f}%")
print(f"  Test ratio (avg): {df_fold_stats['test_ratio'].mean()*100:.2f}%")

print(f"\nValidation results:")
print(f"  ✓ No subject overlap")
print(f"  ✓ All subjects covered")
print(f"  ✓ Each subject tested exactly once")
print(f"  ✓ Train/test sets are disjoint")

print(f"\nOutput files:")
print(f"  Main config: {splits_file}")
print(f"  Detailed config: {loso_config_file}")
print(f"  Fold stats: {fold_stats_file}")
print(f"  Usage example: {example_file}")

print("\nAnti-leakage principles:")
print("  1. ✓ Fully separated by subject")
print("  2. ✓ Statistics computed from training fold only")
print("  3. ✓ Feature engineering is fold-internal")
print("  4. ✓ Window slicing performed after splitting")
print("  5. ✓ Hyperparameter tuning limited to training data")
print("  6. ✓ Test set used strictly for independent evaluation")
print("  7. ✓ Cross-fold aggregation uses independent metrics")

print("\nNext steps:")
print("  - Set export FOLD_ID=<fold_id>")
print("  - Re-run steps 6–9 (per-fold processing)")
print("  - Train and evaluate models")
print("  - Iterate all folds and aggregate results")

print("="*60)

Step 10: LOSO split

1. Load data and get subject list
Loading data: data/lara/mbientlab/proc/labeled/
Data shape: (556504, 16)
Total samples: 556,504

Subject list:
  Total: 8 subjects
  IDs: ['S07', 'S08', 'S09', 'S10', 'S11', 'S12', 'S13', 'S14']

Sample count per subject:
  S07:   76,522 samples (13.75%)
  S08:   64,857 samples (11.65%)
  S09:   77,701 samples (13.96%)
  S10:   82,659 samples (14.85%)
  S11:   70,410 samples (12.65%)
  S12:   30,923 samples ( 5.56%)
  S13:   82,335 samples (14.80%)
  S14:   71,097 samples (12.78%)

2. Generate LOSO split

LOSO strategy: Leave-One-Subject-Out
  #folds = #subjects = 8
  Per fold: 1 subject for test, 7 subjects for train
  Fold 0: test S07, train 7 subjects
  Fold 1: test S08, train 7 subjects
  Fold 2: test S09, train 7 subjects
  Fold 3: test S10, train 7 subjects
  Fold 4: test S11, train 7 subjects
  Fold 5: test S12, train 7 subjects
  Fold 6: test S13, train 7 subjects
  Fold 7: test S14, train 7 subjects

✓ Generated 8 LOSO fol

In [7]:
#!/usr/bin/env python3
"""
Step 11: Feature Engineering (KNN/RF) - Multi-fold Final Version
Compute time- + frequency- + correlation-domain features per window, with magnitudes & energy
Loop over all folds defined in configs/splits.json and save per-fold outputs.
"""

import os
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import json
import yaml
from scipy import stats, fft
from sklearn.preprocessing import StandardScaler
import pickle

# ========== Config ==========

SAMPLING_RATE_HZ = 50.0

print("="*60)
print("Step 11: Feature Engineering (KNN/RF) - multi-fold")
print("="*60)

# Path config
proc_dir = Path("data/lara/mbientlab/proc")
configs_dir = Path("configs")
configs_dir.mkdir(parents=True, exist_ok=True)

windows_root = proc_dir / "windows"
features_root = proc_dir / "features"

# Channel names
channel_names = ['ax', 'ay', 'az', 'gx', 'gy', 'gz', 'acc_mag', 'gyr_mag']

# ========== 0. Determine folds ==========

print("\n" + "="*60)
print("0. Determine folds")
print("="*60)

splits_path = configs_dir / "splits.json"
if splits_path.exists():
    with open(splits_path, "r") as f:
        splits = json.load(f)
    fold_ids = sorted(int(k) for k in splits.keys())
    print(f"Detected {len(fold_ids)} folds from {splits_path}: {fold_ids}")
else:
    print("⚠️ splits.json not found; will treat all data as a single 'all' fold")
    splits = None
    fold_ids = [None]   # pseudo-fold "all"

# ========== 1. Define feature extraction functions (with robustness fixes) ==========

print("\n" + "="*60)
print("1. Define feature extraction functions")
print("="*60)

def extract_time_features(x):
    """Extract time-domain features (with robustness)"""
    features = {}

    # Basic stats
    features['mean'] = np.mean(x)
    features['std'] = np.std(x)
    features['min'] = np.min(x)
    features['max'] = np.max(x)
    features['range'] = features['max'] - features['min']

    # Quantiles
    features['q25'] = np.percentile(x, 25)
    features['q50'] = np.median(x)
    features['q75'] = np.percentile(x, 75)
    features['iqr'] = features['q75'] - features['q25']

    # Robust stats
    features['mad'] = np.median(np.abs(x - features['q50']))

    # Higher-order stats (unbiased)
    features['skew'] = stats.skew(x, bias=False, nan_policy='omit')
    features['kurtosis'] = stats.kurtosis(x, bias=False, fisher=True, nan_policy='omit')

    # Energy-related
    features['rms'] = np.sqrt(np.mean(x**2))
    features['energy'] = np.sum(x**2)

    # Zero crossing rate
    zero_crossings = np.sum(np.diff(np.sign(x)) != 0)
    features['zero_crossing_rate'] = zero_crossings / len(x)

    # Mean absolute difference
    features['mean_abs_diff'] = np.mean(np.abs(np.diff(x)))

    return features

def extract_freq_features(x, fs=50.0):
    """Extract frequency-domain features (skip DC)"""
    features = {}

    # FFT (Hann window to reduce spectral leakage)
    window = np.hanning(len(x))
    x_windowed = x * window

    fft_vals = fft.fft(x_windowed)
    fft_mag = np.abs(fft_vals[:len(x)//2])
    fft_freq = fft.fftfreq(len(x), 1/fs)[:len(x)//2]

    # Normalized power spectrum
    psd = fft_mag**2
    psd_norm = psd / np.sum(psd) if np.sum(psd) > 0 else psd

    # Spectral energy
    features['spectral_energy'] = np.sum(psd)

    # Spectral entropy
    psd_norm_pos = psd_norm[psd_norm > 0]
    features['spectral_entropy'] = -np.sum(psd_norm_pos * np.log2(psd_norm_pos)) if len(psd_norm_pos) > 0 else 0

    # Peak frequency (skip DC to avoid 0 Hz dominating)
    fft_mag_no_dc = fft_mag.copy()
    fft_mag_no_dc[0] = 0.0
    peak_idx = np.argmax(fft_mag_no_dc)
    features['peak_frequency'] = fft_freq[peak_idx]

    # Spectral centroid
    features['spectral_centroid'] = np.sum(fft_freq * psd_norm) if np.sum(psd_norm) > 0 else 0

    # Spectral bandwidth (std)
    features['spectral_bandwidth'] = np.sqrt(
        np.sum(((fft_freq - features['spectral_centroid'])**2) * psd_norm)
    ) if np.sum(psd_norm) > 0 else 0

    # Spectral rolloff (85% energy)
    cumsum_psd = np.cumsum(psd_norm)
    rolloff_idx = np.where(cumsum_psd >= 0.85)[0]
    features['spectral_rolloff'] = fft_freq[rolloff_idx[0]] if len(rolloff_idx) > 0 else fft_freq[-1]

    # Low-frequency energy ratio (0–5 Hz / total energy)
    low_freq_mask = fft_freq <= 5.0
    features['low_freq_energy_ratio'] = (
        np.sum(psd[low_freq_mask]) / np.sum(psd) if np.sum(psd) > 0 else 0
    )

    return features

def extract_window_features(window, channel_names, fs=50.0):
    """Extract all features for a single window (with deduplicated cross-correlation)"""
    all_features = {}
    n_channels = window.shape[1]

    # Time & frequency features per channel
    for ch_idx, ch_name in enumerate(channel_names):
        signal_data = window[:, ch_idx]

        # Time-domain
        time_feats = extract_time_features(signal_data)
        for feat_name, feat_val in time_feats.items():
            all_features[f'{ch_name}_{feat_name}'] = feat_val

        # Frequency-domain
        freq_feats = extract_freq_features(signal_data, fs)
        for feat_name, feat_val in freq_feats.items():
            all_features[f'{ch_name}_{feat_name}'] = feat_val

    # Correlation features (deduplicated)
    corr_mat = np.corrcoef(window, rowvar=False)

    # Autocorrelation (lag=1) for each channel
    for ch_idx, ch_name in enumerate(channel_names):
        signal_data = window[:, ch_idx]
        if len(signal_data) > 1:
            autocorr = np.corrcoef(signal_data[:-1], signal_data[1:])[0, 1]
            all_features[f'{ch_name}_autocorr_lag1'] = autocorr if not np.isnan(autocorr) else 0
        else:
            all_features[f'{ch_name}_autocorr_lag1'] = 0

    # Cross-correlation (upper triangle only; deduplicated): C(8,2)=28 pairs
    for i in range(n_channels):
        for j in range(i + 1, n_channels):
            val = corr_mat[i, j]
            key = f'crosscorr_{channel_names[i]}_{channel_names[j]}'
            all_features[key] = 0 if np.isnan(val) else val

    return all_features

print("Feature types:")
print("  Time-domain: mean, std, min, max, range, q25, q50, q75, iqr, mad,")
print("               skew, kurtosis, rms, energy, zero_crossing_rate, mean_abs_diff")
print("  Frequency-domain: spectral_energy, spectral_entropy, peak_frequency (skip DC),")
print("                    spectral_centroid, spectral_bandwidth, spectral_rolloff,")
print("                    low_freq_energy_ratio")
print("  Correlation: autocorr_lag1 (8 dims), crosscorr_* (28 dims, deduplicated)")
print(f"  Expected total dimension: 16×8 + 7×8 + 8 + 28 = 220 dims")

# ========== 2. Loop over folds and run feature engineering ==========

for fold_id in fold_ids:
    fold_tag = f"fold_{fold_id:02d}" if fold_id is not None else "all"

    print("\n" + "="*60)
    print(f"2. Processing fold: {fold_tag}")
    print("="*60)

    # ----- Paths for this fold -----
    windows_dir = windows_root / fold_tag
    features_dir = features_root / fold_tag
    features_dir.mkdir(parents=True, exist_ok=True)

    if not windows_dir.exists():
        print(f"⚠️ Windows dir not found for {fold_tag}: {windows_dir}, skipping this fold.")
        continue

    print(f"Windows dir:  {windows_dir}")
    print(f"Features dir: {features_dir}")

    # ========== 3. Load window data & metadata ==========
    print("\n" + "-"*60)
    print("3. Load window data & metadata")
    print("-"*60)

    # Train
    X_train_file = windows_dir / "X_train.npy"
    y_train_file = windows_dir / "y_train.npy"
    train_meta_file = windows_dir / "X_train.parquet"

    if not X_train_file.exists() or not y_train_file.exists():
        print(f"⚠️ Train window data not found for {fold_tag}; please run Step 9 first. Skipping.")
        continue

    X_train = np.load(X_train_file)
    y_train = np.load(y_train_file)
    df_train_meta = pd.read_parquet(train_meta_file)

    assert X_train.shape[0] == len(df_train_meta) == len(y_train), \
        f"[{fold_tag}] Inconsistent train data: X={X_train.shape[0]}, meta={len(df_train_meta)}, y={len(y_train)}"

    print(f"\nTrain windows:")
    print(f"  X_train shape: {X_train.shape}")
    print(f"  y_train shape: {y_train.shape}")
    print(f"  metadata:      {len(df_train_meta)} rows")
    print(f"  ✓ Consistency check passed")

    # Test set (may be empty)
    X_test_file = windows_dir / "X_test.npy"
    y_test_file = windows_dir / "y_test.npy"
    test_meta_file = windows_dir / "X_test.parquet"

    if X_test_file.exists() and y_test_file.exists():
        X_test = np.load(X_test_file)
        y_test = np.load(y_test_file)
        df_test_meta = pd.read_parquet(test_meta_file)

        assert X_test.shape[0] == len(df_test_meta) == len(y_test), \
            f"[{fold_tag}] Inconsistent test data: X={X_test.shape[0]}, meta={len(df_test_meta)}, y={len(y_test)}"

        print(f"\nTest windows:")
        print(f"  X_test shape: {X_test.shape}")
        print(f"  y_test shape: {y_test.shape}")
        print(f"  metadata:     {len(df_test_meta)} rows")
        print(f"  ✓ Consistency check passed")
        has_test = True
    else:
        print(f"\nTest windows not found for {fold_tag}; this fold will be train-only.")
        X_test = None
        y_test = None
        df_test_meta = None
        has_test = False

    # ========== Anti-leakage assertions ==========
    train_subs = set(df_train_meta["subject_id"].unique())
    test_subs = set(df_test_meta["subject_id"].unique()) if has_test else set()
    assert train_subs.isdisjoint(test_subs), f"[{fold_tag}] Train/Test subjects overlap: {train_subs & test_subs}"
    if has_test and len(test_subs) > 0:
        assert len(test_subs) == 1, f"[{fold_tag}] Test set contains multiple subjects: {test_subs}"
    print(f"\n✓ Anti-leakage check passed:")
    print(f"  Train subjects: {len(train_subs)}")
    print(f"  Test subjects:  {len(test_subs)}")

    # ========== 4. Extract training-set features ==========
    print("\n" + "-"*60)
    print("4. Extract training-set features")
    print("-"*60)

    print(f"Processing {X_train.shape[0]:,} training windows...")

    train_features_list = []
    for i in range(X_train.shape[0]):
        window = X_train[i]
        features = extract_window_features(window, channel_names, SAMPLING_RATE_HZ)
        train_features_list.append(features)

        if (i + 1) % 1000 == 0:
            print(f"  Processed {i+1:,} / {X_train.shape[0]:,} windows")

    df_train_features = pd.DataFrame(train_features_list)

    print(f"\n✓ Training-set features:")
    print(f"  #samples: {len(df_train_features):,}")
    print(f"  feature dimension: {df_train_features.shape[1]}")

    # Check NaN & Inf
    nan_count = df_train_features.isna().sum().sum()
    inf_count = np.isinf(df_train_features.values).sum()

    if nan_count > 0 or inf_count > 0:
        print(f"\n⚠️ Found invalid values in training features:")
        print(f"  NaN: {nan_count}")
        print(f"  Inf: {inf_count}")
        df_train_features = df_train_features.replace([np.inf, -np.inf], np.nan)
        df_train_features = df_train_features.fillna(0)
        print(f"  ✓ Filled with 0")

    # Feature names
    feature_names = df_train_features.columns.tolist()
    print(f"\nSample feature names (first 10):")
    for name in feature_names[:10]:
        print(f"  - {name}")

    # Low-variance check (raw features, before standardization)
    print(f"\nChecking low-variance features (raw):")
    raw_variance = df_train_features.var()
    low_var_features = raw_variance[raw_variance < 1e-8].index.tolist()

    if low_var_features:
        print(f"  ⚠️ Found {len(low_var_features)} low-variance features (var < 1e-8):")
        for feat in low_var_features[:5]:
            print(f"    - {feat}: var={raw_variance[feat]:.2e}")
        if len(low_var_features) > 5:
            print(f"    ... (total {len(low_var_features)} features)")

        low_var_file = features_dir / "low_variance_features.txt"
        pd.Series(low_var_features).to_csv(low_var_file, index=False, header=False)
        print(f"  ✓ Saved low-variance list: {low_var_file}")
    else:
        print(f"  ✓ All feature variances look OK")

    # ========== 5. Extract test-set features ==========
    if has_test:
        print("\n" + "-"*60)
        print("5. Extract test-set features")
        print("-"*60)

        print(f"Processing {X_test.shape[0]:,} test windows...")

        test_features_list = []
        for i in range(X_test.shape[0]):
            window = X_test[i]
            features = extract_window_features(window, channel_names, SAMPLING_RATE_HZ)
            test_features_list.append(features)

            if (i + 1) % 1000 == 0:
                print(f"  Processed {i+1:,} / {X_test.shape[0]:,} windows")

        df_test_features = pd.DataFrame(test_features_list)

        print(f"\n✓ Test-set features:")
        print(f"  #samples: {len(df_test_features):,}")
        print(f"  feature dimension: {df_test_features.shape[1]}")

        nan_count = df_test_features.isna().sum().sum()
        inf_count = np.isinf(df_test_features.values).sum()

        if nan_count > 0 or inf_count > 0:
            print(f"\n⚠️ Found invalid values in test features:")
            print(f"  NaN: {nan_count}")
            print(f"  Inf: {inf_count}")
            df_test_features = df_test_features.replace([np.inf, -np.inf], np.nan)
            df_test_features = df_test_features.fillna(0)
            print(f"  ✓ Filled with 0")

    # ========== 6. Feature standardization (train-only stats) ==========
    print("\n" + "-"*60)
    print("6. Feature standardization (train-only stats)")
    print("-"*60)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(df_train_features)

    print(f"Scaler parameters (train set):")
    print(f"  mean range: [{scaler.mean_.min():.4f}, {scaler.mean_.max():.4f}]")
    print(f"  std range:  [{scaler.scale_.min():.4f}, {scaler.scale_.max():.4f}]")

    df_train_scaled = pd.DataFrame(X_train_scaled, columns=feature_names).astype('float32')

    if has_test:
        X_test_scaled = scaler.transform(df_test_features)
        df_test_scaled = pd.DataFrame(X_test_scaled, columns=feature_names).astype('float32')
        print(f"\n✓ Standardized test set using train-set statistics")

    print(f"✓ Features cast to: float32")

    # ========== 7. Save feature data (with metadata) ==========
    print("\n" + "-"*60)
    print("7. Save feature data (with metadata)")
    print("-"*60)

    # Train set
    train_X_file = features_dir / "train_X.parquet"
    train_y_file = features_dir / "train_y.parquet"
    train_meta_output = features_dir / "train_meta.parquet"

    df_train_scaled.to_parquet(train_X_file, index=False)
    pd.DataFrame({'label': y_train}).to_parquet(train_y_file, index=False)
    df_train_meta[['window_id', 'subject_id', 'session_id', 'placement',
                   'time_range', 'label', 'label_purity']].to_parquet(
        train_meta_output, index=False
    )

    print(f"✓ Train set:")
    print(f"  features: {train_X_file}")
    print(f"  labels:   {train_y_file}")
    print(f"  metadata: {train_meta_output}")

    # Test set
    if has_test:
        test_X_file = features_dir / "test_X.parquet"
        test_y_file = features_dir / "test_y.parquet"
        test_meta_output = features_dir / "test_meta.parquet"

        df_test_scaled.to_parquet(test_X_file, index=False)
        pd.DataFrame({'label': y_test}).to_parquet(test_y_file, index=False)
        df_test_meta[['window_id', 'subject_id', 'session_id', 'placement',
                      'time_range', 'label', 'label_purity']].to_parquet(
            test_meta_output, index=False
        )

        print(f"✓ Test set:")
        print(f"  features: {test_X_file}")
        print(f"  labels:   {test_y_file}")
        print(f"  metadata: {test_meta_output}")

    # Scaler
    scaler_file = features_dir / "scaler.pkl"
    with open(scaler_file, 'wb') as f:
        pickle.dump(scaler, f)
    print(f"✓ Scaler: {scaler_file}")

    # ========== 8. Feature type stats (exact counts, revised) ==========
    print("\n" + "-"*60)
    print("8. Feature type stats (exact counts, revised)")
    print("-"*60)

    freq_features = [
        f for f in feature_names
        if 'spectral_' in f
        or f.endswith('_peak_frequency')
        or f.endswith('_low_freq_energy_ratio')
    ]

    corr_features = [f for f in feature_names
                     if 'autocorr' in f or 'crosscorr' in f]

    time_features = [f for f in feature_names
                     if f not in freq_features and f not in corr_features]

    print(f"\nFeature counts (exact) for {fold_tag}:")
    print(f"  Total: {len(feature_names)}")
    print(f"  Time-domain:      {len(time_features)}")
    print(f"  Frequency-domain: {len(freq_features)}")
    print(f"  Correlation:      {len(corr_features)}")
    print(f"    - Autocorr: {len([f for f in corr_features if 'autocorr' in f])}")
    print(f"    - Crosscorr: {len([f for f in corr_features if 'crosscorr' in f])} (deduplicated)")

    total_check = len(time_features) + len(freq_features) + len(corr_features)
    assert total_check == len(feature_names), \
        f"[{fold_tag}] Feature count mismatch: {total_check} ≠ {len(feature_names)}"
    print(f"✓ Count verification: {len(time_features)} + {len(freq_features)} + {len(corr_features)} = {len(feature_names)}")

    # ========== 9. Save feature config (named by fold) ==========
    print("\n" + "-"*60)
    print("9. Save feature config (named by fold)")
    print("-"*60)

    feature_config = {
        'feature_extraction': {
            'method': 'handcrafted (time + frequency + correlation)',
            'sampling_rate_hz': SAMPLING_RATE_HZ,
            'n_channels': len(channel_names),
            'channel_names': channel_names,
        },
        'feature_types': {
            'time_domain': [
                'mean', 'std', 'min', 'max', 'range',
                'q25', 'q50', 'q75', 'iqr', 'mad',
                'skew (bias=False)', 'kurtosis (bias=False, fisher=True)',
                'rms', 'energy',
                'zero_crossing_rate', 'mean_abs_diff'
            ],
            'frequency_domain': [
                'spectral_energy', 'spectral_entropy',
                'peak_frequency (skip DC)', 'spectral_centroid',
                'spectral_bandwidth', 'spectral_rolloff',
                'low_freq_energy_ratio'
            ],
            'correlation': [
                'autocorr_lag1 (8 dims: per channel)',
                'crosscorr_* (28 dims: C(8,2) pairs, deduplicated)'
            ]
        },
        'feature_dimensions': {
            'total': len(feature_names),
            'time_domain': len(time_features),
            'frequency_domain': len(freq_features),
            'correlation': len(corr_features),
            'correlation_breakdown': {
                'autocorr': len([f for f in corr_features if 'autocorr' in f]),
                'crosscorr': len([f for f in corr_features if 'crosscorr' in f]),
            }
        },
        'preprocessing': {
            'scaler': 'StandardScaler',
            'fit_on': 'training set only',
            'nan_inf_handling': 'fill with 0',
            'dtype': 'float32',
        },
        'dataset': {
            'fold_id': fold_id if fold_id is not None else None,
            'fold_tag': fold_tag,
            'train_samples': int(len(df_train_scaled)),
            'test_samples': int(len(df_test_scaled)) if has_test else 0,
        },
        'feature_names': feature_names,
        'improvements': [
            'Cross-correlation deduplication: keep upper triangle only, C(8,2)=28 pairs',
            'Higher-order stats: bias=False, fisher=True, nan_policy=omit',
            'Feature counting: exact (time = not frequency and not correlation)',
            'Frequency robustness: Hann window + skip DC for peak frequency',
            'Data type: float32 to save space',
            'Traceability: save window_id/session/time metadata',
            'Per-fold output: avoid overwriting different folds',
            'Consistency assertions: ensure X/y/meta alignment',
            'Low-variance check: on raw features (before standardization)',
            'Anti-leakage assertions: train/test subjects disjoint + single test subject under LOSO',
        ],
        'notes': [
            f'Total feature dimension: {len(feature_names)} (currently ~220)',
            f'Time-domain: {len(time_features)} dims',
            f'Frequency-domain: {len(freq_features)} dims',
            f'Correlation: {len(corr_features)} dims (autocorr + crosscorr)',
            'Standardization uses train set statistics only (anti-leakage)',
            'NaN/Inf filled with 0',
            'Suitable for KNN/RF/SVM and other classical ML models',
        ]
    }

    feature_config_file = configs_dir / f"features_{fold_tag}.yaml"
    with open(feature_config_file, 'w', encoding='utf-8') as f:
        yaml.dump(feature_config, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
    print(f"✓ Saved config: {feature_config_file}")

    feature_config_json = configs_dir / f"features_{fold_tag}.json"
    with open(feature_config_json, 'w', encoding='utf-8') as f:
        json.dump(feature_config, f, indent=2)
    print(f"✓ Saved config: {feature_config_json}")

    # Save feature name list
    feature_names_file = features_dir / "feature_names.txt"
    with open(feature_names_file, 'w') as f:
        for name in feature_names:
            f.write(f"{name}\n")
    print(f"✓ Saved feature names: {feature_names_file}")

    # ========== 10. Feature statistics (train standardized) ==========
    print("\n" + "-"*60)
    print("10. Train feature statistics (standardized)")
    print("-"*60)

    print(f"\nTraining-set feature stats (standardized, {fold_tag}):")
    print(df_train_scaled.describe().T[['mean', 'std', 'min', 'max']].head(10))

    # ========== 11. Per-fold summary ==========
    print("\n" + "-"*60)
    print(f"11. Summary for fold {fold_tag}")
    print("-"*60)

    print(f"\nFeature extraction:")
    print(f"  Method: handcrafted (time + frequency + correlation)")
    print(f"  Total dims:      {len(feature_names)}")
    print(f"  Time-domain:     {len(time_features)} dims")
    print(f"  Frequency-domain:{len(freq_features)} dims")
    print(f"  Correlation:     {len(corr_features)} dims")

    print(f"\nDataset:")
    print(f"  Fold:           {fold_tag}")
    print(f"  Train samples:  {len(df_train_scaled):,}")
    if has_test:
        print(f"  Test samples:   {len(df_test_scaled):,}")

    print(f"\nPreprocessing:")
    print(f"  Standardization: StandardScaler (fit on train only)")
    print(f"  Invalid values:  NaN/Inf filled with 0")
    print(f"  Dtype:           float32")

    print(f"\nOutputs ({fold_tag}):")
    print(f"  Dir:    {features_dir}/")
    print(f"  Train:  train_X.parquet, train_y.parquet, train_meta.parquet")
    if has_test:
        print(f"  Test:   test_X.parquet, test_y.parquet, test_meta.parquet")
    print(f"  Scaler: scaler.pkl")
    print(f"  Config: {feature_config_file.name}")
    print(f"  Feature list: feature_names.txt")
    if low_var_features:
        print(f"  Low variance: low_variance_features.txt")

# ========== Global summary ==========

print("\n" + "="*60)
print("Step 11 complete - Feature Engineering for all folds")
print("="*60)
print("Next steps per fold:")
print("  - Use train_X.parquet / train_y.parquet to train classical machine-learning models (e.g., KNN, Random Forest, SVM)")
print("  - Use test_X.parquet / test_y.parquet to evaluate model performance")
print("="*60)

Step 11: Feature Engineering (KNN/RF) - multi-fold

0. Determine folds
Detected 8 folds from configs/splits.json: [0, 1, 2, 3, 4, 5, 6, 7]

1. Define feature extraction functions
Feature types:
  Time-domain: mean, std, min, max, range, q25, q50, q75, iqr, mad,
               skew, kurtosis, rms, energy, zero_crossing_rate, mean_abs_diff
  Frequency-domain: spectral_energy, spectral_entropy, peak_frequency (skip DC),
                    spectral_centroid, spectral_bandwidth, spectral_rolloff,
                    low_freq_energy_ratio
  Correlation: autocorr_lag1 (8 dims), crosscorr_* (28 dims, deduplicated)
  Expected total dimension: 16×8 + 7×8 + 8 + 28 = 220 dims

2. Processing fold: fold_00
Windows dir:  data/lara/mbientlab/proc/windows/fold_00
Features dir: data/lara/mbientlab/proc/features/fold_00

------------------------------------------------------------
3. Load window data & metadata
------------------------------------------------------------

Train windows:
  X_train shape:

In [8]:
#!/usr/bin/env python3
"""
Step 12: Normalization (feature/deep) - top-conf/journal grade

Classical models: features already standardized in Step 11
Deep models: per-channel z-score on windowed data (train-set statistics only)
Multi-fold version: loop over all folds in configs/splits.json
"""

import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import json
import yaml
import os
import hashlib

# ========== Config ==========
EPSILON = 1e-8  # avoid divide-by-zero

print("=" * 60)
print("Step 12: Normalization (feature/deep) - multi-fold")
print("=" * 60)

# Path config
proc_dir = Path("data/lara/mbientlab/proc")
configs_dir = Path("configs")
configs_dir.mkdir(parents=True, exist_ok=True)

windows_root = proc_dir / "windows"
scalers_root = proc_dir / "scalers"
scalers_root.mkdir(parents=True, exist_ok=True)

# ========== 0. Determine folds ==========
print("\n" + "=" * 60)
print("0. Determine folds")
print("=" * 60)

splits_path = configs_dir / "splits.json"
if splits_path.exists():
    with open(splits_path, "r") as f:
        splits = json.load(f)
    fold_ids = sorted(int(k) for k in splits.keys())
    print(f"Detected {len(fold_ids)} folds from {splits_path}: {fold_ids}")
else:
    # Under strict LOSO protocol, splits.json should exist; this is a conservative fallback
    print("⚠️ splits.json not found; normalization for deep models expects LOSO folds.")
    fold_ids = [0]  # Fallback (e.g., when you only have a single fold); ideally, splits.json should be present

# ========== 0.1 Load channel config ==========
print("\n" + "=" * 60)
print("0.1 Load channel config")
print("=" * 60)

channels_config_file = configs_dir / "channels.yaml"
if channels_config_file.exists():
    with open(channels_config_file, 'r', encoding='utf-8') as f:
        channels_config = yaml.safe_load(f)
    channel_names = channels_config['final_channels']
    print(f"✓ Channels read from config: {channel_names}")
else:
    channel_names = ['ax', 'ay', 'az', 'gx', 'gy', 'gz', 'acc_mag', 'gyr_mag']
    print(f"⚠️ channels.yaml not found; using default: {channel_names}")

n_channels_config = len(channel_names)

# ========== Helper: SHA256 ==========
def _sha256(p: Path):
    h = hashlib.sha256()
    with open(p, "rb") as f:
        h.update(f.read())
    return h.hexdigest()

def _check_finite(name, arr):
    if not np.isfinite(arr).all():
        bad = np.logical_not(np.isfinite(arr))
        nbad = int(bad.sum())
        raise ValueError(f"{name} contains NaN/Inf, total: {nbad}")

# ========== Loop over folds ==========
for fold_id in fold_ids:
    fold_tag = f"fold_{fold_id:02d}"

    print("\n" + "=" * 60)
    print(f"Processing fold: {fold_tag}")
    print("=" * 60)

    windows_dir = windows_root / fold_tag
    scalers_dir = scalers_root / fold_tag
    scalers_dir.mkdir(parents=True, exist_ok=True)

    print(f"Input dir (windows):  {windows_dir}")
    print(f"Output dir (scalers): {scalers_dir}")

    if not windows_dir.exists():
        print(f"⚠️ Windows dir not found for {fold_tag}: {windows_dir}, skipping this fold.")
        continue

    # ========== 1. Classical models: confirm features already standardized ==========
    print("\n" + "=" * 60)
    print("1. Classical models: confirm features already standardized")
    print("=" * 60)

    features_dir = proc_dir / "features" / fold_tag
    feature_scaler_file = features_dir / "scaler.pkl"

    if feature_scaler_file.exists():
        with open(feature_scaler_file, 'rb') as f:
            feature_scaler = pickle.load(f)
        print(f"✓ Feature scaler was generated in Step 11: {feature_scaler_file}")
        print(f"  StandardScaler parameters:")
        print(f"  - #features: {len(feature_scaler.mean_)}")
        print(f"  - mean range: [{feature_scaler.mean_.min():.4f}, {feature_scaler.mean_.max():.4f}]")
        print(f"  - std  range: [{feature_scaler.scale_.min():.4f}, {feature_scaler.scale_.max():.4f}]")
    else:
        print(f"⚠️ Feature scaler not found for {fold_tag}: {feature_scaler_file}")
        print(f"   Classical (KNN/RF) features for this fold may not yet have been processed in Step 11")

    # ========== 2. Deep models: load window data ==========
    print("\n" + "=" * 60)
    print("2. Deep models: load window data")
    print("=" * 60)

    # Train set
    X_train_file = windows_dir / "X_train.npy"
    y_train_file = windows_dir / "y_train.npy"
    train_meta_file = windows_dir / "X_train.parquet"

    if not X_train_file.exists():
        print(f"⚠️ Training windows not found for {fold_tag}: {X_train_file}, skipping this fold.")
        continue

    X_train = np.load(X_train_file)
    y_train = np.load(y_train_file)
    df_train_meta = pd.read_parquet(train_meta_file)

    print(f"\nTrain set:")
    print(f"  X_train shape: {X_train.shape}, dtype={X_train.dtype}")

    # Test set
    X_test_file = windows_dir / "X_test.npy"
    y_test_file = windows_dir / "y_test.npy"
    test_meta_file = windows_dir / "X_test.parquet"

    has_test = X_test_file.exists()
    if has_test:
        X_test = np.load(X_test_file)
        y_test = np.load(y_test_file)
        df_test_meta = pd.read_parquet(test_meta_file)
        print(f"\nTest set:")
        print(f"  X_test shape:  {X_test.shape}, dtype={X_test.dtype}")
    else:
        print(f"\nTest set not found for {fold_tag}; train-only mode for deep normalization.")
        X_test = None
        y_test = None
        df_test_meta = None

    # ========== 2.1. Numerical robustness self-check ==========
    print("\n" + "=" * 60)
    print("2.1. Numerical robustness self-check")
    print("=" * 60)

    _check_finite("X_train", X_train)
    print(f"✓ X_train has no NaN/Inf")

    if has_test:
        _check_finite("X_test", X_test)
        print(f"✓ X_test has no NaN/Inf")

    # ========== 2.2. Anti-leakage & consistency checks ==========
    print("\n" + "=" * 60)
    print("2.2. Anti-leakage & consistency checks")
    print("=" * 60)

    # Channel count
    n_channels = X_train.shape[2]
    assert n_channels == n_channels_config, \
        f"[{fold_tag}] Channel count mismatch: data {n_channels} vs config {n_channels_config}"
    print(f"✓ Channel count consistent: {n_channels}")

    # Validate channel order (sidecar)
    channels_sidecar = windows_dir / "channels.json"
    if channels_sidecar.exists():
        with open(channels_sidecar, "r", encoding="utf-8") as f:
            side = json.load(f)["channel_names"]
        assert side == channel_names, \
            f"[{fold_tag}] Channel order inconsistent:\n  windows={side}\n  config ={channel_names}"
        print(f"✓ Channel order consistent: {channel_names}")
    else:
        print(f"⚠️ channels.json sidecar not found in {windows_dir}; only checking channel count")

    # Train/Test subjects
    train_subjects = set(df_train_meta['subject_id'].unique())
    if has_test:
        test_subjects = set(df_test_meta['subject_id'].unique())
        assert train_subjects.isdisjoint(test_subjects), \
            f"[{fold_tag}] Train/test subjects overlap {train_subjects & test_subjects}, violates LOSO!"
        assert len(test_subjects) == 1, \
            f"[{fold_tag}] LOSO test set should contain exactly 1 subject; got: {len(test_subjects)}"
        print(f"✓ Anti-leakage check passed:")
        print(f"  Train subjects: {len(train_subjects)}")
        print(f"  Test  subjects: {len(test_subjects)} {test_subjects}")
        print(f"  Subject sets disjoint: True")
    else:
        test_subjects = set()
        print(f"✓ Train-only mode (train subjects: {len(train_subjects)})")

    # ========== 3. Compute channel-wise statistics (train only) ==========
    print("\n" + "=" * 60)
    print("3. Compute channel-wise statistics (train only)")
    print("=" * 60)

    print(f"\n#channels: {n_channels}")
    print(f"Channel names: {channel_names}")

    # Check whether data already appears z-scored to avoid double-normalization
    probe_mean = np.mean(X_train, axis=(0, 1))
    probe_std = np.std(X_train, axis=(0, 1))
    already_z = (np.all(np.abs(probe_mean) < 1e-3) and
                 np.all(np.abs(probe_std - 1.0) < 1e-2))

    if already_z:
        print(f"\n⚠️ Detected X_train appears already z-scored (mean≈0, std≈1); skipping normalization.")
        print(f"  Probed mean range: [{probe_mean.min():.4f}, {probe_mean.max():.4f}]")
        print(f"  Probed std  range: [{probe_std.min():.4f}, {probe_std.max():.4f}]")

        X_train_scaled = X_train.astype('float32', copy=True)
        if has_test:
            X_test_scaled = X_test.astype('float32', copy=True)

        channel_mean = probe_mean
        channel_std = probe_std
        skip_normalization = True
        print(f"  ✓ Skipped normalization; copied as float32")
    else:
        skip_normalization = False
        channel_mean = np.mean(X_train, axis=(0, 1))
        channel_std = np.std(X_train, axis=(0, 1))
        channel_std = np.maximum(channel_std, EPSILON)

        print(f"\nTrain-set channel statistics:")
        for i, ch_name in enumerate(channel_names):
            print(f"  {ch_name}: mean={channel_mean[i]:7.4f}, std={channel_std[i]:7.4f}")

    # ========== 4. Apply channel-wise z-score normalization ==========
    print("\n" + "=" * 60)
    print("4. Apply channel-wise z-score normalization")
    print("=" * 60)

    if not skip_normalization:
        X_train_scaled = (X_train - channel_mean) / channel_std
        print(f"\nTrain normalization:")
        print(f"  input:  {X_train.shape}, {X_train.dtype}")
        print(f"  output: {X_train_scaled.shape}, {X_train_scaled.dtype}")

        # Verify
        train_scaled_mean = np.mean(X_train_scaled, axis=(0, 1))
        train_scaled_std = np.std(X_train_scaled, axis=(0, 1))
        print(f"\nTrain-set stats after normalization (should be ~0 and ~1):")
        for i, ch_name in enumerate(channel_names):
            print(f"  {ch_name}: mean={train_scaled_mean[i]:7.4f}, std={train_scaled_std[i]:7.4f}")

        if has_test:
            X_test_scaled = (X_test - channel_mean) / channel_std
            print(f"\nTest normalization (using train statistics):")
            print(f"  input:  {X_test.shape}, {X_test.dtype}")
            print(f"  output: {X_test_scaled.shape}, {X_test_scaled.dtype}")

            test_scaled_mean = np.mean(X_test_scaled, axis=(0, 1))
            test_scaled_std = np.std(X_test_scaled, axis=(0, 1))
            print(f"\nTest-set stats after normalization (train params; not 0/1):")
            for i, ch_name in enumerate(channel_names):
                print(f"  {ch_name}: mean={test_scaled_mean[i]:7.4f}, std={test_scaled_std[i]:7.4f}")
    else:
        print(f"\n✓ Skipped normalization (data already normalized)")
        print(f"  Train: {X_train_scaled.shape}")
        if has_test:
            print(f"  Test:  {X_test_scaled.shape}")

    # ========== 5. Save normalized data ==========
    print("\n" + "=" * 60)
    print("5. Save normalized data")
    print("=" * 60)

    X_train_scaled_file = scalers_dir / "X_train_scaled.npy"
    y_train_scaled_file = scalers_dir / "y_train.npy"
    train_meta_scaled_file = scalers_dir / "train_meta.parquet"

    np.save(X_train_scaled_file, X_train_scaled.astype('float32'))
    np.save(y_train_scaled_file, y_train)
    df_train_meta.to_parquet(train_meta_scaled_file, index=False)

    print(f"✓ Train set:")
    print(f"  features: {X_train_scaled_file}")
    print(f"  labels:   {y_train_scaled_file}")
    print(f"  metadata: {train_meta_scaled_file}")

    if has_test:
        X_test_scaled_file = scalers_dir / "X_test_scaled.npy"
        y_test_scaled_file = scalers_dir / "y_test.npy"
        test_meta_scaled_file = scalers_dir / "test_meta.parquet"

        np.save(X_test_scaled_file, X_test_scaled.astype('float32'))
        np.save(y_test_scaled_file, y_test)
        df_test_meta.to_parquet(test_meta_scaled_file, index=False)

        print(f"✓ Test set:")
        print(f"  features: {X_test_scaled_file}")
        print(f"  labels:   {y_test_scaled_file}")
        print(f"  metadata: {test_meta_scaled_file}")

    # ========== 6. Save scaler parameters (with traceability) ==========
    print("\n" + "=" * 60)
    print("6. Save scaler parameters (with traceability)")
    print("=" * 60)

    print(f"\nComputing input file SHA256...")
    x_train_sha = _sha256(X_train_file)
    print(f"  X_train: {x_train_sha[:16]}...")
    if has_test:
        x_test_sha = _sha256(X_test_file)
        print(f"  X_test:  {x_test_sha[:16]}...")
    else:
        x_test_sha = None

    channel_scaler = {
        'fold_id': fold_id,
        'fold_tag': fold_tag,
        'epsilon': EPSILON,
        'n_channels': int(n_channels),
        'channel_names': channel_names,
        'channel_mean': channel_mean.tolist(),
        'channel_std': channel_std.tolist(),
        'train_subjects': sorted(list(train_subjects)),
        'test_subjects': sorted(list(test_subjects)) if has_test else [],
        'formula': '(x - channel_mean) / max(channel_std, ε)' if not skip_normalization else 'identity (already normalized)',
        'method': 'channel-wise z-score with ε floor' if not skip_normalization else 'skip (data already normalized)',
        'skip_normalization': bool(skip_normalization),
        'input_files': {
            'x_train_file': str(X_train_file.name),
            'x_train_sha256': x_train_sha,
            'x_test_file': str(X_test_file.name) if has_test else None,
            'x_test_sha256': x_test_sha,
        },
        'data_shape': {
            'window_size': int(X_train.shape[1]),
            'n_channels': int(X_train.shape[2]),
            'n_windows_train': int(X_train.shape[0]),
            'n_windows_test': int(X_test.shape[0]) if has_test else 0,
        },
        'notes': [
            'Per-channel normalization (8 channels independently compute mean/std)',
            'Statistics computed from train set only',
            'Test set normalized using train-set statistics',
            'Use np.maximum(std, ε) to avoid divide-by-zero',
            'Auto-detect double-standardization and skip if needed',
            'Consistent with Step 7 anti-leakage principles',
            'Suitable for CNN/LSTM/Transformer and other deep models',
            'Data shape: (n_windows, window_size, n_channels)',
            'Includes input file SHA256 for traceability',
        ]
    }

    channel_scaler_pkl = scalers_dir / "channel_scaler.pkl"
    with open(channel_scaler_pkl, 'wb') as f:
        pickle.dump(channel_scaler, f)
    print(f"✓ Saved scaler (pkl): {channel_scaler_pkl}")

    channel_scaler_json = scalers_dir / "channel_scaler.json"
    with open(channel_scaler_json, 'w') as f:
        json.dump(channel_scaler, f, indent=2)
    print(f"✓ Saved scaler (json): {channel_scaler_json}")

    # ========== 7. Save config ==========
    print("\n" + "=" * 60)
    print("7. Save config")
    print("=" * 60)

    normalization_config = {
        'traditional_models': {
            'method': 'feature-wise StandardScaler',
            'scaler_file': str(feature_scaler_file.relative_to(proc_dir)) if feature_scaler_file.exists() else None,
            'description': 'Feature-level z-score standardization (completed in Step 11)',
            'input': 'features/*/train_X.parquet / test_X.parquet',
            'output': 'features/*/*.parquet (already standardized)',
        },
        'deep_learning_models': {
            'method': 'channel-wise z-score',
            'scaler_file': str(channel_scaler_pkl.relative_to(proc_dir)),
            'description': 'Channel-wise z-score standardization (per-channel)',
            'input': 'windows/*/X_*.npy',
            'output': 'scalers/*/X_*_scaled.npy',
            'formula': '(x - channel_mean) / max(channel_std, ε)',
            'epsilon': EPSILON,
        },
        'fold_info': {
            'fold_id': fold_id,
            'fold_tag': fold_tag,
        },
        'anti_leakage': [
            'All statistics (mean/std) computed from train set only',
            'Test set uses train-set statistics for normalization',
            'Fully consistent with Step 7 principles',
            'Each fold computes and saves its own scaler',
            'Strict LOSO: per-fold processing',
        ],
        'notes': [
            'Classical models use feature-level standardization (Step 11)',
            'Deep models use channel-level standardization (this step)',
            'The two normalization routes are independent',
            'Post-normalization data cast to float32 to save space',
            f'Use np.maximum(std, {EPSILON}) to avoid divide-by-zero',
            'Channel names read from channels.yaml to avoid hard-coding',
            'Anti-leakage assertion: train/test subjects are disjoint',
            'Auto-detect double-standardization and skip (risk mitigation)',
            'Numerical robustness check: NaN/Inf self-check',
            'Channel order validation: compare with channels.json sidecar',
            'Enhanced traceability: record input file SHA256 and window size',
        ]
    }

    norm_config_file = configs_dir / f"normalization_{fold_tag}.yaml"
    with open(norm_config_file, 'w', encoding='utf-8') as f:
        yaml.dump(normalization_config, f, default_flow_style=False,
                  allow_unicode=True, sort_keys=False)
    print(f"✓ Saved config: {norm_config_file}")

    # ========== 8. Summary for this fold ==========
    print("\n" + "=" * 60)
    print(f"Summary for fold {fold_tag}")
    print("=" * 60)

    print(f"\nClassical models path:")
    print(f"  Method: feature-level StandardScaler")
    print(f"  Status: {'✓ completed (Step 11)' if feature_scaler_file.exists() else '⚠️ missing for this fold'}")

    print(f"\nDeep-learning path:")
    print(f"  Method: channel-wise z-score")
    print(f"  Formula: (x - mean) / max(std, ε)")
    print(f"  Skip normalization: {'Yes (data already normalized)' if skip_normalization else 'No'}")
    print(f"  #channels: {n_channels}")
    print(f"  Train windows: {X_train_scaled.shape[0]:,}")
    if has_test:
        print(f"  Test  windows: {X_test_scaled.shape[0]:,}")

    print(f"\nOutputs ({fold_tag}):")
    print(f"  Normalized data: {scalers_dir}/")
    print(f"  Scaler params:   {channel_scaler_pkl.name}")
    print(f"  Config file:     {norm_config_file.name}")

# ========== Global summary ==========
print("\n" + "=" * 60)
print("Step 12 complete - Normalization (feature/deep) for all folds")
print("=" * 60)
print("Next steps (per fold):")
print("  Classical: train with features/fold_xx/train_X.parquet, train_y.parquet")
print("  Deep:      train with scalers/fold_xx/X_train_scaled.npy, y_train.npy")
print("=" * 60)

Step 12: Normalization (feature/deep) - multi-fold

0. Determine folds
Detected 8 folds from configs/splits.json: [0, 1, 2, 3, 4, 5, 6, 7]

0.1 Load channel config
✓ Channels read from config: ['ax', 'ay', 'az', 'gx', 'gy', 'gz', 'acc_mag', 'gyr_mag']

Processing fold: fold_00
Input dir (windows):  data/lara/mbientlab/proc/windows/fold_00
Output dir (scalers): data/lara/mbientlab/proc/scalers/fold_00

1. Classical models: confirm features already standardized
✓ Feature scaler was generated in Step 11: data/lara/mbientlab/proc/features/fold_00/scaler.pkl
  StandardScaler parameters:
  - #features: 220
  - mean range: [-2.3800, 5011.4682]
  - std  range: [0.0275, 5989.5515]

2. Deep models: load window data

Train set:
  X_train shape: (4965, 150, 8), dtype=float32

Test set:
  X_test shape:  (766, 150, 8), dtype=float32

2.1. Numerical robustness self-check
✓ X_train has no NaN/Inf
✓ X_test has no NaN/Inf

2.2. Anti-leakage & consistency checks
✓ Channel count consistent: 8
⚠️ channels.

In [9]:
#!/usr/bin/env python3
"""
Step 13: Model Configuration & Training — top-conf/journal grade

Deep model: InceptionTime (native 1D convolution)
Classical models: RandomForest, KNN
Training: bf16/fp32, early stopping, single random seed
Multi-fold version: loop over all folds in configs/splits.json
"""

import numpy as np
import pandas as pd
from pathlib import Path
import pickle
import json
import yaml
import os
import time
from datetime import datetime, timezone

# PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler

# Classical models
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score

# ========== Config ==========
RANDOM_SEED = 42
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Auto-select mixed precision
USE_AMP = False
AMP_DTYPE = None
USE_SCALER = False  # GradScaler only for fp16
if torch.cuda.is_available():
    # Prefer bf16 (more stable, no GradScaler)
    if torch.cuda.is_bf16_supported():
        USE_AMP = True
        AMP_DTYPE = torch.bfloat16
        USE_SCALER = False
    else:
        # Fall back to fp16 (needs GradScaler)
        USE_AMP = True
        AMP_DTYPE = torch.float16
        USE_SCALER = True

# Deep model hyperparameters
DEEP_CONFIG = {
    'batch_size': 64,
    'epochs': 100,
    'learning_rate': 1e-3,
    'weight_decay': 1e-4,
    'patience': 10,  # early stopping
    'min_delta': 1e-4,
    'val_split': 0.15,  # validation ratio (split from train set)
    'num_workers': 4,
}

# InceptionTime hyperparameters
INCEPTION_CONFIG = {
    'n_filters': 32,
    'kernel_sizes': [9, 19, 39],
    'bottleneck_channels': 32,
    'use_residual': True,
    'depth': 6,
}

# Classical model hyperparameters
RF_CONFIG = {
    'n_estimators': 200,
    'max_depth': 30,
    'min_samples_split': 5,
    'min_samples_leaf': 2,
    'random_state': RANDOM_SEED,
    'n_jobs': -1,
}

KNN_CONFIG = {
    'n_neighbors': 5,
    'weights': 'distance',
    'metric': 'euclidean',
    'n_jobs': -1,
}

print("=" * 60)
print("Step 13: Model Configuration & Training (multi-fold)")
print("=" * 60)

# Paths
proc_dir = Path("data/lara/mbientlab/proc")
configs_dir = Path("configs")
models_root = Path("models")
models_root.mkdir(parents=True, exist_ok=True)

# Set random seed
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

print(f"\nDevice: {DEVICE}")
if USE_AMP:
    dtype_str = 'bf16' if AMP_DTYPE == torch.bfloat16 else 'fp16'
    scaler_str = ' (using GradScaler)' if USE_SCALER else ' (no GradScaler)'
    print(f"Mixed precision: {dtype_str}{scaler_str}")
else:
    print("Mixed precision: OFF (CPU)")
print(f"Random seed: {RANDOM_SEED}")

# ========== InceptionTime definition ==========
class InceptionModule(nn.Module):
    """InceptionTime base module"""
    def __init__(self, in_channels, n_filters, kernel_sizes, bottleneck_channels):
        super().__init__()
        self.bottleneck = nn.Conv1d(in_channels, bottleneck_channels, 1, bias=False)

        self.conv_list = nn.ModuleList([
            nn.Conv1d(bottleneck_channels, n_filters, k, padding=k//2, bias=False)
            for k in kernel_sizes
        ])

        self.maxpool_conv = nn.Sequential(
            nn.MaxPool1d(3, stride=1, padding=1),
            nn.Conv1d(in_channels, n_filters, 1, bias=False)
        )

        out_channels = n_filters * (len(kernel_sizes) + 1)
        self.bn = nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU()

    def forward(self, x):
        bottleneck = self.bottleneck(x)
        conv_outputs = [conv(bottleneck) for conv in self.conv_list]
        maxpool_output = self.maxpool_conv(x)

        out = torch.cat([*conv_outputs, maxpool_output], dim=1)
        out = self.bn(out)
        out = self.relu(out)
        return out

class InceptionTime(nn.Module):
    """InceptionTime classifier"""
    def __init__(self, n_channels, n_classes, config):
        super().__init__()
        self.n_channels = n_channels
        self.n_classes = n_classes

        n_filters = config['n_filters']
        kernel_sizes = config['kernel_sizes']
        bottleneck_channels = config['bottleneck_channels']
        depth = config['depth']
        use_residual = config['use_residual']

        self.inception_modules = nn.ModuleList()
        in_ch = n_channels
        out_ch = n_filters * (len(kernel_sizes) + 1)

        for _ in range(depth):
            self.inception_modules.append(
                InceptionModule(in_ch, n_filters, kernel_sizes, bottleneck_channels)
            )
            in_ch = out_ch

        self.use_residual = use_residual
        if use_residual:
            self.residual_conv = nn.ModuleList([
                nn.Conv1d(n_channels if i == 0 else out_ch, out_ch, 1, bias=False)
                for i in range(depth)
            ])

        self.gap = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(out_ch, n_classes)

    def forward(self, x):
        # x: (batch, time, channels) -> (batch, channels, time)
        x = x.transpose(1, 2)

        for i, inception in enumerate(self.inception_modules):
            residual = x
            x = inception(x)

            if self.use_residual:
                residual = self.residual_conv[i](residual)
                x = x + residual

        x = self.gap(x)   # (batch, channels, 1)
        x = x.squeeze(-1) # (batch, channels)
        x = self.fc(x)
        return x

# ========== Dataset ==========
class WindowDataset(Dataset):
    """Windowed dataset"""
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# ========== Train / Eval helpers ==========
def train_epoch(model, loader, criterion, optimizer, scaler, device, use_amp, amp_dtype, use_scaler):
    """Train one epoch"""
    model.train()
    total_loss = 0
    correct = 0
    total = 0

    for X, y in loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()

        if use_amp:
            with torch.amp.autocast('cuda', dtype=amp_dtype):
                outputs = model(X)
                loss = criterion(outputs, y)

            if use_scaler:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                loss.backward()
                optimizer.step()
        else:
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * X.size(0)
        _, predicted = outputs.max(1)
        total += y.size(0)
        correct += predicted.eq(y).sum().item()

    return total_loss / total, correct / total

def evaluate(model, loader, criterion, device):
    """Evaluate model"""
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for X, y in loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = criterion(outputs, y)

            total_loss += loss.item() * X.size(0)
            _, predicted = outputs.max(1)
            total += y.size(0)
            correct += predicted.eq(y).sum().item()

            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(y.cpu().numpy())

    return total_loss / total, correct / total, np.array(all_preds), np.array(all_labels)

# ========== 0. Determine folds ==========
splits_path = configs_dir / "splits.json"
if splits_path.exists():
    with open(splits_path, "r") as f:
        splits = json.load(f)
    fold_ids = sorted(int(k) for k in splits.keys())
    print(f"\nDetected folds from {splits_path}: {fold_ids}")
else:
    raise RuntimeError("splits.json not found; LOSO Step 13 expects defined folds.")

# Used to record the final metrics for each fold
fold_summaries = []

# ========== Loop over folds ==========
for fold_id in fold_ids:
    fold_tag = f"fold_{fold_id:02d}"
    models_dir = models_root / fold_tag
    models_dir.mkdir(parents=True, exist_ok=True)

    print("\n" + "=" * 60)
    print(f"Processing fold {fold_id} ({fold_tag})")
    print("=" * 60)

    # ========== 1. Train deep model (InceptionTime) ==========
    print("\n" + "=" * 60)
    print("1. Train deep model (InceptionTime)")
    print("=" * 60)

    scalers_dir = proc_dir / "scalers" / fold_tag
    X_train_file = scalers_dir / "X_train_scaled.npy"
    y_train_file = scalers_dir / "y_train.npy"
    train_meta_file = scalers_dir / "train_meta.parquet"
    X_test_file = scalers_dir / "X_test_scaled.npy"
    y_test_file = scalers_dir / "y_test.npy"

    if not X_train_file.exists():
        raise FileNotFoundError(f"[{fold_tag}] Standardized data not found: {X_train_file}, please run Step 12 first")

    X_train_full = np.load(X_train_file)
    y_train_full = np.load(y_train_file)
    df_train_meta = pd.read_parquet(train_meta_file)

    if not X_test_file.exists():
        raise FileNotFoundError(f"[{fold_tag}] Test standardized data not found: {X_test_file}")
    X_test = np.load(X_test_file)
    y_test = np.load(y_test_file)

    print(f"\nData loaded:")
    print(f"  X_train_full: {X_train_full.shape}")
    print(f"  y_train_full: {y_train_full.shape}")
    print(f"  X_test:       {X_test.shape}")
    print(f"  y_test:       {y_test.shape}")

    # ----- 1.0 Label remapping -----
    print(f"\nLabel remapping (ensure contiguous 0..n_classes-1):")
    unique_labels_train = np.unique(y_train_full)
    unique_labels_test = np.unique(y_test)
    all_unique_labels = np.unique(np.concatenate([unique_labels_train, unique_labels_test]))

    print(f"  Original label set: {all_unique_labels.tolist()}")

    label_map = {old_label: new_label for new_label, old_label in enumerate(sorted(all_unique_labels))}
    label_map_inverse = {v: k for k, v in label_map.items()}

    print(f"  Mapping: {label_map}")

    y_train_full_mapped = np.array([label_map[y] for y in y_train_full], dtype=np.int64)
    y_test_mapped = np.array([label_map[y] for y in y_test], dtype=np.int64)

    print(f"  Mapped label range: {np.unique(y_train_full_mapped).tolist()}")
    print(f"  #classes: {len(all_unique_labels)}")

    # ----- 1.1 Subject-exclusive validation split -----
    print(f"\nValidation split by subject from train (subject-exclusive):")

    train_subjects = df_train_meta['subject_id'].unique()
    n_train_subjects = len(train_subjects)
    print(f"  #train subjects: {n_train_subjects}")

    np.random.seed(RANDOM_SEED)
    n_val_subjects = max(1, int(n_train_subjects * DEEP_CONFIG['val_split']))
    val_subjects = np.random.choice(train_subjects, size=n_val_subjects, replace=False)
    train_subjects_final = [s for s in train_subjects if s not in val_subjects]

    print(f"  #val subjects: {n_val_subjects} ({n_val_subjects/n_train_subjects*100:.1f}%)")
    print(f"  Val subjects: {sorted(val_subjects.tolist())}")
    print(f"  Final #train subjects: {len(train_subjects_final)}")

    val_mask = df_train_meta['subject_id'].isin(val_subjects)
    train_mask = ~val_mask

    X_train = X_train_full[train_mask]
    y_train = y_train_full_mapped[train_mask]
    X_val = X_train_full[val_mask]
    y_val = y_train_full_mapped[val_mask]

    print(f"\nAfter split:")
    print(f"  Train: {X_train.shape}")
    print(f"  Val:   {X_val.shape}")
    print(f"  Test:  {X_test.shape}")

    assert set(df_train_meta[train_mask]['subject_id'].unique()).isdisjoint(
        set(df_train_meta[val_mask]['subject_id'].unique())
    ), f"[{fold_tag}] Train/Val subjects overlap!"
    print("  ✓ Train/Val subjects disjoint")

    n_channels = X_train.shape[2]
    n_classes = len(all_unique_labels)
    print(f"\nModel params:")
    print(f"  Input channels: {n_channels}")
    print(f"  #classes:       {n_classes}")

    # Datasets & loaders
    train_dataset = WindowDataset(X_train, y_train)
    val_dataset = WindowDataset(X_val, y_val)
    test_dataset = WindowDataset(X_test, y_test_mapped)

    train_loader = DataLoader(
        train_dataset,
        batch_size=DEEP_CONFIG['batch_size'],
        shuffle=True,
        num_workers=DEEP_CONFIG['num_workers'],
        pin_memory=torch.cuda.is_available()
    )
    val_loader = DataLoader(
        val_dataset,
        batch_size=DEEP_CONFIG['batch_size'],
        shuffle=False,
        num_workers=DEEP_CONFIG['num_workers'],
        pin_memory=torch.cuda.is_available()
    )
    test_loader = DataLoader(
        test_dataset,
        batch_size=DEEP_CONFIG['batch_size'],
        shuffle=False,
        num_workers=DEEP_CONFIG['num_workers'],
        pin_memory=torch.cuda.is_available()
    )

    model = InceptionTime(n_channels, n_classes, INCEPTION_CONFIG).to(DEVICE)
    print(f"\nModel summary ({fold_tag}):")
    print(f"  #params: {sum(p.numel() for p in model.parameters()):,}")

    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(
        model.parameters(),
        lr=DEEP_CONFIG['learning_rate'],
        weight_decay=DEEP_CONFIG['weight_decay']
    )
    scaler = GradScaler() if USE_SCALER else None

    best_val_loss = float('inf')
    patience_counter = 0
    best_epoch = 0
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

    print(f"\nStart training ({fold_tag}):")
    print(f"  Epochs:       {DEEP_CONFIG['epochs']}")
    print(f"  Batch size:   {DEEP_CONFIG['batch_size']}")
    print(f"  Learning rate:{DEEP_CONFIG['learning_rate']}")
    print(f"  Patience:     {DEEP_CONFIG['patience']}")
    if USE_AMP:
        dtype_str = 'bf16' if AMP_DTYPE == torch.bfloat16 else 'fp16'
        scaler_str = ' (using GradScaler)' if USE_SCALER else ' (no GradScaler)'
        print(f"  Mixed precision: {dtype_str}{scaler_str}")
    else:
        print("  Mixed precision: OFF")

    start_time = time.time()

    for epoch in range(DEEP_CONFIG['epochs']):
        train_loss, train_acc = train_epoch(
            model, train_loader, criterion, optimizer, scaler, DEVICE,
            USE_AMP, AMP_DTYPE, USE_SCALER
        )

        val_loss, val_acc, _, _ = evaluate(model, val_loader, criterion, DEVICE)

        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)

        print(f"Epoch {epoch+1:3d}/{DEEP_CONFIG['epochs']}: "
              f"Train Loss={train_loss:.4f}, Acc={train_acc:.4f} | "
              f"Val Loss={val_loss:.4f}, Acc={val_acc:.4f}")

        if val_loss < best_val_loss - DEEP_CONFIG['min_delta']:
            best_val_loss = val_loss
            best_epoch = epoch
            patience_counter = 0

            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,
                'val_acc': val_acc,
                'val_subjects': sorted(val_subjects.tolist()),
            }, models_dir / "inception_time_best.pt")
        else:
            patience_counter += 1
            if patience_counter >= DEEP_CONFIG['patience']:
                print(f"\nEarly stopping triggered, best epoch: {best_epoch+1}")
                break

    train_time = time.time() - start_time
    print(f"\nTraining finished ({fold_tag}), time: {train_time:.2f}s")

    # Load best model and evaluate on test set
    print(f"\nFinal evaluation on test set ({fold_tag}):")
    checkpoint = torch.load(models_dir / "inception_time_best.pt", map_location=DEVICE)
    model.load_state_dict(checkpoint['model_state_dict'])
    test_loss, test_acc, test_preds, test_labels = evaluate(model, test_loader, criterion, DEVICE)

    test_f1_macro = f1_score(test_labels, test_preds, average='macro')
    print(f"  Test Loss:       {test_loss:.4f}")
    print(f"  Test Accuracy:   {test_acc:.4f}")
    print(f"  Test F1 (macro): {test_f1_macro:.4f}")

    deep_results = {
        'model': 'InceptionTime',
        'fold_id': fold_id,
        'random_seed': RANDOM_SEED,
        'best_epoch': int(best_epoch + 1),
        'train_time': float(train_time),
        'label_mapping': {
            'original_labels': sorted(all_unique_labels.tolist()),
            'mapped_labels': list(range(len(all_unique_labels))),
            'label_map': {int(k): int(v) for k, v in label_map.items()},
            'label_map_inverse': {int(k): int(v) for k, v in label_map_inverse.items()},
        },
        'validation': {
            'val_subjects': sorted(val_subjects.tolist()),
            'n_val_subjects': int(n_val_subjects),
            'n_train_subjects': len(train_subjects_final),
            'best_val_loss': float(best_val_loss),
            'best_val_acc': float(checkpoint['val_acc']),
        },
        'test': {
            'test_accuracy': float(test_acc),
            'test_loss': float(test_loss),
            'test_f1_macro': float(test_f1_macro),
        },
        'history': history,
        'config': {**DEEP_CONFIG, **INCEPTION_CONFIG},
        'mixed_precision': {
            'enabled': USE_AMP,
            'dtype': 'bf16' if AMP_DTYPE == torch.bfloat16 else 'fp16' if USE_AMP else 'None',
            'use_grad_scaler': USE_SCALER,
            'note': 'GradScaler is only used for fp16; bf16 does not need it',
        },
        'notes': [
            'Validation split by subject from training set (subject-exclusive)',
            'Early stopping based on validation set to avoid test leakage',
            'Test set used only for final evaluation',
            f'Mixed precision: {"bf16 (no GradScaler)" if AMP_DTYPE == torch.bfloat16 else "fp16 (with GradScaler)" if USE_AMP else "OFF"}',
            'Labels remapped to contiguous 0..n_classes-1',
        ]
    }

    with open(models_dir / "inception_time_results.json", 'w') as f:
        json.dump(deep_results, f, indent=2)

    print("✓ Saved: inception_time_best.pt")
    print("✓ Saved: inception_time_results.json")

    # ========== 2. Train classical models (RF/KNN) ==========
    print("\n" + "=" * 60)
    print("2. Train classical models (RF/KNN)")
    print("=" * 60)

    features_dir = proc_dir / "features" / fold_tag
    train_X_file = features_dir / "train_X.parquet"
    train_y_file = features_dir / "train_y.parquet"
    test_X_file = features_dir / "test_X.parquet"
    test_y_file = features_dir / "test_y.parquet"

    if not train_X_file.exists():
        raise FileNotFoundError(f"[{fold_tag}] Feature data not found: {train_X_file}, please run Step 11 (handcrafted features) first")

    df_train_X = pd.read_parquet(train_X_file)
    df_train_y = pd.read_parquet(train_y_file)
    df_test_X = pd.read_parquet(test_X_file)
    df_test_y = pd.read_parquet(test_y_file)

    X_train_feat = df_train_X.values
    y_train_feat_raw = df_train_y['label'].values
    X_test_feat = df_test_X.values
    y_test_feat_raw = df_test_y['label'].values

    # Apply same label mapping
    y_train_feat = np.array([label_map[y] for y in y_train_feat_raw], dtype=np.int64)
    y_test_feat = np.array([label_map[y] for y in y_test_feat_raw], dtype=np.int64)

    print(f"\nFeature data ({fold_tag}):")
    print(f"  X_train: {X_train_feat.shape}")
    print(f"  X_test:  {X_test_feat.shape}")
    print("  Labels remapped")

    # ----- 2.1 RandomForest -----
    print("\nTraining RandomForest...")
    print(f"  Config: {RF_CONFIG}")

    rf_start = time.time()
    rf_model = RandomForestClassifier(**RF_CONFIG)
    rf_model.fit(X_train_feat, y_train_feat)
    rf_train_time = time.time() - rf_start

    rf_train_pred = rf_model.predict(X_train_feat)
    rf_test_pred = rf_model.predict(X_test_feat)

    rf_train_acc = accuracy_score(y_train_feat, rf_train_pred)
    rf_test_acc = accuracy_score(y_test_feat, rf_test_pred)
    rf_test_f1 = f1_score(y_test_feat, rf_test_pred, average='macro')

    print("✓ RandomForest training complete:")
    print(f"  Train time: {rf_train_time:.2f}s")
    print(f"  Train Acc:  {rf_train_acc:.4f}")
    print(f"  Test  Acc:  {rf_test_acc:.4f}")
    print(f"  Test  F1 (macro): {rf_test_f1:.4f}")

    with open(models_dir / "random_forest.pkl", 'wb') as f:
        pickle.dump(rf_model, f)

    rf_results = {
        'model': 'RandomForest',
        'fold_id': fold_id,
        'random_state': RF_CONFIG['random_state'],
        'train_time': float(rf_train_time),
        'train_accuracy': float(rf_train_acc),
        'test_accuracy': float(rf_test_acc),
        'test_f1_macro': float(rf_test_f1),
        'config': RF_CONFIG,
        'label_mapping': {
            'original_labels': sorted(all_unique_labels.tolist()),
            'mapped_labels': list(range(len(all_unique_labels))),
            'label_map': {int(k): int(v) for k, v in label_map.items()},
        },
    }

    with open(models_dir / "random_forest_results.json", 'w') as f:
        json.dump(rf_results, f, indent=2)

    print("✓ Saved: random_forest.pkl")
    print("✓ Saved: random_forest_results.json")

    # ----- 2.2 KNN -----
    print("\nTraining KNN...")
    print(f"  Config: {KNN_CONFIG}")

    knn_start = time.time()
    knn_model = KNeighborsClassifier(**KNN_CONFIG)
    knn_model.fit(X_train_feat, y_train_feat)
    knn_train_time = time.time() - knn_start

    knn_train_pred = knn_model.predict(X_train_feat)
    knn_test_pred = knn_model.predict(X_test_feat)

    knn_train_acc = accuracy_score(y_train_feat, knn_train_pred)
    knn_test_acc = accuracy_score(y_test_feat, knn_test_pred)
    knn_test_f1 = f1_score(y_test_feat, knn_test_pred, average='macro')

    print("✓ KNN training complete:")
    print(f"  Train time: {knn_train_time:.2f}s")
    print(f"  Train Acc:  {knn_train_acc:.4f}")
    print(f"  Test  Acc:  {knn_test_acc:.4f}")
    print(f"  Test  F1 (macro): {knn_test_f1:.4f}")

    with open(models_dir / "knn.pkl", 'wb') as f:
        pickle.dump(knn_model, f)

    knn_results = {
        'model': 'KNN',
        'fold_id': fold_id,
        'train_time': float(knn_train_time),
        'train_accuracy': float(knn_train_acc),
        'test_accuracy': float(knn_test_acc),
        'test_f1_macro': float(knn_test_f1),
        'config': KNN_CONFIG,
        'label_mapping': {
            'original_labels': sorted(all_unique_labels.tolist()),
            'mapped_labels': list(range(len(all_unique_labels))),
            'label_map': {int(k): int(v) for k, v in label_map.items()},
        },
    }

    with open(models_dir / "knn_results.json", 'w') as f:
        json.dump(knn_results, f, indent=2)

    print("✓ Saved: knn.pkl")
    print("✓ Saved: knn_results.json")

    # ----- 3. Save training config (per fold) -----
    print("\n" + "=" * 60)
    print("3. Save training config")
    print("=" * 60)

    training_config = {
        'fold_id': fold_id,
        'fold_tag': fold_tag,
        'random_seed': RANDOM_SEED,
        'device': str(DEVICE),
        'timestamp': datetime.now(timezone.utc).isoformat(),
        'mixed_precision': {
            'enabled': USE_AMP,
            'dtype': 'bf16' if AMP_DTYPE == torch.bfloat16 else 'fp16' if USE_AMP else 'None',
            'use_grad_scaler': USE_SCALER,
            'auto_selection': 'bf16 first (no GradScaler), fp16 second (with GradScaler), OFF on CPU',
            'note': 'GradScaler is only used for fp16 to prevent underflow; bf16 has sufficient dynamic range',
        },
        'models': {
            'inception_time': {
                'architecture': 'InceptionTime (1D Conv)',
                'config': {**DEEP_CONFIG, **INCEPTION_CONFIG},
                'validation': {
                    'val_subjects': sorted(val_subjects.tolist()),
                    'n_val_subjects': int(n_val_subjects),
                    'split_ratio': DEEP_CONFIG['val_split'],
                    'subject_exclusive': True,
                },
                'test_accuracy': float(test_acc),
                'test_f1_macro': float(test_f1_macro),
                'train_time': float(train_time),
                'best_epoch': int(best_epoch + 1),
            },
            'random_forest': {
                'config': RF_CONFIG,
                'test_accuracy': float(rf_test_acc),
                'test_f1_macro': float(rf_test_f1),
                'train_time': float(rf_train_time),
            },
            'knn': {
                'config': KNN_CONFIG,
                'test_accuracy': float(knn_test_acc),
                'test_f1_macro': float(knn_test_f1),
                'train_time': float(knn_train_time),
            },
        },
        'notes': [
            'Deep model uses InceptionTime (native 1D convolution)',
            'Validation subjects split from train (subject-exclusive)',
            'Early stopping based on validation set (no test leakage)',
            'Test set only for final evaluation',
            f'Mixed precision: {"bf16 (no GradScaler)" if AMP_DTYPE == torch.bfloat16 else "fp16 (with GradScaler)" if USE_AMP else "OFF (CPU)"}',
            'GradScaler only used for fp16',
            'Single random seed for reproducibility',
            'RF records random_state for reproducibility',
            'Labels remapped to contiguous 0..n_classes-1 (prevent IndexError)',
            'All models use the same label mapping',
            'All models evaluated on the same fold’s independent test set',
            'Strict LOSO: train/val/test subjects fully disjoint',
        ]
    }

    config_file = models_dir / "training_config.yaml"
    with open(config_file, 'w', encoding='utf-8') as f:
        yaml.dump(training_config, f, default_flow_style=False, allow_unicode=True)

    print(f"✓ Saved: {config_file}")

    # Collect key metrics for this fold into the global summary list
    fold_summaries.append({
        'fold_id': fold_id,
        'fold_tag': fold_tag,
        'deep_test_acc': float(test_acc),
        'deep_test_f1': float(test_f1_macro),
        'rf_test_acc': float(rf_test_acc),
        'rf_test_f1': float(rf_test_f1),
        'knn_test_acc': float(knn_test_acc),
        'knn_test_f1': float(knn_test_f1),
    })

    # Short per-fold summary for logging
    print("\n" + "=" * 60)
    print(f"Summary for fold {fold_tag}")
    print("=" * 60)
    print(f"  InceptionTime: Acc={test_acc:.4f}, F1={test_f1_macro:.4f}, best_epoch={best_epoch+1}")
    print(f"  RandomForest:  Acc={rf_test_acc:.4f}, F1={rf_test_f1:.4f}")
    print(f"  KNN:           Acc={knn_test_acc:.4f}, F1={knn_test_f1:.4f}")
    print(f"  Outputs: {models_dir}/")

# ========== Global summary ==========
print("\n" + "=" * 60)
print("Step 13 complete - Model Configuration & Training for all folds")
print("=" * 60)

if fold_summaries:
    df_summary = pd.DataFrame(fold_summaries).sort_values('fold_id')
    print("\nPer-fold test performance (deep / RF / KNN):")
    print(df_summary[['fold_id',
                      'deep_test_acc', 'deep_test_f1',
                      'rf_test_acc', 'rf_test_f1',
                      'knn_test_acc', 'knn_test_f1']])

    print("\nCross-fold mean ± std (deep model):")
    print(f"  Acc: {df_summary['deep_test_acc'].mean():.4f} ± {df_summary['deep_test_acc'].std():.4f}")
    print(f"  F1 : {df_summary['deep_test_f1'].mean():.4f} ± {df_summary['deep_test_f1'].std():.4f}")
else:
    print("No folds were processed (check previous steps).")

print("\nNext steps:")
print("  - Compute cross-fold mean ± standard deviation for each model (a high-level overview for the deep model is printed above).")
print("  - In subsequent Step 16 / error analysis, read the JSON files under models/*/ and aggregate them into tables and plots.")
print("=" * 60)

Step 13: Model Configuration & Training (multi-fold)

Device: cuda
Mixed precision: bf16 (no GradScaler)
Random seed: 42

Detected folds from configs/splits.json: [0, 1, 2, 3, 4, 5, 6, 7]

Processing fold 0 (fold_00)

1. Train deep model (InceptionTime)

Data loaded:
  X_train_full: (4965, 150, 8)
  y_train_full: (4965,)
  X_test:       (766, 150, 8)
  y_test:       (766,)

Label remapping (ensure contiguous 0..n_classes-1):
  Original label set: [1, 2, 3, 4, 5, 6]
  Mapping: {np.int32(1): 0, np.int32(2): 1, np.int32(3): 2, np.int32(4): 3, np.int32(5): 4, np.int32(6): 5}
  Mapped label range: [0, 1, 2, 3, 4, 5]
  #classes: 6

Validation split by subject from train (subject-exclusive):
  #train subjects: 7
  #val subjects: 1 (14.3%)
  Val subjects: ['S08']
  Final #train subjects: 6

After split:
  Train: (4306, 150, 8)
  Val:   (659, 150, 8)
  Test:  (766, 150, 8)
  ✓ Train/Val subjects disjoint

Model params:
  Input channels: 8
  #classes:       6

Model summary (fold_00):
  #params:

In [10]:
#!/usr/bin/env python3
"""
Step 14: Inner-layer Tuning (multi-fold, warning-fix version)

- Inner CV for each outer LOSO fold (GroupKFold by subject)
- Models: RandomForest, KNN, InceptionTime (lightweight version)
- Fixes:
  * Use new torch.amp.autocast API (no deprecation warning)
  * Use unstandardized windows for inner-layer normalization (no leakage)
  * Per-fold tuning outputs under tuning/fold_xx/
"""

import numpy as np
import pandas as pd
from pathlib import Path
import json
import yaml
import os
import time
from itertools import product
from functools import lru_cache

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.amp import autocast

# ========== Global config ==========
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

RANDOM_SEED = 42
INNER_CV_FOLDS = 5
USE_AMP_BF16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()
NUM_WORKERS = min(4, os.cpu_count() or 2)

# Which models to tune
TUNE_MODELS = ["rf", "knn", "inception"]

RF_PARAM_GRID = {
    "n_estimators": [100, 200],
    "max_depth": [20, 30],
    "min_samples_split": [2, 5],
}

KNN_PARAM_GRID = {
    "n_neighbors": [5, 7],
    "weights": ["uniform", "distance"],
    "metric": ["euclidean", "manhattan"],
}

INCEPTION_PARAM_TRIALS = [
    {"learning_rate": 1e-3, "n_filters": 32, "depth": 6},
    {"learning_rate": 5e-4, "n_filters": 32, "depth": 6},
]

print("=" * 60)
print("Step 14: Inner-layer Tuning (multi-fold, warning-fix version)")
print("=" * 60)

proc_dir = Path("data/lara/mbientlab/proc")
configs_dir = Path("configs")
configs_dir.mkdir(parents=True, exist_ok=True)

# These globals will be updated for each fold
FOLD_ID = None
fold_tag = None
scalers_dir = None
train_meta_file = None
CLASS_LIST = None
N_CLASSES = None
CLASS_TO_INDEX = None
INDEX_LABELS = None
n_inner = None
inner_cv_splits = None

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)

# ========== 0. Determine outer folds from splits.json ==========
splits_path = configs_dir / "splits.json"
if splits_path.exists():
    with open(splits_path, "r", encoding="utf-8") as f:
        splits_cfg = json.load(f)
    outer_fold_ids = sorted(int(k) for k in splits_cfg.keys())
else:
    raise RuntimeError("splits.json not found; LOSO tuning expects defined outer folds.")

print(f"Outer folds detected: {outer_fold_ids}")
print(f"BF16 mixed precision available: {USE_AMP_BF16}")

# ========== 1. Data loading helpers (using globals per fold) ==========

@lru_cache(maxsize=32)
def cached_window_subset(train_key, val_key):
    """
    Cache standardized window subsets for deep models.

    Important:
    - Uses UNSTANDARDIZED X_train.npy from the windows folder,
      to avoid reusing outer-fold statistics inside inner-layer tuning.
    - Normalization is recomputed per inner train split (channel-wise).
    """
    train_subjects_local = list(train_key)
    val_subjects_local = list(val_key)

    # Try unstandardized windows first (preferred)
    candidates = [
        proc_dir / "windows" / fold_tag / "X_train.npy",
        scalers_dir / "X_train.npy",  # optional fallback
    ]
    for p in candidates:
        if p.exists():
            X_full = np.load(p, mmap_mode="r")
            break
    else:
        raise FileNotFoundError(
            "Unstandardized X_train.npy is required for inner-layer normalization; "
            "having only X_train_scaled.npy would introduce validation-subject statistics. "
            "Please persist the unstandardized window array in Step 9/12."
        )

    y_full = np.load(scalers_dir / "y_train.npy", mmap_mode="r")
    df_meta = pd.read_parquet(train_meta_file)

    train_mask = df_meta["subject_id"].isin(train_subjects_local)
    val_mask = df_meta["subject_id"].isin(val_subjects_local)

    X_train_raw = X_full[train_mask].copy().astype(np.float32)
    X_val_raw = X_full[val_mask].copy().astype(np.float32)
    y_train_raw = y_full[train_mask].copy()
    y_val_raw = y_full[val_mask].copy()

    # Inner-layer independent normalization (channel-wise)
    channel_mean = np.mean(X_train_raw, axis=(0, 1))
    channel_std = np.maximum(np.std(X_train_raw, axis=(0, 1)), 1e-8)

    X_train = (X_train_raw - channel_mean) / channel_std
    X_val = (X_val_raw - channel_mean) / channel_std

    y_train = np.array([CLASS_TO_INDEX[int(v)] for v in y_train_raw], dtype=np.int64)
    y_val = np.array([CLASS_TO_INDEX[int(v)] for v in y_val_raw], dtype=np.int64)

    return X_train, y_train, X_val, y_val


def load_feature_data_subset(train_subjects_local, val_subjects_local):
    """
    Load handcrafted feature data for classical models.
    Inner-layer StandardScaler is recomputed per inner train split.
    """
    features_dir = proc_dir / "features" / fold_tag
    df_X = pd.read_parquet(features_dir / "train_X.parquet")
    df_y = pd.read_parquet(features_dir / "train_y.parquet")
    df_meta = pd.read_parquet(features_dir / "train_meta.parquet")

    train_mask = df_meta["subject_id"].isin(train_subjects_local)
    val_mask = df_meta["subject_id"].isin(val_subjects_local)

    X_train_raw = df_X.values[train_mask]
    X_val_raw = df_X.values[val_mask]

    y_col = df_y["label"]
    y_train_raw = y_col.values[train_mask]
    y_val_raw = y_col.values[val_mask]

    y_train = np.array([CLASS_TO_INDEX[int(v)] for v in y_train_raw], dtype=np.int64)
    y_val = np.array([CLASS_TO_INDEX[int(v)] for v in y_val_raw], dtype=np.int64)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train_raw)
    X_val = scaler.transform(X_val_raw)

    return X_train, y_train, X_val, y_val

# ========== 2. Evaluation functions ==========

def evaluate_rf(params, inner_splits_local):
    f1_scores = []
    for split in inner_splits_local:
        X_train, y_train, X_val, y_val = load_feature_data_subset(
            split["train_subjects"], split["val_subjects"]
        )

        model = RandomForestClassifier(random_state=RANDOM_SEED, n_jobs=-1, **params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        f1 = f1_score(y_val, y_pred, average="macro", labels=INDEX_LABELS, zero_division=0)
        f1_scores.append(f1)

    return np.mean(f1_scores), np.std(f1_scores)


def evaluate_knn(params, inner_splits_local):
    f1_scores = []
    for split in inner_splits_local:
        X_train, y_train, X_val, y_val = load_feature_data_subset(
            split["train_subjects"], split["val_subjects"]
        )

        model = KNeighborsClassifier(n_jobs=-1, **params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

        f1 = f1_score(y_val, y_pred, average="macro", labels=INDEX_LABELS, zero_division=0)
        f1_scores.append(f1)

    return np.mean(f1_scores), np.std(f1_scores)


def evaluate_inception(params, inner_splits_local, max_epochs=20):
    """
    InceptionTime evaluation with inner CV.
    Uses the new torch.amp.autocast API to avoid deprecation warnings.
    """
    torch.manual_seed(RANDOM_SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(RANDOM_SEED)

    class InceptionModule(nn.Module):
        def __init__(self, in_channels, n_filters, kernel_sizes, bottleneck_channels):
            super().__init__()
            self.bottleneck = nn.Conv1d(in_channels, bottleneck_channels, 1, bias=False)
            self.conv_list = nn.ModuleList([
                nn.Conv1d(bottleneck_channels, n_filters, k, padding=k // 2, bias=False)
                for k in kernel_sizes
            ])
            self.maxpool_conv = nn.Sequential(
                nn.MaxPool1d(3, stride=1, padding=1),
                nn.Conv1d(in_channels, n_filters, 1, bias=False),
            )
            out_channels = n_filters * (len(kernel_sizes) + 1)
            self.bn = nn.BatchNorm1d(out_channels)
            self.relu = nn.ReLU()

        def forward(self, x):
            bottleneck = self.bottleneck(x)
            conv_outputs = [conv(bottleneck) for conv in self.conv_list]
            maxpool_output = self.maxpool_conv(x)
            out = torch.cat([*conv_outputs, maxpool_output], dim=1)
            return self.relu(self.bn(out))

    class InceptionTimeInner(nn.Module):
        def __init__(self, n_channels, n_classes, n_filters, depth):
            super().__init__()
            kernel_sizes = [9, 19, 39]
            bottleneck_channels = 32

            self.inception_modules = nn.ModuleList()
            in_ch = n_channels
            out_ch = n_filters * (len(kernel_sizes) + 1)

            for _ in range(depth):
                self.inception_modules.append(
                    InceptionModule(in_ch, n_filters, kernel_sizes, bottleneck_channels)
                )
                in_ch = out_ch

            self.gap = nn.AdaptiveAvgPool1d(1)
            self.fc = nn.Linear(out_ch, n_classes)

        def forward(self, x):
            x = x.transpose(1, 2).contiguous()
            for inception in self.inception_modules:
                x = inception(x)
            x = self.gap(x).squeeze(-1)
            return self.fc(x)

    class WindowDataset(Dataset):
        def __init__(self, X, y):
            self.X = torch.FloatTensor(X)
            self.y = torch.LongTensor(y)

        def __len__(self):
            return len(self.X)

        def __getitem__(self, idx):
            return self.X[idx], self.y[idx]

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    pin = torch.cuda.is_available()
    f1_scores = []

    for split in inner_splits_local:
        X_train, y_train, X_val, y_val = cached_window_subset(
            tuple(split["train_subjects"]),
            tuple(split["val_subjects"]),
        )

        train_loader = DataLoader(
            WindowDataset(X_train, y_train),
            batch_size=64,
            shuffle=True,
            num_workers=NUM_WORKERS,
            pin_memory=pin,
        )
        val_loader = DataLoader(
            WindowDataset(X_val, y_val),
            batch_size=64,
            shuffle=False,
            num_workers=NUM_WORKERS,
            pin_memory=pin,
        )

        n_channels_local = X_train.shape[2]
        model = InceptionTimeInner(
            n_channels_local, N_CLASSES, params["n_filters"], params["depth"]
        ).to(device)

        optimizer = torch.optim.Adam(
            model.parameters(), lr=params["learning_rate"], weight_decay=1e-4
        )
        criterion = nn.CrossEntropyLoss()

        best_f1 = 0.0
        patience_counter = 0

        for _ in range(max_epochs):
            model.train()
            for X_batch, y_batch in train_loader:
                X_batch = X_batch.to(device, non_blocking=True)
                y_batch = y_batch.to(device, non_blocking=True)

                optimizer.zero_grad(set_to_none=True)

                if USE_AMP_BF16:
                    with autocast(device_type="cuda", dtype=torch.bfloat16):
                        outputs = model(X_batch)
                        loss = criterion(outputs, y_batch)
                else:
                    outputs = model(X_batch)
                    loss = criterion(outputs, y_batch)

                loss.backward()
                optimizer.step()

            model.eval()
            all_preds, all_labels = [], []
            with torch.no_grad():
                for X_batch, y_batch in val_loader:
                    X_batch = X_batch.to(device, non_blocking=True)

                    if USE_AMP_BF16:
                        with autocast(device_type="cuda", dtype=torch.bfloat16):
                            outputs = model(X_batch)
                    else:
                        outputs = model(X_batch)

                    _, preds = outputs.max(1)
                    all_preds.extend(preds.cpu().numpy())
                    all_labels.extend(y_batch.numpy())

            f1 = f1_score(
                all_labels, all_preds,
                average="macro", labels=INDEX_LABELS, zero_division=0
            )

            if f1 > best_f1 + 1e-4:
                best_f1 = f1
                patience_counter = 0
            else:
                patience_counter += 1
                if patience_counter >= 5:
                    break

        f1_scores.append(best_f1)

    return np.mean(f1_scores), np.std(f1_scores)

# ========== 3. Loop over outer folds ==========

for FOLD_ID in outer_fold_ids:
    fold_tag = f"fold_{FOLD_ID:02d}"
    tuning_dir = Path("tuning") / fold_tag
    tuning_dir.mkdir(parents=True, exist_ok=True)

    scalers_dir = proc_dir / "scalers" / fold_tag
    train_meta_file = scalers_dir / "train_meta.parquet"

    print("\n" + "=" * 60)
    print(f"Outer fold: FOLD_ID={FOLD_ID} ({fold_tag})")
    print("=" * 60)
    print(f"BF16 mixed precision: {USE_AMP_BF16}")
    print(f"Scalers dir: {scalers_dir}")
    print(f"Tuning dir:  {tuning_dir}")

    # Reset cached inner subsets between folds
    cached_window_subset.cache_clear()

    # ---------- 3.1 Load global class set for this fold ----------
    print("\n" + "=" * 60)
    print("1. Load global class set")
    print("=" * 60)

    df_train_meta = pd.read_parquet(train_meta_file)
    train_subjects = sorted(df_train_meta["subject_id"].unique().tolist())
    n_train_subjects = len(train_subjects)

    all_train_y = np.load(scalers_dir / "y_train.npy")
    CLASS_LIST = sorted(np.unique(all_train_y).tolist())
    N_CLASSES = len(CLASS_LIST)

    CLASS_TO_INDEX = {c: i for i, c in enumerate(CLASS_LIST)}
    INDEX_LABELS = list(range(N_CLASSES))

    print(f"#Train subjects: {n_train_subjects}")
    print(f"Global class set: {CLASS_LIST} -> {INDEX_LABELS}")

    n_inner = min(INNER_CV_FOLDS, n_train_subjects)
    assert n_inner >= 2, "Need at least 2 inner folds"
    if n_inner < INNER_CV_FOLDS:
        print(f"⚠️ Adjust inner folds: {INNER_CV_FOLDS} -> {n_inner}")

    # ---------- 3.2 Inner CV splitting ----------
    print("\n" + "=" * 60)
    print("2. Inner CV splitting (GroupKFold)")
    print("=" * 60)

    df_train_meta = df_train_meta.sort_values(
        ["subject_id", "session_id", "window_id"],
        kind="mergesort"
    ).reset_index(drop=True)

    window_subjects = df_train_meta["subject_id"].values
    window_indices = np.arange(len(df_train_meta))

    gkf = GroupKFold(n_splits=n_inner)
    inner_cv_splits = []

    for fold_idx, (train_idx, val_idx) in enumerate(
        gkf.split(window_indices, groups=window_subjects)
    ):
        val_subjects_inner = sorted(set(window_subjects[val_idx]))
        train_subjects_inner = sorted(set(window_subjects[train_idx]))

        inner_cv_splits.append({
            "fold": fold_idx,
            "train_subjects": train_subjects_inner,
            "val_subjects": val_subjects_inner,
            "train_window_indices": train_idx.tolist(),
            "val_window_indices": val_idx.tolist(),
        })
        print(
            f"  Inner fold {fold_idx}: "
            f"train {len(train_subjects_inner)} subs, val {len(val_subjects_inner)} subs"
        )

    for split in inner_cv_splits:
        assert set(split["train_subjects"]).isdisjoint(set(split["val_subjects"]))
    print("✓ Disjointness check passed")

    inner_splits_file = tuning_dir / "inner_cv_splits.json"
    with open(inner_splits_file, "w", encoding="utf-8") as f:
        splits_to_save = [{
            "fold": s["fold"],
            "train_subjects": s["train_subjects"],
            "val_subjects": s["val_subjects"],
            "n_train_windows": len(s["train_window_indices"]),
            "n_val_windows": len(s["val_window_indices"]),
        } for s in inner_cv_splits]
        json.dump(splits_to_save, f, ensure_ascii=False, indent=2)
    print(f"✓ Saved: {inner_splits_file}")

    # ---------- 4. Tune RandomForest ----------
    if "rf" in TUNE_MODELS:
        print("\n" + "=" * 60)
        print("5. Tune RandomForest")
        print("=" * 60)

        param_names = list(RF_PARAM_GRID.keys())
        param_values = [RF_PARAM_GRID[k] for k in param_names]
        rf_param_combinations = list(product(*param_values))

        print(f"Number of parameter combinations: {len(rf_param_combinations)}")

        rf_trials = []
        best_rf_f1 = 0.0
        best_rf_params = None

        for i, combo in enumerate(rf_param_combinations):
            params = dict(zip(param_names, combo))
            print(f"\nTrial {i+1}/{len(rf_param_combinations)}: {params}")

            start_time = time.time()
            mean_f1, std_f1 = evaluate_rf(params, inner_cv_splits)
            elapsed = time.time() - start_time

            print(f"  Macro-F1: {mean_f1:.4f} ± {std_f1:.4f} ({elapsed:.2f}s)")

            rf_trials.append({
                "trial": i,
                "params": params,
                "mean_f1": float(mean_f1),
                "std_f1": float(std_f1),
                "time": float(elapsed),
            })

            if mean_f1 > best_rf_f1:
                best_rf_f1 = mean_f1
                best_rf_params = params
                print("  ✓ New best!")

        print(f"\n✓ Best RandomForest: {best_rf_params} (F1={best_rf_f1:.4f})")

        pd.DataFrame(rf_trials).to_csv(tuning_dir / "rf_trials.csv", index=False)
        with open(tuning_dir / "rf_best_params.json", "w") as f:
            json.dump(
                {"params": best_rf_params, "mean_f1": float(best_rf_f1)},
                f, indent=2
            )

    # ---------- 5. Tune KNN ----------
    if "knn" in TUNE_MODELS:
        print("\n" + "=" * 60)
        print("6. Tune KNN")
        print("=" * 60)

        param_names = list(KNN_PARAM_GRID.keys())
        param_values = [KNN_PARAM_GRID[k] for k in param_names]
        knn_param_combinations = list(product(*param_values))

        print(f"Number of parameter combinations: {len(knn_param_combinations)}")

        knn_trials = []
        best_knn_f1 = 0.0
        best_knn_params = None

        for i, combo in enumerate(knn_param_combinations):
            params = dict(zip(param_names, combo))
            print(f"\nTrial {i+1}/{len(knn_param_combinations)}: {params}")

            start_time = time.time()
            mean_f1, std_f1 = evaluate_knn(params, inner_cv_splits)
            elapsed = time.time() - start_time

            print(f"  Macro-F1: {mean_f1:.4f} ± {std_f1:.4f} ({elapsed:.2f}s)")

            knn_trials.append({
                "trial": i,
                "params": params,
                "mean_f1": float(mean_f1),
                "std_f1": float(std_f1),
                "time": float(elapsed),
            })

            if mean_f1 > best_knn_f1:
                best_knn_f1 = mean_f1
                best_knn_params = params
                print("  ✓ New best!")

        print(f"\n✓ Best KNN: {best_knn_params} (F1={best_knn_f1:.4f})")

        pd.DataFrame(knn_trials).to_csv(tuning_dir / "knn_trials.csv", index=False)
        with open(tuning_dir / "knn_best_params.json", "w") as f:
            json.dump(
                {"params": best_knn_params, "mean_f1": float(best_knn_f1)},
                f, indent=2
            )

    # ---------- 6. Tune InceptionTime ----------
    if "inception" in TUNE_MODELS:
        print("\n" + "=" * 60)
        print("7. Tune InceptionTime")
        print("=" * 60)

        print(f"#Trials: {len(INCEPTION_PARAM_TRIALS)}")

        inception_trials = []
        best_inception_f1 = 0.0
        best_inception_params = None

        for i, params in enumerate(INCEPTION_PARAM_TRIALS):
            print(f"\nTrial {i+1}/{len(INCEPTION_PARAM_TRIALS)}: {params}")

            start_time = time.time()
            mean_f1, std_f1 = evaluate_inception(params, inner_cv_splits, max_epochs=20)
            elapsed = time.time() - start_time

            print(f"  Macro-F1: {mean_f1:.4f} ± {std_f1:.4f} ({elapsed:.2f}s)")

            inception_trials.append({
                "trial": i,
                "params": params,
                "mean_f1": float(mean_f1),
                "std_f1": float(std_f1),
                "time": float(elapsed),
            })

            if mean_f1 > best_inception_f1:
                best_inception_f1 = mean_f1
                best_inception_params = params
                print("  ✓ New best!")

        print(f"\n✓ Best InceptionTime: {best_inception_params} (F1={best_inception_f1:.4f})")

        pd.DataFrame(inception_trials).to_csv(tuning_dir / "inception_trials.csv", index=False)
        with open(tuning_dir / "inception_best_params.json", "w") as f:
            json.dump(
                {"params": best_inception_params, "mean_f1": float(best_inception_f1)},
                f, indent=2
            )

    # ---------- 7. Save per-fold tuning config ----------
    print("\n" + "=" * 60)
    print("8. Save config")
    print("=" * 60)

    tuning_config = {
        "fold_id": FOLD_ID,
        "fold_tag": fold_tag,
        "random_seed": RANDOM_SEED,
        "n_inner_folds": n_inner,
        "tuned_models": TUNE_MODELS,
        "best_params": {},
    }

    if "rf" in TUNE_MODELS:
        tuning_config["best_params"]["rf"] = {
            "params": best_rf_params,
            "mean_f1": float(best_rf_f1),
        }

    if "knn" in TUNE_MODELS:
        tuning_config["best_params"]["knn"] = {
            "params": best_knn_params,
            "mean_f1": float(best_knn_f1),
        }

    if "inception" in TUNE_MODELS:
        tuning_config["best_params"]["inception"] = {
            "params": best_inception_params,
            "mean_f1": float(best_inception_f1),
        }

    with open(tuning_dir / "tuning_config.yaml", "w", encoding="utf-8") as f:
        yaml.dump(tuning_config, f, default_flow_style=False, allow_unicode=True)

    print(f"✓ Saved: {tuning_dir / 'tuning_config.yaml'}")
    print("\n" + "=" * 60)
    print(f"Step 14 complete for outer fold {FOLD_ID} ({fold_tag})")
    print("=" * 60)
    print(f"Output dir: {tuning_dir}/")
    print("  - inner_cv_splits.json")
    if "rf" in TUNE_MODELS:
        print("  - rf_trials.csv, rf_best_params.json")
    if "knn" in TUNE_MODELS:
        print("  - knn_trials.csv, knn_best_params.json")
    if "inception" in TUNE_MODELS:
        print("  - inception_trials.csv, inception_best_params.json")
    print("  - tuning_config.yaml")

print("\n" + "=" * 60)
print("Step 14 complete for all outer folds")
print("=" * 60)

Outer folds detected: [0, 1, 2, 3, 4, 5, 6, 7]
BF16 mixed precision available: True

Outer fold: FOLD_ID=0 (fold_00)
BF16 mixed precision: True
Scalers dir: data/lara/mbientlab/proc/scalers/fold_00
Tuning dir:  tuning/fold_00

1. Load global class set
#Train subjects: 7
Global class set: [1, 2, 3, 4, 5, 6] -> [0, 1, 2, 3, 4, 5]

2. Inner CV splitting (GroupKFold)
  Inner fold 0: train 6 subs, val 1 subs
  Inner fold 1: train 6 subs, val 1 subs
  Inner fold 2: train 6 subs, val 1 subs
  Inner fold 3: train 5 subs, val 2 subs
  Inner fold 4: train 5 subs, val 2 subs
✓ Disjointness check passed
✓ Saved: tuning/fold_00/inner_cv_splits.json

5. Tune RandomForest
Number of parameter combinations: 8

Trial 1/8: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 2}
  Macro-F1: 0.4267 ± 0.0689 (6.04s)
  ✓ New best!

Trial 2/8: {'n_estimators': 100, 'max_depth': 20, 'min_samples_split': 5}
  Macro-F1: 0.4243 ± 0.0737 (5.79s)

Trial 3/8: {'n_estimators': 100, 'max_depth': 30, 'min_sample

In [11]:
#!/usr/bin/env python3
"""
Step 15: Training & Inference (top-conf/journal grade, multi-fold)
Fit on the full training fold with best hyperparameters, run inference on the test subject,
and save per-window predictions for all outer folds.
"""

import numpy as np
import pandas as pd
from pathlib import Path
import json
import yaml
import os
import pickle
import time
import shutil
from datetime import datetime, timezone

# PyTorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.amp import autocast

# Classical models
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler

# ========== Config ==========
RANDOM_SEED = 42
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

INFERENCE_CONFIG = {
    "batch_size": 128,
    "num_workers": 4,
}

USE_AMP_BF16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

print("=" * 60)
print("Step 15: Training & Inference (multi-fold)")
print("=" * 60)

# Path config
proc_dir = Path("data/lara/mbientlab/proc")
configs_dir = Path("configs")
configs_dir.mkdir(parents=True, exist_ok=True)

models_root = Path("models")
predictions_root = Path("predictions")
models_root.mkdir(parents=True, exist_ok=True)
predictions_root.mkdir(parents=True, exist_ok=True)

print(f"\nDevice: {DEVICE}")
print(f"BF16 mixed precision: {USE_AMP_BF16}")
print(f"Inference batch size: {INFERENCE_CONFIG['batch_size']}")
print(f"Num workers: {INFERENCE_CONFIG['num_workers']}")

# Set seeds
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# ========== Determine outer folds from splits.json ==========
splits_path = configs_dir / "splits.json"
if splits_path.exists():
    with open(splits_path, "r", encoding="utf-8") as f:
        splits_cfg = json.load(f)
    outer_fold_ids = sorted(int(k) for k in splits_cfg.keys())
else:
    raise RuntimeError("splits.json not found; Step 15 expects defined LOSO folds.")

print(f"\nOuter folds detected: {outer_fold_ids}")

# ========== InceptionTime model definition ==========
class InceptionModule(nn.Module):
    def __init__(self, in_channels, n_filters, kernel_sizes, bottleneck_channels):
        super().__init__()
        self.bottleneck = nn.Conv1d(in_channels, bottleneck_channels, 1, bias=False)
        self.conv_list = nn.ModuleList([
            nn.Conv1d(bottleneck_channels, n_filters, k, padding=k // 2, bias=False)
            for k in kernel_sizes
        ])
        self.maxpool_conv = nn.Sequential(
            nn.MaxPool1d(3, stride=1, padding=1),
            nn.Conv1d(in_channels, n_filters, 1, bias=False),
        )
        out_channels = n_filters * (len(kernel_sizes) + 1)
        self.bn = nn.BatchNorm1d(out_channels)
        self.relu = nn.ReLU()

    def forward(self, x):
        bottleneck = self.bottleneck(x)
        conv_outputs = [conv(bottleneck) for conv in self.conv_list]
        maxpool_output = self.maxpool_conv(x)
        out = torch.cat([*conv_outputs, maxpool_output], dim=1)
        return self.relu(self.bn(out))


class InceptionTime(nn.Module):
    def __init__(self, n_channels, n_classes, n_filters, depth):
        super().__init__()
        kernel_sizes = [9, 19, 39]
        bottleneck_channels = 32
        self.inception_modules = nn.ModuleList()
        in_ch = n_channels
        out_ch = n_filters * (len(kernel_sizes) + 1)
        for _ in range(depth):
            self.inception_modules.append(
                InceptionModule(in_ch, n_filters, kernel_sizes, bottleneck_channels)
            )
            in_ch = out_ch
        self.gap = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(out_ch, n_classes)

    def forward(self, x):
        x = x.transpose(1, 2).contiguous()
        for inception in self.inception_modules:
            x = inception(x)
        x = self.gap(x).squeeze(-1)
        return self.fc(x)


class WindowDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


# For cross-fold summary
fold_summaries = []

# ========== Loop over all outer folds ==========
for FOLD_ID in outer_fold_ids:
    fold_tag = f"fold_{FOLD_ID:02d}"
    tuning_dir = Path("tuning") / fold_tag
    models_dir = models_root / fold_tag
    predictions_dir = predictions_root / fold_tag

    models_dir.mkdir(parents=True, exist_ok=True)
    predictions_dir.mkdir(parents=True, exist_ok=True)

    print("\n" + "=" * 60)
    print(f"Outer fold: FOLD_ID={FOLD_ID} ({fold_tag})")
    print("=" * 60)

    # ========== 1. Load best hyperparameters ==========
    print("\n" + "=" * 60)
    print("1. Load best hyperparameters")
    print("=" * 60)

    best_params = {}

    rf_params_file = tuning_dir / "rf_best_params.json"
    if rf_params_file.exists():
        with open(rf_params_file, "r") as f:
            best_params["rf"] = json.load(f)
        print(f"✓ RF: {best_params['rf']['params']}")

    knn_params_file = tuning_dir / "knn_best_params.json"
    if knn_params_file.exists():
        with open(knn_params_file, "r") as f:
            best_params["knn"] = json.load(f)
        print(f"✓ KNN: {best_params['knn']['params']}")

    inception_params_file = tuning_dir / "inception_best_params.json"
    if inception_params_file.exists():
        with open(inception_params_file, "r") as f:
            best_params["inception"] = json.load(f)
        print(f"✓ InceptionTime: {best_params['inception']['params']}")

    if not best_params:
        print(f"⚠️ No best hyperparameters found for {fold_tag}; skipping this fold.")
        continue

    # ========== 2. Load data and build label mapping ==========
    print("\n" + "=" * 60)
    print("2. Load data and label mapping")
    print("=" * 60)

    scalers_dir = proc_dir / "scalers" / fold_tag
    features_dir = proc_dir / "features" / fold_tag

    # Deep meta & labels
    train_meta_file = scalers_dir / "train_meta.parquet"
    df_train_meta = pd.read_parquet(train_meta_file)
    all_train_y = np.load(scalers_dir / "y_train.npy")

    CLASS_LIST = sorted(np.unique(all_train_y).tolist())
    N_CLASSES = len(CLASS_LIST)
    CLASS_TO_INDEX = {c: i for i, c in enumerate(CLASS_LIST)}
    INDEX_TO_CLASS = {i: c for c, i in CLASS_TO_INDEX.items()}

    print(f"#classes: {N_CLASSES}")
    print(f"Class mapping: {CLASS_LIST} -> {list(range(N_CLASSES))}")

    # Meta for test sets
    df_test_meta_deep = pd.read_parquet(scalers_dir / "test_meta.parquet")
    df_test_meta_feat = pd.read_parquet(features_dir / "test_meta.parquet")

    # Deep model data
    X_train_deep = np.load(scalers_dir / "X_train_scaled.npy")
    y_train_deep = np.array([CLASS_TO_INDEX[int(y)] for y in all_train_y], dtype=np.int64)

    X_test_deep = np.load(scalers_dir / "X_test_scaled.npy")
    y_test_raw = np.load(scalers_dir / "y_test.npy")
    y_test_deep = np.array([CLASS_TO_INDEX[int(y)] for y in y_test_raw], dtype=np.int64)

    assert len(df_test_meta_deep) == X_test_deep.shape[0], \
        f"[{fold_tag}] Deep-model metadata and data are misaligned"

    print(f"\nDeep-model data ({fold_tag}):")
    print(f"  Train: {X_train_deep.shape}")
    print(f"  Test:  {X_test_deep.shape}")

    # Feature-based data
    df_train_X = pd.read_parquet(features_dir / "train_X.parquet")
    df_train_y = pd.read_parquet(features_dir / "train_y.parquet")
    df_test_X = pd.read_parquet(features_dir / "test_X.parquet")
    df_test_y = pd.read_parquet(features_dir / "test_y.parquet")

    X_train_feat = df_train_X.values
    y_train_feat = np.array(
        [CLASS_TO_INDEX[int(y)] for y in df_train_y["label"].values],
        dtype=np.int64,
    )
    X_test_feat = df_test_X.values
    y_test_feat = np.array(
        [CLASS_TO_INDEX[int(y)] for y in df_test_y["label"].values],
        dtype=np.int64,
    )

    assert len(df_test_meta_feat) == X_test_feat.shape[0], \
        f"[{fold_tag}] Feature-model metadata and data are misaligned"

    print(f"\nFeature-model data ({fold_tag}):")
    print(f"  Train: {X_train_feat.shape}")
    print(f"  Test:  {X_test_feat.shape}")

    # Feature standardization (prefer per-fold scaler, fallback to features dir)
    scaler_candidates = [
        scalers_dir / "feature_scaler.pkl",   # per-fold
        features_dir / "scaler.pkl",          # from Step 11
    ]
    feat_scaler = None
    for p in scaler_candidates:
        if p.exists():
            with open(p, "rb") as f:
                feat_scaler = pickle.load(f)
            if hasattr(feat_scaler, "feature_names_in_"):
                delattr(feat_scaler, "feature_names_in_")
            if p != scaler_candidates[0]:
                shutil.copy(p, scaler_candidates[0])
            print(f"\n✓ Loaded feature scaler: {p}")
            break

    if feat_scaler is None:
        print("\n⚠️ Feature scaler not found; refitting on this fold's training set")
        feat_scaler = StandardScaler().fit(X_train_feat)
        with open(scalers_dir / "feature_scaler.pkl", "wb") as f:
            pickle.dump(feat_scaler, f)

    X_train_feat = feat_scaler.transform(X_train_feat)
    X_test_feat = feat_scaler.transform(X_test_feat)
    print("  Features standardized (consistent with Step 14)")

    # Per-fold summary metrics
    rf_acc = rf_f1 = None
    knn_acc = knn_f1 = None
    inception_acc = inception_f1 = None

    # ========== 3. Train RandomForest ==========
    if "rf" in best_params:
        print("\n" + "=" * 60)
        print("3. Train RandomForest (full training set)")
        print("=" * 60)

        rf_params = best_params["rf"]["params"]
        print(f"Hyperparameters: {rf_params}")

        rf_start = time.time()
        rf_model = RandomForestClassifier(
            random_state=RANDOM_SEED,
            n_jobs=-1,
            **rf_params,
        )
        rf_model.fit(X_train_feat, y_train_feat)
        rf_train_time = time.time() - rf_start

        print("\nInferencing (RF)...")
        rf_test_proba = rf_model.predict_proba(X_test_feat)
        rf_test_pred = rf_model.predict(X_test_feat)

        assert list(rf_model.classes_) == list(range(N_CLASSES)), \
            f"[{fold_tag}] RF classes order mismatch: {rf_model.classes_} != {list(range(N_CLASSES))}"

        with open(models_dir / "rf_final.pkl", "wb") as f:
            pickle.dump(rf_model, f)

        df_rf_pred = pd.DataFrame({
            "window_id": df_test_meta_feat["window_id"].values,
            "subject_id": df_test_meta_feat["subject_id"].values,
            "pred_label": [INDEX_TO_CLASS[int(p)] for p in rf_test_pred],
            "true_label": [INDEX_TO_CLASS[int(y)] for y in y_test_feat],
        })
        for i in range(N_CLASSES):
            df_rf_pred[f"proba_class_{CLASS_LIST[i]}"] = rf_test_proba[:, i]

        df_rf_pred.to_parquet(predictions_dir / "rf_predictions.parquet", index=False)

        rf_acc = accuracy_score(y_test_feat, rf_test_pred)
        rf_f1 = f1_score(y_test_feat, rf_test_pred, average="macro", zero_division=0)

        print(f"\n✓ RF training complete ({fold_tag}):")
        print(f"  Train time: {rf_train_time:.2f}s")
        print(f"  Test Accuracy: {rf_acc:.4f}")
        print(f"  Test F1:       {rf_f1:.4f}")
        print("  Saved: rf_final.pkl, rf_predictions.parquet")

    # ========== 4. Train KNN ==========
    if "knn" in best_params:
        print("\n" + "=" * 60)
        print("4. Train KNN (full training set)")
        print("=" * 60)

        knn_params = best_params["knn"]["params"]
        print(f"Hyperparameters: {knn_params}")

        knn_start = time.time()
        knn_model = KNeighborsClassifier(n_jobs=-1, **knn_params)
        knn_model.fit(X_train_feat, y_train_feat)
        knn_train_time = time.time() - knn_start

        print("\nInferencing (KNN)...")
        knn_test_proba = knn_model.predict_proba(X_test_feat)
        knn_test_pred = knn_model.predict(X_test_feat)

        assert list(knn_model.classes_) == list(range(N_CLASSES)), \
            f"[{fold_tag}] KNN classes order mismatch: {knn_model.classes_} != {list(range(N_CLASSES))}"

        with open(models_dir / "knn_final.pkl", "wb") as f:
            pickle.dump(knn_model, f)

        df_knn_pred = pd.DataFrame({
            "window_id": df_test_meta_feat["window_id"].values,
            "subject_id": df_test_meta_feat["subject_id"].values,
            "pred_label": [INDEX_TO_CLASS[int(p)] for p in knn_test_pred],
            "true_label": [INDEX_TO_CLASS[int(y)] for y in y_test_feat],
        })
        for i in range(N_CLASSES):
            df_knn_pred[f"proba_class_{CLASS_LIST[i]}"] = knn_test_proba[:, i]

        df_knn_pred.to_parquet(predictions_dir / "knn_predictions.parquet", index=False)

        knn_acc = accuracy_score(y_test_feat, knn_test_pred)
        knn_f1 = f1_score(y_test_feat, knn_test_pred, average="macro", zero_division=0)

        print(f"\n✓ KNN training complete ({fold_tag}):")
        print(f"  Train time: {knn_train_time:.2f}s")
        print(f"  Test Accuracy: {knn_acc:.4f}")
        print(f"  Test F1:       {knn_f1:.4f}")
        print("  Saved: knn_final.pkl, knn_predictions.parquet")

    # ========== 5. Train InceptionTime ==========
    if "inception" in best_params:
        print("\n" + "=" * 60)
        print("5. Train InceptionTime (full training set)")
        print("=" * 60)

        inception_params = best_params["inception"]["params"]
        print(f"Hyperparameters: {inception_params}")

        pin = torch.cuda.is_available()
        train_loader = DataLoader(
            WindowDataset(X_train_deep, y_train_deep),
            batch_size=INFERENCE_CONFIG["batch_size"],
            shuffle=True,
            num_workers=INFERENCE_CONFIG["num_workers"],
            pin_memory=pin,
        )
        test_loader = DataLoader(
            WindowDataset(X_test_deep, y_test_deep),
            batch_size=INFERENCE_CONFIG["batch_size"],
            shuffle=False,
            num_workers=INFERENCE_CONFIG["num_workers"],
            pin_memory=pin,
        )

        n_channels = X_train_deep.shape[2]
        model = InceptionTime(
            n_channels=n_channels,
            n_classes=N_CLASSES,
            n_filters=inception_params["n_filters"],
            depth=inception_params["depth"],
        ).to(DEVICE)

        optimizer = torch.optim.Adam(
            model.parameters(),
            lr=inception_params["learning_rate"],
            weight_decay=1e-4,
        )
        criterion = nn.CrossEntropyLoss()

        print("\nStart training (InceptionTime)...")
        inception_start = time.time()
        max_epochs = 50
        patience = 10
        best_train_loss = float("inf")
        patience_counter = 0

        for epoch in range(max_epochs):
            model.train()
            total_loss = 0.0
            correct = 0
            total = 0

            for X_batch, y_batch in train_loader:
                X_batch = X_batch.to(DEVICE, non_blocking=True)
                y_batch = y_batch.to(DEVICE, non_blocking=True)

                optimizer.zero_grad(set_to_none=True)

                if USE_AMP_BF16:
                    with autocast(device_type="cuda", dtype=torch.bfloat16):
                        outputs = model(X_batch)
                        loss = criterion(outputs, y_batch)
                else:
                    outputs = model(X_batch)
                    loss = criterion(outputs, y_batch)

                loss.backward()
                optimizer.step()

                total_loss += loss.item() * X_batch.size(0)
                _, preds = outputs.max(1)
                total += y_batch.size(0)
                correct += preds.eq(y_batch).sum().item()

            train_loss = total_loss / total
            train_acc = correct / total

            print(f"Epoch {epoch+1}/{max_epochs}: Loss={train_loss:.4f}, Acc={train_acc:.4f}")

            if train_loss < best_train_loss - 1e-4:
                best_train_loss = train_loss
                patience_counter = 0
                torch.save(model.state_dict(), models_dir / "inception_final.pt")
            else:
                patience_counter += 1
                if patience_counter >= patience:
                    print(f"\nEarly stopping triggered at epoch {epoch+1}")
                    break

        inception_train_time = time.time() - inception_start

        # Load best model
        model.load_state_dict(torch.load(models_dir / "inception_final.pt", map_location=DEVICE))

        # Inference
        print("\nInferencing (InceptionTime)...")
        model.eval()
        all_logits = []
        all_preds = []
        all_labels = []

        with torch.no_grad():
            for X_batch, y_batch in test_loader:
                X_batch = X_batch.to(DEVICE, non_blocking=True)

                if USE_AMP_BF16:
                    with autocast(device_type="cuda", dtype=torch.bfloat16):
                        outputs = model(X_batch)
                else:
                    outputs = model(X_batch)

                all_logits.append(outputs.float().cpu().numpy())
                _, preds = outputs.max(1)
                all_preds.extend(preds.cpu().numpy())
                all_labels.extend(y_batch.cpu().numpy())

        all_logits = np.vstack(all_logits)
        all_proba = torch.softmax(torch.FloatTensor(all_logits), dim=1).numpy()
        all_preds = np.array(all_preds)
        all_labels = np.array(all_labels)

        df_inception_pred = pd.DataFrame({
            "window_id": df_test_meta_deep["window_id"].values,
            "subject_id": df_test_meta_deep["subject_id"].values,
            "pred_label": [INDEX_TO_CLASS[int(p)] for p in all_preds],
            "true_label": [INDEX_TO_CLASS[int(y)] for y in all_labels],
        })
        for i in range(N_CLASSES):
            df_inception_pred[f"logit_class_{CLASS_LIST[i]}"] = all_logits[:, i]
        for i in range(N_CLASSES):
            df_inception_pred[f"proba_class_{CLASS_LIST[i]}"] = all_proba[:, i]

        df_inception_pred.to_parquet(predictions_dir / "inception_predictions.parquet", index=False)

        inception_acc = accuracy_score(all_labels, all_preds)
        inception_f1 = f1_score(all_labels, all_preds, average="macro", zero_division=0)

        print(f"\n✓ InceptionTime training complete ({fold_tag}):")
        print(f"  Train time: {inception_train_time:.2f}s")
        print(f"  Test Accuracy: {inception_acc:.4f}")
        print(f"  Test F1:       {inception_f1:.4f}")
        print("  Saved: inception_final.pt, inception_predictions.parquet")

    # ========== 6. Save per-fold training config/results ==========
    print("\n" + "=" * 60)
    print("6. Save training config/results")
    print("=" * 60)

    final_results = {
        "fold_id": FOLD_ID,
        "fold_tag": fold_tag,
        "random_seed": RANDOM_SEED,
        "timestamp": datetime.now(timezone.utc).isoformat(),
        "inference_config": INFERENCE_CONFIG,
        "class_mapping": {
            "original_classes": CLASS_LIST,
            "mapped_indices": list(range(N_CLASSES)),
            "class_to_index": CLASS_TO_INDEX,
            "index_to_class": INDEX_TO_CLASS,
        },
        "models": {},
        "notes": [
            "Train final models with best hyperparameters from Step 14",
            "Fit on the full training set for each outer fold",
            "Run full inference on the test set",
            "Save per-window predictions (window_id → logits/proba)",
            "RF/KNN use metadata from the features directory (ensures alignment)",
            "Deep model uses metadata from the scalers directory",
            "Feature standardization consistent with Step 14 (fit on full train)",
            "Added alignment assertions (prevent order mismatch)",
            "Probability column order assertions for RF/KNN",
            "BF16 logits cast to float32 before NumPy",
        ],
    }

    if "rf" in best_params:
        final_results["models"]["rf"] = {
            "params": best_params["rf"]["params"],
            "test_accuracy": float(rf_acc),
            "test_f1": float(rf_f1),
        }

    if "knn" in best_params:
        final_results["models"]["knn"] = {
            "params": best_params["knn"]["params"],
            "test_accuracy": float(knn_acc),
            "test_f1": float(knn_f1),
        }

    if "inception" in best_params:
        final_results["models"]["inception"] = {
            "params": best_params["inception"]["params"],
            "test_accuracy": float(inception_acc),
            "test_f1": float(inception_f1),
        }

    with open(predictions_dir / "final_results.yaml", "w", encoding="utf-8") as f:
        yaml.dump(final_results, f, default_flow_style=False, allow_unicode=True)

    with open(predictions_dir / "final_results.json", "w", encoding="utf-8") as f:
        json.dump(final_results, f, indent=2)

    print(f"✓ Saved: {predictions_dir / 'final_results.yaml'}")
    print(f"✓ Saved: {predictions_dir / 'final_results.json'}")

    # Fold-level summary
    fold_summaries.append({
        "fold_id": FOLD_ID,
        "fold_tag": fold_tag,
        "rf_acc": rf_acc,
        "rf_f1": rf_f1,
        "knn_acc": knn_acc,
        "knn_f1": knn_f1,
        "inception_acc": inception_acc,
        "inception_f1": inception_f1,
    })

    print("\n" + "=" * 60)
    print(f"Fold {fold_tag} summary")
    print("=" * 60)
    if rf_acc is not None:
        print(f"  RF:         Acc={rf_acc:.4f}, F1={rf_f1:.4f}")
    if knn_acc is not None:
        print(f"  KNN:        Acc={knn_acc:.4f}, F1={knn_f1:.4f}")
    if inception_acc is not None:
        print(f"  Inception:  Acc={inception_acc:.4f}, F1={inception_f1:.4f}")
    print(f"  Predictions dir: {predictions_dir}/")

# ========== Global summary across folds ==========
print("\n" + "=" * 60)
print("Step 15 complete — Training & Inference for all folds")
print("=" * 60)

if fold_summaries:
    df_summary = pd.DataFrame(fold_summaries).sort_values("fold_id")
    print("\nPer-fold test performance:")
    print(df_summary)

    # Deep model summary if available
    if df_summary["inception_acc"].notna().any():
        print("\nInceptionTime cross-fold mean ± std:")
        print(
            f"  Acc: {df_summary['inception_acc'].mean():.4f} "
            f"± {df_summary['inception_acc'].std():.4f}"
        )
        print(
            f"  F1 : {df_summary['inception_f1'].mean():.4f} "
            f"± {df_summary['inception_f1'].std():.4f}"
        )
else:
    print("No folds were processed. Please check previous steps and tuning files.")

print("\nNext steps:")
print("  - Use predictions/*/ to build confusion matrices and calibration plots")
print("  - Aggregate per-window predictions to subject/session-level metrics if needed")
print("  - Feed final_results.json into Step 16 / evaluation scripts")
print("=" * 60)

Step 15: Training & Inference (multi-fold)

Device: cuda
BF16 mixed precision: True
Inference batch size: 128
Num workers: 4

Outer folds detected: [0, 1, 2, 3, 4, 5, 6, 7]

Outer fold: FOLD_ID=0 (fold_00)

1. Load best hyperparameters
✓ RF: {'n_estimators': 200, 'max_depth': 30, 'min_samples_split': 5}
✓ KNN: {'n_neighbors': 7, 'weights': 'distance', 'metric': 'manhattan'}
✓ InceptionTime: {'learning_rate': 0.001, 'n_filters': 32, 'depth': 6}

2. Load data and label mapping
#classes: 6
Class mapping: [1, 2, 3, 4, 5, 6] -> [0, 1, 2, 3, 4, 5]

Deep-model data (fold_00):
  Train: (4965, 150, 8)
  Test:  (766, 150, 8)

Feature-model data (fold_00):
  Train: (4965, 220)
  Test:  (766, 220)

✓ Loaded feature scaler: data/lara/mbientlab/proc/features/fold_00/scaler.pkl
  Features standardized (consistent with Step 14)

3. Train RandomForest (full training set)
Hyperparameters: {'n_estimators': 200, 'max_depth': 30, 'min_samples_split': 5}

Inferencing (RF)...

✓ RF training complete (fold_00

In [12]:
#!/usr/bin/env python3
"""
Step 16: Evaluation (Primary: Macro-F1, multi-fold)
Compute Macro-F1, Balanced Acc, Per-class F1, Confusion Matrix
BCa bootstrap 95% CI (1,000 iterations, window-level sampling)
For all outer folds found under predictions/fold_xx/.
"""

import numpy as np
import pandas as pd
from pathlib import Path
import json
import os
from collections import defaultdict

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    f1_score, balanced_accuracy_score, confusion_matrix
)
from scipy import stats

# ========== Config ==========
RANDOM_SEED = 42
N_BOOTSTRAP = 1000

print("=" * 60)
print("Step 16: Evaluation (Primary: Macro-F1, multi-fold)")
print("=" * 60)

# Path config
predictions_dir = Path("predictions")
metrics_dir = Path("metrics")
metrics_dir.mkdir(parents=True, exist_ok=True)

np.random.seed(RANDOM_SEED)

# Detect available folds from predictions directory
fold_dirs = sorted(
    d for d in predictions_dir.glob("fold_*")
    if d.is_dir()
)

if not fold_dirs:
    raise RuntimeError("No prediction folders found under 'predictions/fold_*'.")

outer_folds = []
for d in fold_dirs:
    tag = d.name  # e.g., "fold_00"
    try:
        fid = int(tag.split("_")[1])
    except (IndexError, ValueError):
        continue
    outer_folds.append((fid, tag))

outer_folds = sorted(outer_folds, key=lambda x: x[0])
print(f"\nDetected folds from predictions/: {[tag for _, tag in outer_folds]}")
print(f"Bootstrap: {N_BOOTSTRAP} iterations, window-level sampling")

# ========== 1. Helper functions ==========
def mean_ci_t(values, alpha=0.05):
    """
    Fold-level t-interval 95% CI for the mean (more stable for small n)

    Returns: (mean, ci_lower, ci_upper)
    """
    arr = np.asarray(values, dtype=float)
    n = len(arr)
    mean = float(np.mean(arr))
    if n < 2:
        return mean, None, None
    se = float(np.std(arr, ddof=1)) / np.sqrt(n)
    tcrit = stats.t.ppf(1 - alpha / 2, df=n - 1)
    ci_lower = mean - tcrit * se
    ci_upper = mean + tcrit * se
    return mean, ci_lower, ci_upper

# ========== 2. BCa Bootstrap function ==========
def bootstrap_ci(y_true, y_pred, metric_func, n_bootstrap=1000, alpha=0.05):
    """
    BCa Bootstrap 95% CI
    Sample at window level (with replacement)

    Returns: (mean, lower, upper)
    """
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    n_samples = len(y_true)
    bootstrap_scores = []

    # Bootstrap sampling
    for _ in range(n_bootstrap):
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        y_true_boot = y_true[indices]
        y_pred_boot = y_pred[indices]
        score = metric_func(y_true_boot, y_pred_boot)
        bootstrap_scores.append(score)

    bootstrap_scores = np.array(bootstrap_scores)

    # Original score
    original_score = metric_func(y_true, y_pred)

    # BCa correction
    # 1. bias correction (z0)
    n_less = np.sum(bootstrap_scores < original_score)
    p_less = n_less / n_bootstrap
    z0 = stats.norm.ppf(max(min(p_less, 0.9999), 0.0001))

    # 2. acceleration (a) - jackknife
    jackknife_scores = []
    for i in range(n_samples):
        mask = np.ones(n_samples, dtype=bool)
        mask[i] = False
        jack_score = metric_func(y_true[mask], y_pred[mask])
        jackknife_scores.append(jack_score)

    jackknife_scores = np.array(jackknife_scores)
    jack_mean = jackknife_scores.mean()
    numerator = np.sum((jack_mean - jackknife_scores) ** 3)
    denominator = 6 * (np.sum((jack_mean - jackknife_scores) ** 2) ** 1.5)
    a = numerator / denominator if denominator != 0 else 0.0

    # 3. adjusted percentiles
    z_alpha_lower = stats.norm.ppf(alpha / 2)
    z_alpha_upper = stats.norm.ppf(1 - alpha / 2)

    p_lower = stats.norm.cdf(z0 + (z0 + z_alpha_lower) / (1 - a * (z0 + z_alpha_lower)))
    p_upper = stats.norm.cdf(z0 + (z0 + z_alpha_upper) / (1 - a * (z0 + z_alpha_upper)))

    p_lower = max(min(p_lower, 0.9999), 0.0001)
    p_upper = max(min(p_upper, 0.9999), 0.0001)

    lower = np.percentile(bootstrap_scores, p_lower * 100)
    upper = np.percentile(bootstrap_scores, p_upper * 100)

    return original_score, lower, upper

# ========== 3. Evaluate a single model for a single fold ==========
def evaluate_model(pred_file, model_name, fold_id, fold_tag):
    """Evaluate a single model for a single fold."""
    print(f"\n{'=' * 60}")
    print(f"Evaluate {model_name} (fold {fold_id})")
    print(f"{'=' * 60}")

    # Load predictions
    df_pred = pd.read_parquet(pred_file)
    print(f"Loaded predictions: {len(df_pred)} windows")

    # Extract ground truth and predictions
    y_true = df_pred["true_label"].values
    y_pred = df_pred["pred_label"].values

    # Unique classes
    unique_labels = sorted(set(y_true) | set(y_pred))
    n_classes = len(unique_labels)
    print(f"#classes: {n_classes}")
    print(f"classes: {unique_labels}")

    # ========== 3.1 Basic metrics ==========
    print("\nBasic metrics:")

    # Macro-F1
    macro_f1 = f1_score(
        y_true, y_pred,
        average="macro", labels=unique_labels, zero_division=0
    )
    print(f"  Macro-F1: {macro_f1:.4f}")

    # Balanced Accuracy
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    print(f"  Balanced Acc: {balanced_acc:.4f}")

    # Per-class F1
    per_class_f1 = f1_score(
        y_true, y_pred,
        average=None, labels=unique_labels, zero_division=0
    )
    print("\n  Per-class F1:")
    for label, f1_val in zip(unique_labels, per_class_f1):
        print(f"    Class {label}: {f1_val:.4f}")

    # ========== 3.2 Bootstrap 95% CI ==========
    print(f"\nBootstrap 95% CI ({N_BOOTSTRAP} iterations):")

    macro_f1_func = lambda yt, yp: f1_score(
        yt, yp, average="macro", labels=unique_labels, zero_division=0
    )
    macro_f1_mean, macro_f1_lower, macro_f1_upper = bootstrap_ci(
        y_true, y_pred, macro_f1_func, N_BOOTSTRAP
    )
    print(f"  Macro-F1: {macro_f1_mean:.4f} [{macro_f1_lower:.4f}, {macro_f1_upper:.4f}]")

    bal_acc_func = lambda yt, yp: balanced_accuracy_score(yt, yp)
    bal_acc_mean, bal_acc_lower, bal_acc_upper = bootstrap_ci(
        y_true, y_pred, bal_acc_func, N_BOOTSTRAP
    )
    print(f"  Balanced Acc: {bal_acc_mean:.4f} [{bal_acc_lower:.4f}, {bal_acc_upper:.4f}]")

    # ========== 3.3 Confusion matrix ==========
    cm = confusion_matrix(y_true, y_pred, labels=unique_labels)

    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm, annot=True, fmt="d", cmap="Blues",
        xticklabels=unique_labels, yticklabels=unique_labels
    )
    plt.title(f"{model_name} - Confusion Matrix (Fold {fold_id})")
    plt.ylabel("True Label")
    plt.xlabel("Predicted Label")
    plt.tight_layout()

    cm_file = metrics_dir / f"confusion_{model_name.lower()}_{fold_tag}.png"
    plt.savefig(cm_file, dpi=150, bbox_inches="tight")
    plt.close()
    print(f"\n✓ Saved confusion matrix: {cm_file.name}")

    # ========== 3.4 Subject-level Macro-F1 ==========
    print("\nSubject-level evaluation:")
    subject_f1_scores = []

    for subject_id in df_pred["subject_id"].unique():
        mask = df_pred["subject_id"] == subject_id
        y_true_subj = df_pred.loc[mask, "true_label"].values
        y_pred_subj = df_pred.loc[mask, "pred_label"].values

        if len(y_true_subj) > 0:
            f1_subj = f1_score(
                y_true_subj, y_pred_subj,
                average="macro", labels=unique_labels, zero_division=0
            )
            subject_f1_scores.append(f1_subj)

    subject_macro_f1_mean = float(np.mean(subject_f1_scores)) if subject_f1_scores else 0.0
    subject_macro_f1_std = float(np.std(subject_f1_scores)) if subject_f1_scores else 0.0
    print(f"  Subject-level Macro-F1: {subject_macro_f1_mean:.4f} ± {subject_macro_f1_std:.4f}")
    print(f"  #test subjects: {len(subject_f1_scores)}")

    # ========== 3.5 Aggregate results ==========
    results = {
        "model": model_name,
        "fold_id": fold_id,
        "fold_tag": fold_tag,
        "n_windows": int(len(df_pred)),
        "n_classes": n_classes,
        "classes": [int(c) for c in unique_labels],
        "window_level": {
            "macro_f1": {
                "value": float(macro_f1),
                "ci_lower": float(macro_f1_lower),
                "ci_upper": float(macro_f1_upper),
            },
            "balanced_accuracy": {
                "value": float(balanced_acc),
                "ci_lower": float(bal_acc_lower),
                "ci_upper": float(bal_acc_upper),
            },
            "per_class_f1": {
                int(label): float(f1_val)
                for label, f1_val in zip(unique_labels, per_class_f1)
            },
        },
        "subject_level": {
            "macro_f1_mean": subject_macro_f1_mean,
            "macro_f1_std": subject_macro_f1_std,
            "n_subjects": len(subject_f1_scores),
        },
        "confusion_matrix": cm.tolist(),
        "bootstrap": {
            "n_iterations": N_BOOTSTRAP,
            "method": "BCa",
            "sampling": "window-level with replacement",
        },
    }

    return results

# ========== 4. Evaluate all models for each fold ==========
all_results_all_folds = {}  # {fold_tag: {model_name: results}}

for fold_id, fold_tag in outer_folds:
    print(f"\n{'=' * 60}")
    print(f"Evaluate models for fold {fold_id} ({fold_tag})")
    print(f"{'=' * 60}")

    pred_fold_dir = predictions_dir / fold_tag

    models_to_eval = {
        "RF": pred_fold_dir / "rf_predictions.parquet",
        "KNN": pred_fold_dir / "knn_predictions.parquet",
        "InceptionTime": pred_fold_dir / "inception_predictions.parquet",
    }

    fold_results = {}
    for model_name, pred_file in models_to_eval.items():
        if not pred_file.exists():
            print(f"\n⚠️ Skip {model_name} on {fold_tag}: file not found {pred_file}")
            continue

        results = evaluate_model(pred_file, model_name, fold_id, fold_tag)
        fold_results[model_name] = results

        # Save per-model results
        model_metrics_file = metrics_dir / f"{model_name.lower()}_{fold_tag}.json"
        with open(model_metrics_file, "w") as f:
            json.dump(results, f, indent=2)
        print(f"✓ Saved metrics: {model_metrics_file.name}")

    all_results_all_folds[fold_tag] = fold_results

    # Save aggregated fold summary
    summary = {
        "fold_id": fold_id,
        "fold_tag": fold_tag,
        "models": list(fold_results.keys()),
        "results": fold_results,
        "summary_table": {},
    }

    for model_name, results in fold_results.items():
        wl = results["window_level"]
        sl = results["subject_level"]
        summary["summary_table"][model_name] = {
            "window_macro_f1": (
                f"{wl['macro_f1']['value']:.4f} "
                f"[{wl['macro_f1']['ci_lower']:.4f}, {wl['macro_f1']['ci_upper']:.4f}]"
            ),
            "window_balanced_acc": (
                f"{wl['balanced_accuracy']['value']:.4f} "
                f"[{wl['balanced_accuracy']['ci_lower']:.4f}, {wl['balanced_accuracy']['ci_upper']:.4f}]"
            ),
            "subject_macro_f1": (
                f"{sl['macro_f1_mean']:.4f} ± {sl['macro_f1_std']:.4f}"
            ),
        }

    summary_file = metrics_dir / f"summary_{fold_tag}.json"
    with open(summary_file, "w") as f:
        json.dump(summary, f, indent=2)
    print(f"✓ Saved summary: {summary_file.name}")

# ========== 5. Cross-fold aggregation ==========
print(f"\n{'=' * 60}")
print("Cross-fold aggregation")
print(f"{'=' * 60}")

# Collect all fold results
all_folds_results = defaultdict(lambda: defaultdict(list))

for fold_file in sorted(metrics_dir.glob("summary_fold_*.json")):
    with open(fold_file, "r") as f:
        fold_data = json.load(f)

    for model_name, model_results in fold_data["results"].items():
        wl = model_results["window_level"]
        sl = model_results["subject_level"]

        all_folds_results[model_name]["window_macro_f1"].append(
            wl["macro_f1"]["value"]
        )
        all_folds_results[model_name]["balanced_acc"].append(
            wl["balanced_accuracy"]["value"]
        )
        all_folds_results[model_name]["subject_macro_f1"].append(
            sl["macro_f1_mean"]
        )

if all_folds_results:
    print(
        f"\nCross-fold summary "
        f"({len(list(metrics_dir.glob('summary_fold_*.json')))} folds):"
    )
    print(
        f"\n{'Model':<15} {'Window Macro-F1 [95% CI]':<35} "
        f"{'Subject Macro-F1 [95% CI]':<35} {'Balanced Acc [95% CI]':<35}"
    )
    print("-" * 120)

    cross_fold_summary = {}
    for model_name, metrics in all_folds_results.items():
        w_mean, w_lo, w_hi = mean_ci_t(metrics["window_macro_f1"])
        s_mean, s_lo, s_hi = mean_ci_t(metrics["subject_macro_f1"])
        b_mean, b_lo, b_hi = mean_ci_t(metrics["balanced_acc"])

        w_str = (
            f"{w_mean:.4f} [{w_lo:.4f}, {w_hi:.4f}]"
            if w_lo is not None else f"{w_mean:.4f}"
        )
        s_str = (
            f"{s_mean:.4f} [{s_lo:.4f}, {s_hi:.4f}]"
            if s_lo is not None else f"{s_mean:.4f}"
        )
        b_str = (
            f"{b_mean:.4f} [{b_lo:.4f}, {b_hi:.4f}]"
            if b_lo is not None else f"{b_mean:.4f}"
        )

        print(f"{model_name:<15} {w_str:<35} {s_str:<35} {b_str:<35}")

        cross_fold_summary[model_name] = {
            "window_macro_f1": {
                "mean": w_mean,
                "ci_lower": w_lo,
                "ci_upper": w_hi,
            },
            "subject_macro_f1": {
                "mean": s_mean,
                "ci_lower": s_lo,
                "ci_upper": s_hi,
            },
            "balanced_acc": {
                "mean": b_mean,
                "ci_lower": b_lo,
                "ci_upper": b_hi,
            },
            "n_folds": len(metrics["window_macro_f1"]),
            "ci_method": "t-interval (fold-level)",
        }

    cross_fold_file = metrics_dir / "cross_fold_summary.json"
    with open(cross_fold_file, "w") as f:
        json.dump(cross_fold_summary, f, indent=2)
    print(f"\n✓ Saved cross-fold summary: {cross_fold_file.name}")
else:
    print("\nOnly single-fold results available; run prediction for more folds to produce cross-fold summary.")

# ========== 6. Final summary ==========
print("\n" + "=" * 60)
print("Step 16 complete — Evaluation for all folds")
print("=" * 60 + "\n")

print(f"Evaluated folds: {[tag for _, tag in outer_folds]}")
print(f"Bootstrap: {N_BOOTSTRAP} iterations (BCa)")

print("\nOutput files:")
print(f"  {metrics_dir}/")
for _, fold_tag in outer_folds:
    for model_name in ["RF", "KNN", "InceptionTime"]:
        per_model_file = metrics_dir / f"{model_name.lower()}_{fold_tag}.json"
        cm_file = metrics_dir / f"confusion_{model_name.lower()}_{fold_tag}.png"
        if per_model_file.exists():
            print(f"    - {per_model_file.name}")
        if cm_file.exists():
            print(f"    - {cm_file.name}")
    summary_file = metrics_dir / f"summary_{fold_tag}.json"
    if summary_file.exists():
        print(f"    - {summary_file.name}")
if (metrics_dir / "cross_fold_summary.json").exists():
    print(f"    - cross_fold_summary.json")

print("\nEvaluation metrics:")
print("  ✓ Window-level Macro-F1 (95% CI - BCa bootstrap)")
print("  ✓ Balanced Accuracy (95% CI - BCa bootstrap)")
print("  ✓ Per-class F1")
print("  ✓ Subject-level Macro-F1 (mean ± std)")
print("  ✓ Confusion Matrix")
print("  ✓ Cross-fold means (95% CI - t-distribution)")

print("\nNext steps:")
print("  - Use cross_fold_summary.json in your report/tables")
print("  - Generate paper-ready figures (ROC/PR curves, etc.)")
print("=" * 60)

Step 16: Evaluation (Primary: Macro-F1, multi-fold)

Detected folds from predictions/: ['fold_00', 'fold_01', 'fold_02', 'fold_03', 'fold_04', 'fold_05', 'fold_06', 'fold_07']
Bootstrap: 1000 iterations, window-level sampling

Evaluate models for fold 0 (fold_00)

Evaluate RF (fold 0)
Loaded predictions: 766 windows
#classes: 6
classes: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]

Basic metrics:
  Macro-F1: 0.4154
  Balanced Acc: 0.4269

  Per-class F1:
    Class 1: 0.0588
    Class 2: 0.8393
    Class 3: 0.0000
    Class 4: 0.7742
    Class 5: 0.7223
    Class 6: 0.0976

Bootstrap 95% CI (1000 iterations):
  Macro-F1: 0.4154 [0.3790, 0.4506]
  Balanced Acc: 0.4269 [0.3853, 0.4621]

✓ Saved confusion matrix: confusion_rf_fold_00.png

Subject-level evaluation:
  Subject-level Macro-F1: 0.4154 ± 0.0000
  #test subjects: 1
✓ Saved metrics: rf_fold_00.json

Evaluate KNN (fold 0)
Loaded predictions: 766 windows
#classes: 6
classes: [np.int64(1), np.int64(2



  Balanced Acc: 0.3325 [0.2936, 0.3962]

✓ Saved confusion matrix: confusion_rf_fold_01.png

Subject-level evaluation:
  Subject-level Macro-F1: 0.3355 ± 0.0000
  #test subjects: 1
✓ Saved metrics: rf_fold_01.json

Evaluate KNN (fold 1)
Loaded predictions: 659 windows
#classes: 6
classes: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]

Basic metrics:
  Macro-F1: 0.3615
  Balanced Acc: 0.3534

  Per-class F1:
    Class 1: 0.4873
    Class 2: 0.7725
    Class 3: 0.0877
    Class 4: 0.0000
    Class 5: 0.6834
    Class 6: 0.1379

Bootstrap 95% CI (1000 iterations):
  Macro-F1: 0.3615 [0.3293, 0.3976]




  Balanced Acc: 0.3534 [0.3044, 0.4214]

✓ Saved confusion matrix: confusion_knn_fold_01.png

Subject-level evaluation:
  Subject-level Macro-F1: 0.3615 ± 0.0000
  #test subjects: 1
✓ Saved metrics: knn_fold_01.json

Evaluate InceptionTime (fold 1)
Loaded predictions: 659 windows
#classes: 6
classes: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]

Basic metrics:
  Macro-F1: 0.3816
  Balanced Acc: 0.3791

  Per-class F1:
    Class 1: 0.4027
    Class 2: 0.7717
    Class 3: 0.0396
    Class 4: 0.0000
    Class 5: 0.7374
    Class 6: 0.3380

Bootstrap 95% CI (1000 iterations):
  Macro-F1: 0.3816 [0.3503, 0.4217]




  Balanced Acc: 0.3791 [0.3310, 0.4482]

✓ Saved confusion matrix: confusion_inceptiontime_fold_01.png

Subject-level evaluation:
  Subject-level Macro-F1: 0.3816 ± 0.0000
  #test subjects: 1
✓ Saved metrics: inceptiontime_fold_01.json
✓ Saved summary: summary_fold_01.json

Evaluate models for fold 2 (fold_02)

Evaluate RF (fold 2)
Loaded predictions: 771 windows
#classes: 6
classes: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]

Basic metrics:
  Macro-F1: 0.4687
  Balanced Acc: 0.5216

  Per-class F1:
    Class 1: 0.4976
    Class 2: 0.7651
    Class 3: 0.0000
    Class 4: 0.6667
    Class 5: 0.8137
    Class 6: 0.0690

Bootstrap 95% CI (1000 iterations):
  Macro-F1: 0.4687 [0.4122, 0.5187]




  Balanced Acc: 0.5216 [0.5000, 0.5513]

✓ Saved confusion matrix: confusion_rf_fold_02.png

Subject-level evaluation:
  Subject-level Macro-F1: 0.4687 ± 0.0000
  #test subjects: 1
✓ Saved metrics: rf_fold_02.json

Evaluate KNN (fold 2)
Loaded predictions: 771 windows
#classes: 6
classes: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]

Basic metrics:
  Macro-F1: 0.4824
  Balanced Acc: 0.5243

  Per-class F1:
    Class 1: 0.4465
    Class 2: 0.7205
    Class 3: 0.1263
    Class 4: 0.7500
    Class 5: 0.7624
    Class 6: 0.0889

Bootstrap 95% CI (1000 iterations):
  Macro-F1: 0.4824 [0.4254, 0.5311]




  Balanced Acc: 0.5243 [0.4951, 0.5595]

✓ Saved confusion matrix: confusion_knn_fold_02.png

Subject-level evaluation:
  Subject-level Macro-F1: 0.4824 ± 0.0000
  #test subjects: 1
✓ Saved metrics: knn_fold_02.json

Evaluate InceptionTime (fold 2)
Loaded predictions: 771 windows
#classes: 6
classes: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]

Basic metrics:
  Macro-F1: 0.5265
  Balanced Acc: 0.5860

  Per-class F1:
    Class 1: 0.5200
    Class 2: 0.7179
    Class 3: 0.1067
    Class 4: 0.8571
    Class 5: 0.6785
    Class 6: 0.2785

Bootstrap 95% CI (1000 iterations):
  Macro-F1: 0.5265 [0.4679, 0.5676]




  Balanced Acc: 0.5860 [0.5436, 0.6288]

✓ Saved confusion matrix: confusion_inceptiontime_fold_02.png

Subject-level evaluation:
  Subject-level Macro-F1: 0.5265 ± 0.0000
  #test subjects: 1
✓ Saved metrics: inceptiontime_fold_02.json
✓ Saved summary: summary_fold_02.json

Evaluate models for fold 3 (fold_03)

Evaluate RF (fold 3)
Loaded predictions: 873 windows
#classes: 6
classes: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]

Basic metrics:
  Macro-F1: 0.4685
  Balanced Acc: 0.4769

  Per-class F1:
    Class 1: 0.4526
    Class 2: 0.8256
    Class 3: 0.0000
    Class 4: 0.8421
    Class 5: 0.6907
    Class 6: 0.0000

Bootstrap 95% CI (1000 iterations):
  Macro-F1: 0.4685 [0.4199, 0.4983]
  Balanced Acc: 0.4769 [0.4078, 0.5156]

✓ Saved confusion matrix: confusion_rf_fold_03.png

Subject-level evaluation:
  Subject-level Macro-F1: 0.4685 ± 0.0000
  #test subjects: 1
✓ Saved metrics: rf_fold_03.json

Evaluate KNN (fold 3)
Loaded predictions: 873 windo