# Run Pipeline

Main entry point. Configure variables below, then run to generate and optionally execute the bash script.

In [1]:
from pathlib import Path
import json
import subprocess

RUN_NAME = "demo_run"
DATA_ROOT = "data/project_5year"
SPLIT_MODE = "simple"  # or "forward"
MODELS_TO_RUN = ["lgbm","ridge","elasticnet","rf","extra_trees","torch_linear","torch_mlp"]
N_WORKERS = 4
DO_BUILD_FEATURES = True
DO_RUN_TRAINING = True
SAVE_PREDS = False
USE_FEAT = True
GPU_ID = 2  # default A100; set None for CPU
SAMPLE_DAYS_PER_YEAR = 20  # set 0 for full data
PARALLEL_MODELS = 2
N_JOBS_PER_MODEL = 16

In [2]:
run_dir = Path('res/experiments') / RUN_NAME
script_dir = Path('scripts') / RUN_NAME
run_dir.mkdir(parents=True, exist_ok=True)
script_dir.mkdir(parents=True, exist_ok=True)


In [3]:
cmds = []

if DO_BUILD_FEATURES:
    cmds.append(
        f"python -m src.features.build_features --data_root {DATA_ROOT} --n_workers {N_WORKERS} --run_name {RUN_NAME}"
    )

if DO_RUN_TRAINING:
    models = ",".join(MODELS_TO_RUN)
    cmd = (
        f"python -m src.train.run_experiment --data_root {DATA_ROOT} --run_name {RUN_NAME} --split_mode {SPLIT_MODE} --models {models} "
        + (" --use_feat" if USE_FEAT else "")
        + (" --save_preds" if SAVE_PREDS else "")
        + (f" --gpu_id {GPU_ID}" if GPU_ID is not None else "")
        + (f" --sample_days_per_year {SAMPLE_DAYS_PER_YEAR}" if SAMPLE_DAYS_PER_YEAR else "")
        + (f" --parallel_models {PARALLEL_MODELS}" if PARALLEL_MODELS else "")
        + (f" --n_jobs {N_JOBS_PER_MODEL}" if N_JOBS_PER_MODEL else "")
    )
    cmds.append(cmd)

script_path = script_dir / "run_all.sh"
script_path.write_text("#!/usr/bin/env bash
set -euo pipefail
" + "
".join(cmds) + "
")
script_path.chmod(0o755)
script_path


PosixPath('scripts/baseline_run/run_all.sh')

In [4]:
# Optional: execute the script
# subprocess.run([str(script_path)], check=True)

In [None]:
# Load results if available
import pandas as pd
metrics_path = run_dir / 'metrics.csv'
if metrics_path.exists():
    display(pd.read_csv(metrics_path))
