# Run Pipeline

Main entry point. Configure variables below, then run to generate and optionally execute the bash script.

In [23]:
from pathlib import Path
import json
import subprocess

def find_repo_root() -> Path:
    cur = Path().resolve()
    for _ in range(6):
        if (cur / 'README.md').exists() and (cur / 'src').exists():
            return cur
        cur = cur.parent
    return Path().resolve()

PROJECT_ROOT = find_repo_root()
RUN_NAME = "demo_run"
DATA_ROOT = str(PROJECT_ROOT / "data" / "project_5year")
SPLIT_MODE = "simple"  # or "forward"
LABEL_MODE = "raw"  # raw | winsor_csz | neu_winsor_csz
WINSORIZE_BY_DATE = True
ZSCORE_BY_DATE = False
Q_LOW = 0.01
Q_HIGH = 0.99
MIN_N = 50
EPS = 1e-12
POSTPROCESS_PIPELINE = "none"  # or neutral_then_z / z_then_neutral
USE_PRED_Z = False
USE_NEUTRALIZE = False
MODELS_TO_RUN = ["lgbm","ridge","elasticnet","rf","extra_trees","torch_mlp","catboost","xgb"]
N_WORKERS = 4
DO_BUILD_FEATURES = True
DO_RUN_TRAINING = True
SAVE_PREDS = False
USE_FEAT = True
GPU_ID = 2  # default A100; set None for CPU
SAMPLE_DAYS_PER_YEAR = 20  # set 0 for full data
PARALLEL_MODELS = 2
N_JOBS_PER_MODEL = 16

In [24]:
run_dir = PROJECT_ROOT / 'res' / 'experiments' / RUN_NAME
script_dir = PROJECT_ROOT / 'scripts' / RUN_NAME
run_dir.mkdir(parents=True, exist_ok=True)
script_dir.mkdir(parents=True, exist_ok=True)


In [25]:
cmds = []

if DO_BUILD_FEATURES:
    cmds.append(
        f"python -m src.features.build_features --data_root {DATA_ROOT} --n_workers {N_WORKERS} --run_name {RUN_NAME}"
    )

if DO_RUN_TRAINING:
    models = ",".join(MODELS_TO_RUN)
    cmd = (
        f"python -m src.train.run_experiment --data_root {DATA_ROOT} --run_name {RUN_NAME} --split_mode {SPLIT_MODE} --models {models} "
        + (" --use_feat" if USE_FEAT else "")
        + (" --save_preds" if SAVE_PREDS else "")
        + (f" --gpu_id {GPU_ID}" if GPU_ID is not None else "")
        + (f" --sample_days_per_year {SAMPLE_DAYS_PER_YEAR}" if SAMPLE_DAYS_PER_YEAR else "")
        + (f" --parallel_models {PARALLEL_MODELS}" if PARALLEL_MODELS else "")
        + (f" --n_jobs {N_JOBS_PER_MODEL}" if N_JOBS_PER_MODEL else "")
        + (f" --label_mode {LABEL_MODE}")
        + (" --winsorize_by_date" if WINSORIZE_BY_DATE else "")
        + (" --zscore_by_date" if ZSCORE_BY_DATE else "")
        + (f" --q_low {Q_LOW} --q_high {Q_HIGH} --min_n {MIN_N} --eps {EPS}")
        + (f" --postprocess_pipeline {POSTPROCESS_PIPELINE}")
        + (" --use_pred_z" if USE_PRED_Z else "")
        + (" --use_neutralize" if USE_NEUTRALIZE else "")
    )
    cmds.append(cmd)

script_path = script_dir / "run_all.sh"
content = "#!/usr/bin/env bash\nset -euo pipefail\n" + "\n".join(cmds) + "\n"
script_path.write_text(content)
script_path.chmod(0o755)
script_path


PosixPath('/data1/hyzhang/Projects/intreviews_2026/quant_yanfu/scripts/demo_run/run_all.sh')

In [26]:
# Optional: execute the script
# subprocess.run([str(script_path)], check=True)

In [None]:
# Load results if available
import pandas as pd
metrics_path = run_dir / 'metrics.csv'
if metrics_path.exists():
    display(pd.read_csv(metrics_path))
