# Endpoint Regressor Inference

Run endpoint-regressor inference and write per-event prediction parquet output to `data/endpoint_regressor`.


In [1]:
from pathlib import Path

import pyarrow.parquet as pq

from pioneerml.common.zenml import load_step_output
from pioneerml.common.zenml import utils as zenml_utils
from pioneerml.pipelines.inference.endpoint_regression import endpoint_regression_inference_pipeline

PROJECT_ROOT = zenml_utils.find_project_root()
zenml_utils.setup_zenml_for_notebook(root_path=PROJECT_ROOT, use_in_memory=True)


Using ZenML repository root: /workspace
Ensure this is the top-level of your repo (.zen must live here).


<zenml.client.Client at 0x737ca51738b0>

In [2]:
# Inputs

def _pick_pred(pred_dir: Path, main_path: Path) -> Path | None:
    candidates = [
        pred_dir / f"{main_path.stem}_preds.parquet",
        pred_dir / f"{main_path.stem}_preds_latest.parquet",
    ]
    for c in candidates:
        if c.exists():
            return c
    return None

main_dir = Path(PROJECT_ROOT) / "data"
main_paths = sorted(main_dir.glob("ml_output_*.parquet"))

# Example: uncomment to use fewer files
main_paths = main_paths[:1]

group_probs_dir = Path(PROJECT_ROOT) / "data" / "group_classifier"
splitter_probs_dir = Path(PROJECT_ROOT) / "data" / "group_splitter"
paired = []
for mp in main_paths:
    gp = _pick_pred(group_probs_dir, mp)
    sp = _pick_pred(splitter_probs_dir, mp)
    if gp is not None and sp is not None:
        paired.append((str(mp.resolve()), str(gp.resolve()), str(sp.resolve())))

if not paired:
    raise RuntimeError(
        "No aligned main/group-classifier/group-splitter prediction triplets found. "
        "Run upstream inference first."
    )

parquet_paths = [p[0] for p in paired]
group_probs_parquet_paths = [p[1] for p in paired]
group_splitter_parquet_paths = [p[2] for p in paired]
model_path = None  # default: latest trained model
output_dir = str((Path(PROJECT_ROOT) / "data" / "endpoint_regressor").resolve())

print(f"Input files: {len(parquet_paths)}")
print(f"Group prior files: {len(group_probs_parquet_paths)}")
print(f"Splitter prior files: {len(group_splitter_parquet_paths)}")


Input files: 1
Group prior files: 1
Splitter prior files: 1


In [3]:
# Run inference pipeline
run = endpoint_regression_inference_pipeline.with_options(enable_cache=False)(
    parquet_paths=parquet_paths,
    group_probs_parquet_paths=group_probs_parquet_paths,
    group_splitter_parquet_paths=group_splitter_parquet_paths,
    model_path=model_path,
    output_dir=output_dir,
    pipeline_config={
        "loader": {
            "config_json": {
                "mode": "inference",
                "batch_size": 64,
                "chunk_row_groups": 4,
                "chunk_workers": 0,
                "use_group_probs": True,
                "use_splitter_probs": True,
            }
        },
        "save_predictions": {"check_accuracy": False, "write_timestamped": False},
    },
)

export_info = load_step_output(run, "save_endpoint_regressor_predictions")
print(export_info)


[37mInitiating a new run for the pipeline: [0m[38;5;105mendpoint_regression_inference_pipeline[37m.[0m
[37mCaching is disabled by default for [0m[38;5;105mendpoint_regression_inference_pipeline[37m.[0m
[37mUsing user: [0m[38;5;105mdefault[37m[0m
[37mUsing stack: [0m[38;5;105mdefault[37m[0m
[37m  deployer: [0m[38;5;105mdefault[37m[0m
[37m  artifact_store: [0m[38;5;105mdefault[37m[0m
[37m  orchestrator: [0m[38;5;105mdefault[37m[0m
[37mYou can visualize your pipeline runs in the [0m[38;5;105mZenML Dashboard[37m. In order to try it locally, please run [0m[38;5;105mzenml login --local[37m.[0m
[37mStep [0m[38;5;105mload_endpoint_regressor_inference_inputs[37m has started.[0m
[37mStep [0m[38;5;105mload_endpoint_regressor_inference_inputs[37m has finished in [0m[38;5;105m0.438s[37m.[0m
[37mStep [0m[38;5;105mload_endpoint_regressor_model[37m has started.[0m
[37mStep [0m[38;5;105mload_endpoint_regressor_model[37m has finished in [

In [4]:
# Inspect export outputs
predictions_paths = [Path(p) for p in (export_info.get("predictions_paths") or [])]
if not predictions_paths and export_info.get("predictions_path"):
    predictions_paths = [Path(export_info["predictions_path"])]
metrics_path = Path(export_info["metrics_path"])

print("predictions_paths:")
for p in predictions_paths:
    print(" ", p)
print("metrics:", metrics_path)
print(metrics_path.read_text())


predictions_paths:
  /workspace/data/endpoint_regressor/ml_output_000_preds.parquet
metrics: /workspace/data/endpoint_regressor/metrics_latest.json
{
  "loss": null,
  "mae": null,
  "mode": "endpoint_regressor",
  "model_path": "/workspace/trained_models/endpoint_regressor/endpoint_regressor_20260218_235111_torchscript.pt",
  "output_path": "/workspace/data/endpoint_regressor/ml_output_000_preds.parquet",
  "output_paths": [
    "/workspace/data/endpoint_regressor/ml_output_000_preds.parquet"
  ],
  "prediction_dim": 18,
  "validated_files": [
    "/workspace/data/ml_output_000.parquet"
  ],
  "validated_group_probs_files": [
    "/workspace/data/group_classifier/ml_output_000_preds.parquet"
  ],
  "validated_group_splitter_files": [
    "/workspace/data/group_splitter/ml_output_000_preds.parquet"
  ]
}


In [5]:
# Optional: verify parquet schema + small sample (avoids loading full file)
import gc
import pyarrow as pa
import pyarrow.parquet as pq

if not predictions_paths:
    raise RuntimeError("No prediction parquet files were exported.")

pf = pq.ParquetFile(predictions_paths[0])
print("file:", predictions_paths[0])
print("rows:", pf.metadata.num_rows)
print(pf.schema_arrow)

if pf.num_row_groups > 0:
    sample = pf.read_row_group(0).slice(0, 3)
    print(sample)
else:
    sample = None
    print("No row groups found.")

# Release notebook-held references after inspection
del sample, pf
gc.collect()
pa.default_memory_pool().release_unused()


file: /workspace/data/endpoint_regressor/ml_output_000_preds.parquet
rows: 1024
event_id: int64
time_group_ids: list<element: int64>
  child 0, element: int64
pred_group_start_x: list<element: float>
  child 0, element: float
pred_group_start_x_q16: list<element: float>
  child 0, element: float
pred_group_start_x_q50: list<element: float>
  child 0, element: float
pred_group_start_x_q84: list<element: float>
  child 0, element: float
pred_group_start_y: list<element: float>
  child 0, element: float
pred_group_start_y_q16: list<element: float>
  child 0, element: float
pred_group_start_y_q50: list<element: float>
  child 0, element: float
pred_group_start_y_q84: list<element: float>
  child 0, element: float
pred_group_start_z: list<element: float>
  child 0, element: float
pred_group_start_z_q16: list<element: float>
  child 0, element: float
pred_group_start_z_q50: list<element: float>
  child 0, element: float
pred_group_start_z_q84: list<element: float>
  child 0, element: float
p