# Group Classifier Inference

Run the inference pipeline and write prediction parquet output to `data/group_classifier`.

In [1]:
from pathlib import Path

from pioneerml.common.zenml import load_step_output
from pioneerml.common.zenml import utils as zenml_utils
from pioneerml.pipelines.inference.group_classification import group_classification_inference_pipeline

PROJECT_ROOT = zenml_utils.find_project_root()
zenml_utils.setup_zenml_for_notebook(root_path=PROJECT_ROOT, use_in_memory=True)


Using ZenML repository root: /workspace
Ensure this is the top-level of your repo (.zen must live here).


<zenml.client.Client at 0x79925dca56f0>

In [2]:
# Inputs
data_dir = Path(PROJECT_ROOT) / "data"
parquet_paths = sorted(data_dir.glob("ml_output_*.parquet"))
# parquet_paths = parquet_paths[:1]  # optional subset
parquet_paths = [str(p.resolve()) for p in parquet_paths]
if not parquet_paths:
    raise RuntimeError(...)

model_path = None # Uses latest model path. Example to use custom model path: str((PROJECT_ROOT / 'trained_models' / 'groupclassifier' / 'groupclassifier_20260208_223249_torchscript.pt').resolve())
output_dir = str((PROJECT_ROOT / 'data' / 'group_classifier').resolve())


In [3]:
# Run inference pipeline
# Ensure JSON-serializable pipeline inputs
parquet_paths = [str(p) for p in parquet_paths]

run = group_classification_inference_pipeline.with_options(enable_cache=False)(
    parquet_paths=parquet_paths,
    model_path=model_path,
    output_dir=output_dir,
    pipeline_config={
        'loader': {'config_json': {'batch_size': 64, 'chunk_row_groups': 4, 'chunk_workers': 0}},
        'inference': {'threshold': 0.5},
        'export': {'check_accuracy': False, 'write_timestamped': False},
    },
)


[37mInitiating a new run for the pipeline: [0m[38;5;105mgroup_classification_inference_pipeline[37m.[0m
[37mCaching is disabled by default for [0m[38;5;105mgroup_classification_inference_pipeline[37m.[0m
[37mUsing user: [0m[38;5;105mdefault[37m[0m
[37mUsing stack: [0m[38;5;105mdefault[37m[0m
[37m  deployer: [0m[38;5;105mdefault[37m[0m
[37m  artifact_store: [0m[38;5;105mdefault[37m[0m
[37m  orchestrator: [0m[38;5;105mdefault[37m[0m
[37mYou can visualize your pipeline runs in the [0m[38;5;105mZenML Dashboard[37m. In order to try it locally, please run [0m[38;5;105mzenml login --local[37m.[0m
[37mStep [0m[38;5;105mload_group_classifier_inference_inputs[37m has started.[0m
[37mStep [0m[38;5;105mload_group_classifier_inference_inputs[37m has finished in [0m[38;5;105m0.236s[37m.[0m
[37mStep [0m[38;5;105mload_group_classifier_model[37m has started.[0m
[37mStep [0m[38;5;105mload_group_classifier_model[37m has finished in [0m[38

In [4]:
# Inspect export outputs
export = load_step_output(run, 'export_group_classifier_predictions')
print('export:', export)

predictions_paths = [Path(p) for p in (export.get('predictions_paths') or [])]
if not predictions_paths and export.get('predictions_path'):
    predictions_paths = [Path(export['predictions_path'])]
metrics_path = Path(export['metrics_path'])

print('predictions_paths:')
for p in predictions_paths:
    print(' ', p)
print('metrics_path:', metrics_path)


export: {'predictions_path': None, 'predictions_paths': ['/workspace/data/group_classifier/ml_output_000_preds.parquet', '/workspace/data/group_classifier/ml_output_001_preds.parquet', '/workspace/data/group_classifier/ml_output_002_preds.parquet', '/workspace/data/group_classifier/ml_output_003_preds.parquet', '/workspace/data/group_classifier/ml_output_004_preds.parquet'], 'metrics_path': '/workspace/data/group_classifier/metrics_latest.json', 'timestamped_predictions_path': None, 'timestamped_predictions_paths': [], 'timestamped_metrics_path': None, 'num_rows': 4950} {'predictions_path': None, 'predictions_paths': ['/workspace/data/group_classifier/ml_output_000_preds.parquet', '/workspace/data/group_classifier/ml_output_001_preds.parquet', '/workspace/data/group_classifier/ml_output_002_preds.parquet', '/workspace/data/group_classifier/ml_output_003_preds.parquet', '/workspace/data/group_classifier/ml_output_004_preds.parquet'], 'metrics_path': '/workspace/data/group_classifier/met

In [5]:
# Optional: verify parquet columns (first predictions file)
import pyarrow.parquet as pq

if not predictions_paths:
    raise RuntimeError('No prediction parquet files were exported.')

tbl = pq.read_table(predictions_paths[0])
print('file:', predictions_paths[0])
print('columns:', tbl.column_names)
print('rows:', tbl.num_rows)
print('first row lens:', len(tbl['pred_pion'][0].as_py()), len(tbl['pred_muon'][0].as_py()), len(tbl['pred_mip'][0].as_py()))


file: /workspace/data/group_classifier/ml_output_000_preds.parquet /workspace/data/group_classifier/ml_output_000_preds.parquet
columns: ['pred_pion', 'pred_muon', 'pred_mip'] ['pred_pion', 'pred_muon', 'pred_mip']
rows: 1024 1024
first row lens: 3 3 3 3 3 3
