# Phase 1: Training Orchestration

This notebook orchestrates all training activities without performing local computation.

## Overview

- **Step 1**: Load Centralized Configs
- **Step 2**: Data Ingestion & Versioning (Asset Layer)
- **Step 3**: Environment Definition
- **Step 4**: The Dry Run
- **Step 5**: The Sweep (HPO)
- **Step 6**: Best Configuration Selection (Automated)
- **Step 7**: Final Training (Post-HPO, Single Run)

## Important

- This notebook **only submits and monitors Azure ML jobs**
- **No training logic** is executed locally
- All computation happens remotely on Azure ML compute
- The notebook must be **re-runnable end-to-end**


## Step P1-3.1: Load Centralized Configs

Load and validate all configuration files. Configs are immutable and will be logged with each job for reproducibility.


In [2]:
# !pip install azureml-mlflow --quiet

In [3]:
import os
from pathlib import Path
from typing import Dict, Any

from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from dotenv import load_dotenv

# Ensure we can import the orchestration package from src/
import sys
sys.path.append(str((Path("..") / "src").resolve()))

from shared.yaml_utils import load_yaml
from orchestration.config_loader import (
    ExperimentConfig,
    create_config_metadata,
    load_all_configs,
    load_experiment_config,
    compute_config_hashes,
    snapshot_configs,
    validate_config_immutability,
)


env_path = Path("../config.env")
if env_path.exists():
    load_dotenv(env_path)


In [4]:
# Helper utilities to work with experiment stages and AML experiment names

def get_stage_config(stage_name: str) -> Dict[str, Any]:
    """Return stage configuration from the experiment YAML (or empty dict)."""
    return getattr(experiment_config, "stages", {}).get(stage_name, {}) or {}


def build_aml_experiment_name(stage_name: str, backbone: str | None = None) -> str:
    """Derive AML experiment name from stage config and optional backbone.

    Falls back to the legacy ``configs['env']['logging']['experiment_name']``
    if the stage configuration is missing.
    """
    stage_cfg = get_stage_config(stage_name)
    base_name = stage_cfg.get("aml_experiment") or configs["env"]["logging"]["experiment_name"]

    naming_cfg = getattr(experiment_config, "naming", {}) or {}
    include_backbone = bool(naming_cfg.get("include_backbone_in_experiment", False))

    if include_backbone and backbone:
        return f"{base_name}-{backbone}"
    return base_name



In [5]:
CONFIG_DIR = Path("../config")

# Experiment selection (switch to try different data/model/HPO/env combos)
EXPERIMENT_NAME = "resume_ner_baseline"

# Resolve experiment-level config into concrete file paths
experiment_config: ExperimentConfig = load_experiment_config(CONFIG_DIR, EXPERIMENT_NAME)
configs = load_all_configs(experiment_config)
config_hashes = compute_config_hashes(configs)

# Immutable snapshots for runtime mutation checks
original_configs = snapshot_configs(configs)


In [6]:
# Reuse shared immutability validator from orchestration package
validate_config_immutability(configs, original_configs)


In [7]:
def get_workspace_name() -> str:
    """Get workspace name from configuration files.

    Order of precedence:
    1. ``config/infrastructure.yaml`` (``workspace.name``)
    2. ``config/env/azure.yaml`` (``workspace.name`` under ``env`` config)
    """
    infrastructure_config_path = Path("../config/infrastructure.yaml")
    if infrastructure_config_path.exists():
        infrastructure_config = load_yaml(infrastructure_config_path)
        return infrastructure_config["workspace"]["name"]

    env_workspace = configs["env"].get("workspace", {}).get("name")
    if env_workspace:
        return env_workspace

    raise ValueError(
        "Workspace name must be configured in either "
        "config/infrastructure.yaml (workspace.name) or config/env/azure.yaml (workspace.name)."
    )


subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID")
resource_group = os.getenv("AZURE_RESOURCE_GROUP")

if not subscription_id or not resource_group:
    raise ValueError("AZURE_SUBSCRIPTION_ID and AZURE_RESOURCE_GROUP must be set")

workspace_name = get_workspace_name()
credential = DefaultAzureCredential()
ml_client = MLClient(
    credential=credential,
    subscription_id=subscription_id,
    resource_group_name=resource_group,
    workspace_name=workspace_name,
)


Class DeploymentTemplateOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


All configs and their hashes will be attached to each Azure ML job for full reproducibility.


In [8]:
# Build config metadata for job tagging using shared helper from
# `orchestration.config_loader`.
config_metadata = create_config_metadata(configs, config_hashes)


## Step P1-3.2: Data Ingestion & Versioning (Asset Layer)

Upload dataset to Blob Storage and register as an Azure ML Data Asset for versioned, immutable data access.


In [None]:
from orchestration.data_assets import (
    resolve_dataset_path,
    register_data_asset,
    ensure_data_asset_uploaded,
    build_data_asset_reference,
)

# Resolve local dataset path from data config (configs["data"]["local_path"])
DATASET_LOCAL_PATH = resolve_dataset_path(configs["data"])
DATA_ASSET_NAME = configs["data"]["name"]
DATA_ASSET_VERSION = configs["data"]["version"]


In [None]:
DATA_ASSET_OVERRIDE_PATH = None
blob_uri = DATA_ASSET_OVERRIDE_PATH or str(DATASET_LOCAL_PATH)


In [None]:
data_asset = register_data_asset(
    ml_client=ml_client,
    name=DATA_ASSET_NAME,
    version=DATA_ASSET_VERSION,
    uri=blob_uri,
    description=configs["data"]["description"],
)

# Best-effort upload of local content to the resolved data asset
data_asset = ensure_data_asset_uploaded(
    ml_client=ml_client,
    data_asset=data_asset,
    local_path=DATASET_LOCAL_PATH,
    description=configs["data"]["description"],
)

# Build shared references for downstream jobs
asset_paths = build_data_asset_reference(ml_client, data_asset)
asset_reference = asset_paths["asset_uri"]
datastore_path = asset_paths["datastore_path"]


### Troubleshooting

If you encounter `ScriptExecution.StreamAccess.NotFound`, verify that:
1. Compute cluster has managed identity assigned
2. Managed identity has "Storage Blob Data Reader" role on storage account
3. Storage account firewall allows Azure services


In [None]:
import json
from pathlib import Path

# Save data asset info to a JSON file
data_asset_cache_file = Path("data_asset_cache.json")

if 'data_asset' in globals() and data_asset is not None:
    data_asset_info = {
        "name": data_asset.name,
        "version": data_asset.version,
    }
    
    # Optionally save asset_paths if they're used directly
    if 'asset_paths' in globals():
        data_asset_info["asset_paths"] = asset_paths
    
    with open(data_asset_cache_file, "w") as f:
        json.dump(data_asset_info, f, indent=2)
    
    print(f"Saved data asset: {data_asset_info['name']} v{data_asset_info['version']} to {data_asset_cache_file}")
else:
    print("No data asset to save")

In [9]:
import json
from pathlib import Path
from orchestration.data_assets import build_data_asset_reference

# Try to reload from cache
data_asset_cache_file = Path("data_asset_cache.json")

if data_asset_cache_file.exists():
    with open(data_asset_cache_file, "r") as f:
        data_asset_info = json.load(f)
    
    try:
        # Reload Data asset object from ML client
        data_asset = ml_client.data.get(
            name=data_asset_info["name"],
            version=data_asset_info["version"]
        )
        
        # Rebuild asset_paths if they were saved, otherwise regenerate them
        if "asset_paths" in data_asset_info:
            asset_paths = data_asset_info["asset_paths"]
        else:
            asset_paths = build_data_asset_reference(ml_client, data_asset)
        
        asset_reference = asset_paths["asset_uri"]
        datastore_path = asset_paths["datastore_path"]
        
        print(f"Loaded data asset: {data_asset.name} v{data_asset.version}")
        print(f"Asset URI: {asset_reference}")
        print("Skipping data asset registration - using cached asset")
    except Exception as e:
        print(f"Warning: Could not load data asset {data_asset_info['name']} v{data_asset_info['version']}: {e}")
        print("Will need to register data asset again")
        data_asset = None
else:
    print(f"Cache file {data_asset_cache_file} not found. Will need to register data asset.")
    data_asset = None

Loaded data asset: resume-ner-data-tiny-short vv2.1
Asset URI: azureml:resume-ner-data-tiny-short:v2.1
Skipping data asset registration - using cached asset


## Step P1-3.3: Environment Definition

Define a stable execution environment (Docker image + Conda dependencies) for consistent behavior across all training jobs.


In [None]:
from orchestration.environment import (
    build_environment_config,
    create_training_environment,
    prepare_environment_image,
)

# Build environment configuration from env.yaml (with sensible defaults)
env_config = build_environment_config(CONFIG_DIR, configs["env"])

# Materialize or fetch the Azure ML Environment
training_environment = create_training_environment(ml_client, env_config)

# Trigger a small warm-up job so the image is built/cached before real work
prepare_environment_image(
    ml_client=ml_client,
    environment=training_environment,
    compute_cluster=configs["env"]["compute"]["training_cluster"],
    env_config=env_config,
)


In [None]:
import json
from pathlib import Path

# Save environment info to a JSON file
env_cache_file = Path("training_environment_cache.json")

if 'training_environment' in globals() and training_environment is not None:
    env_data = {
        "name": training_environment.name,
        "version": training_environment.version,
    }
    
    with open(env_cache_file, "w") as f:
        json.dump(env_data, f, indent=2)
    
    print(f"Saved training environment: {env_data['name']} v{env_data['version']} to {env_cache_file}")
else:
    print("No training environment to save")

In [10]:
import json
from pathlib import Path

# Try to reload from cache
env_cache_file = Path("training_environment_cache.json")

if env_cache_file.exists():
    with open(env_cache_file, "r") as f:
        env_data = json.load(f)
    
    try:
        # Reload Environment object from ML client
        training_environment = ml_client.environments.get(
            name=env_data["name"],
            version=env_data["version"]
        )
        print(f"Loaded training environment: {training_environment.name} v{training_environment.version}")
        print("Skipping environment setup - using cached environment")
    except Exception as e:
        print(f"Warning: Could not load environment {env_data['name']} v{env_data['version']}: {e}")
        print("Will need to create environment again")
        training_environment = None
else:
    print(f"Cache file {env_cache_file} not found. Will need to create environment.")
    training_environment = None

Loaded training environment: resume-ner-training vv71b2f952c412701e
Skipping environment setup - using cached environment


## Step P1-3.4: The Dry Run

Submit a minimal sweep job using `smoke.yaml` to validate the sweep mechanism and pipeline integrity before launching the production HPO sweep.


In [None]:
from orchestration.jobs import (
    create_dry_run_sweep_job_for_backbone,
    submit_and_wait_for_job,
    validate_sweep_job,
)

TRAINING_SCRIPT_PATH = Path("../src/train.py")


In [None]:
compute_cluster_name = configs["env"]["compute"]["training_cluster"]

try:
    compute_cluster = ml_client.compute.get(compute_cluster_name)
    if compute_cluster.provisioning_state != "Succeeded":
        raise ValueError(f"Compute cluster not ready: {compute_cluster.provisioning_state}")
except Exception as e:
    raise RuntimeError(f"Compute cluster '{compute_cluster_name}' not accessible: {e}")

stage_name = "smoke"
smoke_hpo_config = configs["hpo"]

# Backbones are controlled by the HPO config file (single source of truth)
backbone_values = smoke_hpo_config["search_space"]["backbone"]["values"]

dry_run_sweep_jobs = {}

for backbone in backbone_values:
    aml_experiment_name = build_aml_experiment_name(stage_name, backbone)
    dry_run_sweep_jobs[backbone] = create_dry_run_sweep_job_for_backbone(
        script_path=TRAINING_SCRIPT_PATH,
        data_asset=data_asset,
        environment=training_environment,
        compute_cluster=compute_cluster_name,
        backbone=backbone,
        smoke_hpo_config=smoke_hpo_config,
        configs=configs,
        config_metadata=config_metadata,
        aml_experiment_name=aml_experiment_name,
        stage=stage_name,
    )


In [None]:
for backbone, sweep_job in dry_run_sweep_jobs.items():
    completed_job = submit_and_wait_for_job(ml_client, sweep_job)
    validate_sweep_job(
        job=completed_job,
        backbone=backbone,
        job_type="Dry run sweep",
        ml_client=ml_client,
    )


## Step P1-3.5: The Sweep (HPO)

Submit a hyperparameter optimization sweep to systematically search for the best model configuration.

**Note**: Currently using `smoke.yaml` for demonstration purposes (CPU-only setup). For production with GPU, switch to `prod.yaml` in the configuration.


In [11]:
from orchestration.jobs import (
    create_hpo_sweep_job_for_backbone,
    submit_and_wait_for_job,
    validate_sweep_job,
)

TRAINING_SCRIPT_PATH = Path("../src/train.py")


  mlflow.mismatch._check_version_mismatch()


In [12]:
compute_cluster_name = configs["env"]["compute"]["training_cluster"]

try:
    compute_cluster = ml_client.compute.get(compute_cluster_name)
    if compute_cluster.provisioning_state != "Succeeded":
        raise ValueError(f"Compute cluster not ready: {compute_cluster.provisioning_state}")
except Exception as e:
    raise RuntimeError(f"Compute cluster '{compute_cluster_name}' not accessible: {e}")

stage_name = "hpo"
hpo_config = configs["hpo"]
backbone_values = configs["hpo"]["search_space"]["backbone"]["values"]
hpo_sweep_jobs = {}

for backbone in backbone_values:
    aml_experiment_name = build_aml_experiment_name(stage_name, backbone)
    hpo_sweep_jobs[backbone] = create_hpo_sweep_job_for_backbone(
        script_path=TRAINING_SCRIPT_PATH,
        data_asset=data_asset,
        environment=training_environment,
        compute_cluster=compute_cluster_name,
        hpo_config=hpo_config,
        backbone=backbone,
        configs=configs,
        config_metadata=config_metadata,
        aml_experiment_name=aml_experiment_name,
        stage=stage_name,
    )


In [13]:
hpo_completed_jobs = {}

for backbone, sweep_job in hpo_sweep_jobs.items():
    completed_job = submit_and_wait_for_job(ml_client, sweep_job)
    validate_sweep_job(
        job=completed_job,
        backbone=backbone,
        job_type="HPO sweep",
        min_expected_trials=2,
        ml_client=ml_client,
    )
    hpo_completed_jobs[backbone] = completed_job


[32mUploading resume-ner-azureml (0.14 MBs): 100%|██████████| 140574/140574 [00:03<00:00, 37795.43it/s]
[39m



RunId: sad_zebra_z15v7pqsxp
Web View: https://ml.azure.com/runs/sad_zebra_z15v7pqsxp?wsid=/subscriptions/a23fa87c-802c-4fdf-9e59-e3d7969bcf31/resourcegroups/resume_ner_2025-12-14-13-17-35/workspaces/resume-ner-ws

Streaming azureml-logs/hyperdrive.txt

[2025-12-17T00:01:52.5212817Z][GENERATOR][DEBUG]Sampled 2 jobs from search space 
[2025-12-17T00:01:53.0153393Z][SCHEDULER][INFO]Scheduling job, id='sad_zebra_z15v7pqsxp_1' 
[2025-12-17T00:01:52.9168314Z][SCHEDULER][INFO]Scheduling job, id='sad_zebra_z15v7pqsxp_0' 
[2025-12-17T00:01:53.4662215Z][SCHEDULER][INFO]Successfully scheduled a job. Id='sad_zebra_z15v7pqsxp_1' 
[2025-12-17T00:01:53.4799303Z][SCHEDULER][INFO]Successfully scheduled a job. Id='sad_zebra_z15v7pqsxp_0' 
[2025-12-17T00:02:23.0877346Z][GENERATOR][DEBUG]Setting all jobs generated as True, reason : Max number of jobs reached 
[2025-12-17T00:11:27.5047024Z][CONTROLLER][INFO]Changing Run Status from Running to Completed 

Execution Summary
RunId: sad_zebra_z15v7pqsxp
Web Vi

In [16]:
import json
from pathlib import Path

# Save job names to a JSON file
hpo_jobs_cache_file = Path("hpo_completed_jobs_cache.json")

if hpo_completed_jobs:
    # Extract job names (which are strings and easily serializable)
    hpo_jobs_data = {
        backbone: {
            "job_name": job.name,
            "job_id": job.id,
        }
        for backbone, job in hpo_completed_jobs.items()
    }
    
    with open(hpo_jobs_cache_file, "w") as f:
        json.dump(hpo_jobs_data, f, indent=2)
    
    print(f"Saved {len(hpo_jobs_data)} HPO job references to {hpo_jobs_cache_file}")
else:
    print("No HPO completed jobs to save")

Saved 1 HPO job references to hpo_completed_jobs_cache.json


In [17]:
import json
from pathlib import Path

# Try to reload from cache
hpo_jobs_cache_file = Path("hpo_completed_jobs_cache.json")

if hpo_jobs_cache_file.exists():
    with open(hpo_jobs_cache_file, "r") as f:
        hpo_jobs_data = json.load(f)
    
    # Reload Job objects from ML client
    hpo_completed_jobs = {}
    for backbone, job_info in hpo_jobs_data.items():
        try:
            job = ml_client.jobs.get(job_info["job_name"])
            hpo_completed_jobs[backbone] = job
            print(f"Loaded HPO job for {backbone}: {job.name} (status: {job.status})")
        except Exception as e:
            print(f"Warning: Could not load job {job_info['job_name']} for {backbone}: {e}")
    
    if hpo_completed_jobs:
        print(f"\nSuccessfully reloaded {len(hpo_completed_jobs)} HPO completed jobs from cache")
    else:
        print("No valid jobs found in cache, will need to run HPO again")
        hpo_completed_jobs = {}
else:
    print(f"Cache file {hpo_jobs_cache_file} not found. Will need to run HPO.")
    hpo_completed_jobs = {}

Loaded HPO job for deberta: sad_zebra_z15v7pqsxp (status: Completed)

Successfully reloaded 1 HPO completed jobs from cache


## Step P1-3.6: Best Configuration Selection (Automated)

Programmatically select the best configuration from all HPO sweep runs across all backbone models.


In [18]:
# Reload selection helpers to pick up code changes without restarting the kernel
import importlib
import orchestration.jobs.selection
importlib.reload(orchestration.jobs.selection)

from orchestration.jobs import (
    get_best_trial_from_sweep,
    extract_trial_configuration,
    select_best_configuration,
)

BEST_CONFIG_KEY = "best_config"


In [19]:
# Select the best configuration from all HPO sweep runs
best_configuration = select_best_configuration(
    ml_client=ml_client,
    hpo_completed_jobs=hpo_completed_jobs,
    hpo_config=configs["hpo"],
    dataset_version=configs["data"]["version"],
)


## Step P1-3.7: Final Training (Post-HPO, Single Run)

Train the final production model using the best configuration from HPO with stable, controlled conditions.


In [None]:
FINAL_TRAINING_JOB_NAME = "final-training"
RANDOM_SEED = 42


In [None]:
def build_final_training_config(
    best_config: Dict[str, Any],
    train_config: Dict[str, Any],
) -> Dict[str, Any]:
    """
    Build final training configuration by merging best HPO config with train.yaml defaults.
    
    Args:
        best_config: Best configuration from HPO selection
        train_config: Training defaults from train.yaml
        
    Returns:
        dict: Final training configuration
    """
    hyperparameters = best_config.get("hyperparameters", {})
    
    final_config = {
        "backbone": best_config["backbone"],
        "learning_rate": hyperparameters.get("learning_rate", train_config["training"]["learning_rate"]),
        "batch_size": hyperparameters.get("batch_size", train_config["training"]["batch_size"]),
        "dropout": hyperparameters.get("dropout", train_config["training"].get("dropout", 0.1)),
        "weight_decay": hyperparameters.get("weight_decay", train_config["training"]["weight_decay"]),
        "epochs": train_config["training"]["epochs"],
        "random_seed": RANDOM_SEED,
        "early_stopping_enabled": False,
        "use_combined_data": True,
    }
    
    return final_config


final_training_config = build_final_training_config(best_configuration, configs["train"])


In [None]:
def create_final_training_job(
    ml_client: MLClient,
    script_path: Path,
    data_asset: Data,
    environment: Environment,
    compute_cluster: str,
    final_config: Dict[str, Any],
    configs: Dict[str, Any],
    config_metadata: Dict[str, str],
) -> command:
    """
    Create final training Azure ML Command Job with best HPO configuration.
    
    Args:
        ml_client: MLClient instance
        script_path: Path to training script
        data_asset: Registered data asset
        environment: Training environment
        compute_cluster: Compute cluster name
        final_config: Final training configuration
        configs: Configuration dictionaries
        config_metadata: Configuration metadata for tagging
        
    Returns:
        command: Azure ML Command Job definition
        
    Raises:
        FileNotFoundError: If training script does not exist
    """
    if not script_path.exists():
        raise FileNotFoundError(f"Training script not found: {script_path}")
    
    command_args = (
        f"--data-asset ${{{{inputs.data}}}} "
        f"--config-dir ../config "
        f"--backbone {final_config['backbone']} "
        f"--learning-rate {final_config['learning_rate']} "
        f"--batch-size {final_config['batch_size']} "
        f"--dropout {final_config['dropout']} "
        f"--weight-decay {final_config['weight_decay']} "
        f"--epochs {final_config['epochs']} "
        f"--random-seed {final_config['random_seed']} "
        f"--early-stopping-enabled {str(final_config['early_stopping_enabled']).lower()} "
        f"--use-combined-data {str(final_config['use_combined_data']).lower()}"
    )
    
    default_datastore = ml_client.datastores.get_default()
    if "/paths/" in data_asset.path:
        relative_path = data_asset.path.split("/paths/", 1)[1].rstrip('/')
        data_path = f"azureml://datastores/{default_datastore.name}/paths/{relative_path}"
    else:
        data_path = data_asset.path.rstrip('/')
    
    data_input = Input(type="uri_folder", path=data_path)
    
    job = command(
        code="../src",
        command=f"python {script_path.name} {command_args}",
        inputs={
            "data": data_input,
        },
        environment=environment,
        compute=compute_cluster,
        experiment_name=configs["env"]["logging"]["experiment_name"],
        tags={
            **config_metadata,
            "job_type": "final_training",
            "backbone": final_config["backbone"],
            "best_trial": best_configuration["trial_name"],
            "best_metric_value": str(best_configuration["selection_criteria"]["best_value"]),
        },
        display_name=FINAL_TRAINING_JOB_NAME,
        description="Final production training with best HPO configuration",
    )
    
    return job


final_training_job = create_final_training_job(
    ml_client=ml_client,
    script_path=TRAINING_SCRIPT_PATH,
    data_asset=data_asset,
    environment=training_environment,
    compute_cluster=compute_cluster_name,
    final_config=final_training_config,
    configs=configs,
    config_metadata=config_metadata,
)


In [None]:
def validate_final_training_job(job: Job) -> None:
    """
    Validate final training job completed successfully with required outputs.
    
    Args:
        job: Completed job instance
        
    Raises:
        ValueError: If validation fails
    """
    if job.status != "Completed":
        raise ValueError(f"Final training job failed with status: {job.status}")
    
    if not hasattr(job, "outputs") or not job.outputs:
        raise ValueError("Final training job produced no outputs")
    
    required_outputs = ["checkpoint"]
    for output_name in required_outputs:
        if output_name not in job.outputs:
            raise ValueError(f"Final training job missing required output: {output_name}")


final_training_completed_job = submit_and_wait_for_job(ml_client, final_training_job)
validate_final_training_job(final_training_completed_job)


## Step P1-4: Model Conversion & Optimization

Convert the final training checkpoint to an optimized ONNX model (int8 quantized) for production inference.


In [None]:
CONVERSION_SCRIPT_PATH = Path("../src/convert_to_onnx.py")
CONVERSION_JOB_NAME = "model-conversion"


In [None]:
def get_checkpoint_output_from_training_job(training_job: Job):
    """
    Get checkpoint output object from completed training job.
    
    Args:
        training_job: Completed training job
        
    Returns:
        Checkpoint output object
        
    Raises:
        ValueError: If checkpoint not found in job outputs
    """
    if not hasattr(training_job, "outputs") or not training_job.outputs:
        raise ValueError("Training job produced no outputs")
    
    if "checkpoint" not in training_job.outputs:
        raise ValueError("Training job missing 'checkpoint' output")
    
    return training_job.outputs["checkpoint"]


checkpoint_output = get_checkpoint_output_from_training_job(final_training_completed_job)


In [None]:
def create_conversion_job(
    ml_client: MLClient,
    script_path: Path,
    checkpoint_output,
    environment: Environment,
    compute_cluster: str,
    configs: Dict[str, Any],
    config_metadata: Dict[str, str],
    best_config: Dict[str, Any],
) -> command:
    """
    Create Azure ML Command Job for model conversion to ONNX with int8 quantization.
    
    Args:
        ml_client: MLClient instance
        script_path: Path to conversion script
        checkpoint_output: Checkpoint output from training job
        environment: Training environment (reused for conversion)
        compute_cluster: CPU compute cluster name
        configs: Configuration dictionaries
        config_metadata: Configuration metadata for tagging
        best_config: Best configuration from HPO selection
        
    Returns:
        command: Azure ML Command Job definition
        
    Raises:
        FileNotFoundError: If conversion script does not exist
    """
    if not script_path.exists():
        raise FileNotFoundError(f"Conversion script not found: {script_path}")
    
    command_args = (
        f"--checkpoint-path ${{{{inputs.checkpoint}}}} "
        f"--config-dir ../config "
        f"--backbone {best_config['backbone']} "
        f"--output-dir ${{{{outputs.onnx_model}}}} "
        f"--quantize-int8 "
        f"--run-smoke-test"
    )
    
    job = command(
        code="../src",
        command=f"python {script_path.name} {command_args}",
        inputs={
            "checkpoint": checkpoint_output,
        },
        outputs={
            "onnx_model": None,
        },
        environment=environment,
        compute=compute_cluster,
        experiment_name=configs["env"]["logging"]["experiment_name"],
        tags={
            **config_metadata,
            "job_type": "model_conversion",
            "backbone": best_config["backbone"],
            "source_training_job": final_training_completed_job.name,
            "quantization": "int8",
        },
        display_name=CONVERSION_JOB_NAME,
        description="Convert PyTorch checkpoint to optimized ONNX model (int8 quantized)",
    )
    
    return job


conversion_cluster_name = configs["env"]["compute"]["conversion_cluster"]
conversion_job = create_conversion_job(
    ml_client=ml_client,
    script_path=CONVERSION_SCRIPT_PATH,
    checkpoint_output=checkpoint_output,
    environment=training_environment,
    compute_cluster=conversion_cluster_name,
    configs=configs,
    config_metadata=config_metadata,
    best_config=best_configuration,
)


In [None]:
def validate_conversion_job(job: Job) -> None:
    """
    Validate conversion job completed successfully with required ONNX model output.
    
    Args:
        job: Completed job instance
        
    Raises:
        ValueError: If validation fails
    """
    if job.status != "Completed":
        raise ValueError(f"Conversion job failed with status: {job.status}")
    
    if not hasattr(job, "outputs") or not job.outputs:
        raise ValueError("Conversion job produced no outputs")
    
    if "onnx_model" not in job.outputs:
        raise ValueError("Conversion job missing required output: onnx_model")
    
    onnx_output = job.outputs["onnx_model"]
    if hasattr(onnx_output, "path"):
        onnx_path = onnx_output.path
    elif isinstance(onnx_output, str):
        onnx_path = onnx_output
    else:
        raise ValueError(f"Unexpected ONNX output type: {type(onnx_output)}")
    
    if not onnx_path or not onnx_path.endswith(".onnx"):
        raise ValueError(f"Invalid ONNX model path: {onnx_path}")


conversion_completed_job = submit_and_wait_for_job(ml_client, conversion_job)
validate_conversion_job(conversion_completed_job)


## Step P1-5: Model Registration (The Handover)

Register the optimized ONNX model in Azure ML Model Registry with full metadata for production deployment.


In [None]:
from azure.ai.ml.entities import Model
from azure.core.exceptions import ResourceNotFoundError

MODEL_NAME = "resume-ner-onnx"
PROD_STAGE = "prod"


In [None]:
def get_onnx_model_path(conversion_job: Job) -> str:
    """
    Get ONNX model path from completed conversion job.
    
    Args:
        conversion_job: Completed conversion job
        
    Returns:
        str: ONNX model path (Azure ML datastore URI)
        
    Raises:
        ValueError: If ONNX model not found in job outputs
    """
    if not hasattr(conversion_job, "outputs") or not conversion_job.outputs:
        raise ValueError("Conversion job produced no outputs")
    
    if "onnx_model" not in conversion_job.outputs:
        raise ValueError("Conversion job missing 'onnx_model' output")
    
    onnx_output = conversion_job.outputs["onnx_model"]
    
    if hasattr(onnx_output, "path"):
        return onnx_output.path
    elif isinstance(onnx_output, str):
        return onnx_output
    else:
        raise ValueError(f"Unexpected ONNX output type: {type(onnx_output)}")


onnx_model_path = get_onnx_model_path(conversion_completed_job)


In [None]:
def compute_model_version(best_config: Dict[str, Any], config_hashes: Dict[str, str]) -> str:
    """
    Compute deterministic model version from configuration hashes.
    
    Args:
        best_config: Best configuration from HPO selection
        config_hashes: Configuration hashes dictionary
        
    Returns:
        str: Model version string
    """
    version_components = [
        config_hashes["data"],
        config_hashes["model"],
        config_hashes["train"],
        best_config["backbone"],
    ]
    version_str = "_".join(version_components)
    version_hash = hashlib.sha256(version_str.encode()).hexdigest()[:CONFIG_HASH_LENGTH]
    return f"v{version_hash}"


model_version = compute_model_version(best_configuration, config_hashes)


In [None]:
def register_production_model(
    ml_client: MLClient,
    model_name: str,
    model_version: str,
    model_path: str,
    best_config: Dict[str, Any],
    configs: Dict[str, Any],
    config_metadata: Dict[str, str],
) -> Model:
    """
    Register optimized ONNX model in Azure ML Model Registry.
    
    Args:
        ml_client: MLClient instance
        model_name: Model name in registry
        model_version: Model version
        model_path: Path to ONNX model (Azure ML datastore URI)
        best_config: Best configuration from HPO selection
        configs: Configuration dictionaries
        config_metadata: Configuration metadata for tagging
        
    Returns:
        Model: Registered model instance
        
    Raises:
        ValueError: If model path is invalid
    """
    if not model_path or not model_path.endswith(".onnx"):
        raise ValueError(f"Invalid ONNX model path: {model_path}")
    
    selection_criteria = best_config["selection_criteria"]
    
    model_description = (
        f"Production ONNX model for Resume NER. "
        f"Backbone: {selection_criteria['backbone']}, "
        f"Metric: {selection_criteria['metric']}={selection_criteria['best_value']:.4f}"
    )
    
    model_tags = {
        **config_metadata,
        "stage": PROD_STAGE,
        "backbone": selection_criteria["backbone"],
        "metric": selection_criteria["metric"],
        "metric_value": str(selection_criteria["best_value"]),
        "dataset_version": best_config["dataset_version"],
        "model_format": "onnx",
        "quantization": "int8",
        "source_training_job": final_training_completed_job.name,
        "source_conversion_job": conversion_completed_job.name,
    }
    
    model = Model(
        name=model_name,
        version=model_version,
        description=model_description,
        path=model_path,
        tags=model_tags,
    )
    
    try:
        existing_model = ml_client.models.get(name=model_name, version=model_version)
        return existing_model
    except ResourceNotFoundError:
        return ml_client.models.create_or_update(model)


registered_model = register_production_model(
    ml_client=ml_client,
    model_name=MODEL_NAME,
    model_version=model_version,
    model_path=onnx_model_path,
    best_config=best_configuration,
    configs=configs,
    config_metadata=config_metadata,
)


In [None]:
def validate_registered_model(model: Model) -> None:
    """
    Validate registered model has required metadata and tags.
    
    Args:
        model: Registered model instance
        
    Raises:
        ValueError: If validation fails
    """
    required_tags = ["stage", "backbone", "metric", "dataset_version"]
    for tag in required_tags:
        if tag not in model.tags:
            raise ValueError(f"Registered model missing required tag: {tag}")
    
    if model.tags.get("stage") != PROD_STAGE:
        raise ValueError(f"Model stage must be '{PROD_STAGE}', got: {model.tags.get('stage')}")
    
    if not model.path or not model.path.endswith(".onnx"):
        raise ValueError(f"Invalid model path: {model.path}")


validate_registered_model(registered_model)


In [None]:
# Model registration completed successfully
