# Create & Run AML Train Pieline

In [1]:
from azure.ai.ml import command, dsl, Input, Output, MLClient
from azure.identity import DefaultAzureCredential

In [2]:
cpu_compute_target = "paolt-run-cpu-vm"
environment_name = "DataBook-Env"
environment_ver = "3"

In [3]:
data_prep_component = command(
    name="data_prep_dbcc",
    display_name="Data preparation for training",
    inputs={"data": Input(type="uri_file", mode="ro_mount"),
        "ranges": Input(type="uri_file", mode="ro_mount"),
        "test_train_ratio": Input(type="number")},
    outputs={"train_data" : Output(type="uri_folder", mode="rw_mount"),
        "test_data" : Output(type="uri_folder", mode="rw_mount")},
    code="./scripts",
    command="""python data-prep-stage-script.py \
            --data ${{inputs.data}} --ranges ${{inputs.ranges}} --test_train_ratio ${{inputs.test_train_ratio}} \
            --train_data ${{outputs.train_data}} --test_data ${{outputs.test_data}} \
            """,
    environment=f"{environment_name}:{environment_ver}",
)

In [4]:
train_component = command(
    name="train_dbcc",
    display_name="Train classifier",
    inputs={"train_data" : Input(type="uri_folder", mode="ro_mount"),
        "test_data" : Input(type="uri_folder", mode="ro_mount")},
    outputs={"model_path": Output(type="uri_folder", mode="rw_mount")},
    code="./scripts",
    command="""python train-stage-script.py \
            --train_data ${{inputs.train_data}} \
            --test_data ${{inputs.test_data}} \
            --model_path ${{outputs.model_path}}
            """,
    environment=f"{environment_name}:{environment_ver}",
)

In [5]:
@dsl.pipeline(
    compute=cpu_compute_target,
    description="DBCC data_perp-train pipeline",
)
def dbcc_pipeline(
    pipeline_job_data_input,
    pipeline_job_ranges_input,
    pipeline_job_test_train_ratio=0.25,
):
    # using data_prep_function like a python call with its own inputs
    data_prep_job = data_prep_component(
        data=pipeline_job_data_input,
        ranges=pipeline_job_ranges_input,
        test_train_ratio=pipeline_job_test_train_ratio,
    )

    # using train_func like a python call with its own inputs
    train_job = train_component(
        train_data=data_prep_job.outputs.train_data,
        test_data=data_prep_job.outputs.test_data,
    )

    # a pipeline returns a dictionary of outputs
    # keys will code for the pipeline output identifier
    return {
        "pipeline_job_train_data": data_prep_job.outputs.train_data,
        "pipeline_job_test_data": data_prep_job.outputs.test_data,
        "pipeline_job_model": train_job.outputs.model_path,
    }

In [6]:
pipeline = dbcc_pipeline(
    pipeline_job_data_input=Input(type="uri_file", path="azureml://datastores/workspaceblobstore/paths/dbcc_data/Standard_Databook_06_07_2022.csv.json", mode="ro_mount"),
    pipeline_job_ranges_input=Input(type="uri_file", path="azureml://datastores/workspaceblobstore/paths/dbcc_data/areas.txt", mode="ro_mount"),
)

In [7]:
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential())
ml_client

Found the config file in: /mnt/batch/tasks/shared/LS_root/mounts/clusters/paolt-dev-cpu-vm/code/Users/paolt/databook_v1/.azureml/config.json


MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7f999fd1f910>,
         subscription_id=fadb1e32-9c96-4180-be9d-1811f4687cca,
         resource_group_name=paolt-ml-v2,
         workspace_name=paolt-ml-v2)

In [8]:
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    experiment_name="dbcc-test",
)
pipeline_job

[32mUploading scripts (0.01 MBs): 100%|██████████| 7123/7123 [00:00<00:00, 146084.47it/s]
[39m



Experiment,Name,Type,Status,Details Page
dbcc-test,elated_library_srll63wvh7,pipeline,Preparing,Link to Azure Machine Learning studio
