Copyright (c) Rahul Kumar. All rights reserved.

Licensed under the MIT License.

.

# Test-Train Data Split using PRS
This notebook demonstrates how to carry out the test train split for larget dataset using ``ParallelRunStep`` class.

## Workspace

In [None]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
dstore = ws.get_default_datastore()

experiment = Experiment(ws, "train_test_split")

## Compute

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute

# Name your cluster
compute_name = "general-purpose"


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("Found compute target: " + compute_name)
else:
    print("Creating a new compute target...")
    provisioning_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_DS3_V2", max_nodes=10
    )
    # Create the compute target
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

    # Can poll for a minimum number of nodes and for a specific timeout.
    # If no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20
    )

    # For a more detailed view of current cluster status, use the 'status' property
    print(compute_target.status.serialize())

## Run Configuration

In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Environment

aml_run_config = RunConfiguration()
aml_run_config.target = compute_target

USE_CURATED_ENV = True
if USE_CURATED_ENV:
    curated_environment = Environment.get(
        workspace=ws, name="AzureML-sklearn-0.24-ubuntu18.04-py37-cpu"
    )
    aml_run_config.environment = curated_environment
else:
    aml_run_config.environment.python.user_managed_dependencies = False

    # Add some packages relied on by data prep step
    aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
        conda_packages=["pandas", "scikit-learn"],
        pip_packages=[
            "azureml-sdk",
            "azureml-core",
            "azureml-dataset-runtime[fuse,pandas]",
        ],
        pin_sdk_version=False,
    )

## Data
- The original dataset is present in the default datastore of the workspace under ``my_files/original/``.
- The data after splitting will be stored in the default datastore of the workspace under ``my_files/split/``.

In [None]:
from azureml.core import Datastore, Dataset
from azureml.data.output_dataset_config import OutputFileDatasetConfig

input_ds = Dataset.File.from_files(
    path=dstore.path("my_files/original/*"), validate=False
)

output_dir = OutputFileDatasetConfig(
    name="split", destination=(dstore, "my_files/split")
).as_upload()

## Configure PRS

In [None]:
from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep

parallel_run_config = ParallelRunConfig(
    source_directory="../scripts",
    entry_script="test_train_split_prs.py",
    mini_batch_size="1",
    error_threshold=10,
    output_action="append_row",
    environment=aml_run_config.environment,
    compute_target=compute_target,
    node_count=2,
    process_count_per_node=1,
)

parallel_run_step = ParallelRunStep(
    name="test train split prs",
    parallel_run_config=parallel_run_config,
    inputs=[input_ds.as_named_input("input_ds")],
    arguments=["--train_folder", "train1", "--test_folder", "test1"],
    output=output_dir,
    allow_reuse=False,
)

## Configure and Submit the experiment

In [None]:
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=parallel_run_step)
run = experiment.submit(pipeline)