# Training Pipeline

We do this using a 'pipeline first mentality' i.e. we want to have a production pipeline.

In [2]:
import os
import azureml.core
from azureml.core import Workspace, Experiment, Run
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep
from azureml.widgets import RunDetails
from azureml.core import Workspace, Experiment, Datastore
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.widgets import RunDetails
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule
from azureml.core.dataset import Dataset
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Experiment
from azureml.core import Environment
from azureml.core.runconfig import CondaDependencies, DEFAULT_CPU_IMAGE
from azureml.contrib.pipeline.steps import ParallelRunStep, ParallelRunConfig

print("SDK version:", azureml.core.VERSION)

SDK version: 1.0.74


## Set up workspace, datastore, experiment and compute

In [44]:
ws = Workspace(subscription_id="bbd86e7d-3602-4e6d-baa4-40ae2ad9303c", resource_group="ManyModelsSA", workspace_name="ManyModelsSAv1")
# auth = InteractiveLoginAuthentication(force=True, tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47")

# set up workspace
# ws = Workspace.from_config()
ws.get_details()

# choose a compute target
compute = AmlCompute(ws, "train-many-model")

# choose a datastore
dstore = ws.get_default_datastore()

# choose a experiment
experiment = Experiment(ws, 'automl-ojforecasting')
print(dstore.name, dstore.datastore_type, dstore.account_name, dstore.container_name)

workspaceblobstore AzureBlob manymodelssav16457539585 azureml-blobstore-77752be6-01b4-4a3e-9d42-03c9c0d6248f


## Set up run configuration

Set up the run config for experiment to run targeting different compute targets in Azure Machine Learning.

In [4]:
# create a new runconfig object
run_config = RunConfiguration()
run_config.environment.docker.enabled = True
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['sklearn','pmdarima'])

## Read the registered dataset from Workspace

We used 12,222 datasets and ParallelRunStep to build 12,222 time-series ARIMA models to predict the quantity of each store brand.

You will need to register all the datasets in the Workspace first. We uploaded our data to a blob container hence set 'Datastore' as workspaceblobstore 'Relative path' as the correspondig directory in the blob.

In [5]:
allfiledst = Dataset.get_by_name(ws, name='Allfiledatasets') 
allfiledstinput = allfiledst.as_named_input('trainallmodels')

## Set up environment 

Environment defines a collection of resources that we will need to run our Azure pipelines.

In [6]:
batch_conda_deps = CondaDependencies.create(pip_packages=['sklearn','pmdarima'])

batch_env = Environment(name="manymodels_environment")
batch_env.python.conda_dependencies = batch_conda_deps
batch_env.docker.enabled = True
batch_env.docker.base_image = DEFAULT_CPU_IMAGE

## Define ParallelRunConfig

In [45]:
workercount=2
nodecount=5
timeout=3000

output_dir = PipelineData(name="ARIMAmodels", 
                          datastore=dstore, 
                          output_path_on_compute="ARIMAmodels/")


In [46]:
datasetname='store'

tags1={}
tags1['dataset']=datasetname
tags1['nodes']=nodecount
tags1['workers-per-node']=workercount
tags1['timeout']=timeout

parallel_run_config = ParallelRunConfig(
    source_directory='./scripts',
    entry_script='train.py',
    mini_batch_size="5",
    run_invocation_timeout=timeout,
    error_threshold=10,
    output_action="summary_only",
    environment=batch_env,
    process_count_per_node=workercount,
    compute_target=compute,
    node_count=nodecount)

## Set up ParallelRunStep

We added 3 arguments that users can customize based on the prediction goal.

In [47]:
parallelrun_step = ParallelRunStep(
    name="many-models-training",
    parallel_run_config=parallel_run_config,
    inputs=[allfiledstinput],
    output=output_dir,
    models=[],
    arguments=['--target_column','Quantity', '--n_test_periods',6, '--timestamp_column','WeekStarting'],
    allow_reuse=False
)

## Submit the pipeline to run

In [48]:
pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])

run = experiment.submit(pipeline,tags=tags1)
RunDetails(run).show()

Created step many-models-training [28811bee][794039ef-279e-47b6-9fe6-6c5d0b0f5e20], (This step will run and generate new outputs)
Using data reference trainallmodels_0 for StepId [e5ddafbd][2f363fdf-cc44-4e00-b172-099c1b4048f3], (Consumers of this data are eligible to reuse prior runs.)
Submitted PipelineRun c8a4559a-aac6-4985-ae63-f9e18e29a9ca




Link to Azure Machine Learning studio: https://ml.azure.com/experiments/automl-ojforecasting/runs/c8a4559a-aac6-4985-ae63-f9e18e29a9ca?wsid=/subscriptions/bbd86e7d-3602-4e6d-baa4-40ae2ad9303c/resourcegroups/ManyModelsSA/workspaces/ManyModelsSAv1


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

In [None]:
run.wait_for_completion(show_output=True)

PipelineRunId: c8a4559a-aac6-4985-ae63-f9e18e29a9ca
Link to Portal: https://ml.azure.com/experiments/automl-ojforecasting/runs/c8a4559a-aac6-4985-ae63-f9e18e29a9ca?wsid=/subscriptions/bbd86e7d-3602-4e6d-baa4-40ae2ad9303c/resourcegroups/ManyModelsSA/workspaces/ManyModelsSAv1
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: e728447b-4532-4802-b9bc-5b637c942124
Link to Portal: https://ml.azure.com/experiments/automl-ojforecasting/runs/e728447b-4532-4802-b9bc-5b637c942124?wsid=/subscriptions/bbd86e7d-3602-4e6d-baa4-40ae2ad9303c/resourcegroups/ManyModelsSA/workspaces/ManyModelsSAv1
StepRun( many-models-training ) Status: NotStarted
StepRun( many-models-training ) Status: Queued
StepRun( many-models-training ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_6dbcb125e056bc2309a432f4037b753f10fa4069e73437bbf52b7c4fb49822f1_d.txt
2019-12-05T23:00:14Z Starting output-watcher...
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azurem

## Train script

In [12]:
%%writefile ./scripts/train.py

from azureml.core.run import Run
import pandas as pd
import os
import uuid
import argparse
import datetime

from azureml.core.model import Model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import pickle
from azureml.core import Experiment, Workspace, Run
from azureml.core import ScriptRunConfig
from entry_script_helper import EntryScriptHelper
import logging

from sklearn.externals import joblib
from joblib import dump, load
import pmdarima as pm
import time
from datetime import timedelta

thisrun = Run.get_context()

LOG_NAME = "user_log"

print("Split the data into train and test")

parser = argparse.ArgumentParser("split")
parser.add_argument("--target_column", type=str, help="input target column")
parser.add_argument("--n_test_periods", type=int, help="input number of test periods")
parser.add_argument("--timestamp_column", type=str, help="input timestamp column")

args, unknown = parser.parse_known_args()

print("Argument 1(n_test_periods): %s" % args.n_test_periods)
print("Argument 2(target_column): %s" % args.target_column)
print("Argument 3(timestamp_column): %s" % args.timestamp_column)

def init():
    EntryScriptHelper().config(LOG_NAME)
    logger = logging.getLogger(LOG_NAME)
    output_folder = os.path.join(os.environ.get("AZ_BATCHAI_INPUT_AZUREML", ""), "temp/output")
    logger.info(f"{__file__}.output_folder:{output_folder}")
    logger.info("init()")
    return

def run(input_data):
    # 0. Set up logging
    logger = logging.getLogger(LOG_NAME)
    os.makedirs('./outputs', exist_ok=True)
    resultList = []
    logger.info('processing all files')

    # 1. Read in the data file
    for idx, csv_file_path in enumerate(input_data):
        u1 = uuid.uuid4()
        mname='arima'+str(u1)[0:16]
        with thisrun.child_run(name=mname) as childrun:
            for w in range(0,5):
                thisrun.log(mname,str(w))
            date1=datetime.datetime.now()
            logger.info('starting ('+csv_file_path+') ' + str(date1))
            childrun.log(mname,'starttime-'+str(date1))

            data = pd.read_csv(csv_file_path,header=0)
            logger.info(data.head())

            # 2. Split the data into train and test sets based on dates
            data = data.set_index(args.timestamp_column)
            max_date = datetime.datetime.strptime(data.index.max(),'%Y-%m-%d')
            split_date = max_date - timedelta(days=7*args.n_test_periods)
            data.index = pd.to_datetime(data.index)
            train = data[data.index <= split_date]
            test = data[data.index > split_date]

            # 3.Train the model
            model = pm.auto_arima(train[args.target_column],
                      start_p=0,
                      start_q=0,
                      test='adf', #default stationarity test is kpps
                      max_p =3,
                      max_d = 2,
                      max_q=3,
                      m=3, #number of observations per seasonal cycle
                      #d=None,
                      seasonal=True,
                      #trend = None, # adjust this if the series have trend
                      #start_P=0,
                      #D=0,
                      information_criterion = 'aic',
                      trace=True, #prints status on the fits
                      #error_action='ignore',
                      stepwise = False, # this increments instead of doing a grid search
                      suppress_warnings = True,
                      out_of_sample_size = 16
                     )
            model = model.fit(train[args.target_column])
            logger.info('done training')

            # 4. Save the model
            logger.info(model)
            logger.info(mname)
            with open(mname, 'wb') as file:
                joblib.dump(value=model, filename=os.path.join('./outputs/', mname))

            # 5. Register the model to the workspace
            ws1 = childrun.experiment.workspace
            try:
                childrun.upload_file(mname, os.path.join('./outputs/', mname))
            except:
                logger.info('dont need to upload')
            logger.info('register model, skip the outputs prefix')
            Model.register(workspace=ws1, model_path=os.path.join('./outputs/', mname), model_name='arima_'+str(input_data).split('/')[-1][:-6], model_framework='pmdarima')
            date2=datetime.datetime.now()
            logger.info('ending ('+str(file)+') ' + str(date2))

            #6. Log some metrics
            childrun.log(mname,'endtime-'+str(date2))
            childrun.log(mname,'auc-1')
        resultList.append(True)
    return resultList

Overwriting ./scripts/train.py


## Observations

There're multiple arima_Store5_tropicana, arima_Store2_dominicks, arima_Store8_minute.maid models are built, but there is only 1 dataset for each model

## Next step

1. Conduct performance testing/analysis - determine optimal values for parameters like node_count, process_count_per_node, workercount
2. Understand and incorporate tags, tags may contain information about model name etc