In [2]:
#authentication
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Workspace

ia = InteractiveLoginAuthentication(tenant_id='16b3c013-d300-468d-ac64-7eda0820b6d3')

# You can find tenant id under azure active directory->properties
ws = Workspace.get(name='Prod',
                     subscription_id='fe38c376-b42a-4741-9e7c-f5d7c31e5873',
                     resource_group='ProdRG',auth=ia)

In [3]:
# compute target - you can use different compute targets in different steps in the pipeline

pipeline_cluster = "Demo-Compute-Cluster"
compute_target = ws.compute_targets[pipeline_cluster]

In [4]:
import pandas as pd

data = pd.read_csv(
    "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
)
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,57,technician,married,high.school,no,no,yes,cellular,may,mon,...,1,999,1,failure,-1.8,92.893,-46.2,1.299,5099.1,no
1,55,unknown,married,unknown,unknown,yes,no,telephone,may,thu,...,2,999,0,nonexistent,1.1,93.994,-36.4,4.86,5191.0,no
2,33,blue-collar,married,basic.9y,no,no,no,cellular,may,fri,...,1,999,1,failure,-1.8,92.893,-46.2,1.313,5099.1,no
3,36,admin.,married,high.school,no,no,no,telephone,jun,fri,...,4,999,0,nonexistent,1.4,94.465,-41.8,4.967,5228.1,no
4,27,housemaid,married,high.school,no,yes,no,cellular,jul,fri,...,2,999,0,nonexistent,1.4,93.918,-42.7,4.963,5228.1,no


In [5]:
#we will store intermediate data between data preparation step and autoML step in the default datastore of the workspace.
datastore = ws.get_default_datastore()

In [6]:
from azureml.data.dataset_factory import TabularDatasetFactory

dataset = TabularDatasetFactory.register_pandas_dataframe(
    data, target=(datastore, "dataset"), name="AutoMLE2ETraininggPipeline_Classification_dataset"
)
#I creat dataset from pandas dataframe.
#Other way: You can create  tabular dataset by using ds=Dataset.Tabular.from_delimited_files(path=xxx) and register it with ds.register()

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to dataset/d9276c74-4f62-4ac2-86d6-918c30ef620b/
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'emp.var.rate' -> 'emp_var_rate'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'cons.price.idx' -> 'cons_price_idx'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'cons.conf.idx' -> 'cons_conf_idx'
Column header contains '.' This period will be translated to '_' as we write the data out to parquet files: 'nr.employed' -> 'nr_employed'
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [7]:
#Target: Configure the training run- Environment setup

from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

#env = Environment.get(workspace=ws, name='experiment_env', version='2')
curated_env=Environment.get(workspace=ws, name='AzureML-sklearn-1.0-ubuntu20.04-py38-cpu')
pipeline_run_config=RunConfiguration()
pipeline_run_config.environment=curated_env

# Add conda dependencies to the automl env:
#pipeline_run_config.environment.python.conda_dependencies = CondaDependencies.create(
 #conda_packages=['pandas'])  

In [8]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep
from azureml.core.dataset import Dataset

#1st way: OutputFileDatasetConfig without giving destination
#prepped_output_path = OutputFileDatasetConfig(name="output_path") 
# OutputFileDatasetConfig  points a directory and CSV file will be written there. Also given as parameter below.
# If you do not give destination parameter, it will copy the output to the workspaceblobstore datastore, under the path /dataset/{run-id}/{output-name}. 
# Run-id is the name value on the overview of the job and outputname is the name given as a parameter. In my example, it is output_path.

#2nd way: OutputFileDatasetConfig with giving destination
output_path = (datastore, f"azureml/classification_prep_output/")
prepped_output_path = OutputFileDatasetConfig(destination = output_path)

input_ds = Dataset.get_by_name(ws, 'AutoMLE2ETraininggPipeline_Classification_dataset')

prep_step=PythonScriptStep(
    name="Prepare AutoML Classification",
    script_name="prepare.py",
    source_directory="./Scripts",
    arguments=["--output_path",prepped_output_path],
    inputs=[input_ds.as_named_input('AutoMLE2ETraininggPipeline_Classification_dataset')],
    compute_target=compute_target,
    runconfig=pipeline_run_config
)

In [9]:
#In an ML pipeline, the input data must be a Dataset object.

prepped_test_data=prepped_output_path.read_delimited_files("prepped_train_data_classification.csv") # This is the test data to send automlstep
prepped_train_data = prepped_output_path.read_delimited_files("prepped_test_data_classification.csv")

from azureml.pipeline.core import TrainingOutput,PipelineData

metrics_data = PipelineData(name='metrics_data',
                            datastore=datastore,
                            pipeline_output_name='metrics_output',
                            training_output=TrainingOutput(type='Metrics'))

model_data = PipelineData(name='best_model_data',
                          datastore=datastore,
                          pipeline_output_name='model_output',
                          training_output=TrainingOutput(type='Model'))
# two pipelinedata objects for automl outputs: metrics and the model.


In [10]:
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep

# Change iterations to a reasonable number (50) to get better accuracy
automl_settings = {
    "iteration_timeout_minutes" : 60,
    "iterations" : 50,
    "experiment_timeout_hours" : 6,
    "primary_metric" : 'AUC_weighted'
}
#the run will stop after 50 iterations or 60 minutes, whichever comes first.

automl_config = AutoMLConfig(task = 'classification',
                             path = '.',
                             debug_log = 'automated_ml_errors.log',
                             compute_target= compute_target,
                           #  run_configuration = pipeline_run_config,
                             featurization = 'auto',
                             training_data = prepped_train_data,
                             test_data=prepped_test_data,
                             label_column_name = 'y',
                             **automl_settings)
# debug_log local file if you want to see logs

train_step = AutoMLStep(name='AutoML_Classification',
    automl_config=automl_config,
    outputs=[metrics_data,model_data],
    enable_default_model_output=False,
    enable_default_metrics_output=False,
    allow_reuse=True)

In [11]:
from azureml.pipeline.core.graph import PipelineParameter

# The model name with which to register the trained model in the workspace.
model_name = PipelineParameter("model_name", default_value="AutoML_Classification_Test")

register_step = PythonScriptStep(
     name="register_model",
     script_name="register_model.py",
     source_directory="./Scripts",
     allow_reuse=False,
     arguments=["--model_name", model_name, "--model_path", model_data],
     inputs=[model_data],
     compute_target=compute_target,
     runconfig=pipeline_run_config)

In [12]:
from azureml.pipeline.core import Pipeline
from azureml.core import Experiment

experiment = Experiment(ws, name= "automl-classification-E2E_trainingPipeline")

pipeline = Pipeline(ws, [prep_step, train_step,register_step])

pipeline_run = experiment.submit(pipeline, show_output=True)
pipeline_run.wait_for_completion()

Created step Prepare AutoML Classification [b354caf9][2db5abe8-c8cb-4c82-9a47-26411bbe182e], (This step will run and generate new outputs)
Created step AutoML_Classification [d8fffa46][57de53a6-6000-40a8-a857-84817049f3da], (This step will run and generate new outputs)
Created step register_model [a03833c7][1ed8d80e-6e5e-40d4-bd5c-cace3f57056e], (This step will run and generate new outputs)
Submitted PipelineRun 97ec7268-0bf0-48c1-91df-d5c557886f4a
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/97ec7268-0bf0-48c1-91df-d5c557886f4a?wsid=/subscriptions/fe38c376-b42a-4741-9e7c-f5d7c31e5873/resourcegroups/ProdRG/workspaces/Prod&tid=16b3c013-d300-468d-ac64-7eda0820b6d3
PipelineRunId: 97ec7268-0bf0-48c1-91df-d5c557886f4a
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/97ec7268-0bf0-48c1-91df-d5c557886f4a?wsid=/subscriptions/fe38c376-b42a-4741-9e7c-f5d7c31e5873/resourcegroups/ProdRG/workspaces/Prod&tid=16b3c013-d300-468d-ac64-7eda0820b6d3
PipelineRun Status:

KeyError: 'savedId'

In [21]:
published_pipeline = pipeline_run.publish_pipeline(
                                                    name='Centrica-Workshop-Training-Pipeline',
                                                    description='Training Pipeline - Classification',
                                                    version='1.0' 
                                                   )

published_pipeline

Name,Id,Status,Endpoint
Centrica-Workshop-Training-Pipeline,1a8bf1f2-e7f3-40d8-bdb7-070df45cfa68,Active,REST Endpoint


In [14]:
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)

https://uksouth.api.azureml.ms/pipelines/v1.0/subscriptions/fe38c376-b42a-4741-9e7c-f5d7c31e5873/resourceGroups/ProdRG/providers/Microsoft.MachineLearningServices/workspaces/Prod/PipelineRuns/PipelineSubmit/0f37f225-e312-4e62-a79b-5896bf9a5c0c


In [15]:
from azureml.pipeline.core import Schedule,ScheduleRecurrence

In [16]:
# Use active_only=False to get all schedules including disabled schedules
schedules = Schedule.list(ws, active_only=True) 
print("Your workspace has the following schedules set up:")
for schedule in schedules:
    print("{} (Published pipeline: {}".format(schedule.id, schedule.pipeline_id))

Your workspace has the following schedules set up:


In [17]:
weekly=ScheduleRecurrence(frequency='Week',interval=1)

pipeline_schedule=Schedule.create(ws,
                                name='weekly predictions',
                                pipeline_id=published_pipeline.id,
                                experiment_name='test',
                                recurrence=weekly)

In [19]:
schedules = Schedule.list(ws, pipeline_id=published_pipeline.id)

# We will iterate through the list of schedules and disable
print("Found these schedules for the pipeline id {}:".format(published_pipeline.id))
for schedule in schedules: 
    print(schedule.id)
    fetched_schedule = Schedule.get(ws, schedule.id)
    fetched_schedule.disable(wait_for_provisioning=True)
    print("Disabled schedule {}. New status is: {}".format(fetched_schedule.id, fetched_schedule.status))

Found these schedules for the pipeline id 0f37f225-e312-4e62-a79b-5896bf9a5c0c:
b468f29d-0d95-4882-a07b-97bbb5eb7951
Provisioning status: Completed
Disabled schedule b468f29d-0d95-4882-a07b-97bbb5eb7951. New status is: Disabled


In [22]:
p = published_pipeline.get(ws, id=published_pipeline.id)
p.disable()

In [None]:
'''#Create the schedule recurrence
recurrence = ScheduleRecurrence(
frequency=schedule_frequency,
interval=schedule_interval,
week_days=schedule_week_days,
time_of_day=schedule_time_of_day)

#Create the schedule
recurring_schedule = Schedule.create(
ws, name=schedule_name,
description=schedule_desc,
pipeline_id=pipeline_id,
experiment_name=experiment_name,
recurrence=recurrence)'''