# Tutorial #3: Create Azure Machine Learning Pipelines and model deployment.
In this tutorial, we build a pipeline with data dependancy.

The sequential steps are constructed in the pipeline as follows:
 1. Prepare Data
 2. Model Training
 3. Evaluate Model
 4. Register Model
    
The scripts created here can be used for setting up an [Azure MLOps](https://github.com/microsoft/MLOpsPython/blob/master/docs/getting_started.md).  
       
This tutorial also covers how to create a scoring script that receives data submitted to a deployed web service and passes it to the model, and  then takes the response returned by the model and returns that to the client.

Finally, deploy the model and scoring script as Azure Container Instances webservice and Azure Kubernetes Service webservice.

## A. Setup your development environment and cloud compute resource

This section is to check you have Azure ML SDK installed, has an existing Azure Machine Learning Workspace in the Azure portal, and be able to provision Azure compute resource in order to proceed with this tutorial.

The Azure ML SDK version used here is 1.3.0.

### 1. Azure Machine Learning and Pipeline SDK-specific Imports

In [None]:
# These are the packages that will be used in this tutorial.
import azureml.core
from azureml.core import Workspace, Experiment, Datastore, Environment
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.dataset import Dataset
from azureml.train.sklearn import SKLearn
from azureml.widgets import RunDetails
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.model import Model
from azureml.core.model import InferenceConfig
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.core.webservice import AciWebservice, AksWebservice, Webservice
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter
from azureml.pipeline.steps import PythonScriptStep, EstimatorStep
from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type \
  import NumpyParameterType
from sklearn.externals import joblib
import os
import shutil
import json
import numpy as np

print("SDK version:", azureml.core.VERSION)

### 2. Create or Attach existing compute resource

If you don't have any compute target, you can run the following code to create one. Once it is completed, you can find 'cpucluster' in your Workspace under Compute &gt; Training clusters.

### 3. Define variables

In [None]:
aml_workspace = Workspace.from_config()
datastore = aml_workspace.get_default_datastore()
dataset_name = 'predict-employee-retention-training-data'
experiment_name = 'predict-employee-retention'
model_name = 'predict-employee-retention-model'
file_name = 'training-data.csv'
sources_directory_train = 'emp_retention'
data_prep_script_path = 'training/prep_data.py'
train_script_path = 'training/training_model.py'
evaluate_script_path = 'evaluate/evaluate_model.py'
register_script_path = 'register/register_model.py'
score_script_path = 'scoring/score.py'
aci_service_name = 'employee-retention-aci'
aks_service_name = 'employee-retention-aks'

## B. Create scripts to use for the pipeline

This section is to provide the necessary scripts to run in the pipeline steps. 

In [None]:
# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpucluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 2)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_DS2_V2")


if compute_name in aml_workspace.compute_targets:
    compute_target = aml_workspace.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes,
                                                                max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(
        aml_workspace, compute_name, provisioning_config)

    # can poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

# Use "cpucluster"
aml_compute = aml_workspace.compute_targets["cpucluster"]

### 1. Create a folder to host scripts
A folder structure is created to host the python scripts for each of the pipeline steps and the same
structure will be copied to the compute target when the pipeline is executed. 

In [None]:
# You must create a source_directory as the root folder.
os.makedirs(os.path.join(os.getcwd(), sources_directory_train), exist_ok=True)

# Create subfolders to host the scripts. This is optional because you can put everything in source_directory.
training_folder = os.path.join(sources_directory_train,'training')
os.makedirs(training_folder, exist_ok=True)

evaluate_folder = os.path.join(sources_directory_train,'evaluate')
os.makedirs(evaluate_folder, exist_ok=True)

register_folder = os.path.join(sources_directory_train,'register')
os.makedirs(register_folder, exist_ok=True)

scoring_folder = os.path.join(sources_directory_train,'scoring')
os.makedirs(scoring_folder, exist_ok=True)

print('Scripts used for the pipeline will be placed in the following directory:', 
      training_folder, evaluate_folder, register_folder, scoring_folder, sep='\n')

### 2. Create script to Prepare Data
This step uses an input .csv file (i.e. the raw data) and produce an output .csv file (i.e. clean_data.csv) that is stored in the DataStore. The clean data is an intermediate data to be consumed by subsequent step.

In [None]:
%%writefile prep_data.py

import argparse
import os
from azureml.core import Run
import pandas as pd


def prepare_dataset(dataset, run):
    # Rename sales feature into department
    dataset = dataset.rename(columns={"sales": "department"})

    # Map salary into integers
    salary_map = {"low": 0, "medium": 1, "high": 2}
    dataset["salary"] = dataset["salary"].map(salary_map)

    # Create dummy variables for department feature
    dataset = pd.get_dummies(dataset, columns=["department"],
                             drop_first=True)

    # Get number of positve and negative examples
    pos = dataset[dataset["left"] == 1].shape[0]
    neg = dataset[dataset["left"] == 0].shape[0]
    run.log('Positive', 'Positive examples = {}'.format(pos))
    run.log('Negative', 'Negative examples = {}'.format(neg))
    run.log('Proportion', 'Proportion of positive to negative \
    examples = {:.2f}%'.format((pos / neg) * 100))
    return dataset


def main():
    # retrieve argument configured through script_params in estimator
    parser = argparse.ArgumentParser()
    parser.add_argument('--raw_data_file', dest='raw_data_file',
                        type=str, help='input raw data in .csv file')
    parser.add_argument('--clean_data_folder', dest='clean_data_folder',
                        type=str, help='output folder path that stores \
                        the clean data file')
    parser.add_argument('--clean_data_file', dest='clean_data_file',
                        type=str, help='name of the clean data file')
    args = parser.parse_args()

    # get hold of the current run
    run = Run.get_context()

    # Read dataset
    dataset = pd.read_csv(args.raw_data_file)
    run.log('Read raw data file', args.raw_data_file)

    # Clean the dataset to use for model training
    dataset = prepare_dataset(dataset, run)

    # For "output" PipelineData, the folder must be created using
    # os.makedirs() first, then only can write files into the folder.
    os.makedirs(args.clean_data_folder, exist_ok=True)

    # Write the dataset into .csv file for the next step to process
    dataset.to_csv(args.clean_data_folder + args.clean_data_file, index=False)
    run.log('Output path of clean data', args.clean_data_folder
            + args.clean_data_file)


if __name__ == '__main__':
    main()


### 3. Create script for Model Training

This step uses the clean data to train a model and then store the model (.pkl) file into the DataStore. The model is an intermediate data to be consumed by subsequent step.

In [None]:
%%writefile training_model.py

import argparse
import os
from azureml.core import Run
import pandas as pd
from sklearn.externals import joblib
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


def generate_model(dataset, run):
    # Convert dataframe into numpy objects and split them into
    # train and test sets: 80/20
    X = dataset.loc[:, dataset.columns != "left"].values
    y = dataset.loc[:, dataset.columns == "left"].values.flatten()

    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        stratify=y,
                                                        random_state=1)

    clf = LogisticRegression(solver='liblinear', random_state=0)
    clf.fit(X_train, y_train)

    # View the model's coefficients and bias
    run.log('Coefficients', clf.coef_)
    run.log('Bias', clf.intercept_)

    y_pred_LR = clf.predict(X_test)

    # Display confusion matrix
    cf_matrix = confusion_matrix(y_test, y_pred_LR)
    group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
    group_counts = ["{0:0.0f}".format(value) for value in
                    cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in
                         cf_matrix.flatten() / np.sum(cf_matrix)]
    labels = [f"{v1} {v2} ({v3})" for v1, v2, v3 in
              zip(group_names, group_counts, group_percentages)]
    labels = np.asarray(labels).reshape(2, 2)
    run.log('Confusion Matrix', labels)

    # Display statistics
    accuracy = np.trace(cf_matrix) / float(np.sum(cf_matrix))
    precision = cf_matrix[1, 1] / sum(cf_matrix[:, 1])
    recall = cf_matrix[1, 1] / sum(cf_matrix[1, :])
    f1_score = 2*precision*recall / (precision + recall)

    stats_text = "\n\nAccuracy={:0.3f}\n".format(accuracy) + \
        "Precision={:0.3f}\n".format(precision) + \
        "Recall={:0.3f}\n".format(recall) + \
        "F1 Score={:0.3f}".format(f1_score)
    run.log('Statistics', stats_text)

    # Log the following metrics to the parent run so that
    # these are available for model evaluation later.
    run.parent.log('Accuracy', accuracy)
    run.parent.log('Precision', precision)
    run.parent.log('Recall', recall)
    run.parent.log('F1-score', f1_score)

    # Log confusion matrix as JSON.
    cf_matrix_json = {"schema_type": "confusion_matrix",
                      "schema_version": "v1",
                      "data": {"class_labels": group_names,
                               "matrix": cf_matrix.tolist()}}

    run.log_confusion_matrix('Employee Retention Confusion Matrix',
                             cf_matrix_json,
                             description='Confusion matrix generated \
                             for the run')
    return clf


def main():
    # retrieve argument configured through script_params in estimator
    parser = argparse.ArgumentParser()
    parser.add_argument('--clean_data_folder', dest='clean_data_folder',
                        type=str,
                        help='folder path that stores the clean data file')
    parser.add_argument('--new_model_folder', dest='new_model_folder',
                        type=str,
                        help='output folder path that stores the new \
                        model .pkl file name')
    parser.add_argument('--clean_data_file', dest='clean_data_file',
                        type=str,
                        help='name of the clean data file')
    parser.add_argument('--new_model_file', dest='new_model_file',
                        type=str,
                        help='name of the new model .pkl file')
    args = parser.parse_args()

    # get the current run
    run = Run.get_context()

    # Read dataset
    training_dataset_file = args.clean_data_folder + args.clean_data_file
    dataset = pd.read_csv(training_dataset_file)
    run.log('Read training data from file', training_dataset_file)

    # Generate model
    clf = generate_model(dataset, run)

    # For "output" PipelineData, the folder must be created using
    # os.makedirs() first, then only can write files into the folder.
    os.makedirs(args.new_model_folder, exist_ok=True)

    # Pass model file to next step
    model_pkl_file = args.new_model_folder + args.new_model_file
    joblib.dump(value=clf, filename=model_pkl_file)

    run.log('Output path of new model', model_pkl_file)


if __name__ == '__main__':
    main()

### 4. Create script to Evaluate Model

This step will compare the new model's metrics with the current model. It retrieves the current model from the Azure ML Workspace.

After the new model is evaluated to perform better than the current one, the next step will continue to register the new model. Otherwise, the pipeline will stop at this step and cancel the run so that the remaining steps will not get executed. 

In [None]:
%%writefile evaluate_model.py

import argparse
from azureml.core import Run
from azureml.core.model import Model
from azureml.exceptions import WebserviceException


def main():
    # retrieve argument configured through script_params in estimator
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", dest='model_name', type=str,
                        help="Name of the model to retrieve from Workspace")
    args = parser.parse_args()

    # Get the current run
    run = Run.get_context()

    # Get metrics from current model and compare with the metrics
    # of the new model. The metrics of the new model can be retrieved
    # from run.parent.get_metrics, which were created in training_model.py
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']
    current_metrics = {}
    new_metrics = {}

    try:
        workspace = run.experiment.workspace
        # Get latest model
        model = Model(workspace, args.model_name)

        for key in metrics:
            current_metrics[key] = float(model.tags.get(key))
            new_metrics[key] = run.parent.get_metrics(key).get(key)
            run.log(key, 'current(ver '
                         + str(model.version)
                         + ')='
                         + model.tags.get(key)
                         + ' new='
                         + str(run.parent.get_metrics(key).get(key))
                    )

    except WebserviceException as e:
        if('ModelNotFound' in e.message):
            model = None
        else:
            raise

    # Perform comparison. Just do a simple comparison:
    # If Accuracy improves, proceed next step to register model.
    if(model is not None):
        if(new_metrics['Accuracy'] >= current_metrics['Accuracy']):
            run.log("Result", "New Accuracy is as good as current, \
                will proceed to register new model.")
        else:
            run.log("Result", "New Accuracy is worse than current, \
                will not register model. Processing cancelled.")
            run.parent.cancel()
    else:
        run.log("Result", "This is the first model, will proceed \
            to register the model.")


if __name__ == '__main__':
    main()

### 5. Create script to Register Model

This step is to register the new model that was created after the model training. It uses the intermediate output (i.e. model (.pkl) file) as an input.

Note: In Tutorial #1, we use **run.register_model()** to register the model:

model = run.register_model(model_name='predict-employee-retention',
                           model_path='outputs/predict-employee-retention-model.pkl',
                           tags=tags
                          )

**run.register_model()** will not work here, so you need to use **Model.register()** as shown in the script below.

In [None]:
%%writefile register_model.py

import argparse
from azureml.core import Run, Dataset
from azureml.core.model import Model


def main():
    # Retrieve argument configured through script_params in estimator
    parser = argparse.ArgumentParser()
    parser.add_argument('--new_model_folder', dest='new_model_folder',
                        type=str,
                        help='input folder path for reading the new \
                        model .pkl file')
    parser.add_argument('--new_model_file', dest='new_model_file',
                        type=str,
                        help='name of the model .pkl file')
    parser.add_argument("--model_name", dest='model_name',
                        type=str,
                        help="Name of the model to register into \
                        Azure ML Workspace")
    args = parser.parse_args()

    # Get the current run
    run = Run.get_context()
    # Adding metrics to tags so that these information can
    # be used for model comparison purpose.
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']
    tags = {}
    for key in metrics:
        tags[key] = run.parent.get_metrics(key).get(key)

    # Store BuildId
    parent_tags = run.parent.get_tags()
    build_id = 'BuildId'
    try:
        build_id = parent_tags["BuildId"]
        tags['BuildId'] = build_id
    except KeyError:
        print("BuildId tag not found on parent run.")
        print(f"Tags present: {parent_tags}")

    # Register the new model, note the metric values are stored in "tags".
    model_pkl_file = args.new_model_folder + args.new_model_file
    workspace = run.experiment.workspace
    dataset_name = 'predict-employee-retention-training-data'
    model = Model.register(workspace=workspace,
                           model_name=args.model_name,
                           model_path=model_pkl_file,
                           tags=tags,
                           datasets=[('training data',
                                      Dataset.get_by_name(workspace,
                                                          dataset_name))])

    run.log('Model registered', 'New model ' + model.name
            + ' version ' + str(model.version) + ' BuildId ' + build_id)


if __name__ == '__main__':
    main()

### 6. Copy scripts to subfolders

In [None]:
shutil.copy('prep_data.py', training_folder)
shutil.copy('training_model.py', training_folder)
shutil.copy('evaluate_model.py', evaluate_folder)
shutil.copy('register_model.py', register_folder)

## C. Prepare data source to use in the pipeline

### 1. Create a DataReference object to reference the data file in the datastore.

Datasource is represented by **[DataReference](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.data_reference.datareference?view=azure-ml-py)** object and points to data that lives in or is accessible from Datastore. DataReference could be a pointer to a file or a directory.

In [None]:
# Upload the "training-data.csv" data file to datastore.
datastore.upload_files([file_name],
                       target_path=dataset_name,
                       overwrite=True)

raw_data_file = DataReference(datastore=datastore,
    data_reference_name="Raw_Data_File",
    path_on_datastore=dataset_name + '/' + file_name)

### 2. Check your uploaded data file in the datastore
Once you successfully uploaded the files, you can browse to them (or upload more files).

- In the [**Azure Portal**](https://portal.azure.com), go to your Machine Learning Workspace. 


- In your Workspace, click on the "Storage" hyperlink. This will bring you to the storage account page.


- In the storage account page, click "Storage Explorer" on the left-hand menu.


- You will find the uploaded file in BLOB CONTAINERS > azureml-blobstore-[your subscription ID] > predict-employee-retention-training-data
 

## D. Building Pipeline Steps with Inputs and Outputs
A step in the pipeline can take data as input. This data can be a data source that lives in one of the accessible data locations, or intermediate data produced by a previous step in the pipeline. This tutorial illustrates how to use PythonScriptStep and EstimatorStep to build the pipeline.

Intermediate data (or output of a Step) is represented by **[PipelineData](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-core/azureml.pipeline.core.pipelinedata?view=azure-ml-py)** object. PipelineData can be produced by one step and consumed in another step by providing the PipelineData object as an output of one step and the input of one or more steps.

[**PipelineParameter**](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-core/azureml.pipeline.core.pipelineparameter?view=azure-ml-py) can be used to pass varying parameter values to a published Pipeline.

A [**DataPath**](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.data.datapath.datapath?view=azure-ml-py) can be modified during pipeline submission with the PipelineParameter.

Tips: 
1. If you use a **PipelineData** object in *arguments*, you must also specify it in either *inputs* or *outputs*.


2. For **PipelineParameter** object use in *arguments*, you don't have to specify it in *inputs* or *outputs*.


3. If you use a **DataReference** object in *arguments*, you must specify it in *inputs*.


4. When use **PipelineData** as *outputs* in a pipeline step, if this step is the first and will write output data, you must use os.makedirs() to create the **PipelineData** folder first then only you are able to write files to the folder. See [Moving data into and between ML pipeline steps (Python)](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-move-data-in-out-of-pipelines#use-pipelinedata-for-intermediate-data)



### 1. Create Prepare Data Step

The first step in the pipeline is to "clean" the data and subsequently pass it to the second step to perform model training.
The [**PythonScriptStep**](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.python_script_step.pythonscriptstep?view=azure-ml-py) can be used here.
Notice this step will run the 'training/prep_data.py' python script.

In [None]:
# Create the PipelineParameter and PipelineData used for 
# prepare data step.
# clean_data_folder is used to store the clean data file.
# Take note when create PipelineData, you don't need to 
# specify the full folder path, just need the name of the subfolder 
# to be created e.g. "clean_data_folder"
clean_data_file = PipelineParameter(name="clean_data_file",
                                    default_value="/clean_data.csv")
clean_data_folder = PipelineData("clean_data_folder",
                                 datastore=datastore)

# raw_data_file is a Datareference and produce clean data to 
# be used for model training.
prepDataStep = PythonScriptStep(name="Prepare Data",
                   source_directory=sources_directory_train,
                   script_name=data_prep_script_path,
                   arguments=["--raw_data_file", raw_data_file,
                              "--clean_data_folder", clean_data_folder,
                              "--clean_data_file", clean_data_file],
                   inputs=[raw_data_file],
                   outputs=[clean_data_folder],
                   compute_target=aml_compute,
                   allow_reuse=False)

print("Step Prepare Data created")

### 2. Create Model Training Step

This step is to perform model training using Scikit-Learn. This example shows you how to create using PythonScriptStep and EstimatorStep.

In [None]:
# Create the PipelineParameter and PipelineData used
# for model training step.
# new_model_folder is used to store the model .pkl file.
new_model_file = PipelineParameter(name="new_model_file ",
                                   default_value='/' + model_name
                                   + '.pkl')
new_model_folder = PipelineData("new_model_folder", datastore=datastore)

#### Option 1: Using EstimatorStep
[**EstimatorStep**](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-steps/azureml.pipeline.steps.estimator_step.estimatorstep?view=azure-ml-py) adds a step to run Estimator in a Pipeline.

In [None]:
# Create an Estimator (in this case we use the SKLearn estimator)
est = SKLearn(source_directory=sources_directory_train,
              entry_script=train_script_path,
              # Use either pip_packages or conda_packages.
              pip_packages=['azureml-sdk', 'scikit-learn==0.20.3', \
                            'azureml-dataprep[pandas,fuse]>=1.1.14'],
              #conda_packages=['scikit-learn==0.20.3'],
              compute_target=aml_compute)

# Notice here you can't use PipelineParameter object, 
# but you can specify its value to pass.
trainingStep = EstimatorStep(
    name="Model Training", 
    estimator=est,
    estimator_entry_script_arguments=["--clean_data_folder", \
                                      clean_data_folder, \
                                      "--new_model_folder", \
                                      new_model_folder,
                                      "--clean_data_file", \
                                      clean_data_file.default_value,
                                      "--new_model_file", \
                                      new_model_file.default_value],
    runconfig_pipeline_params=None, 
    inputs=[clean_data_folder], 
    outputs=[new_model_folder], 
    compute_target=aml_compute,
    allow_reuse=False)

print("Step Train created")

#### Option 2: Using PythonScriptStep

In [None]:
# Use a RunConfiguration to specify additional requirements for this step, like the specific packages required.
# create a new runconfig object
run_config = RunConfiguration()

# enable Docker 
run_config.environment.docker.enabled = True

# set Docker base image to the default CPU-based image
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE

# use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_config.environment.python.user_managed_dependencies = False

# specify CondaDependencies obj
run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn==0.20.3'])

trainingStep = PythonScriptStep(
    name="Model Training",
    source_directory=sources_directory_train,
    script_name=train_script_path, 
    arguments=["--clean_data_folder", clean_data_folder, 
        "--new_model_folder", new_model_folder,
        "--clean_data_file", clean_data_file,
        "--new_model_file", new_model_file],
    inputs=[clean_data_folder],
    outputs=[new_model_folder],
    compute_target=aml_compute,
    runconfig=run_config,
    allow_reuse=False)

print("Step Train created")

### 3. Create Model Evaluation Step

This step is to evaluate the new model, whether it is better than the current model.


Using the *model_name* as input to the python code *'evaluate/evaluate_model.py'*, 
this step will fetch the current model from Azure ML Workspace and then compare its metrics against the new model.
You write your comparison logic in *'evaluate/evaluate_model.py'*.


In [None]:
# Create a PipelineParameter to pass the name of the model to be evaluated.
model_name_param = PipelineParameter(name="model_name",
                                     default_value=model_name)

evaluateStep = PythonScriptStep(
    name="Evaluate Model",
    source_directory=sources_directory_train,
    script_name=evaluate_script_path, 
    arguments=["--model_name", model_name_param],
    compute_target=aml_compute,
    allow_reuse=False)

print("Step Evaluate created")

### 4. Create Model Registration Step

After evaluated the new model is better, this step will register it into the Azure ML Workspace.

In [None]:
registerStep = PythonScriptStep(
    name="Register Model",
    source_directory=sources_directory_train,
    script_name=register_script_path, 
    arguments=["--new_model_folder", new_model_folder,
               "--new_model_file", new_model_file,
               "--model_name", model_name_param],
    inputs=[new_model_folder],
    compute_target=aml_compute,
    allow_reuse=False)

print("Step Register created")

## E. Define the Pipeline Steps

You can define the steps to be run concurrently or sequentially. This example is to show how to chain the steps in sequence.

In [None]:
# Chain the steps in sequence.
trainingStep.run_after(prepDataStep)
evaluateStep.run_after(trainingStep)
registerStep.run_after(evaluateStep)

pipeline = Pipeline(workspace=aml_workspace, steps=[registerStep])
pipeline.validate()
print ("Pipeline is built")

## F. Run the Pipeline

### 1. Submit the pipeline
[Submitting](https://docs.microsoft.com/en-us/python/api/azureml-pipeline-core/azureml.pipeline.core.pipeline.pipeline?view=azure-ml-py#submit) the pipeline involves creating an [Experiment](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.experiment?view=azure-ml-py) object and providing the built pipeline for submission. 

In [None]:
pipeline_run = Experiment(aml_workspace, experiment_name).submit(pipeline)

# Use RunDetails Widget to examine the run of the pipeline. 
RunDetails(pipeline_run).show()

### 2. View Pipeline outputs

See where outputs of each pipeline step are located on your datastore.

Wait for pipeline run to complete, to make sure all the outputs are ready


In [None]:
# Loop through each step to get the list of outputs.
for step in pipeline_run.get_steps():
    print("Outputs of step " + step.name)
    
    # Get a dictionary of StepRunOutputs with the output name as the key 
    output_dict = step.get_outputs()
    
    for name, output in output_dict.items():
        
        output_reference = output.get_port_data_reference() # Get output port data reference
        print("\tname: " + name)
        print("\tdatastore: " + output_reference.datastore_name)
        print("\tpath on datastore: " + output_reference.path_on_datastore)

### 3. Get additonal run details
If you wait until the pipeline_run is finished, you may be able to get additional details on the run by executing the following codes.

In [None]:
pipeline_run.wait_for_completion()
for step_run in pipeline_run.get_children():
    print("{}: {}".format(step_run.name, step_run.get_metrics()), end='\n\n')

## G. Publish the Pipeline

The published pipeline can be found in Azure Machine Learning studio > Pipelines > Pipeline endpoints.

### Option 1: from a submitted PipelineRun

In [None]:
from datetime import datetime

timenow = datetime.now().strftime('%m-%d-%Y-%H-%M')
pipeline_name = timenow + "-Employee Retention Pipeline"
print(pipeline_name)

published_pipeline1 = pipeline_run.publish_pipeline(
    name=pipeline_name, 
    description="Predict Employee Retention Model training/retraining pipeline (from submitted PipelineRun)", 
    version="0.1", 
    continue_on_step_failure=True)

print("Newly published pipeline id: {}".format(published_pipeline1.id))

published_pipeline1

Note: the *continue_on_step_failure* parameter specifies whether the execution of steps in the Pipeline will continue if one step fails. The default value is False, meaning when one step fails, the Pipeline execution will stop, canceling any running steps.

### Option 2: from a Pipeline

In [None]:
from datetime import datetime

timenow = datetime.now().strftime('%m-%d-%Y-%H-%M')
pipeline_name = timenow + "-Employee Retention Pipeline"
print(pipeline_name)

published_pipeline2 = pipeline.publish(
    name=pipeline_name, 
    description="Predict Employee Retention Model training/retraining pipeline (from Pipeline)",
    version="0.2",
    continue_on_step_failure=True)

print("Newly published pipeline id: {}".format(published_pipeline2.id))

published_pipeline2

## H. Run published pipeline using its REST endpoint

In [None]:
from azureml.core.authentication import InteractiveLoginAuthentication
import requests

auth = InteractiveLoginAuthentication()
aad_token = auth.get_authentication_header()

#rest_endpoint = published_pipeline1.endpoint
rest_endpoint = published_pipeline2.endpoint

print("You can perform HTTP POST on URL {} to trigger this pipeline".format(rest_endpoint))

# specify the param when running the pipeline
response = requests.post(rest_endpoint, 
                         headers=aad_token, 
                         json={"ExperimentName": "predict_employee_retention_pipeline",
                               "RunSource": "SDK",
                               "ParameterAssignments": {"pipeline_arg": 45}})
run_id = response.json()["Id"]

print('Submitted pipeline run: ', run_id)

You should be able to see the submitted pipeline (given by the run_id) in **Azure ML studio > Experiments > predict_employee_retention_pipeline > click on the Run ID**.



In [None]:
## I. Create a scoring script

This tutorial covers how to create a scoring script that receives data submitted to a deployed web service and passes it to the model, and  then takes the response returned by the model and returns that to the client. 
This script is specific to your model.

[Deploy models with Azure Machine Learning](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-and-where)

### 1. Test model scoring

In [None]:
# Load model
model_path=Model(aml_workspace, model_name)
model_path.download(target_dir=os.getcwd(), exist_ok=True)
model_path_pkl = os.path.join(os.getcwd(), model_name+'.pkl')
model = joblib.load(model_path_pkl)

raw_data1 = '{"data": [[0.1, 0.77, 6.0, 272.0, 4.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]}'

raw_data2 = '{"data": [[0.76, 0.5, 4.0, 136.0, 3.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]]}'


data1 = np.array(json.loads(raw_data1)['data'])
y_hat1 = model.predict(data1) 
print("prediction:", {"result": y_hat1.tolist()}) # Prediction result 1 (i.e. leave company)

data2 = np.array(json.loads(raw_data2)['data'])
y_hat2 = model.predict(data2) 
print("prediction:", {"result": y_hat2.tolist()}) # Prediction result 0 (i.e. stay)

### 2. Install inference-schema

This example is to illustrate how to use the automatic schema generation.

Before you proceed, run the following pip commands to download this package into your local Anaconda:
    
+ pip install inference-schema[numpy-support]


### 3. Create scoring script

This section is to create a scoring script for your model.

[Deploy models with Azure Machine Learning](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-and-where)

#### Specify the environment to run the script and save it as conda_dependencies.yml

* You must indicate azureml-defaults as a pip dependency, because it contains the functionality needed to host the model as a web service.
* If you want to use automatic schema generation, your entry script must also import the inference-schema packages.

#### Create conda dependencies in YAML file

In [None]:
env = CondaDependencies.create(pip_packages=['azureml-sdk','azureml-defaults','scikit-learn==0.20.3','inference-schema[numpy-support]'])

with open("conda_dependencies.yml","w") as f:
    f.write(env.serialize_to_string())
    
# Review the content of the YAML file you just created.
with open("conda_dependencies.yml","r") as f:
    print(f.read())

#### Write your scoring script and save it as score.py

In [None]:
%%writefile score.py

import numpy as np
from sklearn.externals import joblib
from azureml.core.model import Model
from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type \
  import NumpyParameterType


def init():
    global model
    # retrieve the path to the model file using the model name
    model_path = Model.get_model_path('predict-employee-retention-model')
    model = joblib.load(model_path)


input_sample = np.array([[0.76, 0.5, 4.0, 136.0, 3.0, 0.0, 0.0, 1.0,
                          0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]])
output_sample = np.array([1])


@input_schema('data', NumpyParameterType(input_sample))
@output_schema(NumpyParameterType(output_sample))
def run(data):
    y_hat = model.predict(data)
    return y_hat.tolist()


#### Copy your script to the scoring folder.

In [None]:
shutil.copy('conda_dependencies.yml', scoring_folder)
shutil.copy('score.py', scoring_folder)

## J. Deploy as a Azure Container Instances webservice

[What is Azure Container Instances?](https://docs.microsoft.com/en-us/azure/container-instances/container-instances-overview)

### 1. Create and deploy ACI webservice

This step may take a while to start after you run the cell, you will see the message "Running" appearing when it starts and will take few minutes to complete.

In [None]:
%%time

model=Model(aml_workspace, model_name)

# Load the environment file.
env = Environment.from_conda_specification(name = 'myenv', 
                                           file_path = scoring_folder+'/conda_dependencies.yml')

# Combine scoring script & environment in Inference configuration
inference_config = InferenceConfig(entry_script=sources_directory_train+'/'+score_script_path, environment=env)

# Set deployment configuration.
deployment_config = AciWebservice.deploy_configuration(cpu_cores=1, 
                                               memory_gb=1,
                                               tags={"data": "Predict employee retention",  "method" : "sklearn"}, 
                                               description='Predict employee retention with sklearn')

# Define the model, inference, & deployment configuration and web service name and location to deploy
aci_service = Model.deploy(
    workspace = aml_workspace,
    name = aci_service_name,
    models = [model],
    inference_config = inference_config,
    deployment_config = deployment_config)

aci_service.wait_for_deployment(show_output=True)

View status and logs (for troubleshooting purpose):

In [None]:
print(aci_service.state)
print(aci_service.get_logs())

Get the scoring web service's HTTP endpoint, which accepts REST client calls. This endpoint can be shared with anyone who wants to test the web service or integrate it into an application.

In [None]:
# If the webservice is deployed successfully, you can use 
# the HTTP endpoint "scoring_uri" to test model scoring. 
aci_service = Webservice(workspace=aml_workspace, name=aci_service_name)
print(aci_service.scoring_uri)
print(aci_service.swagger_uri)

### 2. Test deployed ACI webservice

Send HTTP request to test the web service.

In [None]:
import requests

headers = {'Content-Type':'application/json'}

# Data for scoring
raw_data1 = '{"data": [[0.1, 0.77, 6.0, 272.0, 4.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]}'
resp = requests.post(aci_service.scoring_uri, raw_data1, headers=headers)
print("POST to url", aci_service.scoring_uri)
print("input data:", raw_data1)
print("prediction:", resp.text) # Return 1 (i.e. predict the employee will leave the organization)

raw_data2 = '{"data": [[0.76, 0.5, 4.0, 136.0, 3.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]]}'
resp = requests.post(aci_service.scoring_uri, raw_data2, headers=headers)
print("POST to url", aci_service.scoring_uri)
print("input data:", raw_data2)
print("prediction:", resp.text) # Return 0 (i.e. predict the employee will stay)

### 3. Delete the ACI Webservice

You can delete the ACI deployment using this API call if it is no longer required for testing:

In [None]:
aci_service.delete()

You can also manually delete the deployed web service which can be found in your **Azure ML Workspace > Deployments**.

## K. Deploy as a Azure Kubernetes Service webservice

[Azure Kubernetes Service (AKS)](https://docs.microsoft.com/en-us/azure/aks/intro-kubernetes)

### 1. Create the AKS cluster

This step may take a while to start after you run the cell, you will see the message "Creating" appearing when it starts and will take few minutes to complete.

[Create a new AKS cluster](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-azure-kubernetes-service#create-a-new-aks-cluster)

In [None]:
from azureml.core.compute import AksCompute, ComputeTarget

# Create a dev/test cluster for testing purpose:
prov_config = AksCompute.provisioning_configuration(cluster_purpose = AksCompute.ClusterPurpose.DEV_TEST)

aks_name = 'aks'
# Create the cluster
aks_target = ComputeTarget.create(workspace = aml_workspace,
                                    name = aks_name,
                                    provisioning_configuration = prov_config)

# Wait for the create process to complete
aks_target.wait_for_completion(show_output = True)

### 2. Create and deploy AKS webservice

This step may take a while to start after you run the cell, you will see the message "Running" appearing when it starts and will take few minutes to complete.

In [None]:
# Set the web service configuration (using default here)
aks_config = AksWebservice.deploy_configuration()

aks_service = Model.deploy(workspace=aml_workspace,
                           name=aks_service_name,
                           models=[model],
                           inference_config=inference_config,
                           deployment_config=aks_config,
                           deployment_target=aks_target)

aks_service.wait_for_deployment(show_output = True)
print(aks_service.state)

In [None]:
# If the webservice is deployed successfully, you can use 
# the HTTP endpoint "scoring_uri" to test model scoring. 
aks_service = Webservice(workspace=aml_workspace, name=aks_service_name)
print(aci_service.scoring_uri)
print(aci_service.swagger_uri)

### 3. Test deployed AKS webservice

In [None]:
import requests

# Need to get the authorization key, either one can be used.
primary_key, secondary_key = aks_service.get_keys()
headers = {'Content-Type':'application/json',  'Authorization':('Bearer '+ primary_key)} 

# Data for scoring
raw_data1 = '{"data": [[0.1, 0.77, 6.0, 272.0, 4.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]}'
resp = requests.post(aks_service.scoring_uri, raw_data1, headers=headers)
print("POST to url", aks_service.scoring_uri)
print("input data:", raw_data1)
print("prediction:", resp.text) # Return 1 (i.e. predict the employee will leave the organization)

raw_data2 = '{"data": [[0.76, 0.5, 4.0, 136.0, 3.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]]}'
resp = requests.post(aks_service.scoring_uri, raw_data2, headers=headers)
print("POST to url", aks_service.scoring_uri)
print("input data:", raw_data2)
print("prediction:", resp.text) # Return 0 (i.e. predict the employee will stay)

### 4. Delete the AKS Webservice

In [None]:
aks_service.delete()

You can also manually delete the deployed web service which can be found in your **Azure ML Workspace > Deployments**.