UC : Full pipeline to be launch every months
Data ingestion from a business app in order to feed the Blob Storage


#Add Environnement Creation
#Add Compute engine on & off 

# Pipeline 0 : reading (data_ingestion) connected with Blob Storage
# Pipeline 1 : data processing 
# Pipeline 2 : hyperparameters + best model  
# Pipeline 3 : register best model + save pipeline + schedule every months
# Final steps : monitoring

- integrate metrics - ok 
- integrate fairness 
- integrate graph explainability
- inference controle (best model + drift detector) 
- add a graph (learning curve)

# + feed new data from PowerApps  
# Azure DevOps ? AzureMlOps ?
# cout carbonne associée à l'experimentation 
# AZ keyvaults for subscription key


#- model register in the model library
#- display all metrics in one tab from all childs and not one by one
#- update blob storage from data store, is it automatic or not ???????
#- UI display fairness dashboard in preview screen


In [None]:
%%writefile conda_dependencies.yml

dependencies:
- python=3.6.2
- pip:
  - azureml-defaults
  - keras
  - tensorflow<=2.4.*
  - numpy
  - scikit-learn
  - pandas
  - matplotlib
  - raiwidgets
  - fairlearn==0.4.6
  - azureml-contrib-fairness

In [None]:
#!pip install azureml-contrib-fairness
#!pip install fairlearn==0.4.6
#!pip install raiwidgets

In [162]:
from azureml.core import Dataset
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.core import Workspace, Dataset
from azureml.train.automl import AutoMLConfig
from azureml.core import Workspace,RunConfiguration
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep
from azureml.train.automl.utilities import get_primary_metrics
from azureml.data.datapath import DataPath
from azureml.core.datastore import Datastore
from azureml.pipeline.core import InputPortBinding
from azureml.core import Run
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from azureml.pipeline.core import PipelineParameter
from azureml.core import Dataset
from azureml.core import Environment
from azureml.pipeline.core import PipelineData, TrainingOutput
from azureml.pipeline.steps import HyperDriveStep, HyperDriveStepRun, PythonScriptStep
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.parameter_expressions import choice
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.data import OutputFileDatasetConfig
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.train.hyperdrive import *
from azureml.widgets import RunDetails
from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule
from azureml.core import Run
from azureml.core import get_run


subscription_id = 'a0f4cddc-a66a-4dcc-9df7-ccbd7f81bf7b'
resource_group = 'learning'
workspace_name = 'training_MLservices'
compute_engine = 'jcharley3'
dataset_name = 'trainingdataset'

# Step 0 environnement
myenv = Environment.from_conda_specification(name = 'sklearn-env', file_path = './conda_dependencies.yml')
data_folder = os.path.join(os.getcwd(), 'data/')
os.makedirs(data_folder, exist_ok=True)
ws = Workspace.from_config()
ws.get_details()

# Step 0 data ingestion
raw_ds = Dataset.get_by_name(ws, name=dataset_name, version='latest')
datastore = ws.get_default_datastore()
# allow to use output1 in input2
fileConfig = OutputFileDatasetConfig(name='file_dataset')

# Step 1 data preparation
step1 = PythonScriptStep(name = 'prepare data',
                         source_directory = 'scripts',
                         script_name = 'data_prep.py',
                         compute_target = compute_engine,
                         allow_reuse = True,
                         arguments = ['--raw-ds', raw_ds.as_named_input('raw_data'),
                                      '--output-dir', fileConfig])


# Step 2 : training 
train_src = ScriptRunConfig(source_directory='scripts',
                            script='model_script2.py',
                            compute_target= compute_engine,
                            arguments = ["--input-dir", fileConfig.as_input()],
                            environment=myenv)


# Step 2 : hyperparameter 
param_sampling = RandomParameterSampling( {
    "--n_estimators": choice(15, 50, 100, 200, 300),
    "--criterion": choice("gini", "entropy"),
    "--max_depth": choice(2, 8, 16)
    }
)

hd_config = HyperDriveConfig(run_config=train_src, 
                             hyperparameter_sampling=param_sampling,
                             primary_metric_name='precision', 
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                             max_total_runs=3, # 1 for testing 
                             max_concurrent_runs=4)


saved_model = PipelineData(name='saved_model',
                            datastore=datastore,
                            pipeline_output_name='model_output',
                            #training_output=TrainingOutput(type = "Model",model_file="model/save_model.pkl"))
                            training_output=TrainingOutput(type = "Model", metric="Precision"))

metrics_data = PipelineData(name='metrics_data', 
                             datastore=datastore,
                             pipeline_output_name='metrics_output',
                            training_output=TrainingOutput(type='Metrics'))

                                
 
hd_step = HyperDriveStep(
    allow_reuse=True,
    name='hyperparameters',
    outputs = [metrics_data, saved_model],
    hyperdrive_config=hd_config,
                             )



# step 3 : register best model
# best model is retrieve directly from PipelineData saved model
#register_model = PythonScriptStep(name = 'Model Registration',
#                                  script_name = 'scripts/register_model.py',
#                                  arguments = ["--saved-model", saved_model],
#                                  inputs = [saved_model],
#                                  compute_target = compute_engine
#                                  )



# step 3 : publish pipeline
pipeline = Pipeline(workspace=ws, steps=[step1, hd_step], description="test-pipeline_3")
pipeline_run = pipeline.submit("end-to-end-demo", regenerate_outputs=True)

published_pipeline1 = pipeline.publish(
                        name="Template_Pipeline_Notebook",
                        description="Published Pipeline Description",
                        version="1.0")


# step 3 : schedule pipeline run every day
recurrence = ScheduleRecurrence(frequency='Day', interval=1)
recurring_schedule = Schedule.create(ws, name='DailySchedule', 
                            description='Once a day',
                            pipeline_id=published_pipeline1.id, 
                            experiment_name='Schedule_endtoend_demo_Pipelines', 
                            recurrence=recurrence)


'enabled' is deprecated. Please use the azureml.core.runconfig.DockerConfiguration object with the 'use_docker' param instead.


Created step prepare data [2459e553][b3d8f5e7-e5c5-4cae-9691-46d5ff1039f1], (This step will run and generate new outputs)
Created step hyperparameters [dc929dc0][8919451f-cf4c-44bb-884c-054267954178], (This step will run and generate new outputs)
Submitted PipelineRun 1e124bcb-0b05-4d45-812c-2bd0da1d8b87
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1e124bcb-0b05-4d45-812c-2bd0da1d8b87?wsid=/subscriptions/a0f4cddc-a66a-4dcc-9df7-ccbd7f81bf7b/resourcegroups/learning/workspaces/training_MLservices&tid=72f988bf-86f1-41af-91ab-2d7cd011db47


In [165]:
# Retrieve best model from Pipeline Run
import pickle

best_model_output = pipeline_run.get_pipeline_output("model_output")
num_file_downloaded = best_model_output.download('.', show_progress=True)
best_model_output



ErrorResponseException: Unknown error

In [None]:
with open(best_model_output._path_on_datastore, "rb" ) as f:
    best_model = pickle.load(f)
best_model

best_model.steps

In [None]:
# Retrieve all metrics from Child
metrics_output = pipeline_run.get_pipeline_output('metrics_output')
num_file_downloaded = metrics_output.download('.', show_progress=True)

import json
with open(metrics_output._path_on_datastore) as f:
    metrics_output_result = f.read()
    
deserialized_metrics_output = json.loads(metrics_output_result)
df = pd.DataFrame(deserialized_metrics_output)
df

In [None]:
# Get the HyperDriveStep of the pipeline by name (make sure only 1 exists)
from azureml.pipeline.core import PipelineRun, StepRun, PortDataReference

pipeline_run = PipelineRun("end-to-end-demo","9cc33344-0383-4750-8aef-9e07a0942760")
step_run = pipeline_run.find_step_run("hd_step")[0]

# Get RunID for best run (we're lazy)
best_run_id = hd_step_run.get_best_run_by_primary_metric().id

# Get all hyperparameters that where tried
hyperparameters = hd_step_run.get_hyperparameters()

# Get all metrics for the runs
metrics = hd_step_run.get_metrics()

# Iterate through all runs and print metrics + hyperparameters
for run_id, hp in hyperparameters.items():
    print(run_id, "===========")
    print("Hyperparameters:\n", hp)
    print("Metrics:\n", metrics[run_id])
 
# Just for the best run
print("BEST RUN:", best_run_id)
print("Hyperparameters for best run:\n", hyperparameters[best_run_id])
print("Metrics of best run:\n", metrics[best_run_id])

In [None]:

# Step 2 bis (choice 2 or 2 bis) Run AutomML
automl_settings = {
    "iteration_timeout_minutes" : 10,
    "iterations" : 4,
    "experiment_timeout_hours" : 0.10,
    "primary_metric" : 'Precision'
}

aml_run_config = RunConfiguration()
automl_config = AutoMLConfig(task = 'classification',
                             path = '.',
                             debug_log = 'automated_ml_errors.log',
                             compute_target = 'jcharley2',
                             run_configuration = aml_run_config,
                             featurization = 'auto',
                             training_data = [prepared_ds.read_delimited_files().as_input(name='prepared_ds')],
                             label_column_name = 'EmployeeTargeted',
                             **automl_settings)
                             
# add to the pipeline
step2_bis = AutoMLStep(name='AutoML',
                automl_config=automl_config,
                passthru_automl_config=False,
                #outputs=[metrics_data,model_data],
                enable_default_model_output=False,
                enable_default_metrics_output=False,
                allow_reuse=True)

