#### Author: Kubam Ivo
#### Date: 7/22/2020

In [26]:
# Setting up workspace
from azureml.core import Workspace

ws = Workspace.from_config()
print('My current workspace name is: ', ws.name)

My current workspace name is:  ml_practice


In [27]:
# Getting data 
from azureml.core import Dataset

if not 'titanic_ds' in ws.datasets.keys():
    web_paths = ['https://dprepdata.blob.core.windows.net/demo/Titanic.csv', # downloading files
                 'https://dprepdata.blob.core.windows.net/demo/Titanic2.csv']
    titanic_ds = Dataset.Tabular.from_delimited_files(path = web_paths) #creating a tabular dataset

    titanic_ds.register(workspace=ws, name= 'titanic_ds', description= ' Titanic baseline data', create_new_version= True)

    titanic_ds = Dataset.get_by_name(ws, 'titanic_ds') # assigning dataset to titanic_ds

In [28]:
# Configure storage and compute target
from azureml.core import Datastore
from azureml.core.compute import AmlCompute, ComputeTarget

datastore = ws.get_default_datastore()

compute_name = 'mlcompute'
if not compute_name in ws.compute_targets :
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                                min_nodes=0,
                                                                max_nodes=1)
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)

    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # Show the result
    print(compute_target.get_status().serialize())
else: 
    print(compute_name + ' already exist')

compute_target = ws.compute_targets[compute_name]

mlcompute already exist


In [29]:
# Configure the training run
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core import Environment

aml_run_config = RunConfiguration()
aml_run_config.target =  compute_target

USE_CURATED_ENV = True # An environment with already prebuilld packages 
if USE_CURATED_ENV :
    curated_environment = Environment.get(workspace=ws, name="AzureML-Tutorial")
    aml_run_config.environment = curated_environment
else:
    aml_run_config.environment.python.user_managed_dependencies = False
    
    # Add some packages relied on by data prep step
    aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
        conda_packages=['pandas','scikit-learn'], 
        pip_packages=['azureml-sdk[automl,explain]', 'azureml-dataprep[fuse,pandas]'], 
        pin_sdk_version=False)

In [30]:
# Data preparation pipeline step (pythonscriptstep)
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

prepped_data_path = PipelineData("titanic_train", datastore).as_dataset()
prepped_data_path = PipelineData("titanic_train", datastore).as_dataset()

dataprep_step = PythonScriptStep(
    name="dataprep", 
    script_name="dataprep.py", 
    compute_target=compute_target, 
    runconfig=aml_run_config,
    arguments=["--output_path", prepped_data_path],
    inputs=[titanic_ds.as_named_input("titanic_ds")],
    outputs=[prepped_data_path],
    allow_reuse=True
)

In [31]:
# Send data to AutoMLStep
prepped_data = prepped_data_path.parse_parquet_files(file_extension=None) #high performing pipelineOutputTabularDataset

In [32]:
# Specify automated Ml outputs
from azureml.pipeline.core import TrainingOutput

metrics_data = PipelineData(name = 'metrics_data',
                            datastore=datastore,
                            pipeline_output_name='metrics_output',
                            training_output=TrainingOutput(type='Metrics'))
model_data = PipelineData(name='best_model_data',
                           datastore=datastore,
                           pipeline_output_name='model_output',
                           training_output=TrainingOutput(type='Model'))

In [33]:
# Confugre and create the automated ML pipeline step
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep

automl_settings = {
    "iteration_timeout_minutes" : 10,
    "iterations" : 2,
    "experiment_timeout_hours" : 0.25,
    "primary_metric" : 'AUC_weighted'

}

automl_config = AutoMLConfig(task = 'classification',
                             path = '.',
                             debug_log = 'automated_ml_errors.log',
                             compute_target = compute_target,
                             run_configuration = aml_run_config,
                             featurization = 'auto',
                             training_data = prepped_data,
                             label_column_name = 'Survived',
                             **automl_settings)
train_step = AutoMLStep(name = 'Automl_classification',
                        automl_config=  automl_config,
                        passthru_automl_config=False,
                        outputs=[metrics_data, model_data],
                        allow_reuse=True)

In [34]:
# Pythonscript step to register model

from azureml.pipeline.core.graph import PipelineParameter

# The model name with which to register the trained model in the workspace.
model_name = PipelineParameter("model_name", default_value="TitanicSurvivalInitial")

register_step = PythonScriptStep(script_name="register_model.py",
                                       name="register_model",
                                       allow_reuse=False,
                                       arguments=["--model_name", model_name, "--model_path", model_data],
                                       inputs=[model_data],
                                       compute_target=compute_target,
                                       runconfig=aml_run_config)

In [35]:
# Create and run the automated ML pipeline
from azureml.pipeline.core import Pipeline
from azureml.core import Experiment

pipeline = Pipeline(ws, [dataprep_step, train_step, register_step])

experiment = Experiment(workspace=ws, name='titanic_automl')

run = experiment.submit(pipeline, show_output=True)
run.wait_for_completion()



Created step dataprep [001bb52e][ca08837e-0c7f-4563-8fc5-9e2027d1f26d], (This step will run and generate new outputs)
Created step Automl_classification [ac16f8ab][6c03bdc5-5458-4c9b-9f3a-682653b9dca9], (This step will run and generate new outputs)Created step register_model [ede90806][8c922287-562f-4504-8de9-ebd3f9368c56], (This step will run and generate new outputs)

Submitted PipelineRun a9c73a4c-bec1-4252-9209-a5732bd78ab3
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/a9c73a4c-bec1-4252-9209-a5732bd78ab3?wsid=/subscriptions/a1839f8e-ad8f-4825-ab97-e9628255f2bb/resourcegroups/azureml/workspaces/ml_practice
PipelineRunId: a9c73a4c-bec1-4252-9209-a5732bd78ab3
Link to Azure Machine Learning Portal: https://ml.azure.com/experiments/titanic_automl/runs/a9c73a4c-bec1-4252-9209-a5732bd78ab3?wsid=/subscriptions/a1839f8e-ad8f-4825-ab97-e9628255f2bb/resourcegroups/azureml/workspaces/ml_practice
PipelineRun Status: NotStarted
PipelineRun Status: R

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "User program failed with NameError: name 'null' is not defined",
        "detailsUri": "https://aka.ms/azureml-known-errors",
        "details": [],
        "debugInfo": {
            "type": "NameError",
            "message": "name 'null' is not defined",
            "stackTrace": "  File \"/mnt/batch/tasks/shared/LS_root/jobs/ml_practice/azureml/4d71cd94-1808-49d0-a5a4-10483d34519f/mounts/workspaceblobstore/azureml/4d71cd94-1808-49d0-a5a4-10483d34519f/azureml-setup/context_manager_injector.py\", line 148, in execute_with_context\n    runpy.run_path(sys.argv[0], globals(), run_name=\"__main__\")\n  File \"/azureml-envs/azureml_46fb7d2fe1381cf4c90841fb72b9a774/lib/python3.6/runpy.py\", line 263, in run_path\n    pkg_name=pkg_name, script_name=fname)\n  File \"/azureml-envs/azureml_46fb7d2fe1381cf4c90841fb72b9a774/lib/python3.6/runpy.py\", line 96, in _run_module_code\n    mod_name, mod_spec, pkg_name, script_name)\n  File \"/azureml-envs/azureml_46fb7d2fe1381cf4c90841fb72b9a774/lib/python3.6/runpy.py\", line 85, in _run_code\n    exec(code, run_globals)\n  File \"dataprep.py\", line 1, in <module>\n    {\"cells\":[{\"cell_type\":\"code\",\"source\":[\"from azureml.core import Run\\r\\n\",\"\\r\\n\",\"import pandas as pd \\r\\n\",\"import numpy as np \\r\\n\",\"import pyarrow as pa\\r\\n\",\"import pyarrow.parquet as pq\\r\\n\",\"import argparse\\r\\n\",\"\\r\\n\",\"RANDOM_SEED=42\\r\\n\",\"\\r\\n\",\"def prepare_age(df):\\r\\n\",\"    # Fill in missing Age values from distribution of present Age values \\r\\n\",\"    mean = df[\\\"Age\\\"].mean()\\r\\n\",\"    std = df[\\\"Age\\\"].std()\\r\\n\",\"    is_null = df[\\\"Age\\\"].isnull().sum()\\r\\n\",\"    # compute enough (== is_null().sum()) random numbers between the mean, std\\r\\n\",\"    rand_age = np.random.randint(mean - std, mean + std, size = is_null)\\r\\n\",\"    # fill NaN values in Age column with random values generated\\r\\n\",\"    age_slice = df[\\\"Age\\\"].copy()\\r\\n\",\"    age_slice[np.isnan(age_slice)] = rand_age\\r\\n\",\"    df[\\\"Age\\\"] = age_slice\\r\\n\",\"    df[\\\"Age\\\"] = df[\\\"Age\\\"].astype(int)\\r\\n\",\"    \\r\\n\",\"    # Quantize age into 5 classes\\r\\n\",\"    df['Age_Group'] = pd.qcut(df['Age'],5, labels=False)\\r\\n\",\"    df.drop(['Age'], axis=1, inplace=True)\\r\\n\",\"    return df\\r\\n\",\"\\r\\n\",\"def prepare_fare(df):\\r\\n\",\"    df['Fare'].fillna(0, inplace=True)\\r\\n\",\"    df['Fare_Group'] = pd.qcut(df['Fare'],5,labels=False)\\r\\n\",\"    df.drop(['Fare'], axis=1, inplace=True)\\r\\n\",\"    return df \\r\\n\",\"\\r\\n\",\"def prepare_genders(df):\\r\\n\",\"    genders = {\\\"male\\\": 0, \\\"female\\\": 1, \\\"unknown\\\": 2}\\r\\n\",\"    df['Sex'] = df['Sex'].map(genders)\\r\\n\",\"    df['Sex'].fillna(2, inplace=True)\\r\\n\",\"    df['Sex'] = df['Sex'].astype(int)\\r\\n\",\"    return df\\r\\n\",\"\\r\\n\",\"def prepare_embarked(df):\\r\\n\",\"    df['Embarked'].replace('', 'U', inplace=True)\\r\\n\",\"    df['Embarked'].fillna('U', inplace=True)\\r\\n\",\"    ports = {\\\"S\\\": 0, \\\"C\\\": 1, \\\"Q\\\": 2, \\\"U\\\": 3}\\r\\n\",\"    df['Embarked'] = df['Embarked'].map(ports)\\r\\n\",\"    return df\\r\\n\",\"    \\r\\n\",\"parser = argparse.ArgumentParser()\\r\\n\",\"parser.add_argument('--output_path', dest='output_path', required=True)\\r\\n\",\"args = parser.parse_args()\\r\\n\",\"    \\r\\n\",\"titanic_ds = Run.get_context().input_datasets['titanic_ds']\\r\\n\",\"df = titanic_ds.to_pandas_dataframe().drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)\\r\\n\",\"df = prepare_embarked(prepare_genders(prepare_fare(prepare_age(df))))\\r\\n\",\"\\r\\n\",\"os.makedirs(os.path.dirname(args.output_path), exist_ok=True)\\r\\n\",\"pq.write_table(pa.Table.from_pandas(df), args.output_path)\\r\\n\",\"\\r\\n\",\"print(f\\\"Wrote test to {args.output_path} and train to {args.output_path}\\\")\"],\"outputs\":[],\"execution_count\":null,\"metadata\":{}}],\"metadata\":{\"kernelspec\":{\"name\":\"python3-azureml\",\"language\":\"python\",\"display_name\":\"Python 3.6 - AzureML\"},\"language_info\":{\"name\":\"python\",\"version\":\"3.6.9\",\"mimetype\":\"text/x-python\",\"codemirror_mode\":{\"name\":\"ipython\",\"version\":3},\"pygments_lexer\":\"ipython3\",\"nbconvert_exporter\":\"python\",\"file_extension\":\".py\"},\"kernel_info\":{\"name\":\"python3-azureml\"},\"nteract\":{\"version\":\"nteract-front-end@1.0.0\"}},\"nbformat\":4,\"nbformat_minor\":2}\n"
        },
        "messageParameters": {}
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"User program failed with NameError: name 'null' is not defined\",\n        \"detailsUri\": \"https://aka.ms/azureml-known-errors\",\n        \"details\": [],\n        \"debugInfo\": {\n            \"type\": \"NameError\",\n            \"message\": \"name 'null' is not defined\",\n            \"stackTrace\": \"  File \\\"/mnt/batch/tasks/shared/LS_root/jobs/ml_practice/azureml/4d71cd94-1808-49d0-a5a4-10483d34519f/mounts/workspaceblobstore/azureml/4d71cd94-1808-49d0-a5a4-10483d34519f/azureml-setup/context_manager_injector.py\\\", line 148, in execute_with_context\\n    runpy.run_path(sys.argv[0], globals(), run_name=\\\"__main__\\\")\\n  File \\\"/azureml-envs/azureml_46fb7d2fe1381cf4c90841fb72b9a774/lib/python3.6/runpy.py\\\", line 263, in run_path\\n    pkg_name=pkg_name, script_name=fname)\\n  File \\\"/azureml-envs/azureml_46fb7d2fe1381cf4c90841fb72b9a774/lib/python3.6/runpy.py\\\", line 96, in _run_module_code\\n    mod_name, mod_spec, pkg_name, script_name)\\n  File \\\"/azureml-envs/azureml_46fb7d2fe1381cf4c90841fb72b9a774/lib/python3.6/runpy.py\\\", line 85, in _run_code\\n    exec(code, run_globals)\\n  File \\\"dataprep.py\\\", line 1, in <module>\\n    {\\\"cells\\\":[{\\\"cell_type\\\":\\\"code\\\",\\\"source\\\":[\\\"from azureml.core import Run\\\\r\\\\n\\\",\\\"\\\\r\\\\n\\\",\\\"import pandas as pd \\\\r\\\\n\\\",\\\"import numpy as np \\\\r\\\\n\\\",\\\"import pyarrow as pa\\\\r\\\\n\\\",\\\"import pyarrow.parquet as pq\\\\r\\\\n\\\",\\\"import argparse\\\\r\\\\n\\\",\\\"\\\\r\\\\n\\\",\\\"RANDOM_SEED=42\\\\r\\\\n\\\",\\\"\\\\r\\\\n\\\",\\\"def prepare_age(df):\\\\r\\\\n\\\",\\\"    # Fill in missing Age values from distribution of present Age values \\\\r\\\\n\\\",\\\"    mean = df[\\\\\\\"Age\\\\\\\"].mean()\\\\r\\\\n\\\",\\\"    std = df[\\\\\\\"Age\\\\\\\"].std()\\\\r\\\\n\\\",\\\"    is_null = df[\\\\\\\"Age\\\\\\\"].isnull().sum()\\\\r\\\\n\\\",\\\"    # compute enough (== is_null().sum()) random numbers between the mean, std\\\\r\\\\n\\\",\\\"    rand_age = np.random.randint(mean - std, mean + std, size = is_null)\\\\r\\\\n\\\",\\\"    # fill NaN values in Age column with random values generated\\\\r\\\\n\\\",\\\"    age_slice = df[\\\\\\\"Age\\\\\\\"].copy()\\\\r\\\\n\\\",\\\"    age_slice[np.isnan(age_slice)] = rand_age\\\\r\\\\n\\\",\\\"    df[\\\\\\\"Age\\\\\\\"] = age_slice\\\\r\\\\n\\\",\\\"    df[\\\\\\\"Age\\\\\\\"] = df[\\\\\\\"Age\\\\\\\"].astype(int)\\\\r\\\\n\\\",\\\"    \\\\r\\\\n\\\",\\\"    # Quantize age into 5 classes\\\\r\\\\n\\\",\\\"    df['Age_Group'] = pd.qcut(df['Age'],5, labels=False)\\\\r\\\\n\\\",\\\"    df.drop(['Age'], axis=1, inplace=True)\\\\r\\\\n\\\",\\\"    return df\\\\r\\\\n\\\",\\\"\\\\r\\\\n\\\",\\\"def prepare_fare(df):\\\\r\\\\n\\\",\\\"    df['Fare'].fillna(0, inplace=True)\\\\r\\\\n\\\",\\\"    df['Fare_Group'] = pd.qcut(df['Fare'],5,labels=False)\\\\r\\\\n\\\",\\\"    df.drop(['Fare'], axis=1, inplace=True)\\\\r\\\\n\\\",\\\"    return df \\\\r\\\\n\\\",\\\"\\\\r\\\\n\\\",\\\"def prepare_genders(df):\\\\r\\\\n\\\",\\\"    genders = {\\\\\\\"male\\\\\\\": 0, \\\\\\\"female\\\\\\\": 1, \\\\\\\"unknown\\\\\\\": 2}\\\\r\\\\n\\\",\\\"    df['Sex'] = df['Sex'].map(genders)\\\\r\\\\n\\\",\\\"    df['Sex'].fillna(2, inplace=True)\\\\r\\\\n\\\",\\\"    df['Sex'] = df['Sex'].astype(int)\\\\r\\\\n\\\",\\\"    return df\\\\r\\\\n\\\",\\\"\\\\r\\\\n\\\",\\\"def prepare_embarked(df):\\\\r\\\\n\\\",\\\"    df['Embarked'].replace('', 'U', inplace=True)\\\\r\\\\n\\\",\\\"    df['Embarked'].fillna('U', inplace=True)\\\\r\\\\n\\\",\\\"    ports = {\\\\\\\"S\\\\\\\": 0, \\\\\\\"C\\\\\\\": 1, \\\\\\\"Q\\\\\\\": 2, \\\\\\\"U\\\\\\\": 3}\\\\r\\\\n\\\",\\\"    df['Embarked'] = df['Embarked'].map(ports)\\\\r\\\\n\\\",\\\"    return df\\\\r\\\\n\\\",\\\"    \\\\r\\\\n\\\",\\\"parser = argparse.ArgumentParser()\\\\r\\\\n\\\",\\\"parser.add_argument('--output_path', dest='output_path', required=True)\\\\r\\\\n\\\",\\\"args = parser.parse_args()\\\\r\\\\n\\\",\\\"    \\\\r\\\\n\\\",\\\"titanic_ds = Run.get_context().input_datasets['titanic_ds']\\\\r\\\\n\\\",\\\"df = titanic_ds.to_pandas_dataframe().drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)\\\\r\\\\n\\\",\\\"df = prepare_embarked(prepare_genders(prepare_fare(prepare_age(df))))\\\\r\\\\n\\\",\\\"\\\\r\\\\n\\\",\\\"os.makedirs(os.path.dirname(args.output_path), exist_ok=True)\\\\r\\\\n\\\",\\\"pq.write_table(pa.Table.from_pandas(df), args.output_path)\\\\r\\\\n\\\",\\\"\\\\r\\\\n\\\",\\\"print(f\\\\\\\"Wrote test to {args.output_path} and train to {args.output_path}\\\\\\\")\\\"],\\\"outputs\\\":[],\\\"execution_count\\\":null,\\\"metadata\\\":{}}],\\\"metadata\\\":{\\\"kernelspec\\\":{\\\"name\\\":\\\"python3-azureml\\\",\\\"language\\\":\\\"python\\\",\\\"display_name\\\":\\\"Python 3.6 - AzureML\\\"},\\\"language_info\\\":{\\\"name\\\":\\\"python\\\",\\\"version\\\":\\\"3.6.9\\\",\\\"mimetype\\\":\\\"text/x-python\\\",\\\"codemirror_mode\\\":{\\\"name\\\":\\\"ipython\\\",\\\"version\\\":3},\\\"pygments_lexer\\\":\\\"ipython3\\\",\\\"nbconvert_exporter\\\":\\\"python\\\",\\\"file_extension\\\":\\\".py\\\"},\\\"kernel_info\\\":{\\\"name\\\":\\\"python3-azureml\\\"},\\\"nteract\\\":{\\\"version\\\":\\\"nteract-front-end@1.0.0\\\"}},\\\"nbformat\\\":4,\\\"nbformat_minor\\\":2}\\n\"\n        },\n        \"messageParameters\": {}\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}