In [1]:
from azureml.core import Workspace
import azureml.core

ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.38.0 to work with Azureml-SDK-WS02


In [25]:
import os
# Create a folder for the pipeline step files
experiment_folder = 'defaults_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

defaults_pipeline


# Compute cluster

In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = 'my-cluster-001'

try:
    compute_cluster = ComputeTarget(workspace = ws,
                                   name = cluster_name)
    print('Found existing cluster, use it')
except ComputeTargetException:
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size = 'STANDARD_DS11_V',
                                                              max_nodes = 2)
        compute_cluster = ComputeTarget.create(ws,
                                              cluster_name,
                                              compute_config)
        compute_cluster.wait_for_completion(show_output = True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it


# Register Environment (Other way)

In [33]:
%%writefile $experiment_folder/experiment_env.yml
name: experiment_env
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Writing defaults_pipeline/experiment_env.yml


In [None]:
experiment_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/experiment_env.yml")
experiment_env.register(workspace=ws)


# Register Environment 

In [5]:
from azureml.core import Environment, Experiment, ScriptRunConfig
from azureml.core.runconfig import RunConfiguration
from azureml.core.environment import CondaDependencies


myenv = Environment(name = 'MyEnvironment')

myenv_dep = CondaDependencies.create(conda_packages = ['scikit-learn',
                                                      'pandas',
                                                      'numpy'])
myenv.python.conda_dependencies = myenv_dep
myenv.register(ws)


{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20220113.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "MyEnvironment",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge"


# Create Run Configuration

In [6]:
run_config = RunConfiguration()
run_config.target = compute_cluster
run_config.environment = myenv

# Dataset

In [36]:
from azureml.core import Dataset

default_ds = ws.get_default_datastore()

if 'defaults dataset' not in ws.datasets:
    default_ds.upload(src_dir = '.',
                     target_path = 'defaults',
                     overwrite = True,
                     show_progress = True)

    path_csv = [(default_ds, '/defaults/defaults.csv')]
    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path = path_csv)

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='Defaults',
                                description='defaults dataset',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

Uploading an estimated of 245 files
Uploading ./.amlignore
Uploaded ./.amlignore, 1 files out of an estimated total of 245
Uploading ./.amlignore.amltmp
Uploaded ./.amlignore.amltmp, 2 files out of an estimated total of 245
Uploading ./Compute_Cluster with SDK.ipynb
Uploaded ./Compute_Cluster with SDK.ipynb, 3 files out of an estimated total of 245
Uploading ./Experiments SDK.ipynb
Uploaded ./Experiments SDK.ipynb, 4 files out of an estimated total of 245
Uploading ./Experiments AzureML SDK 1.ipynb
Uploaded ./Experiments AzureML SDK 1.ipynb, 5 files out of an estimated total of 245
Uploading ./Set up AzureML Workspace.ipynb
Uploaded ./Set up AzureML Workspace.ipynb, 6 files out of an estimated total of 245
Uploading ./Test-Bikes.ipynb
Uploaded ./Test-Bikes.ipynb, 7 files out of an estimated total of 245
Uploading ./test-bikes.ipynb.amltmp
Uploaded ./test-bikes.ipynb.amltmp, 8 files out of an estimated total of 245
Uploading ./Untitled.ipynb
Uploaded ./Untitled.ipynb, 9 files out of an 

# Data Prep Pipeline

In [46]:
%%writefile $experiment_folder/Dataprep_Pipeline.py
import os
import argparse
import pandas as pd
from azureml.core import Run
from argparse import ArgumentParser as AP
from sklearn.preprocessing import MinMaxScaler


# Get the arguments from pipeline job
parser = AP()
parser.add_argument('--datafolder', type = str)
args = parser.parse_args()

# Get the run context
new_run = Run.get_context()

# Get the workspace fromthe run
ws = new_run.experiment.workspace

# Read the input dataset
df = new_run.input_datasets['raw_data'].to_pandas_dataframe()

# df = pd.read_csv('defaults.csv')
dataPrep = df.drop(['ID'], axis = 1 )
all_cols = dataPrep.columns

# Check the missing values
dataNull = dataPrep.isnull().sum()

# Replace the missing values of string variable with mode
mode = dataPrep.mode().iloc[0]
cols = dataPrep.select_dtypes(include = 'object').columns

dataPrep[cols] = dataPrep[cols].fillna(mode)

# Replace numerical columns with mean
mean = dataPrep.mean()
dataPrep = dataPrep.fillna(mean)

# Create Dummy variables
dataPrep = pd.get_dummies(dataPrep, drop_first = True)

# Normalise the data
scaler = MinMaxScaler()
columns = df.select_dtypes(include = 'number').columns
dataPrep[columns] = scaler.fit_transform(dataPrep[columns])



# Create the folder if it does not exist
print("Saving Data...")
os.makedirs(args.datafolder, exist_ok=True)
path = os.path.join(args.datafloder, 'defaults_prep.csv')
dataPrep.to_csv(path, index = False, header = True)

# Log null values
for columns in all_cols:
    new_run.log(columns, dataNull[column])
    
run.complete()

Overwriting defaults_pipeline/Dataprep_Pipeline.py


# Training Pipeline

In [47]:
%%writefile $experiment_folder/Training_Pipeline.py
from azureml.core import Run
import argparser
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix


# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument('--datafolder', type = str)
args = parser.parse_args()

# Get the context of the experiment run
new_run = Run.get_context()

# Access the Workspace
ws = new_run.experiment.workspace

path = os.path.join(args.datafolder, 'defaults_prep.csv')
dataPrep = pd.read_csv(path)

# Create X and Y
y = dataPrep[['Default Next Month_Yes']]
X = dataPrep.drop(['Default Next Month_Yes'], axis = 1)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1234, stratify = True)

# Build the logistic Regression model
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_predict = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:, 1]

cm = confusion_matrix(y_test, y_predict)
score = lr.score(X_test, y_test)

# Create the confusion matrix dictionary
cm_dict = {'schema_type': 'confusion_matrix',
           'schema_version': 'v1',
           'data': { 'class_labels': ['N', 'Y'],
                     'matrix': cm.tolist()
                   }
          }

# Create the scored dataset and upload to outputs
X_test = X_test.reset_index(drop = True)
y_test = y_test.reset_index(drop = True)

y_prob_df = pd.DataFrame(y_prob, columns = ['Scored Probabilities'])
y_predict_df = pd.DataFrame(y_predict, columns = ['Scored Label'])

scored_dataset = pd.concat([X_test, y_test, y_predict_df, y_prob_df], axis = 1)

scored_dataset.to_csv('./outputs/defaults_scored.csv')

new_run.complete()

Writing defaults_pipeline/Training_Pipeline.py


# Define Pipeline steps

In [50]:
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import PipelineData
from azureml.core.runconfig import RunConfiguration

input_ds = ws.datasets.get('Defaults')

dataFolder = PipelineData('datafolder',
                         datastore = ws.get_default_datastore())

# step 01 - Data Preparation
dataPrep_step = PythonScriptStep(name = '01 Data Preparation',
                                 source_directory = experiment_folder,
                                 script_name = 'Dataprep_Pipeline.py',
                                 inputs = [input_ds.as_named_input('raw_data')],
                                 outputs = [dataFolder],
                                 runconfig = run_config,
                                 arguments = ['--datafolder', dataFolder])

# step 02 - Training the model
train_step = PythonScriptStep(name = '02 Train the Model',
                                 source_directory = experiment_folder,
                                 script_name = 'Training_Pipeline.py',
                                 inputs = [dataFolder],
                                 runconfig = run_config,
                                 arguments = ['--datafolder', dataFolder])
print("Pipeline steps defined")

Pipeline steps defined


# Configure and build the Pipeline

In [53]:
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

steps = [dataPrep_step, train_step]

new_pipeline = Pipeline(workspace = ws,
                        steps = steps)

# Create the experiment and run the pipeline
from azureml.core import Experiment

new_experiment = Experiment(workspace = ws, name = 'PïpelineExp01')
new_pipeline_run = new_experiment.submit(new_pipeline)

print("Pipeline submitted for execution.")
RunDetails(new_pipeline_run).show()
new_pipeline_run.wait_for_completion(show_output = True)

Created step 01 Data Preparation [d2655156][ebd53c01-f3cd-400c-90d9-9fcad0fb0180], (This step is eligible to reuse a previous run's output)
Created step 02 TRain the Model [304e9943][a34fbfcd-04cc-4a78-a675-a96068fcea08], (This step is eligible to reuse a previous run's output)
Submitted PipelineRun 8d50e49b-4c98-4ec0-8939-e4d9923314da
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/8d50e49b-4c98-4ec0-8939-e4d9923314da?wsid=/subscriptions/f0ec0447-a406-4c0a-922d-f468c99bce13/resourcegroups/AzuremlSDKRG01/workspaces/Azureml-SDK-WS02&tid=f94bf4d9-8097-4794-adf6-a5466ca28563
Pipeline submitted for execution.


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: 8d50e49b-4c98-4ec0-8939-e4d9923314da
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/8d50e49b-4c98-4ec0-8939-e4d9923314da?wsid=/subscriptions/f0ec0447-a406-4c0a-922d-f468c99bce13/resourcegroups/AzuremlSDKRG01/workspaces/Azureml-SDK-WS02&tid=f94bf4d9-8097-4794-adf6-a5466ca28563
PipelineRun Status: Running


StepRunId: c1ecbcc6-3910-469c-9bcb-237d7d94c464
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/c1ecbcc6-3910-469c-9bcb-237d7d94c464?wsid=/subscriptions/f0ec0447-a406-4c0a-922d-f468c99bce13/resourcegroups/AzuremlSDKRG01/workspaces/Azureml-SDK-WS02&tid=f94bf4d9-8097-4794-adf6-a5466ca28563
StepRun( 01 Data Preparation ) Status: Running

Streaming azureml-logs/20_image_build_log.txt
2022/03/25 09:03:00 Downloading source code...
2022/03/25 09:03:01 Finished downloading source code
2022/03/25 09:03:02 Creating Docker network: acb_default_network, driver: 'bridge'
2022/03/25 09:03:02 Successfully set up Docker network: acb_default_network
2

ActivityFailedException: ActivityFailedException:
	Message: Activity Failed:
{
    "error": {
        "code": "UserError",
        "message": "{'code': ExecutionFailed, 'message': [{\"exit_code\":1,\"error_message\":\"Execution failed with error: Saving Data...\\nCleaning up all outstanding Run operations, waiting 300.0 seconds\\n1 items cleaning up...\\nCleanup took 0.0708913803100586 seconds\\n[stderr]Traceback (most recent call last):\\n[stderr]  File \\\"Dataprep_Pipeline.py\\\", line 53, in <module>\\n[stderr]    path = os.path.join(args.datafloder, 'defaults_prep.csv')\\n[stderr]AttributeError: 'Namespace' object has no attribute 'datafloder'\\n[stderr]\\n\",\"process_name\":\"/azureml-envs/azureml_d9271587e78e8fd4e49fcb4d1af951bc/bin/python\",\"error_file\":\"user_logs/std_log.txt\"}], 'target': , 'category': UserError, 'error_details': [{'key': exit_codes, 'value': 1}, ], 'inner_error': null}",
        "messageParameters": {},
        "details": []
    },
    "time": "0001-01-01T00:00:00.000Z"
}
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Activity Failed:\n{\n    \"error\": {\n        \"code\": \"UserError\",\n        \"message\": \"{'code': ExecutionFailed, 'message': [{\\\"exit_code\\\":1,\\\"error_message\\\":\\\"Execution failed with error: Saving Data...\\\\nCleaning up all outstanding Run operations, waiting 300.0 seconds\\\\n1 items cleaning up...\\\\nCleanup took 0.0708913803100586 seconds\\\\n[stderr]Traceback (most recent call last):\\\\n[stderr]  File \\\\\\\"Dataprep_Pipeline.py\\\\\\\", line 53, in <module>\\\\n[stderr]    path = os.path.join(args.datafloder, 'defaults_prep.csv')\\\\n[stderr]AttributeError: 'Namespace' object has no attribute 'datafloder'\\\\n[stderr]\\\\n\\\",\\\"process_name\\\":\\\"/azureml-envs/azureml_d9271587e78e8fd4e49fcb4d1af951bc/bin/python\\\",\\\"error_file\\\":\\\"user_logs/std_log.txt\\\"}], 'target': , 'category': UserError, 'error_details': [{'key': exit_codes, 'value': 1}, ], 'inner_error': null}\",\n        \"messageParameters\": {},\n        \"details\": []\n    },\n    \"time\": \"0001-01-01T00:00:00.000Z\"\n}"
    }
}

In [54]:
from azureml.core import Model

for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')