In [3]:
from azureml.core import Workspace, Experiment

ws = Workspace.get(name="Workspace")
exp = Experiment(workspace=ws, name="udacity-project-hyperparam")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: Workspace
Azure region: westeurope
Subscription id: 72f46e0e-1451-4b79-92cd-fc8f7797bda7
Resource group: test-Jesse


In [35]:
from azureml.core.compute import ComputeTarget, AmlCompute

# from https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/training/train-on-amlcompute/train-on-amlcompute.ipynb
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [5]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform, loguniform, choice
import os

# Specify parameter sampler
# Did some reseach to find acceptable values. Chose loguniform to make the sampling work better
# Especially for the C parameter this is important, otherwise Azure would be unlikely to explore the very low values
# https://www.kaggle.com/joparga3/2-tuning-parameters-for-logistic-regression#Testing-different-parameters-to-understand-how-accuracies-change.
# https://stackoverflow.com/questions/21816346/fine-tuning-parameters-in-logistic-regression
ps = RandomParameterSampling( {
        "C": loguniform(0.001, 1000),
        "max_iter": choice([10, 100,1000, 2000])})


# Specify a Policy
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters
policy = BanditPolicy(slack_factor = 0.1, evaluation_interval=1, delay_evaluation=5)

if "training" not in os.listdir():
    os.mkdir("./training")

# Create a SKLearn estimator for use with train.py
# SKIPPED BECAUSE ESTIMATORS ARE DEPRECIATED 
# https://knowledge.udacity.com/questions/407297
# Essentially I started following https://docs.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters
# this means I have to make a scriptrunconfig first and submit the hyperdriveconfig

In [6]:
# Submit your hyperdrive run to the experiment and show run details with the widget.

from azureml.core import Environment
from azureml.core import ScriptRunConfig
project_folder = '.'
myenv = Environment.get(workspace=ws, name="AzureML-Scikit-learn-0.20.3")
myenv = Environment.from_conda_specification(name = 'project1', file_path = './config/project1.yml')
src = ScriptRunConfig(source_directory=project_folder, 
                      script='train.py', 
                      compute_target=cpu_cluster, 
                      environment=myenv)

# Create a HyperDriveConfig using the estimator, hyperparameter sampler, and policy.
hyperdrive_config = HyperDriveConfig(run_config=src,
                             hyperparameter_sampling=ps,
                             policy=policy,
                             primary_metric_name="Accuracy",
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                             max_total_runs=40,
                             max_concurrent_runs=4)

hyperdrive_run = exp.submit(config=hyperdrive_config)
hyperdrive_run

Experiment,Id,Type,Status,Details Page,Docs Page
udacity-project-hyperparam,HD_16ad0962-fe8b-4a43-a093-6121e7064dff,hyperdrive,Running,Link to Azure Machine Learning studio,Link to Documentation


In [63]:
exp = Experiment(workspace=ws, name="udacity-project-hyperparam")
hyperdrive_run = list(exp.get_runs())[0]

In [7]:
from azureml.widgets import RunDetails
RunDetails(hyperdrive_run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [31]:
import joblib
# Get your best run and save the model from that run.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
print(best_run.get_details()['runDefinition']['arguments'])
print(best_run_metrics['Accuracy'])
files = best_run.get_file_names()
best_run.download_file(files[-1], output_file_path='./outputs/')
joblib.load('./outputs/model.joblib')

['--C', '3.145839853194323E+279', '--max_iter', '100']
0.9088012139605463


LogisticRegression(C=3.145839853194323e+279, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
# clean up compute cluster
cpu_cluster.delete()

# AutoML part

In [37]:
from azureml.data.dataset_factory import TabularDatasetFactory
# Create TabularDataset using TabularDatasetFactory
data_url = ["https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"]
ds = TabularDatasetFactory.from_delimited_files(data_url)

Current provisioning state of AmlCompute is "Deleting"



In [38]:
from train import clean_data
from sklearn.model_selection import train_test_split

# Use the clean_data function to clean your data.
x, y = clean_data(ds)
x['y'] = y
train, test = train_test_split(x, test_size=0.2, random_state=112)

In [39]:
# I have to convert the dataframe to a dataset since I'm running jupyter in VSCode locally
from azureml.core import Workspace, Dataset
def upload(ws, df, name):
    local_path = 'data/'+name+'.csv'
    train.to_csv(local_path)
    datastore = ws.get_default_datastore()
    # upload the local file from src_dir to the target_path in datastore
    datastore.upload(src_dir='data', target_path='data')
    # create a dataset referencing the cloud location
    return Dataset.Tabular.from_delimited_files(path = [(datastore, (local_path))])

train_ds = upload(ws, train, 'train')
test_ds = upload(ws, train, 'test')

Uploading an estimated of 2 files
Target already exists. Skipping upload for data\test.csv
Target already exists. Skipping upload for data\train.csv
Uploaded 0 files
Uploading an estimated of 2 files
Target already exists. Skipping upload for data\test.csv
Target already exists. Skipping upload for data\train.csv
Uploaded 0 files


In [44]:
cpu_cluster_name = "cpu-cluster-2"

# Verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

from azureml.train.automl import AutoMLConfig

# Set parameters for AutoMLConfig
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    enable_onnx_compatible_models=True,
    task='classification',
    primary_metric='accuracy',
    training_data= train_ds,
    compute_target=cpu_cluster,
    label_column_name='y',
    validation_data = test_ds)

CreatingCurrent provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"

Current provisioning state of AmlCompute is "Deleting"


Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


In [45]:
# Submit your automl run
from azureml.core import Workspace, Experiment
experiment=Experiment(ws, "automl-test")
automl_run = experiment.submit(automl_config, show_output = False)

Running on remote.


In [48]:
automl_run

Experiment,Id,Type,Status,Details Page,Docs Page
automl-test,AutoML_5a1a518c-0601-4968-9a98-c412529a55f3,automl,Running,Link to Azure Machine Learning studio,Link to Documentation


In [49]:
RunDetails(automl_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [119]:
exp = Experiment(workspace=ws, name="automl")
automl_run = list(exp.get_runs())[0]
# best_run = max(automl_run.get_children(), key= lambda x: float(x.get_properties()['score']))

In [23]:
automl_run.get_output()

AutoMLException: AutoMLException:
	Message: Could not find a model with valid score for metric 'accuracy'. Please ensure that at least one run was successfully completed with a valid score for the given metric.
	InnerException: None
	ErrorResponse 
{
    "error": {
        "code": "UserError",
        "message": "Could not find a model with valid score for metric 'accuracy'. Please ensure that at least one run was successfully completed with a valid score for the given metric.",
        "target": "metric",
        "inner_error": {
            "code": "NotFound",
            "inner_error": {
                "code": "ModelMissing"
            }
        }
    }
}

In [21]:
run = next(experiment.get_runs())
run.get_output()

AttributeError: 'Run' object has no attribute 'get_output'

In [123]:
best_run.register_model(model_name='best_automl',
                           tags={'type': 'automl'},
                           model_path='outputs/best_automl.pkl')

ModelPathNotFoundException: ModelPathNotFoundException:
	Message: Could not locate the provided model_path outputs/best_automl.pkl in the set of files uploaded to the run: ['accuracy_table', 'automl_driver.py', 'azureml-logs/55_azureml-execution-tvmps_379bd59b4ee2fc95a6ae436f5949353b329ca20aacce1fd89cd8a82a394c7d1f_d.txt', 'azureml-logs/65_job_prep-tvmps_379bd59b4ee2fc95a6ae436f5949353b329ca20aacce1fd89cd8a82a394c7d1f_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_379bd59b4ee2fc95a6ae436f5949353b329ca20aacce1fd89cd8a82a394c7d1f_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'confusion_matrix', 'explanation/284fb7c3/classes.interpret.json', 'explanation/284fb7c3/eval_data_viz.interpret.json', 'explanation/284fb7c3/expected_values.interpret.json', 'explanation/284fb7c3/features.interpret.json', 'explanation/284fb7c3/global_names/0.interpret.json', 'explanation/284fb7c3/global_rank/0.interpret.json', 'explanation/284fb7c3/global_values/0.interpret.json', 'explanation/284fb7c3/local_importance_values.interpret.json', 'explanation/284fb7c3/per_class_names/0.interpret.json', 'explanation/284fb7c3/per_class_rank/0.interpret.json', 'explanation/284fb7c3/per_class_values/0.interpret.json', 'explanation/284fb7c3/rich_metadata.interpret.json', 'explanation/284fb7c3/visualization_dict.interpret.json', 'explanation/284fb7c3/ys_pred_proba_viz.interpret.json', 'explanation/284fb7c3/ys_pred_viz.interpret.json', 'explanation/fdcf31d4/classes.interpret.json', 'explanation/fdcf31d4/eval_data_viz.interpret.json', 'explanation/fdcf31d4/expected_values.interpret.json', 'explanation/fdcf31d4/features.interpret.json', 'explanation/fdcf31d4/global_names/0.interpret.json', 'explanation/fdcf31d4/global_rank/0.interpret.json', 'explanation/fdcf31d4/global_values/0.interpret.json', 'explanation/fdcf31d4/local_importance_values.interpret.json', 'explanation/fdcf31d4/per_class_names/0.interpret.json', 'explanation/fdcf31d4/per_class_rank/0.interpret.json', 'explanation/fdcf31d4/per_class_values/0.interpret.json', 'explanation/fdcf31d4/rich_metadata.interpret.json', 'explanation/fdcf31d4/visualization_dict.interpret.json', 'logs/azureml/101_azureml.log', 'logs/azureml/azureml_automl.log', 'logs/azureml/dataprep/python_span_28c0d8e5-ebc7-4820-82a9-c9286300a9bd.jsonl', 'logs/azureml/dataprep/python_span_de54a234-70f9-4cdd-8758-b5e5beebbbdb.jsonl', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/conda_env_v_1_0_0.yml', 'outputs/env_dependencies.json', 'outputs/model.pkl', 'outputs/pipeline_graph.json', 'outputs/scoring_file_v_1_0_0.py']
                See https://aka.ms/run-logging for more details.
	InnerException None
	ErrorResponse 
{
    "error": {
        "message": "Could not locate the provided model_path outputs/best_automl.pkl in the set of files uploaded to the run: ['accuracy_table', 'automl_driver.py', 'azureml-logs/55_azureml-execution-tvmps_379bd59b4ee2fc95a6ae436f5949353b329ca20aacce1fd89cd8a82a394c7d1f_d.txt', 'azureml-logs/65_job_prep-tvmps_379bd59b4ee2fc95a6ae436f5949353b329ca20aacce1fd89cd8a82a394c7d1f_d.txt', 'azureml-logs/70_driver_log.txt', 'azureml-logs/75_job_post-tvmps_379bd59b4ee2fc95a6ae436f5949353b329ca20aacce1fd89cd8a82a394c7d1f_d.txt', 'azureml-logs/process_info.json', 'azureml-logs/process_status.json', 'confusion_matrix', 'explanation/284fb7c3/classes.interpret.json', 'explanation/284fb7c3/eval_data_viz.interpret.json', 'explanation/284fb7c3/expected_values.interpret.json', 'explanation/284fb7c3/features.interpret.json', 'explanation/284fb7c3/global_names/0.interpret.json', 'explanation/284fb7c3/global_rank/0.interpret.json', 'explanation/284fb7c3/global_values/0.interpret.json', 'explanation/284fb7c3/local_importance_values.interpret.json', 'explanation/284fb7c3/per_class_names/0.interpret.json', 'explanation/284fb7c3/per_class_rank/0.interpret.json', 'explanation/284fb7c3/per_class_values/0.interpret.json', 'explanation/284fb7c3/rich_metadata.interpret.json', 'explanation/284fb7c3/visualization_dict.interpret.json', 'explanation/284fb7c3/ys_pred_proba_viz.interpret.json', 'explanation/284fb7c3/ys_pred_viz.interpret.json', 'explanation/fdcf31d4/classes.interpret.json', 'explanation/fdcf31d4/eval_data_viz.interpret.json', 'explanation/fdcf31d4/expected_values.interpret.json', 'explanation/fdcf31d4/features.interpret.json', 'explanation/fdcf31d4/global_names/0.interpret.json', 'explanation/fdcf31d4/global_rank/0.interpret.json', 'explanation/fdcf31d4/global_values/0.interpret.json', 'explanation/fdcf31d4/local_importance_values.interpret.json', 'explanation/fdcf31d4/per_class_names/0.interpret.json', 'explanation/fdcf31d4/per_class_rank/0.interpret.json', 'explanation/fdcf31d4/per_class_values/0.interpret.json', 'explanation/fdcf31d4/rich_metadata.interpret.json', 'explanation/fdcf31d4/visualization_dict.interpret.json', 'logs/azureml/101_azureml.log', 'logs/azureml/azureml_automl.log', 'logs/azureml/dataprep/python_span_28c0d8e5-ebc7-4820-82a9-c9286300a9bd.jsonl', 'logs/azureml/dataprep/python_span_de54a234-70f9-4cdd-8758-b5e5beebbbdb.jsonl', 'logs/azureml/job_prep_azureml.log', 'logs/azureml/job_release_azureml.log', 'outputs/conda_env_v_1_0_0.yml', 'outputs/env_dependencies.json', 'outputs/model.pkl', 'outputs/pipeline_graph.json', 'outputs/scoring_file_v_1_0_0.py']\n                See https://aka.ms/run-logging for more details."
    }
}

In [57]:
import joblib
# Get your best run and save the model from that run.
best_run = automl_run
best_run_metrics = best_run.get_metrics()
print(best_run.get_details()['runDefinition']['arguments'])
files = best_run.get_file_names()
best_run.download_file(files[-1], output_file_path='./outputs/best_automl.joblib')
joblib.load('./outputs/best_automl.joblib')

KeyError: 'runDefinition'

In [61]:
best_run, fitted_model = automl_run.get_output()
print(best_run)
print(fitted_model)

AttributeError: 'Experiment' object has no attribute 'get_output'

In [59]:
best_run, best_model = automl_run.get_output()
best_run.register_model(model_name = 'automl_best_model.pkl', model_path = './outputs/')

AttributeError: 'Run' object has no attribute 'get_output'

In [86]:
azureml-train-automl-runtime.VERSION

NameError: name 'azureml' is not defined

In [None]:
joblib.dump(model, 'best_train_model')