In [None]:
from azureml.core import Workspace, Experiment

ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="udacity-project")
# define experiment
#exp_automl = Experiment(workspace=ws, name="automl")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "MY_PROJECT_CLUSTER"

try:
    compute_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    cluster_config = AmlCompute.provisioning_configuration(vm_size='Standard_D2_V2', max_nodes=4)
    compute_cluster = ComputeTarget.create(ws, cluster_name, cluster_config)

compute_cluster.wait_for_completion(show_output=True)

# get a detailed status for the current cluster.
print(compute_cluster.get_status().serialize())

In [None]:
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn
from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import choice, uniform
from azureml.core import Environment, ScriptRunConfig
import os, logging
logging.basicConfig(level=logging.INFO)

# Create a directory that will contain all the necessary code.
#project_folder = 'Final_Project'
#os.makedirs(project_folder, exist_ok=True)

# parameter sampler
param_sampling = RandomParameterSampling( {
        '--C': choice(0.1, 1, 10, 100, 1000), 
        #'--max_iter': uniform(1, 500)
        '--max_iter': choice(1, 50, 100, 200, 500)
    })

# End poorly performing runs with an early termination policy (improves computational efficiency).
early_termination_policy = BanditPolicy(slack_factor=0.1)

if "training" not in os.listdir():
    os.mkdir("./training")

# Setup environment for your training run
sklearn_env = Environment.from_conda_specification(name='sklearn-env', file_path='conda_dependencies.yml')

# Create a ScriptRunConfig Object to specify the configuration details of training job
src = ScriptRunConfig(source_directory='./',
                      script='train.py',
                      #arguments=['--C', '--max_iter'],
                      compute_target=compute_cluster,
                      environment=sklearn_env)

# Create a HyperDriveConfig.
# The name of the primary metric needs to exactly match the name of the metric logged by the training script.
hyperdrive_config = HyperDriveConfig(run_config=src,
                             hyperparameter_sampling=param_sampling,
                             policy=early_termination_policy,
                             primary_metric_name="Accuracy",
                             primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                             max_total_runs=4,
                             max_concurrent_runs=4)

In [None]:
# Submit the experiment (start the HyperDrive run).
hyperdrive_run = exp.submit(hyperdrive_config)

# Visualize all hyperparameter tuning runs.
RunDetails(hyperdrive_run).show()
hyperdrive_run.wait_for_completion(show_output=True)
assert(hyperdrive_run.get_status() == "Completed")

In [None]:
import joblib

# Best performing configuration and hyperparameter values.
best_run = hyperdrive_run.get_best_run_by_primary_metric()
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['arguments']

print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['accuracy'])
print('\n learning rate:',parameter_values[3])
print('\n keep probability:',parameter_values[5])
print('\n batch size:',parameter_values[7])

# List the model files uploaded during the run.
# print('\n Model files:', best_run.get_file_names())

# Save the model from that run.
model_hyperdrive = best_run.register_model(model_name='model_hyperdrive', model_path='outputs/model.joblib')

In [None]:
from azureml.data.dataset_factory import TabularDatasetFactory

# Create TabularDataset
data_location = "https://automlsamplenotebookdata.blob.core.windows.net/automl-sample-notebook-data/bankmarketing_train.csv"
ds = TabularDatasetFactory.from_delimited_files(path=data_location, validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False)

In [None]:
from train import clean_data
from sklearn.model_selection import train_test_split
import pandas as pd

# clean data.
x, y = clean_data(ds)

# y is a pandas Series. Convert to dataframe.
y_df = pd.DataFrame(y,columns=['y'])

# Combine both dataframes : x and y_df.
combined_data = pd.concat([x,y_df],axis=1)

# Split data into train and test sets.
train_data, test_data = train_test_split(combined_data, test_size = 0.2, random_state = 42)

# Convert training data to csv.
train_data.to_csv('training/automl_training_data.csv')

# get datastore
datastore_automl = ws.get_default_datastore()

# upload data to datastore
datastore_automl.upload(src_dir='training/',target_path='data/')

# convert data to tabular format
automl_train_dataset = TabularDatasetFactory.from_delimited_files(path=[(datastore_automl,('data/automl_training_data.csv'))])

In [None]:
from azureml.train.automl import AutoMLConfig

automl_settings = {
       "n_cross_validations": 3,
       "primary_metric": 'accuracy',
       "experiment_timeout_minutes":30,
       "verbosity": logging.INFO,
   }

# Set parameters for AutoMLConfig
automl_config = AutoMLConfig(task = 'classification',
                             compute_target = compute_cluster,
                             training_data = automl_train_dataset,
                             label_column_name='y',
                             **automl_settings
                             )

#  Submit automl run (show Validation errors and current status).
automl_run = exp.submit(automl_config, show_output=True)

In [2]:
from azureml.widgets import RunDetails

RunDetails(automl_run).show()
automl_run.wait_for_completion(show_output=True)

In [2]:
# Retrieve the best Run object
#automl_best_run = automl_run.get_best_child()

# Retrieve the best run and the fitted model.
automl_best_run, fitted_model = automl_run.get_output()

In [None]:
# Save best automl model.
automl_best_run.register_model(model_name = "automl_model.pkl", model_path = './output/')
print(fitted_model._final_estimator)

In [None]:
# delete compute cluster
compute_cluster.delete()