# Part 3 - Hyper-parameter tuning
In this section of the lab we fine tune hyper-parameters of our image classifier using Azure ML feature calle `hyperdrive`.


![Transfer Learning](../images/TLArch.png)


We will run training jobs in parallel on Azure Batch AI GPU cluster. After the model is fine tuned, the best version will be registered in AML Model Registry.

![AML Arch](../images/amlarch.png)



### Create training script

We are using the same script as in the previous step.

#### Create a folder to hold the script

In [None]:
import os
script_folder = './script'
os.makedirs(script_folder, exist_ok=True)

#### Use Jupyter `%%writefile` magic to write the script


In [None]:
%%writefile $script_folder/train.py
import os
import argparse

from azureml.core import Run

import numpy as np
import random
import h5py

from sklearn.linear_model import LogisticRegression
from sklearn.externals import joblib



# Training regime
def train_evaluate(run):
   
    print("Loading bottleneck features")
    train_file_name = os.path.join(args.data_folder, args.training_file_name)
    valid_file_name = os.path.join(args.data_folder, args.validation_file_name)
    
    # Load bottleneck training features and labels
    with h5py.File(train_file_name, "r") as hfile:
        train_features = np.array(hfile.get('features'))
        train_labels = np.array(hfile.get('labels'))
        
        
    # Load bottleneck validation features and labels
    with h5py.File(valid_file_name, "r") as hfile:
        valid_features = np.array(hfile.get('features'))
        valid_labels = np.array(hfile.get('labels'))
        
    # Conver one-hot labels to integers
    y_train = np.argmax(train_labels, axis=1)
    y_valid = np.argmax(valid_labels, axis=1)
    
    # Train logistics regresssion model
    print("Starting training on")
    print("  Features:", train_features.shape)
    print("  Labels:", y_train.shape)
    clf = LogisticRegression(
        C=1.0/args.reg, 
        multi_class='multinomial',
        solver='lbfgs',
        random_state=42)
    clf.fit(train_features, y_train)
    
    
    # Validate
    print("Starting validation")
    y_hat = clf.predict(valid_features)
    
    # Calculate accuracy 
    acc = np.average(y_hat == y_valid)
    print('Accuracy is:', acc)
    
    # Log to AML Experiment
    run.log('regularization_rate', np.float(args.reg))
    run.log('validation_acc', np.float(acc))
          
    # Save the trained model to outp'uts which is a standard folder expected by AML
    model_file = 'aerial_sklearn.pkl'
    model_file = os.path.join('outputs', model_file)
    print("Saving the model to: ", model_file)
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(value=clf, filename=model_file)
    

  

if __name__ == '__main__':
    parser = argparse.ArgumentParser("Training, evaluation worklfow")

    ### Model parameters
    
    parser.add_argument(
        '--data-folder',
        type=str,
        default = './bottleneck',
        help='Folder with bottleneck features and labels')

    parser.add_argument(
        '--training-file-name',
        type=str,
        default = 'aerial_bottleneck_train.h5',
        help='Training file name')

    parser.add_argument(
        '--validation-file-name',
        type=str,
        default = 'aerial_bottleneck_valid.h5',
        help='Validation file name')

    parser.add_argument(
        '--regularization', 
        type=float, dest='reg', 
        default=0.01, 
        help='regularization rate')
    
    args = parser.parse_args()
    
    # get hold of the current run
    run = Run.get_submitted_run()
    train_evaluate(run)
    

### Connect to AML workspace


In [None]:
import azureml.core
from azureml.core import Workspace

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep='\n')

### Create remote compute cluster

We will use Azure Batch AI GPU cluster to run  hyper parameter tuning.

The cluster is set up for autoscaling. It will start with a single node and can scale to up to 4 nodes. The nodes are NC6 VMs with Tesla K80 GPU.

**Creation of the cluster takes approximately 5 minutes.** If the cluster is already in the workspace this code uses it and skips the creation process.

In [None]:
from azureml.core.compute import ComputeTarget, BatchAiCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
batchai_cluster_name = ws.name + 'cpucls'
vim_size = 'Standard_D4_v2'

try:
    # look for the existing cluster by name
    compute_target = ComputeTarget(workspace=ws, name=batchai_cluster_name)
    if type(compute_target) is BatchAiCompute:
        print('found compute target {}, just use it.'.format(batchai_cluster_name))
    else:
        print('{} exists but it is not a Batch AI cluster. Please choose a different name.'.format(batchai_cluster_name))
except ComputeTargetException:
    print('creating a new compute target...')
    compute_config = BatchAiCompute.provisioning_configuration(vm_size=vim_size, # GPU-based VM
                                                                #vm_priority='lowpriority', # optional
                                                                autoscale_enabled=True,
                                                                cluster_min_nodes=1, 
                                                                cluster_max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, batchai_cluster_name, compute_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it uses the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
    # Use the 'status' property to get a detailed status for the current cluster. 
    print(compute_target.status.serialize())

### Configure datastore

The bottleneck files have been uploaded to the workspace's default datastore during the previous step. We will mount the store on the nodes of the cluster.

In [None]:
from azureml.core import Datastore

ds = ws.get_default_datastore()
print("Using the default datastore for training data: ")
print(ds.name, ds.datastore_type, ds.account_name, ds.container_name)


## Hyperparameter tuning


Although there are multiple hyper-parameters controling logistic regression, for the purpose of the lab, we will focus on one: regularization.

So far we have executed a single training run with one value of regularizaton: 0.8. 

Now, we are going to use AML feature called *hyperdrive* to launch multiple runs on multiple cluster nodes using different values for regularization.


First, let's define the hyperparameter space using grid sampling


In [None]:
from azureml.train.hyperdrive import *

ps = GridParameterSampling(
    {
        '--regularization': choice(0.01, 0.05, 0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 1.2)
    }
)

Next, we will create a new estimator without the above parameters since they will be passed in later. 

In [None]:
from azureml.train.estimator import Estimator

script_params = {
    '--data-folder': ds.path('bottleneck').as_download(),
    '--training-file-name': 'aerial_bottleneck_train_vgg16.h5',
    '--validation-file-name': 'aerial_bottleneck_valid_vgg16.h5'
}

pip_packages = ['h5py','pillow','scikit-learn']

est = Estimator(source_directory=script_folder,
                script_params=script_params,
                compute_target=compute_target,
                entry_script='train.py',
                pip_packages=pip_packages
                )

*Hyperdrive* supports early termination policies to limit exploration of hyperparameter combinations that don't show promise of helping reach the target metric. This is feature is especially useful when traversing large hyperparameter spaces. Since we are going to run a small number of jobs we will not apply early termination.

In [None]:
policy = NoTerminationPolicy()

Now we are ready to configure a run configuration object, and specify the primary metric as *validation_acc* that's recorded in our training runs. If you go back to visit the training script, you will notice that this value is being logged after every run. We also want to tell the service that we are looking to maximizing this value. We also set the number of total runs to 20, and maximal concurrent job to 4, which is the same as the number of nodes in our computer cluster. 

In [None]:
htc = HyperDriveRunConfig(estimator=est, 
                          hyperparameter_sampling=ps,
                          policy=policy,
                          primary_metric_name='validation_acc', 
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                          max_total_runs=10,
                          max_concurrent_runs=4)

Create a new experiment to capture `hyperdrive` runs.

In [None]:
experiment_name = 'aerial-sklearn-hyperdrive'

from azureml.core import Experiment
exp = Experiment(workspace=ws, name=experiment_name)

Finally, let's launch the hyperparameter tuning job.

In [None]:
tags = {"RunName": "Hyperdrive-sklearn"}

hdr = exp.submit(config=htc, tags=tags)
hdr

In [None]:
from azureml.train.widgets import RunDetails
RunDetails(hdr).show()

In [None]:
hdr.wait_for_completion(show_output=True) # specify True for a verbose log

## Find and register best model
When all the jobs finish, we can find out the one that has the highest accuracy.

In [None]:
best_run = hdr.get_best_run_by_primary_metric()

In [None]:
best_run_metrics = best_run.get_metrics()
parameter_values = best_run.get_details()['runDefinition']['Arguments']

print('Best Run Id: ', best_run.id)
print('\n Validation Accuracy:', best_run_metrics['validation_acc'])
print('\n Regularization:',parameter_values[7])

Check the output of the best run.

In [None]:
print(best_run.get_file_names())

## Register model
The last step in the training script wrote the file `aerial_classifier.hd5` in the `outputs` directory. As noted before, `outputs` is a special directory in that all content in this  directory is automatically uploaded to your workspace.  This content appears in the run record in the experiment under your workspace. 

You can register the model so that it can be later queried, examined and deployed.

In [None]:
model = best_run.register_model(model_name='aerial_sklearn', model_path='outputs/aerial_sklearn.pkl')
print(model.name, model.id, model.version, sep = '\t')

## Next Step

The model is now ready for deployment

Proceed to `04-deploy`

## Clean up resources
Before you move to the next step, delete the cluster.

In [None]:
compute_target.delete()