<center><h1>Hyperparameter Tuning using HyperDrive</h1></center>

### Import Dependencies.

In [82]:
from azureml.core import Workspace, ScriptRunConfig, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core import Model
from azureml.core.resource_configuration import ResourceConfiguration

from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn

from azureml.train.hyperdrive.run import PrimaryMetricGoal
from azureml.train.hyperdrive.policy import BanditPolicy
from azureml.train.hyperdrive.sampling import RandomParameterSampling
from azureml.train.hyperdrive import choice
from azureml.train.hyperdrive.runconfig import HyperDriveConfig
from azureml.train.hyperdrive.parameter_expressions import uniform

from azureml.data.dataset_factory import TabularDatasetFactory 
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import os
import json

### Set up Workspace

In [3]:
ws = Workspace.from_config()

experiment_name = 'HotelBookingDemand'

experiment=Experiment(ws, experiment_name)

In [4]:
cpu_cluster_name = "udacity-casptone-project"

try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size="Standard_D2_V2", max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)

Found existing cluster, use it.

Running


In [83]:
# Data
pd.options.display.max_columns = None


ds = TabularDatasetFactory.from_delimited_files("https://raw.githubusercontent.com/ketcx/ml-ops-exercise/master/data/hotel_bookings_clean.csv", validate=True, include_path=False, infer_column_types=True, set_column_types=None, separator=',', header=True, partition_format=None, support_multi_line=False, empty_as_string=False) 
df = ds.to_pandas_dataframe() 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119210 entries, 0 to 119209
Data columns (total 24 columns):
hotel                          119210 non-null object
is_canceled                    119210 non-null int64
lead_time                      119210 non-null int64
stays_in_weekend_nights        119210 non-null int64
stays_in_week_nights           119210 non-null int64
adults                         119210 non-null int64
children                       119210 non-null int64
babies                         119210 non-null int64
meal                           119210 non-null object
country                        119210 non-null object
market_segment                 119210 non-null object
distribution_channel           119210 non-null object
is_repeated_guest              119210 non-null int64
booking_changes                119210 non-null int64
deposit_type                   119210 non-null object
agent                          119210 non-null int64
company                        1192

In [87]:
df[df['hotel'] == "City Hotel"].head(1)

Unnamed: 0,hotel,is_canceled,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,Room,net_cancelled
40047,City Hotel,0,6,0,2,1,0,0,HB,PRT,Offline TA/TO,TA/TO,0,0,No Deposit,6,0,0,Transient,0.0,0,0,1,0


### Hyperdrive Configuration


In [6]:
# Early termination policy. 
early_termination_policy = BanditPolicy(evaluation_interval=2, slack_factor=0.1) 

# Parameter sampler
param_sampling = RandomParameterSampling(
    {
    'C': uniform(0.01, 100),
    'max_iter': choice(100, 1000, 10000)
    }
)

if "training" not in os.listdir():
    os.mkdir("./training")
    
azureml_pip_packages = [
    'azureml-defaults', 'azureml-contrib-interpret', 'azureml-telemetry', 'azureml-interpret'
]
  

#Estimator
estimator = SKLearn(entry_script='./train.py', source_directory=".", compute_target=cpu_cluster, pip_packages=azureml_pip_packages)

# Hyperdrive config
hyperdrive_run_config = HyperDriveConfig(estimator=estimator,
                                         hyperparameter_sampling=param_sampling,
                                         policy=early_termination_policy,
                                         primary_metric_name='Accuracy',
                                         primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                         max_total_runs=4,
                                         max_concurrent_runs=4
                                        )

'SKLearn' estimator is deprecated. Please use 'ScriptRunConfig' from 'azureml.core.script_run_config' with your own defined environment or the AzureML-Tutorial curated environment.


In [12]:
#Experiment Submit
run = experiment.submit(config=hyperdrive_run_config, show_output=True)

## Run Details

The model has different C and Max-Iter, the best model was Max_iter = 1000 and C =76

In [13]:
RunDetails(run).show()
run.wait_for_completion(show_output=True)

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

RunId: HD_e21bb9d3-658b-42ff-8b62-b56a7d082b9c
Web View: https://ml.azure.com/experiments/HotelBookingDemand/runs/HD_e21bb9d3-658b-42ff-8b62-b56a7d082b9c?wsid=/subscriptions/c643ea10-d987-475a-a877-13474b748947/resourcegroups/VioxAI/workspaces/Experiments

Streaming azureml-logs/hyperdrive.txt

"<START>[2021-01-08T21:36:26.243088][API][INFO]Experiment created<END>\n""<START>[2021-01-08T21:36:26.880014][GENERATOR][INFO]Trying to sample '4' jobs from the hyperparameter space<END>\n""<START>[2021-01-08T21:36:27.823200][GENERATOR][INFO]Successfully sampled '4' jobs, they will soon be submitted to the execution target.<END>\n"<START>[2021-01-08T21:36:28.8271352Z][SCHEDULER][INFO]The execution environment is being prepared. Please be patient as it can take a few minutes.<END>

Execution Summary
RunId: HD_e21bb9d3-658b-42ff-8b62-b56a7d082b9c
Web View: https://ml.azure.com/experiments/HotelBookingDemand/runs/HD_e21bb9d3-658b-42ff-8b62-b56a7d082b9c?wsid=/subscriptions/c643ea10-d987-475a-a877-13

{'runId': 'HD_e21bb9d3-658b-42ff-8b62-b56a7d082b9c',
 'target': 'udacity-casptone-project',
 'status': 'Completed',
 'startTimeUtc': '2021-01-08T21:36:26.03429Z',
 'endTimeUtc': '2021-01-08T21:44:15.30155Z',
 'properties': {'primary_metric_config': '{"name": "Accuracy", "goal": "maximize"}',
  'resume_from': 'null',
  'runTemplate': 'HyperDrive',
  'azureml.runsource': 'hyperdrive',
  'platform': 'AML',
  'ContentSnapshotId': '1936b4bc-abe7-4e0e-b498-602ffdfd8f0c',
  'score': '0.796409697173056',
  'best_child_run_id': 'HD_e21bb9d3-658b-42ff-8b62-b56a7d082b9c_2',
  'best_metric_status': 'Succeeded'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'azureml-logs/hyperdrive.txt': 'https://experiments5909144693.blob.core.windows.net/azureml/ExperimentRun/dcid.HD_e21bb9d3-658b-42ff-8b62-b56a7d082b9c/azureml-logs/hyperdrive.txt?sv=2019-02-02&sr=b&sig=Ma2mmn04omk6s83sgnqQHItCV3JizSZX%2BD6t6Ftboow%3D&st=2021-01-08T21%3A34%3A25Z&se=2021-01-09T05%3A44%3A25Z&sp=r'}}

## Best Model

TODO: In the cell below, get the best model from the hyperdrive experiments and display all the properties of the model.

In [14]:
best_run = run.get_best_run_by_primary_metric()

model = best_run.register_model(model_name='HotelBookingDemand-lr', 
                                model_path='./outputs/modelht.pkl', 
                                model_framework=Model.Framework.SCIKITLEARN, 
                                model_framework_version='0.22.2',
                                resource_configuration=ResourceConfiguration(cpu=2, memory_in_gb=4),
                                description='Will the hotel booking be canceled?.',
                                tags={'area': 'hospitality', 'type': 'classification'})

print(best_run.get_metrics())

print(model)

{'Regularization Strength:': 76.62969490511352, 'Max iterations:': 1000, 'Accuracy': 0.796409697173056}
Model(workspace=Workspace.create(name='Experiments', subscription_id='c643ea10-d987-475a-a877-13474b748947', resource_group='VioxAI'), name=HotelBookingDemand-lr, id=HotelBookingDemand-lr:1, version=1, tags={'area': 'hospitality', 'type': 'classification'}, properties={})


In [None]:
#TODO: Save the best model

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [18]:
service_name = 'hotelbookinght'
service = Model.deploy(ws, service_name, [model]) #No-code model deployment or Use Code?

TODO: In the cell below, send a request to the web service you deployed to test it.

In [21]:
service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running.......................................................
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [27]:
def get_value(key, value):
    
    le = LabelEncoder()
    le.fit(df[key])
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))

    return le_name_mapping[value]

In [90]:
# This is just a example, can be improve.

hotel,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,Room,net_cancelled = ['City Hotel', 6,0,2,1,0,0,'HB','PRT','Offline TA/TO','TA/TO',0,0,'No Deposit',6,0,0,'Transient',0.0,0,0,1,0]

hotel = int(get_value('hotel', hotel))
meal  = int(get_value('meal', meal))
country = int(get_value('country', country))
market_segment= int(get_value('market_segment', market_segment))
distribution_channel= int(get_value('distribution_channel', distribution_channel)) 
deposit_type=int(get_value('deposit_type', deposit_type))
customer_type= int(get_value('customer_type', customer_type))

In [91]:
import json

input_payload = json.dumps({
    'data': [
        [hotel,lead_time,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,Room,net_cancelled] #value of 23 colums of dataset.
    ],
    'method': 'predict'  
})

output = service.run(input_payload)


print("Will the hotel booking be canceled? {}".format("Yes" if int(output['predict'][0]) == 1 else "No"))

Will the hotel booking be canceled? No


TODO: In the cell below, print the logs of the web service and delete the service

In [92]:
service.delete()