## Model Training 

In [None]:
import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.core.dataset import Dataset
from azureml.train.automl import AutoMLConfig

In [7]:
#Connect to the workspace
ws = Workspace.from_config()

#### <font color='blue'> Challenge 4</font>:
Get the compute cluster that you created in the previous lab and complete the cell below.
1. Define a variable for your cluster name
2. Verify that the cluster does not exist already. If the cluster doesn't exist, create one.

Tips: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-create-attach-compute-cluster?tabs=python

In [3]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# Choose a name for your CPU cluster
cpu_cluster_name = ""

# Verify that the cluster does not exist already. If it doesn't exist, create one. 
'''
Code goes here 
'''

#### <font color='blue'> Challenge 5</font>:
In the previous lab, you pre-processed the data that we going are to use for the training and registered it as a dataset. Now you need to use the dataset for training. Write a single line of code below that gets the training dataset.

Tips: https://docs.microsoft.com/en-us/azure/machine-learning/how-to-version-track-datasets#:~:text=An%20Azure%20Machine%20Learning%20dataset.%20Register%20and%20retrieve,a%20specific%20version%20by%20name%20and%20version%20number.

In [5]:
#Get the dataset that we prepared earlier
'''
Code goes here
'''

'\nCode goes here\n'

### Automl Configuration

Instantiate a AutoMLConfig object. This defines the settings and data used to run the experiment.

|Property|Description|
|-|-|
|**task**|classification or regression or forecasting|
|**primary_metric**|This is the metric that you want to optimize. Classification supports the following primary metrics: <br><i>accuracy</i><br><i>AUC_weighted</i><br><i>average_precision_score_weighted</i><br><i>norm_macro_recall</i><br><i>precision_score_weighted</i>|
|**iteration_timeout_minutes**|Time limit in minutes for each iteration.|
|**blocked_models** | *List* of *strings* indicating machine learning algorithms for AutoML to avoid in this run. <br><br> Allowed values for **Classification**<br><i>LogisticRegression</i><br><i>SGD</i><br><i>MultinomialNaiveBayes</i><br><i>BernoulliNaiveBayes</i><br><i>SVM</i><br><i>LinearSVM</i><br><i>KNN</i><br><i>DecisionTree</i><br><i>RandomForest</i><br><i>ExtremeRandomTrees</i><br><i>LightGBM</i><br><i>GradientBoosting</i><br><i>TensorFlowDNN</i><br><i>TensorFlowLinearClassifier</i><br><br>Allowed values for **Regression**<br><i>ElasticNet</i><br><i>GradientBoosting</i><br><i>DecisionTree</i><br><i>KNN</i><br><i>LassoLars</i><br><i>SGD</i><br><i>RandomForest</i><br><i>ExtremeRandomTrees</i><br><i>LightGBM</i><br><i>TensorFlowLinearRegressor</i><br><i>TensorFlowDNN</i><br><br>Allowed values for **Forecasting**<br><i>ElasticNet</i><br><i>GradientBoosting</i><br><i>DecisionTree</i><br><i>KNN</i><br><i>LassoLars</i><br><i>SGD</i><br><i>RandomForest</i><br><i>ExtremeRandomTrees</i><br><i>LightGBM</i><br><i>TensorFlowLinearRegressor</i><br><i>TensorFlowDNN</i><br><i>Arima</i><br><i>Prophet</i>|
|**allowed_models** |  *List* of *strings* indicating machine learning algorithms for AutoML to use in this run. Same values listed above for **blocked_models** allowed for **allowed_models**.|
|**experiment_exit_score**| Value indicating the target for *primary_metric*. <br>Once the target is surpassed the run terminates.|
|**experiment_timeout_hours**| Maximum amount of time in hours that all iterations combined can take before the experiment terminates.|
|**enable_early_stopping**| Flag to enble early termination if the score is not improving in the short term.|
|**featurization**| 'auto' / 'off'  Indicator for whether featurization step should be done automatically or not. Note: If the input data is sparse, featurization cannot be turned on.|
|**n_cross_validations**|Number of cross validation splits.|
|**training_data**|Input dataset, containing both features and label column.|
|**label_column_name**|The name of the label column.|

#### <font color='blue'> Challenge 6</font>:
1. Choose an appropriate primary metric to use for the auto ml training and why. Include this primary metric into **auto_settings** below as `primary_metric = <value>`

Tips: _You can learn more about primary metrics_ [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-configure-auto-train#primary-metric)

In [11]:
automl_settings = {
    "experiment_timeout_hours" : 0.3,
    "enable_early_stopping" : False,
    "iteration_timeout_minutes": 5,
    "max_concurrent_iterations": 4,
    "max_cores_per_iteration": -1,
    "n_cross_validations": 3,
    "featurization": 'auto',
    "verbosity": logging.INFO,
}

automl_config = AutoMLConfig(task = 'classification',
                             debug_log = 'automl_errors.log',
                             compute_target=compute_target,
                             blocked_models = ['KNN','LinearSVM'],
                             enable_onnx_compatible_models=True,
                             training_data = training_ds,
                             label_column_name = 'label1',
                             **automl_settings
                            )

## Train the Models

Call the `submit` method on the experiment object and pass the run configuration. Execution of local runs is synchronous. Depending on the data and the number of iterations this can run for a while.
In this example, we specify `show_output = True` to print currently running iterations to the console.

In [13]:
experiment_name = 'pdm-automl'
project_folder = './sample_projects/pdm-automl'

experiment = Experiment(ws, experiment_name)

In [14]:
remote_run = experiment.submit(automl_config, show_output = False)

Submitting remote run.


Experiment,Id,Type,Status,Details Page,Docs Page
pdm-automl,AutoML_7336b233-2830-4548-9d93-893ff2323647,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation


## Explore the Results

#### Widget for Monitoring Runs

The widget will first report a "loading" status while running the first iteration. After completing the first iteration, an auto-updating graph and table will be shown. The widget will refresh once per minute, so you should see the graph update as child runs complete.

**Note:** The widget displays a link at the bottom. Use this link to open a web interface to explore the individual run details.

In [2]:
from azureml.widgets import RunDetails
from azureml.core.run import Run
RunDetails(remote_run).show() 

## Register Model

In [None]:
best_run = remote_run.get_best_child()

In [None]:
model_name = best_run.properties["model_name"]

script_file_name = "inference/score.py"

best_run.download_file("outputs/scoring_file_v_1_0_0.py", "inference/score.py")

In [None]:
description = "AutoML Model for predictive maintenance"
tags = None
model = remote_run.register_model(
    model_name=model_name, description=description, tags=tags
)

print(
    remote_run.model_id
)  # This will be written to the script file later in the notebook.

## Deploy

### Retrieve the Best Model

Below we select the best pipeline from our iterations.  The `get_output` method returns the best run and the fitted model. Overloads on `get_output` allow you to retrieve the best run and fitted model for *any* logged metric or for a particular *iteration*.

In [None]:
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.model import Model

inference_config = InferenceConfig(environment = best_run.get_environment(), entry_script=script_file_name)

aciconfig = AciWebservice.deploy_configuration(cpu_cores = 2, 
                                               memory_gb = 2, 
                                               tags = {'type': "automl_classification"}, 
                                               description = 'service for Automl PDM Classification')

aci_service_name = 'automl-remote-pdm'
print(aci_service_name)
aci_service = Model.deploy(ws, aci_service_name, [model], inference_config, aciconfig)
aci_service.wait_for_deployment(True)
print(aci_service.state)

In [None]:
import numpy as np
import azureml
from azureml.core import Workspace, Run
from azureml.core.model import Model

print("Azure ML: ", azureml.core.VERSION)
ws = Workspace.from_config()
#model = Model(ws, "model_name")

### Get Logs from a Deployed Web Service

Gets logs from a deployed web service.

## Test and Evaluate on Holdout Set

Now that the model is trained, run the test data through the trained model to get the predicted values.  This calls the ACI web service to do the prediction.

Note that the JSON passed to the ACI web service is an array of rows of data.  Each row should either be an array of values in the same order that was used for training or a dictionary where the keys are the same as the column names used for training.  The example below uses dictionary rows.

In [None]:
# build test data matrix
df_mean = test_df[lag_cols].rolling(window=lag_window).mean()
df_std = test_df[lag_cols].rolling(window=lag_window).std()
df_mean.columns = ['MA'+s for s in lag_cols]
df_std.columns = ['STD'+s for s in lag_cols]
df_test = pd.concat([test_df,df_mean,df_std], axis=1, join='inner')

In [None]:
df_test.head()

In [None]:
#cut head by id, due to lagging transformation
#train_array = [df_train[df_train['id']==id].values[lag_window+40:,:] for id in df_train['id'].unique()]

test_arrayx=df_test[0:0]
for id in df_test['id'].unique():
    dfy=df_test[df_test['id']==id].iloc[-1:]
    test_arrayx=test_arrayx.append(dfy)

In [None]:
# build the matrix
test_x = test_arrayx.drop(test_arrayx.iloc[:, :4], axis = 1)
test_y = test_arrayx['label1']

In [None]:
import requests

X_test_json = test_x.to_json(orient='records')
data = "{\"data\": " + X_test_json +"}"
headers = {'Content-Type': 'application/json'}

resp = requests.post(aci_service.scoring_uri, data, headers=headers)

y_pred = json.loads(json.loads(resp.text))['result']

In [None]:
from numpy import array
actual = list(test_y)

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import itertools

cf =confusion_matrix(actual,y_pred)
plt.imshow(cf,cmap=plt.cm.Blues,interpolation='nearest')
plt.colorbar()
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
class_labels = ['no','yes']
tick_marks = np.arange(len(class_labels))
plt.xticks(tick_marks,class_labels)
plt.yticks([-0.5,0,1,1.5],['','no','yes',''])
# plotting text value inside cells
thresh = cf.max() / 2.
for i,j in itertools.product(range(cf.shape[0]),range(cf.shape[1])):
    plt.text(j,i,format(cf[i,j],'d'),horizontalalignment='center',color='white' if cf[i,j] >thresh else 'black')
plt.show()

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

# Compute fpr, tpr, thresholds and roc auc
fpr, tpr, thresholds = roc_curve(actual,y_pred)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.plot(fpr, tpr, label='ROC curve (area = %0.3f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')  # random predictions curve
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate or (1 - Specifity)')
plt.ylabel('True Positive Rate or (Sensitivity)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()