In [19]:
# Azure Machine Learning SDK core
from azureml.core import Workspace
from azureml.core.model import Model

# Scikit-learn and others
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

### Load and connect to workspace

In [20]:
ws = Workspace.from_config("Azure_machine_learning/config.json")

### Registering model onto Azure

In [21]:
model = Model.register(ws, model_name="data_salaries_random_forest_regression", model_path="model.pkl")

Registering model data_salaries_random_forest_regression


### Test and share endpoint for marking

In [22]:
import urllib.request
import json
import os
import ssl

def allowSelfSignedHttps(allowed):
    # bypass the server certificate verification on client side
    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
        ssl._create_default_https_context = ssl._create_unverified_context

allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.

# Request data goes here
data = {
  "data": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
}

body = str.encode(json.dumps(data))

url = 'https://msa2023-phase2-azure-przto.australiaeast.inference.ml.azure.com/score'
# Replace this with the primary/secondary key or AMLToken for the endpoint
api_key = 'uk8j9RkV2pEsssu2lBxK2yfwz719CyOz'
if not api_key:
    raise Exception("A key should be provided to invoke the endpoint")

# The azureml-model-deployment header will force the request to go to a specific deployment.
headers = {
    'Content-Type':'application/json', 
    'Authorization':('Bearer '+ api_key), 
    'azureml-model-deployment': 'data-salaries-random-forest-re-1'
}

req = urllib.request.Request(url, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the request ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(error.read().decode("utf8", 'ignore'))


b'[8255.1]'


## Hypertuning parametres 

As the accuracy of the predictions of the random forest model for the market segmenetations dataset is not high I want to see if I can tune the hyperparametres further and improve the accuracy of the model.

In [None]:
## Connecting the workspace

ws = Workspace.from_config("Azure_machine_learning/config.json")
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

In [None]:
# Uploaded dataset to azure ws
from azureml.core import Dataset

default_ds = ws.get_default_datastore()

if 'market segmentation dataset' not in ws.datasets:
    default_ds.upload_files(file=['./cleaned_data/market_segmentation_cleaned.csv'], target_path ='market-segmentation/', overwrite = True, show_progress = True)


    # Creating a tabular dataset from the path on the datastore
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'market-segmentation/*.csv'))

    # Register the tabular dataset

    try:
        tab_data_set = tab_data_set.register(workspace=ws,
                                            name = 'market segmentation dataset',
                                            description='market segmentation data',
                                            tags = {'format':'CSV'},
                                            create_new_version = True)
        print('Dataset Registered')
        
    except Exception as ex:
        print(ex)
else:
    print("Dataset already registered")
                            

In [None]:
import os

experiment_folder ='market_segmentation_training-hyperdrive'
os.makedirs(experiment_folder, exist_ok=True)

print("Folder ready")

In [None]:
from azureml.train.hyperdrive import RandomParameterSampling, BayesianParameterSampling, BanditPolicy
from azureml.train.hyperdrive import choice, uniform, loguniform

# Using Bayesian Sampling
bayesian_sampling = BayesianParameterSampling({
    "--learning_rate": loguniform(-6, -1),
    "--batch_size": choice(16, 32, 64, 128)
})

# Define Bandit Policy
bandit_policy = BanditPolicy(slack_factor=0.1, evaluation_interval=2, delay_evaluation=5)

from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal

# Configure HyperDrive with Bandit Policy
hyperdrive_config = HyperDriveConfig(run_config=estimator,
                                     hyperparameter_sampling=bayesian_sampling, 
                                     primary_metric_name='accuracy',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=100,
                                     max_concurrent_runs=4,
                                     policy=bandit_policy)
