In [19]:
# Azure Machine Learning SDK core
from azureml.core import Workspace
from azureml.core.model import Model

# Scikit-learn and others
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pickle

### Load and connect to workspace

In [20]:
ws = Workspace.from_config("Azure_machine_learning/config.json")

### Registering model onto Azure

In [21]:
model = Model.register(ws, model_name="data_salaries_random_forest_regression", model_path="model.pkl")

Registering model data_salaries_random_forest_regression


### Test and share endpoint for marking

In [22]:
import urllib.request
import json
import os
import ssl

def allowSelfSignedHttps(allowed):
    # bypass the server certificate verification on client side
    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
        ssl._create_default_https_context = ssl._create_unverified_context

allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.

# Request data goes here
data = {
  "data": [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]]
}

body = str.encode(json.dumps(data))

url = 'https://msa2023-phase2-azure-przto.australiaeast.inference.ml.azure.com/score'
# Replace this with the primary/secondary key or AMLToken for the endpoint
api_key = 'uk8j9RkV2pEsssu2lBxK2yfwz719CyOz'
if not api_key:
    raise Exception("A key should be provided to invoke the endpoint")

# The azureml-model-deployment header will force the request to go to a specific deployment.
headers = {
    'Content-Type':'application/json', 
    'Authorization':('Bearer '+ api_key), 
    'azureml-model-deployment': 'data-salaries-random-forest-re-1'
}

req = urllib.request.Request(url, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the request ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(error.read().decode("utf8", 'ignore'))


b'[8255.1]'


## TASK 3: Hypertuning parametres 

As the accuracy of the predictions of the random forest model for the market segmenetations dataset is not high I want to see if I can tune the hyperparametres further and improve the accuracy of the model.

In [None]:
## Connecting the workspace

ws = Workspace.from_config("Azure_machine_learning/config.json")
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

In [None]:
# Uploaded dataset to azure ws
from azureml.core import Dataset

default_ds = ws.get_default_datastore()

if 'market segmentation dataset' not in ws.datasets:
    default_ds.upload_files(file=['./cleaned_data/market_segmentation_cleaned.csv'], target_path ='market-segmentation/', overwrite = True, show_progress = True)


    # Creating a tabular dataset from the path on the datastore
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'market-segmentation/*.csv'))

    # Register the tabular dataset

    try:
        tab_data_set = tab_data_set.register(workspace=ws,
                                            name = 'market segmentation dataset',
                                            description='market segmentation data',
                                            tags = {'format':'CSV'},
                                            create_new_version = True)
        print('Dataset Registered')
        
    except Exception as ex:
        print(ex)
else:
    print("Dataset already registered")
                            

### Preparing a training script

In [None]:
import os

experiment_folder ='market_segmentation_training-hyperdrive'
os.makedirs(experiment_folder, exist_ok=True)

print("Folder ready")

In [None]:
%%writefile $experiment_folder/market_segmentation_training.py

# Import libraries
import argparse, joblib, os
from azureml.core import Run
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

# Get the experiment run context
run = Run.get_context()

# Get script arguments
parser = argparse.ArgumentParser()

# Input dataset
parser.add_argument("--input-data", type=str, dest='input_data', help='training dataset')

# Hyperparameters
parser.add_argument('--n_estimators', type=int, dest='n_estimators', default=100, help='number of estimators')
parser.add_argument('--max_depth', type=int, dest='max_depth', default=None, help='maximum depth of the tree')

# Add arguments to args collection
args = parser.parse_args()

# Log Hyperparameter values
run.log('n_estimators',  np.int(args.n_estimators))
run.log('max_depth',  np.float(args.max_depth) if args.max_depth else 'None')

# load the market segmentation dataset
print("Loading Data...")
segmentation_data = run.input_datasets['training_data'].to_pandas_dataframe()

# Separate features and labels NEED TO EDIT THIS
X = segmentation_data.drop('Segment', axis=1)
y = segmentation_data['Segment']

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

# Train a Random Forest classification model with the specified hyperparameters
print('Training a classification model')
model = RandomForestClassifier(n_estimators=args.n_estimators, max_depth=args.max_depth).fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = accuracy_score(y_test, y_hat)
print('Accuracy:', acc)
run.log('Accuracy', np.float(acc))

# If the segmentation labels are binary (e.g., 0 and 1), we can compute AUC.
# If not, you'll need to adjust this or skip AUC computation.
if len(np.unique(y)) == 2:
    y_scores = model.predict_proba(X_test)
    auc = roc_auc_score(y_test, y_scores[:,1])
    print('AUC: ' + str(auc))
    run.log('AUC', np.float(auc))

# Save the model in the run outputs
os.makedirs('outputs', exist_ok=True)
joblib.dump(value=model, filename='outputs/market_segmentation_model.pkl')

run.complete()

## Create compute 

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "market-segmentation-cluster"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)


In [None]:
# Creating file to allow a python environment to be hosted on compute
%%writefile $experiment_folder/hyperdrive_env.yml
name: batch_environment
dependencies:
- python=3.6.2
- scikit-learn
- pandas
- numpy
- pip
- pip:
  - azureml-defaults

### Run a hyperparameter tuning experiment

In [None]:
from azureml.core import Experiment, ScriptRunConfig, Environment
from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice
from azureml.widgets import RunDetails

# Create a Python environment for the experiment
hyper_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/hyperdrive_env.yml")

# Get the training dataset
market_segmentation_ds = ws.datasets.get("market segmentation dataset")

# Create a script config
script_config = ScriptRunConfig(source_directory=experiment_folder,
                                script='market_segmentation_training.py',
                                # Add non-hyperparameter arguments -in this case, the training dataset
                                arguments = ['--input-data', market_segmentation_ds.as_named_input('training_data')],
                                environment=hyper_env,
                                compute_target = training_cluster)

# Sample a range of parameter values
params = GridParameterSampling(
    {
        # Hyperdrive will try 6 combinations, adding these as script arguments
        '--n_estimators': choice(10, 50, 100),
        '--max_depth': choice(None, 10, 20, 30)  # None means nodes are expanded until all leaves are pure or contain less than min_samples_split samples.
    }
)

# Configure hyperdrive settings
hyperdrive = HyperDriveConfig(run_config=script_config, 
                          hyperparameter_sampling=params, 
                          policy=None,  # No early stopping policy
                          primary_metric_name='AUC',  # Assuming you're using AUC as the metric; change if needed
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE, 
                          max_total_runs=12,  # Adjusted to account for new parameter combinations
                          max_concurrent_runs=2)  # Run up to 2 iterations in parallel

# Run the experiment
experiment = Experiment(workspace=ws, name='mslearn-marketsegmentation-hyperdrive')
run = experiment.submit(config=hyperdrive)

# Show the status in the notebook as the experiment runs
RunDetails(run).show()
run.wait_for_completion()


### Determine the best performing run