# Run Pipeplines with Azure Machine Learning Python SDK

You use the Azure python SDK to orchestrate steps in a Pipeline will run in succession or in Parallel on a compute target

# Connect to a Workspace

To connect to a workspace, we need identifier parameters - a subscription ID, resource group name, and workspace name. A `config.json` file containing these parameters can be downloaded from the Azure Machine Learning workspace or Azure portal.

In [25]:
from azureml.core import Workspace

ws = Workspace.from_config(path="../../config.json")

## Create the Experiment

In [26]:
from azureml.core import Experiment
import pandas as pd

# create the experiment
experiment = Experiment(workspace=ws, name="diabetes-train-predict")


## Exploratory Data Analysis Step

Plot correlation, feature-wise distributions and pairwise scatter plots

In [2]:
%%writefile src/diabetes-exploratory-plots.py
# Plot distrubtions step

import pandas as pd
import seaborn as sns
import os
import matplotlib.pyplot as plt
from itertools import combinations

# Create a function that we can re-use
def plot_correlations(data):
    """
    This function will make a correlation graph and save it
    """
    correlation = data.corr()
    print("Correlation between features\n", correlation)

    fig = plt.figure(figsize=(10, 12))
    sns.heatmap(data=correlation, annot=True)
    plt.title("Correlation betweeen features")

    # Save plot
    filename = "outputs/correlations-between-features.png"
    os.makedirs("outputs", exist_ok=True)
    fig.savefig(filename)


def plot_distribution(var_data, column_name=None):
    """
    This function will make a distribution (graph) and save it
    """

    # Get statistics
    min_val = var_data.min()
    max_val = var_data.max()
    mean_val = var_data.mean()
    med_val = var_data.median()
    mod_val = var_data.mode()[0]

    print(
        "{} Statistics:\nMinimum:{:.2f}\nMean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\nMaximum:{:.2f}\n".format(
            "" if column_name is None else column_name,
            min_val,
            mean_val,
            med_val,
            mod_val,
            max_val,
        )
    )

    # Create a figure for 2 subplots (2 rows, 1 column)
    fig, ax = plt.subplots(2, 1, figsize=(10, 4))

    # Plot the histogram
    ax[0].hist(var_data)
    ax[0].set_ylabel("Frequency")

    # Add lines for the mean, median, and mode
    ax[0].axvline(x=min_val, color="gray", linestyle="dashed", linewidth=2)
    ax[0].axvline(x=mean_val, color="cyan", linestyle="dashed", linewidth=2)
    ax[0].axvline(x=med_val, color="red", linestyle="dashed", linewidth=2)
    ax[0].axvline(x=mod_val, color="yellow", linestyle="dashed", linewidth=2)
    ax[0].axvline(x=max_val, color="gray", linestyle="dashed", linewidth=2)
    ax[0].legend()

    # Plot the boxplot
    ax[1].boxplot(var_data, vert=False)
    xlabel = "Value" if column_name is None else column_name
    ax[1].set_xlabel(xlabel)

    # Add a title to the Figure
    title = (
        "Data Distribution"
        if column_name is None
        else "{} Data Distribution".format(column_name)
    )
    fig.suptitle(title)

    # Save plot
    filename = "outputs/{}-distribution.png".format(column_name)
    os.makedirs("outputs", exist_ok=True)
    fig.savefig(filename)


def plot_scatters(x_y_data):
    """
    Plot scatter plots with :y_column: on y-axis and save them. 
    """
    
    x_column = x_y_data.columns.values[0]
    y_column = x_y_data.columns.values[1]

    fig = plt.figure(figsize=(10, 12))
    sns.regplot(data=x_y_data,x=x_column, y=y_column)
    plt.xlabel(x_column)
    plt.ylabel(y_column)
    plt.title("Scatter plot of {} vs {}".format(x_column,y_column))

    # Save plot
    filename = "outputs/Scatter plot of {} vs {}.png".format(x_column,y_column)
    os.makedirs("outputs", exist_ok=True)
    fig.savefig(filename)


print("Loading Data...")
diabetes = pd.read_csv("../../data/diabetes.csv")

# plot correlations
plot_correlations(data=diabetes)

# plot distributions
columns = diabetes.columns.values
for x in columns:
    plot_distribution(var_data=diabetes[x],column_name=x)

# plot scatter plots
columns = set(columns)
exlude_column = set(["Diabetic", "PatientID"])

column_comb=list(combinations(columns-exlude_column,2))
column_comb = [list(x) for x in column_comb]

for x_y_pairs in column_comb:
    plot_scatters(diabetes[x_y_pairs])

Writing src/diabetes-exploratory-plots.py


## Model Training Step

To train a model, you'll first create the **diabetes_training.py** script in the **src** folder. 

In [3]:
%%writefile src/diabetes-training.py
# import libraries
import os
import argparse
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

# Parse job parameters
parser = argparse.ArgumentParser()
parser.add_argument('--reg-rate', type=float, dest='reg_rate', default=0.01)
parser.add_argument('--test-size', type=float, dest='test_size', default=0.30)
parser.add_argument('--data-set', type=str,dest="data")
args = parser.parse_args()

reg_rate = args.reg_rate
test_size = args.test_size
print("Test data size:", test_size)
print("Regularization rate:", reg_rate)

# load the diabetes dataset
print("Loading Data...")
diabetes = pd.read_csv(args.data, header=0)

print("num_samples:", diabetes.shape[0])
features = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']]
print("num_features:", features.shape[1])
print("features:", features.columns.values)

# separate features and labels
X = features.values
y = diabetes['Diabetic'].values

# split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=0)

# train a logistic regression model
print('Training a logistic regression model with regularization rate of', reg_rate)
model = LogisticRegression(C=1/reg_rate, solver="liblinear").fit(X_train, y_train)

# calculate accuracy
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
print('Accuracy:', float(acc))

Writing src/diabetes-training.py


## Model Registration Step

In [4]:
%%writefile src/diabetes-model-registration.py
import sklearn
from azureml.core import Model
import argparse

# Set regularization hyperparameter
parser = argparse.ArgumentParser()
parser.add_argument('--reg-rate', type=float, dest='reg_rate', default=0.01)
parser.add_argument('--test-size', type=float, dest='test_size', default=0.30)
args = parser.parse_args()

filename = 'outputs/model.pkl'
Model.register(
    workspace = ws,
    model_name="diabetes-classification-model",
    model_path = filename,
    description = "A LogisticRegression classification model for Diabetes",
    tags = { 'data-format':"CSV", "regularization-rate":args.reg_rate, "test-size":args. test_size},
    model_framework = Model.Framework.SCIKITLEARN,
    model_framework_version = str(sklearn.__version__)
)

Writing src/diabetes-model-registration.py


## Building the Pipeline

In [None]:
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep

In [6]:
%%bash 
python --version

Python 3.12.4
