# Train Models


## Connect to your workspace

To get started, connect to your workspace.

> **Note**: If you haven't already established an authenticated session with your Azure subscription, you'll be prompted to authenticate by clicking a link, entering an authentication code, and signing into Azure.

In [1]:
pip install argparse

Note: you may need to restart the kernel to use updated packages.


In [2]:
import azureml.core
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
               
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.38.0 to work with machinelearningfinal


# Prepare data

In [3]:
from azureml.core import Dataset

default_ds = ws.get_default_datastore()

if 'house prices dataset' not in ws.datasets:
    default_ds.upload_files(files=['./data/over 5 years.csv'], # Upload the diabetes csv files in /data
                        target_path='house-prices-data/', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'house-prices-data/*.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='house prices dataset',
                                description='house prices data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

Dataset already registered.


# Create scripts for pipeline steps

In [4]:
import os
# Create a folder for the pipeline step files
experiment_folder = 'house_prediction_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

house_prediction_pipeline


## Prepare compute env

## Prep and training steps

In [5]:
%%writefile $experiment_folder/prep_houses.py
# Import libraries
import os
import argparse
import pandas as pd
from azureml.core import Run
from sklearn.preprocessing import MinMaxScaler

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args = parser.parse_args()
save_folder = args.prepped_data

# Get the experiment run context
run = Run.get_context()

# load the data (passed as an input dataset)
print("Loading Data...")
houses = run.input_datasets['raw_data'].to_pandas_dataframe()

# Log raw row count
row_count = (len(houses))
run.log('raw_rows', row_count)

# remove nulls
houses = houses.dropna()

# Normalize the numeric columns
scaler = MinMaxScaler()
num_cols = ['nombre_pieces_principales','surface_terrain','surface_reelle_bati']
houses[num_cols] = scaler.fit_transform(houses[num_cols])

# Log processed rows
row_count = (len(houses))
run.log('processed_rows', row_count)

# Save the prepped data
print("Saving Data...")
os.makedirs(save_folder, exist_ok=True)
save_path = os.path.join(save_folder,'data.csv')
houses.to_csv(save_path, index=False, header=True)

# End the run
run.complete()

Overwriting house_prediction_pipeline/prep_houses.py


In [6]:
%%writefile $experiment_folder/houses_pricing-training.py
# Import libraries
from azureml.core import Run,Model
import argparse
import pandas as pd
import numpy as np
import joblib
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Get parameters
parser = argparse.ArgumentParser()
parser.add_argument("--training-data", type=str, dest='training_data', help='training data')
args = parser.parse_args()
training_data = args.training_data

# Get the experiment run context
run = Run.get_context()

# load the diabetes dataset
print("Loading Data...")
file_path = os.path.join(training_data,'data.csv')
houses = pd.read_csv(file_path)

# Separate features and predicted result
X = houses[['nombre_pieces_principales','surface_terrain','surface_reelle_bati']]
y = houses['valeur_fonciere']

# Split data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)


model = LinearRegression().fit(X_train, y_train)

# calculate r2
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred, multioutput='variance_weighted')
print('R2: ' + str(r2))
run.log('R2', np.float(r2))

# calculate mse
mse = mean_squared_error(y_test, y_pred)
print('MSE:' + str(mse))
run.log('MSE', np.float(mse))

# calculate mae (mean absolute error)
mae = mean_absolute_error(y_test, y_pred)
print('MAE:' + str(mae))
run.log('MAE', np.float(mae))

# Save the trained model in the outputs folder
print("Saving model...")
os.makedirs('outputs', exist_ok=True)
model_file = os.path.join('outputs', 'house_prices_model.pkl')
joblib.dump(value=model, filename=model_file)
# Register the model
print('Registering model...')
Model.register(workspace=run.experiment.workspace,
               model_path = model_file,
               model_name = 'house-prices_model',
               tags={'Training context':'Pipeline'},
               properties={'R2': np.float(r2), 'MSE': np.float(mse), 'MAE': np.float(mae)})



Overwriting house_prediction_pipeline/houses_pricing-training.py


In [7]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "your-compute-cluster"

try:
    # Check for existing compute target
    pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        pipeline_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        pipeline_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Found existing cluster, use it.


In [8]:
%%writefile $experiment_folder/experiment_env.yml
name: experiment_env
dependencies:
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting house_prediction_pipeline/experiment_env.yml


In [9]:
from azureml.core import Environment
from azureml.core.runconfig import RunConfiguration

# Create a Python environment for the experiment (from a .yml file)
experiment_env = Environment.from_conda_specification("experiment_env", experiment_folder + "/experiment_env.yml")

# Register the environment 
experiment_env.register(workspace=ws)
registered_env = Environment.get(ws, 'experiment_env')

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")

Run configuration created.


## Create and run a pipeline

In [10]:
from azureml.data import OutputFileDatasetConfig
from azureml.pipeline.steps import PythonScriptStep

# Get the training dataset
houses_ds = ws.datasets.get("house prices dataset")

# Create an OutputFileDatasetConfig (temporary Data Reference) for data passed from step 1 to step 2
prepped_data = OutputFileDatasetConfig("prepped_data")

# Step 1, Run the data prep script
prep_step = PythonScriptStep(name = "Prepare Data",
                                source_directory = experiment_folder,
                                script_name = "prep_houses.py",
                                arguments = ['--input-data', houses_ds.as_named_input('raw_data'),
                                             '--prepped-data', prepped_data],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

# Step 2, run the training script
train_step = PythonScriptStep(name = "Train and Register Model",
                                source_directory = experiment_folder,
                                script_name = "houses_pricing-training.py",
                                arguments = ['--training-data', prepped_data.as_input()],
                                compute_target = pipeline_cluster,
                                runconfig = pipeline_run_config,
                                allow_reuse = True)

print("Pipeline steps defined")

Pipeline steps defined


In [11]:
from azureml.core import Experiment
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

# Construct the pipeline
pipeline_steps = [prep_step, train_step]
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = 'mslearn-house-price_prediction-pipeline')
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
RunDetails(pipeline_run).show()
pipeline_run.wait_for_completion(show_output=True)

Pipeline is built.
Created step Prepare Data [53a7b5ac][f0f68bfe-6cd4-4253-8d4d-9311aa9c0250], (This step will run and generate new outputs)
Created step Train and Register Model [2bc62cf2][c179d360-df3a-465f-aa92-f9a831f1e1fa], (This step will run and generate new outputs)
Submitted PipelineRun 53a19b88-9f2d-4a63-8aca-8df0a296c789
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/53a19b88-9f2d-4a63-8aca-8df0a296c789?wsid=/subscriptions/d820a79a-27a2-4720-aaad-33b53f92fa93/resourcegroups/teamrocket/workspaces/machinelearningfinal&tid=413600cf-bd4e-4c7c-8a61-69e73cddf731
Pipeline submitted for execution.
PipelineRunId: 53a19b88-9f2d-4a63-8aca-8df0a296c789
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/53a19b88-9f2d-4a63-8aca-8df0a296c789?wsid=/subscriptions/d820a79a-27a2-4720-aaad-33b53f92fa93/resourcegroups/teamrocket/workspaces/machinelearningfinal&tid=413600cf-bd4e-4c7c-8a61-69e73cddf731
PipelineRun Status: NotStarted


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRun Status: Running


StepRunId: 3dd0eb03-ccc8-4549-8744-73ca0b5cd673
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/3dd0eb03-ccc8-4549-8744-73ca0b5cd673?wsid=/subscriptions/d820a79a-27a2-4720-aaad-33b53f92fa93/resourcegroups/teamrocket/workspaces/machinelearningfinal&tid=413600cf-bd4e-4c7c-8a61-69e73cddf731
StepRun( Prepare Data ) Status: Running

StepRun(Prepare Data) Execution Summary
StepRun( Prepare Data ) Status: Finished
{'runId': '3dd0eb03-ccc8-4549-8744-73ca0b5cd673', 'target': 'your-compute-cluster', 'status': 'Completed', 'startTimeUtc': '2022-03-08T23:52:03.106634Z', 'endTimeUtc': '2022-03-08T23:53:40.948696Z', 'services': {}, 'properties': {'ContentSnapshotId': 'a01b331c-2143-4491-8ad1-61d133308fd8', 'StepType': 'PythonScriptStep', 'ComputeTargetType': 'AmlCompute', 'azureml.moduleid': 'f0f68bfe-6cd4-4253-8d4d-9311aa9c0250', 'azureml.moduleName': 'Prepare Data', 'azureml.runsource': 'azureml.StepRun', 'azureml.nodeid': '53a7b5ac', 'azureml.pipelin

'Finished'

## Publish the pipeline

In [12]:
# Publish the pipeline from the run
published_pipeline = pipeline_run.publish_pipeline(
    name="house-prices-prediction-training-pipeline", description="Trains houses prices model", version="1.0")

published_pipeline

Name,Id,Status,Endpoint
house-prices-prediction-training-pipeline,0beec2f4-d84b-4eba-93c5-e42225f4432b,Active,REST Endpoint


In [13]:
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()
print("Authentication header ready.")
import requests

experiment_name = 'mslearn-diabetes-pipeline'

rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint, 
                         headers=auth_header, 
                         json={"ExperimentName": experiment_name})
run_id = response.json()["Id"]
from azureml.pipeline.core.run import PipelineRun

published_pipeline_run = PipelineRun(ws.experiments[experiment_name], run_id)
published_pipeline_run.wait_for_completion(show_output=True)

https://eastus.api.azureml.ms/pipelines/v1.0/subscriptions/d820a79a-27a2-4720-aaad-33b53f92fa93/resourceGroups/teamrocket/providers/Microsoft.MachineLearningServices/workspaces/machinelearningfinal/PipelineRuns/PipelineSubmit/0beec2f4-d84b-4eba-93c5-e42225f4432b
Authentication header ready.
PipelineRunId: 161229cd-79f9-4213-af67-0402e950a94f
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/161229cd-79f9-4213-af67-0402e950a94f?wsid=/subscriptions/d820a79a-27a2-4720-aaad-33b53f92fa93/resourcegroups/teamrocket/workspaces/machinelearningfinal&tid=413600cf-bd4e-4c7c-8a61-69e73cddf731
PipelineRun Status: NotStarted
PipelineRun Status: Running

PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '161229cd-79f9-4213-af67-0402e950a94f', 'status': 'Completed', 'startTimeUtc': '2022-03-08T23:54:14.580581Z', 'endTimeUtc': '2022-03-08T23:54:16.415946Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'Unavailable', 'runType': 

'Finished'

In [14]:
from azureml.pipeline.core import ScheduleRecurrence, Schedule

# Submit the Pipeline every Monday at 00:00 UTC
recurrence = ScheduleRecurrence(frequency="Week", interval=1, week_days=["Monday"], time_of_day="00:00")
weekly_schedule = Schedule.create(ws, name="weekly-diabetes-training", 
                                  description="Based on time",
                                  pipeline_id=published_pipeline.id, 
                                  experiment_name='mslearn-diabetes-pipeline', 
                                  recurrence=recurrence)
print('Pipeline scheduled.')
schedules = Schedule.list(ws)
pipeline_experiment = ws.experiments.get('mslearn-diabetes-pipeline')
latest_run = list(pipeline_experiment.get_runs())[0]

latest_run.get_details()

Pipeline scheduled.


{'runId': '212a8618-c559-4c8a-8bad-00874256f453',
 'status': 'Completed',
 'startTimeUtc': '2022-03-08T23:54:21.516957Z',
 'endTimeUtc': '2022-03-08T23:54:23.299944Z',
 'services': {},
 'properties': {'azureml.runsource': 'azureml.PipelineRun',
  'runSource': 'Unavailable',
  'runType': 'Schedule',
  'azureml.parameters': '{}',
  'azureml.continue_on_step_failure': 'False',
  'azureml.pipelineComponent': 'pipelinerun',
  'azureml.pipelineid': '0beec2f4-d84b-4eba-93c5-e42225f4432b'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'logs/azureml/executionlogs.txt': 'https://machinelearnin6978024331.blob.core.windows.net/azureml/ExperimentRun/dcid.212a8618-c559-4c8a-8bad-00874256f453/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=zPj2Iw4vGaMUY37tChx8VC1MTQ3o4xTmiAmXCMg2CTk%3D&skoid=ca23d09e-9aca-4272-8800-371063aa510c&sktid=413600cf-bd4e-4c7c-8a61-69e73cddf731&skt=2022-03-08T21%3A02%3A14Z&ske=2022-03-10T05%3A12%3A14Z&sks=b&skv=2019-07-07&st=2022-03-08T23%3A44%3A30Z&se=20

You can retrieve the metrics and outputs from the **Run** object.

In [15]:
for run in pipeline_run.get_children():
    print(run.name, ':')
    metrics = run.get_metrics()
    for metric_name in metrics:
        print('\t',metric_name, ":", metrics[metric_name])

Train and Register Model :
	 R2 : -0.015234379387455423
	 MSE : 10145730144851.955
	 MAE : 1188469.1720946392
Prepare Data :
	 raw_rows : 1910
	 processed_rows : 1910


In [16]:
from azureml.core import Model

for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

house-prices_model version: 3
	 Training context : Pipeline
	 R2 : -0.015234379387455423
	 MSE : 10145730144851.955
	 MAE : 1188469.1720946392


house-prices_model version: 2
	 Training context : Pipeline
	 R2 : -0.015234379387455643
	 MSE : 10145730144851.957
	 MAE : 1188469.172094638


house-prices_model version: 1
	 Training context : Pipeline
	 R2 : -0.012891572926038597
	 MSE : 10122317342230.465
	 MAE : 1193828.0233541918


diabetes_model version: 3
	 Training context : Pipeline
	 R2 : -0.012891572926038597


diabetes_model version: 2
	 Training context : Pipeline
	 R2 : -0.012891572926038597


diabetes_model version: 1
	 Training context : Pipeline
	 R2 : -0.012891572926038597




In [17]:
# Publish the pipeline from the run
published_pipeline = pipeline_run.publish_pipeline(
    name="house-training-pipeline", description="Trains house model", version="1.0")

published_pipeline

Name,Id,Status,Endpoint
house-training-pipeline,9f6ecf8f-9173-4362-b199-1fb94db53a9e,Active,REST Endpoint


In [18]:
rest_endpoint = published_pipeline.endpoint
print(rest_endpoint)

https://eastus.api.azureml.ms/pipelines/v1.0/subscriptions/d820a79a-27a2-4720-aaad-33b53f92fa93/resourceGroups/teamrocket/providers/Microsoft.MachineLearningServices/workspaces/machinelearningfinal/PipelineRuns/PipelineSubmit/9f6ecf8f-9173-4362-b199-1fb94db53a9e


In [19]:
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()
print("Authentication header ready.")

Authentication header ready.


In [20]:
import requests

experiment_name = 'mslearn-house-prices-pipeline'

rest_endpoint = published_pipeline.endpoint
response = requests.post(rest_endpoint, 
                         headers=auth_header, 
                         json={"ExperimentName": experiment_name})
run_id = response.json()["Id"]
run_id

'cbb49b02-e32d-486e-9abe-a5f43c217cb8'

In [21]:
from azureml.pipeline.core.run import PipelineRun

published_pipeline_run = PipelineRun(ws.experiments[experiment_name], run_id)
published_pipeline_run.wait_for_completion(show_output=True)

PipelineRunId: cbb49b02-e32d-486e-9abe-a5f43c217cb8
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/cbb49b02-e32d-486e-9abe-a5f43c217cb8?wsid=/subscriptions/d820a79a-27a2-4720-aaad-33b53f92fa93/resourcegroups/teamrocket/workspaces/machinelearningfinal&tid=413600cf-bd4e-4c7c-8a61-69e73cddf731
PipelineRun Status: NotStarted
PipelineRun Status: Running

PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': 'cbb49b02-e32d-486e-9abe-a5f43c217cb8', 'status': 'Completed', 'startTimeUtc': '2022-03-08T23:54:34.563924Z', 'endTimeUtc': '2022-03-08T23:54:36.47056Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'Unavailable', 'runType': 'HTTP', 'azureml.parameters': '{}', 'azureml.continue_on_step_failure': 'False', 'azureml.pipelineComponent': 'pipelinerun', 'azureml.pipelineid': '9f6ecf8f-9173-4362-b199-1fb94db53a9e'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://machin

'Finished'

In [22]:
from azureml.pipeline.core import ScheduleRecurrence, Schedule

# Submit the Pipeline every Monday at 00:00 UTC
recurrence = ScheduleRecurrence(frequency="Week", interval=1, week_days=["Monday"], time_of_day="00:00")
weekly_schedule = Schedule.create(ws, name="weekly-houses-training", 
                                  description="Based on time",
                                  pipeline_id=published_pipeline.id, 
                                  experiment_name='mslearn-houses-pipeline', 
                                  recurrence=recurrence)
print('Pipeline scheduled.')

Pipeline scheduled.


In [23]:
schedules = Schedule.list(ws)
schedules

[Pipeline(Name: weekly-houses-training,
 Id: 64a8065e-172f-4e00-a79c-d8c082049468,
 Status: Active,
 Pipeline Id: 9f6ecf8f-9173-4362-b199-1fb94db53a9e,
 Pipeline Endpoint Id: None,
 Recurrence Details: Runs at 0:00 on Monday every Week),
 Pipeline(Name: weekly-diabetes-training,
 Id: 709e9909-52b5-4724-ae7f-500797f0bcfa,
 Status: Active,
 Pipeline Id: 0beec2f4-d84b-4eba-93c5-e42225f4432b,
 Pipeline Endpoint Id: None,
 Recurrence Details: Runs at 0:00 on Monday every Week),
 Pipeline(Name: weekly-houses-training,
 Id: 30111224-f10e-4148-89cf-2beb7a783959,
 Status: Active,
 Pipeline Id: bb38695e-ea60-456b-a83a-aa8eb706b47f,
 Pipeline Endpoint Id: None,
 Recurrence Details: Runs at 0:00 on Monday every Week),
 Pipeline(Name: weekly-diabetes-training,
 Id: 0544c4e2-17df-4b14-bf81-e96d3cbedd56,
 Status: Active,
 Pipeline Id: 34098224-dd67-4f69-b2c8-ea78e49b4f25,
 Pipeline Endpoint Id: None,
 Recurrence Details: Runs at 0:00 on Monday every Week),
 Pipeline(Name: weekly-houses-training,
 Id:

In [24]:
pipeline_experiment = ws.experiments.get('mslearn-houses-pipeline')
latest_run = list(pipeline_experiment.get_runs())[0]

latest_run.get_details()

{'runId': '0406a475-f487-4b57-bb53-8b95447b3f06',
 'status': 'Completed',
 'startTimeUtc': '2022-03-08T22:24:46.015358Z',
 'endTimeUtc': '2022-03-08T22:24:48.176584Z',
 'services': {},
 'properties': {'azureml.runsource': 'azureml.PipelineRun',
  'runSource': 'Unavailable',
  'runType': 'Schedule',
  'azureml.parameters': '{}',
  'azureml.continue_on_step_failure': 'False',
  'azureml.pipelineComponent': 'pipelinerun',
  'azureml.pipelineid': 'bb38695e-ea60-456b-a83a-aa8eb706b47f'},
 'inputDatasets': [],
 'outputDatasets': [],
 'logFiles': {'logs/azureml/executionlogs.txt': 'https://machinelearnin6978024331.blob.core.windows.net/azureml/ExperimentRun/dcid.0406a475-f487-4b57-bb53-8b95447b3f06/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=t%2B8VMIFOvTNd3yTFFTNWyZzzlNjBu%2B%2BN9MsQu%2Fdfm%2Fk%3D&skoid=ca23d09e-9aca-4272-8800-371063aa510c&sktid=413600cf-bd4e-4c7c-8a61-69e73cddf731&skt=2022-03-08T21%3A02%3A14Z&ske=2022-03-10T05%3A12%3A14Z&sks=b&skv=2019-07-07&st=2022-03-08T23%3A44%3