In [1]:
!pip install azureml-sdk[notebooks]



In [2]:
from azureml.core import *
import azureml.dataprep as dprep


# Create a Pipeline¶
You can perform the various steps required to ingest data, train a model, and register the model individually by using the Azure ML SDK to run script-based experiments. However, in an enterprise environment it is common to encapsulate the sequence of discrete steps required to build a machine learning solution into a pipeline that can be run on one or more compute targets; either on-demand by a user, from an automated build process, or on a schedule.

In this notebook, you'll bring together all of these elements to create a simple pipeline that pre-processes data and then trains and registers a model.

# Connect to your workspace
To get started, connect to your workspace.

Note: If you haven't already established an authenticated session with your Azure subscription, you'll be prompted to authenticate by clicking a link, entering an authentication code, and signing into Azure.

In [6]:
ws = Workspace.get("myworkspace", subscription_id='ff711122-6294-4fad-9d1f-bf505a51fc42',
               resource_group='mlproject',
               location='westus2')
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.32.0 to work with myworkspace


# Prepare the Data
In your pipeline, you'll use a dataset containing details of diabetes patients. Run the cell below to create this dataset (if you created it previously, the code will find the existing version)

In [10]:
from azureml.core import Dataset
default_ds = ws.get_default_datastore()
print(default_ds)

{
  "name": "workspaceblobstore",
  "container_name": "azureml-blobstore-4251d2dc-9aa6-4571-a78d-2baff407acbf",
  "account_name": "myworkspstorage32cc8b3bb",
  "protocol": "https",
  "endpoint": "core.windows.net"
}


In [26]:
if 'diabetes dataset' not in ws.datasets:
    default_ds.upload_files(files=['C:/Users/User/Desktop/Data/diabetes2.csv'], # Upload the diabetes csv files in /data
                        target_path='diabetes-data/', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

    #Create a tabular dataset from the path on the datastore (this may take a short while)
    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name='diabetes dataset',
                                description='diabetes data',
                                tags = {'format':'CSV'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset already registered.')

Uploading an estimated of 1 files
Uploading C:/Users/User/Desktop/Data/diabetes2.csv
Uploaded C:/Users/User/Desktop/Data/diabetes2.csv, 1 files out of an estimated total of 1
Uploaded 1 files
Dataset registered.


# Convert Data Upload Into A Function

In [28]:
def dataupload(filename):
    if 'diabetes dataset' not in ws.datasets:
        default_ds.upload_files(files=[filename], # Upload the diabetes csv files in /data
                            target_path='diabetes-data/', # Put it in a folder path in the datastore
                            overwrite=True, # Replace existing files of the same name
                            show_progress=True)

    #Create a tabular dataset from the path on the datastore (this may take a short while)
        tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'diabetes-data/*.csv'))

    # Register the tabular dataset
        try:
            tab_data_set = tab_data_set.register(workspace=ws, 
                                    name='diabetes dataset',
                                    description='diabetes data',
                                    tags = {'format':'CSV'},
                                    create_new_version=True)
            print('Dataset registered.')
        except Exception as ex:
            print(ex)
    else:
        print('Dataset already registered.')
    

In [33]:
dataupload('C:/Users/User/Desktop/Data/diabetes2.csv')

Dataset already registered.


In [34]:
dataupload('C:/Users/User/Desktop/Data/diabetes.csv')

Dataset already registered.


# Copy from Consume in AML after Registering
* Remember when you open the workspace blobstore in the Datastore you can browse all file uploaded

In [31]:

# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = 'ff711122-6294-4fad-9d1f-bf505a51fc42'
resource_group = 'mlproject'
workspace_name = 'myworkspace'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='diabetes dataset')
# dataset.to_pandas_dataframe()

In [32]:
print(dataset)

TabularDataset
{
  "source": [
    "('workspaceblobstore', 'diabetes-data/*.csv')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ParseDelimited",
    "DropColumns",
    "SetColumnTypes"
  ],
  "registration": {
    "id": "6d9e2ba1-d821-48d4-93b3-59d6b0216817",
    "name": "diabetes dataset",
    "version": 1,
    "description": "diabetes data",
    "tags": {
      "format": "CSV"
    },
    "workspace": "Workspace.create(name='myworkspace', subscription_id='ff711122-6294-4fad-9d1f-bf505a51fc42', resource_group='mlproject')"
  }
}


# Create scripts for pipeline steps
Pipelines consist of one or more steps, which can be Python scripts, or specialized steps like a data transfer step that copies data from one location to another. Each step can run in its own compute context. In this exercise, you'll build a simple pipeline that contains two Python script steps: one to pre-process some training data, and another to use the pre-processed data to train and register a model.

First, let's create a folder for the script files we'll use in the pipeline steps.

In [36]:
import os
# Create a folder for the pipeline step files
experiment_folder = 'diabetes_pipeline'
os.makedirs(experiment_folder, exist_ok=True)

print(experiment_folder)

diabetes_pipeline


# Data Preprocessing
Now let's create the first script, which will read data from the diabetes dataset and apply some simple pre-processing to remove any rows with missing data and normalize the numeric features so they're on a similar scale.

The script includes a argument named --prepped-data, which references the folder where the resulting data should be saved.

## 1. Import libraries

In [37]:
%%writefile $experiment_folder/prep_diabetes.py # this writes the following commands into this file

import os
import argparse
import pandas as pd
from azureml.core import Run
from sklearn.preprocessing import MinMaxScaler

Writing diabetes_pipeline/prep_diabetes.py


In [41]:
print(f' write {experiment_folder}/prep_diabetes.py')

 write diabetes_pipeline/prep_diabetes.py


In [47]:
!pip install azureml-core azureml-pipeline



## 2. Get Parameters

In [48]:
import argparse
from argparse import ArgumentParser
from azureml.core import Run
parser=argparse.ArgumentParser()
parser.add_argument("--input-data", type=str, dest='raw_dataset_id', help='raw dataset')
parser.add_argument('--prepped-data', type=str, dest='prepped_data', default='prepped_data', help='Folder for results')
args=parser.parse_args()
save_folder=args.prepped_data

usage: ipykernel_launcher.py [-h] [--input-data RAW_DATASET_ID] [--prepped-data PREPPED_DATA]
ipykernel_launcher.py: error: unrecognized arguments: -f C:\Users\User\AppData\Roaming\jupyter\runtime\kernel-f3b114c2-7e70-4178-94e1-e02c3b3dfe16.json


SystemExit: 2

In [49]:
# Get the experiment run context
run = Run.get_context()

In [50]:
# load the data (passed as an input dataset)
print("Loading Data...")
diabetes = run.input_datasets['raw_data'].to_pandas_dataframe()

Loading Data...


AttributeError: '_OfflineRun' object has no attribute 'input_datasets'