Set the Azure ML Workspace

In [21]:
from azureml.core import Workspace
# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to work with', ws.name)

Ready to work with azuremlws


Create the baseline dataset

In [12]:
from azureml.core import Datastore, Dataset

# Upload the baseline data
default_ds = ws.get_default_datastore()
default_ds.upload_files(files=['./data/heart_failure.csv', './data/heart_failure2.csv'],
                       target_path='heart-baseline',
                       overwrite=True, 
                       show_progress=True)

# Create and register the baseline dataset
print('Registering baseline dataset...')
baseline_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'heart-baseline/*.csv'))
baseline_data_set = baseline_data_set.register(workspace=ws, 
                           name='heart-baseline',
                           description='heart baseline data',
                           tags = {'format':'CSV'},
                           create_new_version=True)

print('Baseline dataset registered!')

Uploading an estimated of 2 files
Uploading ./data/heart_failure.csv
Uploaded ./data/heart_failure.csv, 1 files out of an estimated total of 2
Uploading ./data/heart_failure2.csv
Uploaded ./data/heart_failure2.csv, 2 files out of an estimated total of 2
Uploaded 2 files
Registering baseline dataset...
Baseline dataset registered!


Generate simulated data and regsiter the target dataset

In [13]:
import datetime as dt
import pandas as pd


# Load the smaller of the two data files
data = pd.read_csv('data/heart_failure2.csv')

# We'll generate data for the past 3 weeks
weeknos = reversed(range(3))

file_paths = []
for weekno in weeknos:
    
    # Get the date X weeks ago
    data_date = dt.date.today() - dt.timedelta(weeks=weekno)
    
    # Modify data to ceate some drift
    data['anaemia'] = data['anaemia'] + 1
    data['age'] = round(data['age'] * 1.2).astype(int)
    
    
    # Save the file with the date encoded in the filename
    file_path = 'data/heart_{}.csv'.format(data_date.strftime("%Y-%m-%d"))
    data.to_csv(file_path)
    file_paths.append(file_path)

# Upload the files
path_on_datastore = 'heart-target'
default_ds.upload_files(files=file_paths,
                       target_path=path_on_datastore,
                       overwrite=True,
                       show_progress=True)

# Use the folder partition format to define a dataset with a 'date' timestamp column
partition_format = path_on_datastore + '/heart_{date:yyyy-MM-dd}.csv'
target_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, path_on_datastore + '/*.csv'),
                                                       partition_format=partition_format)

# Register the target dataset
print('Registering target dataset...')
target_data_set = target_data_set.with_timestamp_columns('date').register(workspace=ws,
                                                                          name='heart target',
                                                                          description='heart target data',
                                                                          tags = {'format':'CSV'},
                                                                          create_new_version=True)

print('Target dataset registered!')

Generating simulated data...
Uploading an estimated of 3 files
Uploading data/heart_2021-07-06.csv
Uploaded data/heart_2021-07-06.csv, 1 files out of an estimated total of 3
Uploading data/heart_2021-07-13.csv
Uploaded data/heart_2021-07-13.csv, 2 files out of an estimated total of 3
Uploading data/heart_2021-07-20.csv
Uploaded data/heart_2021-07-20.csv, 3 files out of an estimated total of 3
Uploaded 3 files
Registering target dataset...
Target dataset registered!


Create the AML Compute

In [17]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "AMLAZURECOMPUTE"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

Creating......
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


Verify if azureml-datadrift pacakcge is installed in the environment

In [15]:
!pip show azureml-datadrift

Name: azureml-datadrift
Version: 1.31.0
Summary: Azure Machine Learning datadrift
Home-page: https://docs.microsoft.com/python/api/overview/azure/ml/?view=azure-ml-py
Author: Microsoft Corp
Author-email: None
License: https://aka.ms/azureml-sdk-license
Location: /anaconda/envs/azureml_py36/lib/python3.6/site-packages
Requires: pandas, msrest, azureml-telemetry, jsonpickle, matplotlib, numpy, pyspark, azureml-dataset-runtime, azureml-pipeline-core, scikit-learn, azureml-core, scipy, lightgbm
Required-by: 


Create the DataDrift Detector based on the listed features

In [18]:
from azureml.datadrift import DataDriftDetector

# set up feature list
features = ['anaemia', 'age']

# set up data drift detector
monitor = DataDriftDetector.create_from_datasets(ws, 'mslearn-heart-drift', baseline_data_set, target_data_set,
                                                      compute_target=cluster_name, 
                                                      frequency='Week', 
                                                      feature_list=features, 
                                                      drift_threshold=.3, 
                                                      latency=24)
monitor

{'_workspace': Workspace.create(name='azuremlws', subscription_id='1cad424f-916b-47e1-bb04-5e8889f177d8', resource_group='azuremldemo'), '_frequency': 'Week', '_schedule_start': None, '_schedule_id': None, '_interval': 1, '_state': 'Disabled', '_alert_config': None, '_type': 'DatasetBased', '_id': '0dd368e9-c37a-4e62-8072-877118fdbe51', '_model_name': None, '_model_version': 0, '_services': None, '_compute_target_name': 'AMLAZURECOMPUTE', '_drift_threshold': 0.3, '_baseline_dataset_id': '4c61173c-2005-4664-a0d6-a77c37334860', '_target_dataset_id': '8033bd3e-1921-4e69-9b82-6d708e283099', '_feature_list': ['anaemia', 'age'], '_latency': 24, '_name': 'mslearn-heart-drift', '_latest_run_time': None, '_client': <azureml.datadrift._restclient.datadrift_client.DataDriftClient object at 0x7f6a6699bdd8>, '_logger': <_TelemetryLoggerContextAdapter azureml.datadrift._logging._telemetry_logger.azureml.datadrift.datadriftdetector (DEBUG)>}

Use the backfill option to monitor datadrift back in time

In [19]:
from azureml.widgets import RunDetails
backfill = monitor.backfill(dt.datetime.now() - dt.timedelta(weeks=6), dt.datetime.now())
RunDetails(backfill).show()
backfill.wait_for_completion()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

{'runId': 'mslearn-heart-drift-Monitor-Runs_1626796471886',
 'target': 'AMLAZURECOMPUTE',
 'status': 'Finalizing',
 'startTimeUtc': '2021-07-20T16:07:24.618575Z',
   'message': 'target dataset id:8033bd3e-1921-4e69-9b82-6d708e283099 do not contain sufficient amount of data after timestamp filteringMinimum needed: 50 rows.Skipping calculation for time slice 2021-06-06 00:00:00 to 2021-06-13 00:00:00.'},
  {'source': 'datadrift',
   'message': 'target dataset id:8033bd3e-1921-4e69-9b82-6d708e283099 do not contain sufficient amount of data after timestamp filteringMinimum needed: 50 rows.Skipping calculation for time slice 2021-06-13 00:00:00 to 2021-06-20 00:00:00.'},
  {'source': 'datadrift',
   'message': 'target dataset id:8033bd3e-1921-4e69-9b82-6d708e283099 do not contain sufficient amount of data after timestamp filteringMinimum needed: 50 rows.Skipping calculation for time slice 2021-06-20 00:00:00 to 2021-06-27 00:00:00.'},
  {'source': 'datadrift',
   'message': 'target dataset 

Print the data drift metrics

In [20]:
drift_metrics = backfill.get_metrics()
for metric in drift_metrics:
    print(metric, drift_metrics[metric])

start_date 2021-06-06
end_date 2021-07-25
frequency Week
Datadrift percentage {'days_from_start': [28, 35, 42], 'drift_percentage': [75.51430421160069, 100.0, 100.0]}
