Check drift installed

In [None]:
!pip show azureml-datadrift

In [None]:
from azureml.core import Workspace

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to work with', ws.name)

Create Baseline dataset

Upload file to default datastore, then make a new dataset from there.

In [None]:
from azureml.core import Datastore, Dataset


# Upload the baseline data
# You'll need to upload/have your own sample .csv
default_ds = ws.get_default_datastore()
default_ds.upload_files(files=['./data/sample.csv'],
                       target_path='data-baseline',
                       overwrite=True,
                       show_progress=True)

# Create and register the baseline dataset
print('Registering baseline dataset...')
baseline_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'data-baseline/*.csv'))
baseline_data_set = baseline_data_set.register(workspace=ws,
                           name='data baseline',
                           description='baseline data',
                           tags = {'format':'CSV'},
                           create_new_version=True)

print('Baseline dataset registered!')

Create target set with drift

In [None]:
import datetime as dt
import pandas as pd

print('Generating simulated data...')

# Load the smaller of the two data files
data = pd.read_csv('data/sample.csv')

# We'll generate data for the past 6 weeks
weeknos = reversed(range(6))

file_paths = []
for weekno in weeknos:

    # Get the date X weeks ago
    data_date = dt.date.today() - dt.timedelta(weeks=weekno)

    # Modify data to ceate some drift
    # For each week, add drift to it
    # These are fake features. Update corresponding to the features of your sample data
    data['Feature 1'] = data['Feature 1'] + 2
    data['Feature 2'] = round(data['Feature 2'] * 1.2).astype(int)

    # Save the file with the date encoded in the filename
    # Create a new file with the name of the date in the filename, turn it into csv and add it to an array to all be uploaded
    # Each week file has features modified based on lines 20-23 above
    file_path = 'data/sample_{}.csv'.format(data_date.strftime("%Y-%m-%d"))
    data.to_csv(file_path)
    file_paths.append(file_path)

# You have an array of file paths with each file being a from 6 weeks ago
# Upload the files
path_on_datastore = 'data-target'
default_ds.upload_files(files=file_paths,
                       target_path=path_on_datastore,
                       overwrite=True,
                       show_progress=True)

# Use the folder partition format to define a dataset with a 'date' timestamp column
partition_format = path_on_datastore + '/sample_{date:yyyy-MM-dd}.csv'
target_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, path_on_datastore + '/*.csv'),
                                                       partition_format=partition_format)

# Register the target dataset
print('Registering target dataset...')
target_data_set = target_data_set.with_timestamp_columns('date').register(workspace=ws,
                                                                          name='data target',
                                                                          description='target data',
                                                                          tags = {'format':'CSV'},
                                                                          create_new_version=True)

print('Target dataset registered!')

Create + Run Dataset Monitor

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "drift2"

try:
    # Check for existing compute target
    training_cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2)
        training_cluster = ComputeTarget.create(ws, cluster_name, compute_config)
        training_cluster.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

In [None]:
from azureml.datadrift import DataDriftDetector

# set up feature list
# same features we drifted before

# set up data drift detector
monitor = DataDriftDetector.create_from_datasets(ws, 'data-drift-2', baseline_data_set, target_data_set,
                                                      compute_target=cluster_name,
                                                      frequency='Week',
                                                      feature_list=None,
                                                      drift_threshold=.3,
                                                      latency=24)
monitor

In [None]:
from azureml.widgets import RunDetails

#backfill from 6 weeks ago, to today
backfill = monitor.backfill(dt.datetime.now() - dt.timedelta(weeks=6), dt.datetime.now())

RunDetails(backfill).show()
backfill.wait_for_completion()

In [None]:
drift_metrics = backfill.get_metrics()
for metric in drift_metrics:
    print(metric, drift_metrics[metric])