# Monitor Data Drift

In [None]:
pip show azureml-datadrift

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
print('Ready to work with', ws.name)

In [None]:
from azureml.core import Datastore, Dataset
from azureml.data.datapath import DataPath

# Upload the baseline data
default_ds = ws.get_default_datastore()
Dataset.File.upload_directory(src_dir='data',
                              target=DataPath(default_ds, 'titanic-baseline/')
                              )

# Create and register the baseline dataset
print('Registering baseline dataset...')
baseline_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, 'titanic-baseline/*.csv'))
baseline_data_set = baseline_data_set.register(workspace=ws, 
                           name='titanic baseline',
                           description='titanic baseline data',
                           tags = {'format':'CSV'},
                           create_new_version=True)

print('Baseline dataset registered!')

In [None]:
import datetime as dt
import pandas as pd

print('Generating simulated data...')

data = pd.read_csv('data/titanic.csv')

# We'll generate data for the past 6 weeks
weeknos = reversed(range(6))

file_paths = []
for weekno in weeknos:
    
    # Get the date X weeks ago
    data_date = dt.date.today() - dt.timedelta(weeks=weekno)
    
    # Modify data to ceate some drift
    data['Age'] = data['Age'] + 20
    data['Fare'] = data['Fare'] * 1.5
    
    # Save the file with the date encoded in the filename
    file_path = 'data/titanic_{}.csv'.format(data_date.strftime("%Y-%m-%d"))
    data.to_csv(file_path)
    file_paths.append(file_path)

path_on_datastore = 'titanic-target'
default_ds.upload_files(files=file_paths,
                       target_path=path_on_datastore,
                       overwrite=True,
                       show_progress=True)

# Use the folder partition format to define a dataset with a 'date' timestamp column
partition_format = path_on_datastore + '/titanic_{date:yyyy-MM-dd}.csv'
target_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, path_on_datastore + '/*.csv'),
                                                       partition_format=partition_format)

# Register the target dataset
print('Registering target dataset...')
target_data_set = target_data_set.with_timestamp_columns('date').register(workspace=ws,
                                                                          name='titanic target',
                                                                          description='titanic target data',
                                                                          tags = {'format':'CSV'},
                                                                          create_new_version=True)

print('Target dataset registered!')

# Exercise: 

1. Create a DataDriftDetector
2. Backfill the DataDriftDetector
3. Get the metrics to analyze the drift