# TODO
- [x] Fix MLTable Access
- [x] Add today offset to dataset. Split production in half
  - [x] We don't need test, rename current to production.
  - [x] Which Schedule frequency? -> Weekly?
  - [ ] Add 7 day offset.
- [ ] Create Model Outputs (Notebook 01)
- [ ] Document Notebooks

# Data Set Preparation

In [1]:
import os
import shutil
import pandas as pd

In [3]:
dataset = pd.read_csv("./data/predictive_maintenance_update.csv")

operators = pd.get_dummies(dataset["operator"])
assembly_line_nums = pd.get_dummies(dataset["assembly_line_num"])

dataset_dummies = pd.merge(dataset, operators, left_index=True, right_index=True).drop(columns="operator")
dataset_dummies = pd.merge(dataset_dummies, assembly_line_nums, left_index=True, right_index=True).drop(columns="assembly_line_num")

uint8_columns = [
    'operator0', 'operator1', 'operator2', 'operator3', 'operator4',
    'operator5', 'operator6', 'operator7', 'assembly_0', 'assembly_1',
    'assembly_2', 'assembly_3', 'assembly_4', 'assembly_5', 'assembly_6'
]

# Convert each column in the list to bool
for column in uint8_columns:
    dataset_dummies[column] = dataset_dummies[column].astype(bool)

dataset_dummies

Unnamed: 0,timestamp,heat_deviation,speed_deviation,days_since_last_service,sensor_back,sensor_front,failure,operator0,operator1,operator2,...,operator5,operator6,operator7,assembly_0,assembly_1,assembly_2,assembly_3,assembly_4,assembly_5,assembly_6
0,2020-01-06 00:24:38,-1.402682,4.241747,99,-3.405993,3.450679,0,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,2020-01-06 01:39:16,-0.126422,1.930147,101,-3.413560,2.879816,0,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,2020-01-06 01:49:17,0.102070,4.555567,100,-3.728655,0.073900,0,False,False,True,...,False,False,False,False,True,False,False,False,False,False
3,2020-01-06 02:27:52,0.337126,0.903267,102,-3.562049,3.185188,0,False,False,True,...,False,False,False,False,False,True,False,False,False,False
4,2020-01-06 02:28:29,-2.618703,-0.498052,100,2.491643,-2.057373,0,False,False,True,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44995,2022-01-08 23:56:27,0.592558,-0.735717,100,0.679219,0.747613,0,False,False,True,...,False,False,False,False,True,False,False,False,False,False
44996,2022-01-08 23:56:53,-1.458949,-0.761666,100,1.850105,3.902713,0,False,False,True,...,False,False,False,False,False,False,True,False,False,False
44997,2022-01-08 23:57:56,-0.838191,-2.909899,98,0.860230,0.894338,0,False,False,True,...,False,False,False,True,False,False,False,False,False,False
44998,2022-01-08 23:58:15,-1.728545,2.942215,101,-0.425456,2.185823,0,False,False,False,...,False,False,False,False,False,False,True,False,False,False


In [4]:
dataset_dummies["timestamp"] = pd.to_datetime(dataset_dummies["timestamp"])

# Define timestamps for splitting
production_start = pd.to_datetime("01/06/2021")
seven_day_ago = pd.to_datetime("today") - pd.Timedelta(days=7)
offset = seven_day_ago - production_start

# Add the offset to all timestamps in the dataset
dataset_dummies["timestamp"] = dataset_dummies["timestamp"] + offset

# Update the production_start to today
production_start = seven_day_ago

# Split the dataset
training = dataset_dummies[dataset_dummies["timestamp"] <= production_start]
production = dataset_dummies[dataset_dummies["timestamp"] >= production_start]

# Sanity checking
print("reference min: ", training["timestamp"].min(), " reference max: ", training["timestamp"].max())
print("production min: ", production["timestamp"].min(), " production max: ", production["timestamp"].max())

reference min:  2023-02-07 14:51:31.705794  reference max:  2024-02-08 14:17:47.705794
production min:  2024-02-08 14:43:00.705794  production max:  2025-02-10 14:26:02.705794


In [5]:
shutil.rmtree("data/tmp/", ignore_errors=True)

os.makedirs("data/tmp", exist_ok=False)
os.makedirs("data/tmp/training", exist_ok=False)
os.makedirs("data/tmp/production", exist_ok=False)
os.makedirs("data/tmp/dataset", exist_ok=False)

# dataset_dummies.to_parquet("data/tmp/dataset/dataset.parquet", index=False)
training.to_parquet("data/tmp/training/training.parquet", index=False)
production.to_parquet("data/tmp/production/production.parquet", index=False)

## Create AzureML Datasets
### Uri_folder Dataset (Training, Production)

In [6]:
import time

from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.identity import DefaultAzureCredential
from azure.ai.ml.constants import AssetTypes

In [9]:
ml_client = MLClient.from_config(credential=DefaultAzureCredential())

VERSION = time.strftime("%Y.%m.%d.%H%M%S", time.gmtime())

Found the config file in: .\config.json


In [10]:
training_dataset = Data(
    path="./data/tmp/training/training.parquet",
    type=AssetTypes.URI_FILE,
    description="synthetic Dataset (training) for demonstrating data drift; parquet file",
    name="synthetic-urifile-training",
    version=VERSION,
)

ml_client.data.create_or_update(training_dataset)

production_dataset = Data(
    path="./data/tmp/production",
    type=AssetTypes.URI_FOLDER,
    description="synthetic Dataset (production) for demonstrating data drift; parquet file",
    name="synthetic-urifolder-production",
    version=VERSION,
)

ml_client.data.create_or_update(production_dataset)

[32mUploading training.parquet[32m (< 1 MB): 100%|##########| 1.03M/1.03M [00:00<00:00, 3.71MB/s]
[39m

[32mUploading production (1.29 MBs): 100%|##########| 1287643/1287643 [00:00<00:00, 3891602.67it/s]
[39m



Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_folder', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'synthetic-urifolder-production', 'description': 'synthetic Dataset (production) for demonstrating data drift; parquet file', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/13c1109b-ba76-4ca6-8161-8767bdf3c75c/resourceGroups/ai-services-rg/providers/Microsoft.MachineLearningServices/workspaces/schaeffler-ops-it-aml/data/synthetic-urifolder-production/versions/2024.02.15.133553', 'Resource__source_path': None, 'base_path': 'c:\\code\\demo\\azure-mlops-end2end\\notebooks', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x0000024075125BD0>, 'serialize': <msrest.serialization.Serializer object at 0x0000024074E25050>, 'version': '2024.02.15.133553', 'latest_version': None, 'path': 'azureml://subscriptions/13c1109b-ba76-4ca6-8161-8767bdf3c

## MLTable Dataset (Training Data)

In [13]:
import mltable

data_asset = ml_client.data.get(name="synthetic-urifile-training", version=VERSION)

path = {
    'file': data_asset.path
}

tbl = mltable.from_parquet_files(paths=[path])
df = tbl.to_pandas_dataframe()
df.head()

Unnamed: 0,timestamp,heat_deviation,speed_deviation,days_since_last_service,sensor_back,sensor_front,failure,operator0,operator1,operator2,...,operator5,operator6,operator7,assembly_0,assembly_1,assembly_2,assembly_3,assembly_4,assembly_5,assembly_6
0,2023-02-07 14:51:31.705794,-1.402682,4.241747,99,-3.405993,3.450679,0,False,False,False,...,False,False,False,False,False,False,True,False,False,False
1,2023-02-07 16:06:09.705794,-0.126422,1.930147,101,-3.41356,2.879816,0,False,False,False,...,False,False,False,False,False,True,False,False,False,False
2,2023-02-07 16:16:10.705794,0.10207,4.555567,100,-3.728655,0.0739,0,False,False,True,...,False,False,False,False,True,False,False,False,False,False
3,2023-02-07 16:54:45.705794,0.337126,0.903267,102,-3.562049,3.185188,0,False,False,True,...,False,False,False,False,False,True,False,False,False,False
4,2023-02-07 16:55:22.705794,-2.618703,-0.498052,100,2.491643,-2.057373,0,False,False,True,...,False,False,False,False,True,False,False,False,False,False


In [14]:
tbl.save(path="data/tmp/mltable-traning", overwrite=True)

dataset = Data(
    path="data/tmp/mltable-traning",
    type=AssetTypes.MLTABLE,
    description=f"synthetic Dataset (training, MLTABLE) for demonstrating data drift",
    name=f"synthetic-mltable-training",
    version=VERSION,
)

ml_client.data.create_or_update(dataset)

[32mUploading mltable-traning (0.0 MBs): 100%|##########| 344/344 [00:00<00:00, 10114.55it/s]
[39m



Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': ['azureml://subscriptions/13c1109b-ba76-4ca6-8161-8767bdf3c75c/resourcegroups/ai-services-rg/workspaces/schaeffler-ops-it-aml/datastores/workspaceblobstore/paths/LocalUpload/3cc268c85ae547b5964218eb850e897f/training.parquet'], 'type': 'mltable', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'synthetic-mltable-training', 'description': 'synthetic Dataset (training, MLTABLE) for demonstrating data drift', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/13c1109b-ba76-4ca6-8161-8767bdf3c75c/resourceGroups/ai-services-rg/providers/Microsoft.MachineLearningServices/workspaces/schaeffler-ops-it-aml/data/synthetic-mltable-training/versions/2024.02.15.133553', 'Resource__source_path': None, 'base_path': 'c:\\code\\demo\\azure-mlops-end2end\\notebooks', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x0000024075DF40D

In [15]:
shutil.rmtree("data/tmp/", ignore_errors=True)