# TODO
- [x] Fix MLTable Access
- [x] Add today offset to dataset. Split production in half
  - [x] We don't need test, rename current to production.
  - [x] Which Schedule frequency? -> Weekly?
  - [x] Add 7 day offset.
- [x] Create Model Outputs (Notebook 01)
- [ ] Document Notebooks

# Data Set Preparation

This notebook will set up the data sets we need for simulating production data and model monitoring, while not using Managed Online Endpoints for deployment.

In [None]:
import os
import shutil
import pandas as pd

dataset = pd.read_csv("./data/predictive_maintenance_update.csv")

operators = pd.get_dummies(dataset["operator"])
assembly_line_nums = pd.get_dummies(dataset["assembly_line_num"])

dataset_dummies = pd.merge(dataset, operators, left_index=True, right_index=True).drop(columns="operator")
dataset_dummies = pd.merge(dataset_dummies, assembly_line_nums, left_index=True, right_index=True).drop(columns="assembly_line_num")
dataset = dataset_dummies

uint8_columns = [
    'operator0', 'operator1', 'operator2', 'operator3', 'operator4',
    'operator5', 'operator6', 'operator7', 'assembly_0', 'assembly_1',
    'assembly_2', 'assembly_3', 'assembly_4', 'assembly_5', 'assembly_6'
]

# Convert each column in the list to bool
for column in uint8_columns:
    dataset[column] = dataset[column].astype(bool)

dataset

In [None]:
dataset["timestamp"] = pd.to_datetime(dataset["timestamp"])

# Define timestamps for splitting
production_start = pd.to_datetime("01/06/2021")
two_weeks_ago = pd.to_datetime("today") - pd.Timedelta(days=14)
offset = two_weeks_ago - production_start

# Add the offset to all timestamps in the dataset
dataset["timestamp"] = dataset["timestamp"] + offset

# Update the production_start to today
production_start = two_weeks_ago

# Split the dataset
training = dataset[dataset["timestamp"] <= production_start]
production = dataset[dataset["timestamp"] >= production_start]

# Sanity checking
print("reference min: ", training["timestamp"].min(), " reference max: ", training["timestamp"].max())
print("production min: ", production["timestamp"].min(), " production max: ", production["timestamp"].max())

In [None]:
shutil.rmtree("data/tmp/", ignore_errors=True)

os.makedirs("data/tmp", exist_ok=False)
os.makedirs("data/tmp/training", exist_ok=False)
os.makedirs("data/tmp/production", exist_ok=False)
os.makedirs("data/tmp/dataset", exist_ok=False)

# dataset_dummies.to_parquet("data/tmp/dataset/dataset.parquet", index=False)
training.drop(columns=["timestamp"]).to_parquet("data/tmp/training/training.parquet", index=False)
production.to_parquet("data/tmp/production/production.parquet", index=False)

## Create AzureML Datasets
### Uri_folder Dataset (Training, Production)

In [None]:
import time

from azure.ai.ml import MLClient
from azure.ai.ml.entities import Data
from azure.identity import DefaultAzureCredential
from azure.ai.ml.constants import AssetTypes

In [None]:
ml_client = MLClient.from_config(credential=DefaultAzureCredential())

VERSION = time.strftime("%Y.%m.%d.%H%M%S", time.gmtime())

In [None]:
training_dataset = Data(
    path="./data/tmp/training/training.parquet",
    type=AssetTypes.URI_FILE,
    description="synthetic Dataset (training) for demonstrating data drift; parquet file",
    name="synthetic-urifile-training",
    version=VERSION,
)

ml_client.data.create_or_update(training_dataset)

production_dataset = Data(
    path="./data/tmp/production",
    type=AssetTypes.URI_FOLDER,
    description="synthetic Dataset (production) for demonstrating data drift; parquet file",
    name="synthetic-urifolder-production",
    version=VERSION,
)

ml_client.data.create_or_update(production_dataset)

## MLTable Dataset (Training Data)

In [None]:
import mltable

data_asset = ml_client.data.get(name="synthetic-urifile-training", version=VERSION)

path = {
    'file': data_asset.path
}

tbl = mltable.from_parquet_files(paths=[path])
df = tbl.to_pandas_dataframe()
df.head()

In [None]:
tbl.save(path="data/tmp/mltable-traning", overwrite=True)

dataset = Data(
    path="data/tmp/mltable-traning",
    type=AssetTypes.MLTABLE,
    description=f"synthetic Dataset (training, MLTABLE) for demonstrating data drift",
    name=f"synthetic-mltable-training",
    version=VERSION,
)

ml_client.data.create_or_update(dataset)

In [None]:
shutil.rmtree("data/tmp/", ignore_errors=True)