# Train a model

## Connect to the workspace

Connect to the workspace and establish mlflow tracking

In [7]:
# Handle to the workspace
from sklearn.ensemble import RandomForestClassifier
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient
import mlflow
import mltable

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np
import os
from mlflow.models import infer_signature

ml_client = MLClient.from_config(
    DefaultAzureCredential()
)

# Gather MLflow URI information from workspace
azureml_mlflow_uri = ml_client.workspaces.get(ml_client.workspace_name).mlflow_tracking_uri
mlflow.set_tracking_uri(azureml_mlflow_uri)

experiment_name = "Synteticdata-Monitoring-Models-Experiment"
mlflow.set_experiment(experiment_name)

Found the config file in: .\config.json


UnsupportedModelRegistryStoreURIException:  Model registry functionality is unavailable; got unsupported URI 'azureml://westeurope.api.azureml.ms/mlflow/v1.0/subscriptions/13c1109b-ba76-4ca6-8161-8767bdf3c75c/resourceGroups/ai-services-rg/providers/Microsoft.MachineLearningServices/workspaces/schaeffler-ops-it-aml' for model registry data storage. Supported URI schemes are: ['', 'file', 'databricks', 'databricks-uc', 'http', 'https', 'postgresql', 'mysql', 'sqlite', 'mssql']. See https://www.mlflow.org/docs/latest/tracking.html#storage for how to run an MLflow server against one of the supported backend storage locations.

## Model Training

### Prepare Data

In [2]:
data_asset = ml_client.data.get("synthetic-mltable-training", label="latest")

tbl = mltable.load(data_asset.path)

df = tbl.to_pandas_dataframe()
df

# Breaking up data into input/target features
# X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values
X, y = df.drop(columns=["failure", "timestamp"]), df["failure"]

# Breaking data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

X_train.head(5)

Unnamed: 0,heat_deviation,speed_deviation,days_since_last_service,sensor_back,sensor_front,operator0,operator1,operator2,operator3,operator4,operator5,operator6,operator7,assembly_0,assembly_1,assembly_2,assembly_3,assembly_4,assembly_5,assembly_6
8226,-1.346026,-2.482588,100,2.457135,-0.315574,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False
9398,-0.782131,-0.709747,101,0.053087,0.224553,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False
11646,-0.951416,-0.767887,102,-0.297,0.600808,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False
12193,0.405007,2.240321,99,-1.026176,-0.796458,False,False,False,False,True,False,False,False,True,False,False,False,False,False,False
18601,-0.338065,0.182039,98,1.426864,-0.954988,False,False,True,False,False,False,False,False,True,False,False,False,False,False,False


### Train the model

In [3]:
model_path = "./models/monitoring"

# delete model directory if it exists
import shutil
if os.path.exists(model_path):
    shutil.rmtree(model_path)

os.makedirs(model_path, exist_ok=False)


try:
    mlflow.end_run()
except:
    pass

# Start Logging
mlflow.start_run()

# Enable autologging (optional)
# mlflow.sklearn.autolog()

# Training a model:
model = RandomForestClassifier().fit(X_train, y_train)

# Calculating performance and logging them
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
mlflow.log_metric('Accuracy', float(acc))

y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
mlflow.log_metric('AUC', float(auc))

### Register the model

In [4]:
signature = infer_signature(X_test, y_hat)

# Registering the model to the workspace
print("Registering the model via MLFlow")
mlflow.sklearn.log_model(
    sk_model=model,
    registered_model_name="monitoring-synthetic-pred-maintenance",
    artifact_path="model",
    signature=signature,
)

# Saving the model to a file
mlflow.sklearn.save_model(
    sk_model=model, 
    path=model_path,
    signature=signature
)

# Stop logging
mlflow.end_run()


  inputs = _infer_schema(model_input)


Registering the model via MLFlow


Registered model 'monitoring-synthetic-pred-maintenance' already exists. Creating a new version of this model...
2024/02/15 12:38:52 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: monitoring-synthetic-pred-maintenance, version 3
Created version '3' of model 'monitoring-synthetic-pred-maintenance'.


## Run Inference

- [ ] TBD

In [8]:
import pandas as pd

production_dataset_name = "synthetic-urifolder-production"

data_asset = ml_client.data.get(production_dataset_name, label="latest")

df = pd.read_parquet(data_asset.path)
X_test = df.drop(columns=["failure", "timestamp"])
y_test = df["failure"]
X_test.head(5)

Unnamed: 0,heat_deviation,speed_deviation,days_since_last_service,sensor_back,sensor_front,operator0,operator1,operator2,operator3,operator4,operator5,operator6,operator7,assembly_0,assembly_1,assembly_2,assembly_3,assembly_4,assembly_5,assembly_6
0,-0.405605,-2.360076,103,-0.84186,-3.522181,False,False,False,True,False,False,False,False,True,False,False,False,False,False,False
1,-0.175536,4.221148,101,-4.160534,0.808003,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False
2,0.032615,-1.216637,101,-0.55929,-1.982913,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False
3,-0.818966,-0.442152,100,-0.585094,-0.150093,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False
4,0.217286,-2.025518,102,-0.987976,-3.672958,False,False,False,True,False,False,False,False,False,True,False,False,False,False,False


Predict failures

In [9]:
predictions = model.predict(X_test)
predictions

array([0, 0, 0, ..., 0, 0, 0])

In [11]:
df["true_failure"] = y_test
df["failure"] = predictions

In [36]:
production_outputs_folder = "data/productionoutputs/"
shutil.rmtree(production_outputs_folder, ignore_errors=True)
os.makedirs(production_outputs_folder, exist_ok=False)
df.to_parquet(f"{production_outputs_folder}/outputs.parquet")

In [37]:
import time

VERSION = time.strftime("%Y.%m.%d.%H%M%S", time.gmtime())

In [38]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

outputs_data_asset = Data(
    path=production_outputs_folder,
    type=AssetTypes.URI_FOLDER,
    description="synthetic Dataset (production-outputs) for demonstrating data drift; parquet file",
    name="synthetic-urifolder-production-outputs",
    version=VERSION
)

ml_client.data.create_or_update(outputs_data_asset)

[32mUploading productionoutputs (1.26 MBs): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1259765/1259765 [00:00<00:00, 28643342.43i

Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': None, 'type': 'uri_folder', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'synthetic-urifolder-production-outputs', 'description': 'synthetic Dataset (production-outputs) for demonstrating data drift; parquet file', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/13c1109b-ba76-4ca6-8161-8767bdf3c75c/resourceGroups/ai-services-rg/providers/Microsoft.MachineLearningServices/workspaces/schaeffler-ops-it-aml/data/synthetic-urifolder-production-outputs/versions/2024.02.15.130209', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/hehein2/code/Users/hehein/datadrift', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7fa8148bdf30>, 'serialize': <msrest.serialization.Serializer object at 0x7fa8148bc310>, 'version': '2024.02.15.130209', 'latest_version': None, 'path': 'azur

In [42]:
import mltable

outputs_data_asset = ml_client.data.get(name="synthetic-urifolder-production-outputs", version=VERSION)

path = {
    'folder': outputs_data_asset.path
}

print(f"Data asset path: {path}")

tbl = mltable.from_parquet_files(paths=[path])
df = tbl.to_pandas_dataframe()
df.head(5)

Data asset path: {'folder': 'azureml://subscriptions/13c1109b-ba76-4ca6-8161-8767bdf3c75c/resourcegroups/ai-services-rg/workspaces/schaeffler-ops-it-aml/datastores/workspaceblobstore/paths/LocalUpload/0940d573ee69bc5608cd781ea2ef3b86/productionoutputs/'}


Unnamed: 0,timestamp,heat_deviation,speed_deviation,days_since_last_service,sensor_back,sensor_front,failure,operator0,operator1,operator2,...,operator6,operator7,assembly_0,assembly_1,assembly_2,assembly_3,assembly_4,assembly_5,assembly_6,true_failure
0,2024-02-15 12:53:25.181869,-0.405605,-2.360076,103,-0.84186,-3.522181,0,False,False,False,...,False,False,True,False,False,False,False,False,False,0
1,2024-02-15 12:55:10.181869,-0.175536,4.221148,101,-4.160534,0.808003,0,False,False,False,...,False,False,False,False,True,False,False,False,False,0
2,2024-02-15 13:28:03.181869,0.032615,-1.216637,101,-0.55929,-1.982913,0,False,False,True,...,False,False,False,True,False,False,False,False,False,0
3,2024-02-15 13:54:49.181869,-0.818966,-0.442152,100,-0.585094,-0.150093,0,False,False,False,...,True,False,True,False,False,False,False,False,False,1
4,2024-02-15 14:35:54.181869,0.217286,-2.025518,102,-0.987976,-3.672958,0,False,False,False,...,False,False,False,True,False,False,False,False,False,0


In [43]:
# tbl.save(path="data/mltable-production-outputs", overwrite=True)

# dataset = Data(
#     path="data/mltable-production-outputs",
#     type=AssetTypes.MLTABLE,
#     description=f"synthetic Dataset (production with outputs, MLTABLE) for demonstrating data drift",
#     name="synthetic-mltable-production-outputs",
#     version=VERSION,
# )

# ml_client.data.create_or_update(dataset)

[32mUploading mltable-production-outputs (0.0 MBs): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 341/341 [00:00<00:00, 43548.33i

Data({'skip_validation': False, 'mltable_schema_url': None, 'referenced_uris': ['azureml://subscriptions/13c1109b-ba76-4ca6-8161-8767bdf3c75c/resourcegroups/ai-services-rg/workspaces/schaeffler-ops-it-aml/datastores/workspaceblobstore/paths/LocalUpload/0940d573ee69bc5608cd781ea2ef3b86/productionoutputs/'], 'type': 'mltable', 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'synthetic-mltable-production-outputs', 'description': 'synthetic Dataset (production with outputs, MLTABLE) for demonstrating data drift', 'tags': {}, 'properties': {}, 'print_as_yaml': True, 'id': '/subscriptions/13c1109b-ba76-4ca6-8161-8767bdf3c75c/resourceGroups/ai-services-rg/providers/Microsoft.MachineLearningServices/workspaces/schaeffler-ops-it-aml/data/synthetic-mltable-production-outputs/versions/2024.02.15.130209', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/hehein2/code/Users/hehein/datadrift', 'creation_context':