# Train a model

## Connect to the workspace

Connect to the workspace and establish mlflow tracking

In [None]:
# Handle to the workspace
from sklearn.ensemble import RandomForestClassifier
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient
import mlflow
import mltable

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np
import os
from mlflow.models import infer_signature

ml_client = MLClient.from_config(
    DefaultAzureCredential()
)

# Gather MLflow URI information from workspace
azureml_mlflow_uri = ml_client.workspaces.get(ml_client.workspace_name).mlflow_tracking_uri
mlflow.set_tracking_uri(azureml_mlflow_uri)

experiment_name = "Synteticdata-Monitoring-Models-Experiment"
mlflow.set_experiment(experiment_name)

## Model Training

### Prepare Data

In [None]:
data_asset = ml_client.data.get("synthetic-mltable-training", label="latest")

tbl = mltable.load(data_asset.path)

df = tbl.to_pandas_dataframe()
df

# Breaking up data into input/target features
# X, y = diabetes[['Pregnancies','PlasmaGlucose','DiastolicBloodPressure','TricepsThickness','SerumInsulin','BMI','DiabetesPedigree','Age']].values, diabetes['Diabetic'].values
X, y = df.drop(columns=["failure"]), df["failure"]

# Breaking data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

X_train.head(5)

### Train the model

In [None]:
model_path = "./models/monitoring"

# delete model directory if it exists
import shutil
if os.path.exists(model_path):
    shutil.rmtree(model_path)

os.makedirs(model_path, exist_ok=False)


try:
    mlflow.end_run()
except:
    pass

# Start Logging
mlflow.start_run()

# Enable autologging (optional)
# mlflow.sklearn.autolog()

# Training a model:
model = RandomForestClassifier().fit(X_train, y_train)

# Calculating performance and logging them
y_hat = model.predict(X_test)
acc = np.average(y_hat == y_test)
mlflow.log_metric('Accuracy', float(acc))

y_scores = model.predict_proba(X_test)
auc = roc_auc_score(y_test,y_scores[:,1])
mlflow.log_metric('AUC', float(auc))

### Register the model

In [None]:
signature = infer_signature(X_test, y_hat)

# Registering the model to the workspace
print("Registering the model via MLFlow")
mlflow.sklearn.log_model(
    sk_model=model,
    registered_model_name="monitoring-synthetic-pred-maintenance",
    artifact_path="model",
    signature=signature,
)

# Saving the model to a file
mlflow.sklearn.save_model(
    sk_model=model, 
    path=model_path,
    signature=signature
)

# Stop logging
mlflow.end_run()


In [None]:
if os.path.exists(model_path):
    shutil.rmtree(model_path)

## Run Inference

- [ ] TBD

In [None]:
import pandas as pd
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

ml_client = MLClient.from_config(
    DefaultAzureCredential()
)

production_dataset_name = "synthetic-urifolder-production"

data_asset = ml_client.data.get(production_dataset_name, label="latest")

df = pd.read_parquet(data_asset.path)
X_test = df.drop(columns=["failure", "timestamp"])
y_test = df["failure"]
X_test.head(5)

Predict failures

In [None]:
predictions = model.predict(X_test)
predictions

In [None]:
df["true_failure"] = y_test
df["failure"] = predictions

In [None]:
production_outputs_folder = "data/productionoutputs/"
shutil.rmtree(production_outputs_folder, ignore_errors=True)
os.makedirs(production_outputs_folder, exist_ok=False)
df.to_parquet(f"{production_outputs_folder}/outputs.parquet")

In [None]:
import time

VERSION = time.strftime("%Y.%m.%d.%H%M%S", time.gmtime())

In [None]:
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

outputs_data_asset = Data(
    path=production_outputs_folder,
    type=AssetTypes.URI_FOLDER,
    description="synthetic Dataset (production-outputs) for demonstrating data drift; parquet file",
    name="synthetic-urifolder-production-outputs",
    version=VERSION
)

ml_client.data.create_or_update(outputs_data_asset)

In [None]:
import mltable

outputs_data_asset = ml_client.data.get(name="synthetic-urifolder-production-outputs", version=VERSION)

path = {
    'folder': outputs_data_asset.path
}

print(f"Data asset path: {path}")

tbl = mltable.from_parquet_files(paths=[path])
df = tbl.to_pandas_dataframe()
df.head(5)

In [None]:
import shutil

shutil.rmtree(production_outputs_folder, ignore_errors=True)