## 2. Model Training

In [199]:
# Importing pyspark dependencies
from pyspark.sql.types import IntegerType,BooleanType,DateType,NumericType,TimestampType
from pyspark.ml.feature import Binarizer, Bucketizer, QuantileDiscretizer
from com.microsoft.spark.sqlanalytics.Constants import Constants
import com.microsoft.spark.sqlanalytics
from pyspark.sql.functions import col
import pyspark.sql.functions as F

# Importing mlflow libraries
from mlflow.models import infer_signature, set_signature
from mlflow.models.model import get_model_info
import mlflow


# Importing general libraries
from sklearn.metrics import accuracy_score, f1_score, precision_score, confusion_matrix, recall_score, roc_auc_score, classification_report
import pandas as pd

#AML workspace authentication using linked service
from notebookutils.mssparkutils import azureML
linked_service_name = "AzureMLService1"
ws = azureML.getWorkspace(linked_service_name)
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri().replace("v2.0","v1.0"))

## Importing SynapseML
from synapse.ml.featurize import Featurize
from synapse.ml.lightgbm import *
from synapse.ml.train import ComputeModelStatistics

In [200]:
# Read from existing internal table
df = (spark.read.synapsesql("synapseazuremldedicates.dbo.class_transformed_taxi_data")).drop('cost')

# Show contents of the dataframe
display(df.head(5))

### 2.2 Split dataset in train/test using a stratified strategy

In [201]:
train_df = df.sampleBy("cost_class", fractions={0: 0.7, 1: 0.7, 2: 0.7, 3: 0.7, 4: 0.7}, seed=10)

print("----------------------------------------------------------------------")
print("Printing count of train dataset")
train_df.groupBy("cost_class").count().show()

test_df = df.subtract(train_df)

print("----------------------------------------------------------------------")
print("Printing count of test dataset")
test_df.groupBy("cost_class").count().show()

### 2.3 Train featurizer

In [203]:
mlflow.autolog(exclusive=False)

#Set MLflow experiment.
experiment_name = "taxi-classifier-experiment"
mlflow.set_experiment(experiment_name)

with mlflow.start_run(run_name="class_featurization") as featurization:
    feature_cols = [column for column in train_df.columns if column!="cost_class"]
    featurize = (Featurize()
    .setOutputCol("features")
    .setInputCols(feature_cols)
    .setOneHotEncodeCategoricals(True)
    .setNumFeatures(4096))
    featurizer_model = featurize.fit(train_df)
    train_df_trans = featurizer_model.transform(train_df)
    test_df_trans = featurizer_model.transform(test_df)

    mlflow.end_run()

## Registering featurizer
model_name = 'featurizer'
model_uri = f"runs:/{featurization.info.run_id}/model"
mlflow.register_model(model_uri=model_uri, name=model_name)

### 2.3 Train classifier

In [280]:
mlflow.autolog(exclusive=False)

#Set MLflow experiment.
experiment_name = "taxi-classifier-experiment"
mlflow.set_experiment(experiment_name)

with mlflow.start_run(run_name="class_training") as training:

    lightgbm_classifier = (LightGBMClassifier()
            .setFeaturesCol("features")
            .setRawPredictionCol("rawPrediction")
            .setDefaultListenPort(12402)
            .setNumLeaves(5)
            .setNumIterations(10)
            .setObjective("multiclass")
            .setLabelCol("cost_class")
            .setLeafPredictionCol("leafPrediction")
            .setFeaturesShapCol("featuresShap"))

    lightgbm_model = lightgbm_classifier.fit(train_df_trans)

    # Use mlflow.spark.save_model to save the model to your path
    mlflow.spark.save_model(lightgbm_model, "lightgbm_model")
    # Use mlflow.spark.log_model to log the model if you have a connected mlflow service
    mlflow.spark.log_model(lightgbm_model, "lightgbm_model")

    # Use mlflow.spark.load_model to load model back as PipelineModel and apply transform
    predictions = lightgbm_model.transform(train_df_trans)
    metrics = ComputeModelStatistics(evaluationMetric="classification", labelCol='cost_class', scoredLabelsCol='prediction').transform(predictions).collect()
    mlflow.log_metric("Train accuracy", metrics[0]['accuracy'])
    mlflow.log_metric("Train precision", metrics[0]['precision'])
    mlflow.log_metric("Train recall", metrics[0]['recall'])
    mlflow.log_metric("Train macro_averaged_precision", metrics[0]['macro_averaged_precision'])
    mlflow.log_metric("Train macro_averaged_recall", metrics[0]['macro_averaged_recall'])
    print(metrics)

    predictions = lightgbm_model.transform(test_df_trans)
    metrics = ComputeModelStatistics(evaluationMetric="classification", labelCol='cost_class', scoredLabelsCol='prediction').transform(predictions).collect()
    mlflow.log_metric("Test accuracy", metrics[0]['accuracy'])
    mlflow.log_metric("Test precision", metrics[0]['precision'])
    mlflow.log_metric("Test recall", metrics[0]['recall'])
    mlflow.log_metric("Test macro_averaged_precision", metrics[0]['macro_averaged_precision'])
    mlflow.log_metric("Test macro_averaged_recall", metrics[0]['macro_averaged_recall'])
    print(metrics)

    mlflow.end_run()

## Registering Classifier 
model_name = 'classification_demo'
model_uri = f"runs:/{training.info.run_id}/lightgbm_model"
mlflow.register_model(model_uri=model_uri, name=model_name)

### 2.4 Testing models

#### 2.4.1 Testing model from run folder

In [288]:
### --------------------------------------------------------------------------- ###
model_name = 'featurizer'
# Search registered models for the 
for model in mlflow.search_registered_models():
    if model.name == model_name:
        latest_version_run_id = model.latest_versions[0].run_id
        model_found = True
        break

# load model from Azure Machine Learning
run_id = latest_version_run_id
artifact_path = "model"
model_uri = f"runs:/{run_id}/{artifact_path}"
featurizer = mlflow.pyfunc.load_model(model_uri)

### --------------------------------------------------------------------------- ###

model_name = 'classification_demo'
# Search registered models for the 
for model in mlflow.search_registered_models():
    if model.name == model_name:
        latest_version_run_id = model.latest_versions[0].run_id
        model_found = True
        break

# load model from Azure Machine Learning
run_id = latest_version_run_id
artifact_path = "lightgbm_model"
model_uri = f"runs:/{run_id}/{artifact_path}"
model = mlflow.pyfunc.load_model(model_uri)

### --------------------------------------------------------------------------- ###

test_df_trans = featurizer_model.transform(test_df)
pandas_test_df = test_df_trans.toPandas()

# testing model
test_predictions = model.predict(pandas_test_df.head())
print(test_predictions)

#### 2.4.2 Testing model from model registry using pyfunc

In [289]:
model = mlflow.pyfunc.load_model(model_uri=f"models:/classification_demo/latest")

test_predictions = model.predict(pandas_test_df.head())
print(test_predictions)

#### 2.4.3 Testing model from model registry using spark

In [290]:
spark_model = mlflow.spark.load_model(model_uri=f"models:/classification_demo/latest")
display(spark_model.transform(test_df_trans))