## 2. Model Training

In [1]:
# Importing pyspark dependencies
from pyspark.sql.types import IntegerType,BooleanType,DateType,NumericType,TimestampType
from pyspark.ml.feature import Binarizer, Bucketizer, QuantileDiscretizer
from com.microsoft.spark.sqlanalytics.Constants import Constants
import com.microsoft.spark.sqlanalytics
from pyspark.sql.functions import col
import pyspark.sql.functions as F

# Importing mlflow libraries
from mlflow.models import infer_signature, set_signature
from mlflow.models.model import get_model_info
import mlflow


# Importing general libraries
from sklearn.metrics import accuracy_score, f1_score, precision_score, confusion_matrix, recall_score, roc_auc_score, classification_report
import pandas as pd

#AML workspace authentication using linked service
from notebookutils.mssparkutils import azureML
linked_service_name = "AzureMLService1"
ws = azureML.getWorkspace(linked_service_name)
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri().replace("v2.0","v1.0"))

## Importing SynapseML
from synapse.ml.featurize import Featurize
from synapse.ml.lightgbm import *
from synapse.ml.train import ComputeModelStatistics

In [2]:
# Read from existing internal table
df = (spark.read.synapsesql("synapseazuremldedicates.dbo.class_transformed_taxi_data")).drop('cost')

# Show contents of the dataframe
display(df.head(5))

### 2.2 Split dataset in train/test using a stratified strategy

In [3]:
train_df = df.sampleBy("cost_class", fractions={0: 0.7, 1: 0.7, 2: 0.7, 3: 0.7, 4: 0.7}, seed=10)

print("----------------------------------------------------------------------")
print("Printing count of train dataset")
train_df.groupBy("cost_class").count().show()

test_df = df.subtract(train_df)

print("----------------------------------------------------------------------")
print("Printing count of test dataset")
test_df.groupBy("cost_class").count().show()

### 1 - Setting up dependencies

Define the models to be used.

In [None]:
#https://spark.apache.org/docs/latest/api/python/_modules/pyspark/ml/classification.html

In [13]:
from synapse.ml.automl import TuneHyperparameters
from synapse.ml.train import TrainClassifier
from pyspark.ml.classification import (
    LogisticRegression,
    RandomForestClassifier,
    GBTClassifier,
)

logReg = LogisticRegression()
randForest = RandomForestClassifier()
gbt = GBTClassifier()
smlmodels = [logReg, randForest]
mmlmodels = [TrainClassifier(model=model, labelCol="cost_class") for model in smlmodels]

### 2 - Find the best model using AutoML


Import SynapseML's AutoML classes from `synapse.ml.automl`. Specify the hyperparameters using the `HyperparamBuilder`. Add either `DiscreteHyperParam` or `RangeHyperParam` hyperparameters. `TuneHyperparameters` will randomly choose values from a uniform distribution:

In [14]:
from synapse.ml.automl import *

paramBuilder = (
    HyperparamBuilder()
    .addHyperparam(logReg, logReg.regParam, RangeHyperParam(0.1, 0.3))
    .addHyperparam(randForest, randForest.numTrees, DiscreteHyperParam([5, 10]))
    .addHyperparam(randForest, randForest.maxDepth, DiscreteHyperParam([3, 5]))
)
searchSpace = paramBuilder.build()
# The search space is a list of params to tuples of estimator and hyperparam
print(searchSpace)
randomSpace = RandomSpace(searchSpace)

In [19]:
bestModel = TuneHyperparameters(
    evaluationMetric="accuracy",
    models=mmlmodels,
    numFolds=5,
    numRuns=len(mmlmodels) * 5,
    parallelism=1,
    paramSpace=randomSpace.space(),
    seed=0,
).fit(train_df)

In [20]:
print(bestModel.getBestModelInfo())
print(bestModel.getBestModel())

In [21]:
from synapse.ml.train import ComputeModelStatistics

prediction = bestModel.transform(test_df)
metrics = ComputeModelStatistics().transform(prediction)
metrics.limit(10).toPandas()