# Notebook 03 â€” Model Training (MLflow Tracking)

This notebook trains multiple classification models using the Gold dataset.
All experiments are tracked with MLflow.  
The models evaluated are:

- Logistic Regression  
- Random Forest Classifier  
- Gradient-Boosted Trees (GBT)

The best model is selected based on the F1-score.


In [0]:
df_gold = spark.read.format("delta").load(
    "dbfs:/Volumes/workspace/credit-risk/credit-risk/gold"
)

df_gold.display()
df_gold.printSchema()



In [0]:
from pyspark.ml.feature import StringIndexer

label_indexer = StringIndexer(
    inputCol="risk",
    outputCol="label",
    handleInvalid="keep"
)

df_labeled = label_indexer.fit(df_gold).transform(df_gold)

df_labeled.select("risk", "label").show(10)


In [0]:
data = df_labeled.select("features", "label")
data.printSchema()


## Train/test split


In [0]:
train, test = data.randomSplit([0.8, 0.2], seed=42)


## Import models and MLflow


In [0]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import mlflow
import mlflow.spark



## Define evaluation metric (F1-score)


In [0]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)


## Define evaluation metric (F1-score)


In [0]:
def train_model(model, name, params=None):

    with mlflow.start_run(run_name=name):

        if params:
            for p_name, p_value in params.items():
                mlflow.log_param(p_name, p_value)

        fitted = model.fit(train)
        preds = fitted.transform(test)

        f1 = evaluator.evaluate(preds)
        mlflow.log_metric("f1_score", float(f1))

        mlflow.spark.log_model(
            fitted,
            artifact_path=name,
            dfs_tmpdir="dbfs:/Volumes/workspace/credit-risk/credit-risk/mlflow_tmp"
        )

        print(f"{name} - F1 Score:", f1)
        return fitted, f1





## 8. Train models

In [0]:
# Logistic Regression
lr = LogisticRegression(featuresCol="features", labelCol="label")
lr_model, lr_f1 = train_model(lr, "Logistic_Regression")

# Random Forest
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label",
    numTrees=200,
    maxDepth=10
)
rf_model, rf_f1 = train_model(
    rf, 
    "Random_Forest",
    params={"numTrees": 200, "maxDepth": 10}
)

# Gradient Boosted Trees
gbt = GBTClassifier(
    featuresCol="features",
    labelCol="label",
    maxDepth=5,
    maxIter=50
)
gbt_model, gbt_f1 = train_model(
    gbt,
    "Gradient_Boosted_Trees",
    params={"maxDepth": 5, "maxIter": 50}
)



## 9. Compare results & pick best model

In [0]:
scores = {
    "Logistic Regression": lr_f1,
    "Random Forest": rf_f1,
    "GBT": gbt_f1
}

best_name = max(scores, key=scores.get)
best_score = scores[best_name]

print("Best Model:", best_name)
print("Best F1 Score:", best_score)



## 10. Save best model to UC Volume

In [0]:
best_model = (
    lr_model if best_name == "Logistic Regression" else
    rf_model if best_name == "Random Forest" else
    gbt_model
)

mlflow.spark.save_model(
    spark_model=best_model,
    path="/Volumes/workspace/credit-risk/credit-risk/models/best_model",
    dfs_tmpdir="dbfs:/Volumes/workspace/credit-risk/credit-risk/mlflow_tmp"
)

print(f"Best model saved successfully: {best_name}")


In [0]:
display(dbutils.fs.ls("/Volumes/workspace/credit-risk/credit-risk/models"))
