# Notebook 03 â€” Model Training (MLflow Tracking)

This notebook trains multiple classification models using the Gold dataset.
All experiments are tracked with MLflow.  
The models evaluated are:

- Logistic Regression  
- Random Forest Classifier  
- Gradient-Boosted Trees (GBT)

The best model is selected based on the F1-score.


In [0]:
df_gold = spark.read.format("delta").load(
    "dbfs:/Volumes/workspace/credit-risk/credit-risk/gold"
)

df_gold.display()
df_gold.printSchema()


In [0]:
data = (
    df_gold
    .select("features", "risk_idx")
    .withColumnRenamed("risk_idx", "label")
)

data.show(5)
data.printSchema()


## Train/test split


In [0]:
train, test = data.randomSplit([0.8, 0.2], seed=42)


## Import models and MLflow


In [0]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import mlflow
import mlflow.spark


## Define evaluation metric (F1-score)


In [0]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)


## Define evaluation metric (F1-score)


In [0]:
def train_model(model, name):
    with mlflow.start_run(run_name=name):
        fitted = model.fit(train)
        preds = fitted.transform(test)

        f1 = evaluator.evaluate(preds)

        mlflow.log_metric("f1_score", float(f1))

        mlflow.spark.log_model(
            fitted,
            artifact_path=name,
            dfs_tmpdir="dbfs:/Volumes/workspace/credit-risk/credit-risk/mlflow_tmp"
        )

        print(f"{name} - F1 Score:", f1)
        return fitted, f1




In [0]:
lr_model, lr_f1 = train_model(
    LogisticRegression(featuresCol="features", labelCol="label"),
    "Logistic_Regression"
)

rf_model, rf_f1 = train_model(
    RandomForestClassifier(featuresCol="features", labelCol="label"),
    "Random_Forest"
)

gbt_model, gbt_f1 = train_model(
    GBTClassifier(featuresCol="features", labelCol="label", maxDepth=5),
    "Gradient_Boosted_Trees"
)


In [0]:
scores = {
    "Logistic Regression": lr_f1,
    "Random Forest": rf_f1,
    "GBT": gbt_f1
}

best_name = max(scores, key=scores.get)
best_score = scores[best_name]

print("Best Model:", best_name)
print("F1 Score:", best_score)


In [0]:
best_model = (
    lr_model if best_name == "Logistic Regression" else
    rf_model if best_name == "Random Forest" else
    gbt_model
)

mlflow.spark.save_model(
    lr_model,
    "/Volumes/workspace/credit-risk/credit-risk/models/best_model",
    dfs_tmpdir="/Volumes/workspace/credit-risk/credit-risk/mlflow_tmp"
)

print("Logistic Regression model saved successfully.")
