In [0]:
from pyspark.ml import PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import GBTClassifier
from pyspark.sql import functions as F

# Load slicing stage (top-10 features)
slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")

# Load preprocessed datasets
train_ready = spark.read.format("delta").load("/FileStore/data/train_ready")
val_ready   = spark.read.format("delta").load("/FileStore/data/val_ready")

# Apply slicer to keep only top-10 features
train_topk = slicer_model.transform(train_ready)
val_topk   = slicer_model.transform(val_ready)

# Efficient class balancing using sampleBy
fractions = {0: 0.6, 1: 1.0}  # 60% of majority, 100% of minority
train_sample = train_topk.sampleBy("label", fractions=fractions, seed=42)

# Define Gradient-Boosted Tree
gbt = GBTClassifier(
    labelCol="label",
    featuresCol="features_topK",
    seed=42,
    maxIter=20  # number of boosting rounds
)

# Small param grid (to control compute load)
param_grid = ParamGridBuilder() \
    .addGrid(gbt.maxDepth, [3, 5]) \
    .addGrid(gbt.stepSize, [0.1, 0.2]) \
    .build()

# Evaluator
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

# TrainValidationSplit
tvs = TrainValidationSplit(
    estimator=gbt,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    trainRatio=1.0,
    parallelism=1  # Required for Databricks CE
)

# Train model with grid search
tvs_model = tvs.fit(train_sample)

# Evaluate on external validation set
val_preds = tvs_model.transform(val_topk)
f1_score = evaluator.evaluate(val_preds)

print(f"Best GBT model F1-score on validation set: {f1_score:.4f}")

# Save best model
tvs_model.bestModel.write().overwrite().save("/FileStore/models/gbt_top10_model_grid")