In [0]:
from pyspark.ml import PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col

# Load slicing stage (top-10 features)
slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")

# Load datasets
train_ready = spark.read.format("delta").load("/FileStore/data/train_ready")
val_ready   = spark.read.format("delta").load("/FileStore/data/val_ready")

# Apply slicer to keep only top-10 features
train_topk = slicer_model.transform(train_ready)
val_topk   = slicer_model.transform(val_ready)

# Step 1: Separate label == 1 (minority) and others
minority_df = train_topk.filter(col("label") == 1)
majority_df = train_topk.filter(col("label") != 1)

# Step 2: Sample 60% of majority class
sampled_majority_df = majority_df.sample(withReplacement=False, fraction=0.4, seed=42)

# Step 3: Combine all minority with sampled majority
train_reduced = sampled_majority_df.union(minority_df)

# Define Decision Tree Classifier
dt = DecisionTreeClassifier(
    labelCol="label",
    featuresCol="features_topK",
    seed=42
)

# Param grid for DT
param_grid = ParamGridBuilder() \
    .addGrid(dt.maxDepth, [5, 10, 15]) \
    .addGrid(dt.minInstancesPerNode, [1, 5]) \
    .build()

# Evaluators
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedPrecision"
)

recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedRecall"
)

# TrainValidationSplit
tvs = TrainValidationSplit(
    estimator=dt,
    estimatorParamMaps=param_grid,
    evaluator=f1_evaluator,
    trainRatio=0.7,
    parallelism=1
)

# Train model
tvs_model = tvs.fit(train_reduced)

# Evaluate on validation set
val_preds = tvs_model.transform(val_topk)

f1_score = f1_evaluator.evaluate(val_preds)
precision = precision_evaluator.evaluate(val_preds)
recall = recall_evaluator.evaluate(val_preds)

print("\n✅ Evaluation on validation set:")
print(f"F1-score:      {f1_score:.4f}")
print(f"Precision:     {precision:.4f}")
print(f"Recall:        {recall:.4f}")

# Confusion Matrix
predictionAndLabels = val_preds.select("prediction", "label") \
                               .rdd.map(lambda row: (float(row.prediction), float(row.label)))
metrics = MulticlassMetrics(predictionAndLabels)

print("\nConfusion Matrix:")
print(metrics.confusionMatrix().toArray())

# Save best model
tvs_model.bestModel.write().overwrite().save("/FileStore/models/dt_top10_model")


✅ Evaluation on validation set:
F1-score:      0.9463
Precision:     0.9344
Recall:        0.9639





Confusion Matrix:
[[7.13915e+05 6.00000e+00]
 [2.66970e+04 1.00000e+00]]
