In [0]:
from pyspark.ml import PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql import functions as F
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col

# Load slicing stage (top-10 features)
slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")

# Load preprocessed datasets (without limit now)
train_ready = spark.read.format("delta").load("/FileStore/data/train_ready")
val_ready   = spark.read.format("delta").load("/FileStore/data/val_ready")

# Apply slicer to keep only top-10 features
train_topk = slicer_model.transform(train_ready)
val_topk   = slicer_model.transform(val_ready)

# Step 1: Separate label == 1 (minority) and others
minority_df = train_topk.filter(col("label") == 1)
majority_df = train_topk.filter(col("label") != 1)

# Step 2: Sample 60% of majority class
sampled_majority_df = majority_df.sample(withReplacement=False, fraction=0.1, seed=42)

# Step 3: Combine all minority with sampled majority
train_reduced = sampled_majority_df.union(minority_df)

# Define Random Forest classifier
rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features_topK",
    seed=42
)

# Grid de parâmetros para RF
param_grid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [20, 50]) \
    .addGrid(rf.maxDepth, [5, 10]) \
    .addGrid(rf.maxBins, [32, 64]) \
    .build()

# Avaliador primário (F1)
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

# Avaliadores adicionais
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedPrecision"
)

recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedRecall"
)

# TrainValidationSplit
tvs = TrainValidationSplit(
    estimator=rf,
    estimatorParamMaps=param_grid,
    evaluator=f1_evaluator,
    trainRatio=0.7,
    parallelism=1
)

# Treinar modelo
tvs_model = tvs.fit(train_reduced)

# Avaliação no conjunto de validação externo
val_preds = tvs_model.transform(val_topk)

f1_score = f1_evaluator.evaluate(val_preds)
precision = precision_evaluator.evaluate(val_preds)
recall = recall_evaluator.evaluate(val_preds)

print("\n✅ Evaluation on validation set:")
print(f"F1-score:      {f1_score:.4f}")
print(f"Precision:     {precision:.4f}")
print(f"Recall:        {recall:.4f}")

# Confusion Matrix
predictionAndLabels = val_preds.select("prediction", "label") \
                               .rdd.map(lambda row: (float(row.prediction), float(row.label)))
metrics = MulticlassMetrics(predictionAndLabels)

print("\nConfusion Matrix:")
print(metrics.confusionMatrix().toArray())

# Guardar melhor modelo
tvs_model.bestModel.write().overwrite().save("/FileStore/models/rf_top10_model")


✅ Evaluation on validation set:
F1-score:      0.9462
Precision:     0.9308
Recall:        0.9639





Confusion Matrix:
[[7.13899e+05 2.20000e+01]
 [2.66970e+04 1.00000e+00]]
