In [0]:
from pyspark.ml import PipelineModel
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType

# Carregar modelos e dados
slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")
train_ready = spark.read.format("delta").load("/FileStore/data/train_ready")
val_ready = spark.read.format("delta").load("/FileStore/data/val_ready")

# Aplicar slicer
train_topk = slicer_model.transform(train_ready)
val_topk = slicer_model.transform(val_ready)

# Balanceamento leve
minority_df = train_topk.filter(col("label") == 1)
majority_df = train_topk.filter(col("label") != 1)
train_balanced = majority_df.sample(False, 0.1, seed=42).union(minority_df)

# Calcular pesos por classe (class weights)
label_counts = train_balanced.groupBy("label").count().collect()
label_dict = {row["label"]: row["count"] for row in label_counts}
total = sum(label_dict.values())
class_weights = {label: total / count for label, count in label_dict.items()}

# Criar UDF para aplicar pesos
def get_weight(label):
    return float(class_weights[label])

weight_udf = F.udf(get_weight, DoubleType())

# Adicionar coluna classWeightCol ao DataFrame
train_weighted = train_balanced.withColumn("classWeightCol", weight_udf(col("label")))

# Definir o classificador GBT com coluna de pesos
gbt = GBTClassifier(
    labelCol="label",
    featuresCol="features",
    weightCol="classWeightCol",
    maxIter=20,
    maxDepth=5,
    seed=42
)

# Treinar modelo
model = gbt.fit(train_weighted)

# Inferência no conjunto de validação
val_preds = model.transform(val_topk)

# Avaliação
f1 = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1").evaluate(val_preds)
precision = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision").evaluate(val_preds)
recall = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall").evaluate(val_preds)

print(f"\n✅ GBT com Pesos:")
print(f"F1-score:  {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")

# Matriz de confusão
preds_rdd = val_preds.select("prediction", "label").rdd.map(lambda r: (float(r[0]), float(r[1])))
metrics = MulticlassMetrics(preds_rdd)
print("\nConfusion Matrix:")
print(metrics.confusionMatrix().toArray())

# Guardar modelo
model.write().overwrite().save("/FileStore/models/gbt_top10_weighted_model")


✅ GBT com Pesos:
F1-score:  0.3148
Precision: 0.9639
Recall:    0.2217





Confusion Matrix:
[[1.37773e+05 5.76148e+05]
 [2.39000e+02 2.64590e+04]]
