In [0]:
from pyspark.ml import PipelineModel
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import functions as F
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col

# # Load slicing stage (top-10 features)
slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")

# Load preprocessed datasets (limit for speed)
train_ready = spark.read.format("delta").load("/FileStore/data/train_ready")
val_ready   = spark.read.format("delta").load("/FileStore/data/val_ready")

# Apply slicer to keep only top-10 features
train_topk = slicer_model.transform(train_ready)
val_topk   = slicer_model.transform(val_ready)

# # Step 1: Separate label == 1 (minority) and others
# minority_df = train_ready.filter(col("label") == 1)
# majority_df = train_ready.filter(col("label") != 1)

# # Step 2: Sample 60% of majority class
# sampled_majority_df = majority_df.sample(withReplacement=False, fraction=0.4, seed=42)

# # Step 3: Combine all minority with sampled majority
# train_reduced = sampled_majority_df.union(minority_df)

# Compute class counts for weights
label_counts = train_topk.groupBy("label").count().collect()
label_dict = {row["label"]: row["count"] for row in label_counts}
total = sum(label_dict.values())

# Inverse frequency class weighting
class_weights = {label: total / count for label, count in label_dict.items()}

# Add class weight column
train_weighted = train_topk.withColumn(
    "classWeightCol",
    F.udf(lambda label: float(class_weights[label]), "double")(F.col("label"))
)

# Define Logistic Regression
lr = LogisticRegression(
    labelCol="label",
    featuresCol="features_topK",
    weightCol="classWeightCol",
    maxIter=20
)

# Grid de parâmetros para LR
param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

# Avaliador primário (F1)
f1_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

# Avaliadores adicionais
precision_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedPrecision"
)

recall_evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedRecall"
)

# TrainValidationSplit
tvs = TrainValidationSplit(
    estimator=lr,
    estimatorParamMaps=param_grid,
    evaluator=f1_evaluator,
    trainRatio=0.7,
    parallelism=1
)

# Treinar modelo
tvs_model = tvs.fit(train_weighted)

# Avaliação no conjunto de validação externo
val_preds = tvs_model.transform(val_topk)

f1_score = f1_evaluator.evaluate(val_preds)
precision = precision_evaluator.evaluate(val_preds)
recall = recall_evaluator.evaluate(val_preds)

print("\n✅ Evaluation on validation set:")
print(f"F1-score:      {f1_score:.4f}")
print(f"Precision:     {precision:.4f}")
print(f"Recall:        {recall:.4f}")

# Confusion Matrix
predictionAndLabels = val_preds.select("prediction", "label") \
                               .rdd.map(lambda row: (float(row.prediction), float(row.label)))
metrics = MulticlassMetrics(predictionAndLabels)

print("\nConfusion Matrix:")
print(metrics.confusionMatrix().toArray())

# Guardar melhor modelo
tvs_model.bestModel.write().overwrite().save("/FileStore/models/lr_top10_model_weighted")


✅ Evaluation on validation set:
F1-score:      0.7566
Precision:     0.9349
Recall:        0.6474





Confusion Matrix:
[[468171. 245750.]
 [ 15381.  11317.]]
