In [0]:
from pyspark.ml import PipelineModel
from pyspark.sql.functions import col, udf
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.sql.types import ArrayType, DoubleType
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# Load slicing stage (top-10 features)
slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")

# Load preprocessed datasets (limit for speed)
train_ready = spark.read.format("delta").load("/FileStore/data/train_ready")
val_ready   = spark.read.format("delta").load("/FileStore/data/val_ready")

# Apply slicer to keep only top-10 features
train_topk = slicer_model.transform(train_ready)
val_topk   = slicer_model.transform(val_ready)

# ========== Light undersampling ==========
minority_df = train_topk.filter(col("label") == 1)
majority_df = train_topk.filter(col("label") != 1)
train_balanced = majority_df.sample(False, 0.1, seed=42).union(minority_df)

# ========== Converter VectorUDT para lista de floats ==========
def vector_to_array(v):
    if isinstance(v, DenseVector) or isinstance(v, SparseVector):
        return v.toArray().tolist()
    else:
        return v

vector_to_array_udf = udf(vector_to_array, ArrayType(DoubleType()))

train_array = train_balanced.withColumn("features_array", vector_to_array_udf("features_topK"))
val_array = val_topk.withColumn("features_array", vector_to_array_udf("features_topK"))

# ========== Convert to Pandas ==========
train_pd = train_array.select("features_array", "label").toPandas()
val_pd   = val_array.select("features_array", "label").toPandas()

# Extract features and labels
X_train = pd.DataFrame(train_pd["features_array"].tolist())
y_train = train_pd["label"]

X_val = pd.DataFrame(val_pd["features_array"].tolist())
y_val = val_pd["label"]

# ========== Class Weights ==========
label_counts = y_train.value_counts().to_dict()
total = sum(label_counts.values())
class_weights = {label: total / count for label, count in label_counts.items()}

# Sort by label index to match CatBoost order
class_weights_list = [class_weights[i] for i in sorted(class_weights)]

# ========== Train CatBoost ==========
model = CatBoostClassifier(
    iterations=200,
    learning_rate=0.1,
    depth=6,
    loss_function='MultiClass',
    class_weights=class_weights_list,
    verbose=50,
    random_seed=42
)

model.fit(X_train, y_train)

# ========== Evaluate ==========
y_pred = model.predict(X_val)
print("\n✅ Classification Report on Validation Set:")
print(classification_report(y_val, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

# 1. Certifica-te que o diretório existe
import os
os.makedirs("/dbfs/FileStore/models/", exist_ok=True)

# 2. Salva o modelo
model.save_model("/dbfs/FileStore/models/cb_top10_model_weighted.cbm")

0:	learn: 0.6834448	total: 201ms	remaining: 40s
50:	learn: 0.6260196	total: 7.75s	remaining: 22.6s
100:	learn: 0.6251217	total: 14.8s	remaining: 14.5s
150:	learn: 0.6244234	total: 22s	remaining: 7.14s
199:	learn: 0.6237874	total: 29s	remaining: 0us

✅ Classification Report on Validation Set:
              precision    recall  f1-score   support

         0.0       1.00      0.19      0.32    713921
         1.0       0.04      0.99      0.08     26698

    accuracy                           0.22    740619
   macro avg       0.52      0.59      0.20    740619
weighted avg       0.96      0.22      0.31    740619

Confusion Matrix:
[[137857 576064]
 [   259  26439]]
