In [0]:
#####################################
# need to add a new library to the cluster, using PyPI add catboost
#####################################

from pyspark.ml import PipelineModel
from pyspark.sql.functions import col, udf
from pyspark.ml.linalg import DenseVector, SparseVector
from pyspark.sql.types import ArrayType, DoubleType
import pandas as pd
import numpy as np
import os

from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load top-k features slicer model
slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")

train_ready = spark.read.format("delta").load("/FileStore/data/train_ready")
val_ready = spark.read.format("delta").load("/FileStore/data/val_ready")

# Apply slicer to datasets
train_topk = slicer_model.transform(train_ready)
val_topk = slicer_model.transform(val_ready)

# Light undersampling for grid search (10%)
minority_df = train_topk.filter(col("label") == 1)
majority_df = train_topk.filter(col("label") != 1)
train_balanced = majority_df.sample(False, 0.1, seed=42).union(minority_df)

# Convert VectorUDT to list
def vector_to_array(v):
    if isinstance(v, DenseVector) or isinstance(v, SparseVector):
        return v.toArray().tolist()
    return v

vector_to_array_udf = udf(vector_to_array, ArrayType(DoubleType()))

train_array = train_balanced.withColumn("features_array", vector_to_array_udf("features_topK"))
val_array = val_topk.withColumn("features_array", vector_to_array_udf("features_topK"))

# Convert to Pandas
train_pd = train_array.select("features_array", "label").toPandas()
val_pd = val_array.select("features_array", "label").toPandas()

X_train = pd.DataFrame(train_pd["features_array"].tolist())
y_train = train_pd["label"]

X_val = pd.DataFrame(val_pd["features_array"].tolist())
y_val = val_pd["label"]

# Compute class weights
label_counts = y_train.value_counts().to_dict()
total = sum(label_counts.values())
class_weights = {label: total / count for label, count in label_counts.items()}
class_weights_list = [class_weights[i] for i in sorted(class_weights)]

# Hyperparameter tuning using GridSearchCV
params = {
    'depth': [4, 6, 8],
    'learning_rate': [0.03, 0.05, 0.1],
    'iterations': [100, 200],
    'l2_leaf_reg': [1, 3, 5],
}

cat = CatBoostClassifier(
    loss_function='MultiClass',
    class_weights=class_weights_list,
    verbose=0,
    random_seed=42
)

grid = GridSearchCV(cat, param_grid=params, cv=3, scoring='f1_macro', n_jobs=-1)
grid.fit(X_train, y_train)

# Best model
model = grid.best_estimator_
print(" Best Parameters Found:", grid.best_params_)

 Best Parameters Found: {'depth': 8, 'iterations': 200, 'l2_leaf_reg': 1, 'learning_rate': 0.1}


In [0]:
from sklearn.metrics import f1_score, balanced_accuracy_score, recall_score
from collections import Counter

# New undersampling: 80% of the majority class
minority_df = train_topk.filter(col("label") == 1)
majority_df = train_topk.filter(col("label") != 1)
train_balanced = majority_df.sample(False, 0.8, seed=42).union(minority_df)

# Repeat transformation and conversion to pandas
train_array = train_balanced.withColumn("features_array", vector_to_array_udf("features_topK"))
train_pd = train_array.select("features_array", "label").toPandas()

X_train = pd.DataFrame(train_pd["features_array"].tolist())
y_train = train_pd["label"]

# Manual Grid Search for Class Weights
best_score = -1
best_weights = None
best_model = None

weight_options = [11, 12, 13, 14, 15, 16, 17, 18, 19]
results = []

print("Starting grid search for class weights...")

for w1 in weight_options:
    class_weights_list = [1, w1]  # Class 0 → 1, Class 1 → w1
    
    model = CatBoostClassifier(
        depth=8,
        iterations=200,
        learning_rate=0.1,
        l2_leaf_reg=1,
        loss_function='MultiClass',
        class_weights=class_weights_list,
        early_stopping_rounds=50,
        verbose=0,
        random_seed=42
    )

    model.fit(X_train, y_train, eval_set=(X_val, y_val), verbose=0)
    
    y_pred = model.predict(X_val)
    y_pred = np.array(y_pred).ravel()
    
    unique_preds = np.unique(y_pred)
    if len(unique_preds) < 2:
        print(f"Ignored class_weights={class_weights_list} → only predicted class {unique_preds[0]}")
        continue
    
    score_f1 = f1_score(y_val, y_pred, average='macro')
    score_bal = balanced_accuracy_score(y_val, y_pred)
    score_rec = recall_score(y_val, y_pred, average='macro')
    
    results.append((class_weights_list, score_f1, score_bal, score_rec, model))
    
    print(f"class_weights={class_weights_list} | F1-macro: {score_f1:.4f} | Balanced Acc: {score_bal:.4f} | Recall-macro: {score_rec:.4f} | Pred distribution: {Counter(y_pred)}")

    if score_f1 > best_score:
        best_score = score_f1
        best_weights = class_weights_list
        best_model = model

print("\n Best result:")
print(f"Class weights: {best_weights}")
print(f"F1-macro: {best_score:.4f}")

Starting grid search for class weights...
class_weights=[1, 11] | F1-macro: 0.4909 | Balanced Acc: 0.5000 | Recall-macro: 0.5000 | Pred distribution: Counter({0.0: 740542, 1.0: 77})
class_weights=[1, 12] | F1-macro: 0.4913 | Balanced Acc: 0.5001 | Recall-macro: 0.5001 | Pred distribution: Counter({0.0: 740373, 1.0: 246})
class_weights=[1, 13] | F1-macro: 0.4917 | Balanced Acc: 0.5002 | Recall-macro: 0.5002 | Pred distribution: Counter({0.0: 740112, 1.0: 507})
class_weights=[1, 14] | F1-macro: 0.4942 | Balanced Acc: 0.5007 | Recall-macro: 0.5007 | Pred distribution: Counter({0.0: 738365, 1.0: 2254})
class_weights=[1, 15] | F1-macro: 0.4957 | Balanced Acc: 0.5007 | Recall-macro: 0.5007 | Pred distribution: Counter({0.0: 736230, 1.0: 4389})
class_weights=[1, 16] | F1-macro: 0.5015 | Balanced Acc: 0.5021 | Recall-macro: 0.5021 | Pred distribution: Counter({0.0: 725974, 1.0: 14645})
class_weights=[1, 17] | F1-macro: 0.4864 | Balanced Acc: 0.5159 | Recall-macro: 0.5159 | Pred distribution: C

In [0]:
y_pred = best_model.predict(X_val)

print(" Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

print("\n Classification Report:")
print(classification_report(y_val, y_pred, digits=4))

# Save the model
os.makedirs("/dbfs/FileStore/models/", exist_ok=True)
best_model.save_model("/dbfs/FileStore/models/cb_top10_model_best_weights.cbm")

 Confusion Matrix:
[[699912  14009]
 [ 26062    636]]

 Classification Report:
              precision    recall  f1-score   support

         0.0     0.9641    0.9804    0.9722    713921
         1.0     0.0434    0.0238    0.0308     26698

    accuracy                         0.9459    740619
   macro avg     0.5038    0.5021    0.5015    740619
weighted avg     0.9309    0.9459    0.9382    740619

