In [0]:
from pyspark.ml import PipelineModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.classification import (
    LogisticRegressionModel, 
    RandomForestClassificationModel, 
    GBTClassificationModel
)
from pyspark.sql.functions import col
import numpy as np
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# --- Load Slicer and Test Data ---
slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")
test_ready = spark.read.format("delta").load("/FileStore/data/test_ready")
test_topk = slicer_model.transform(test_ready)

# --- Evaluator Setup ---
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)

# --- Model Paths & Classes ---
model_paths = {
    "Logistic Regression": "/FileStore/models/lr_top10_weighted_model",
    "Random Forest": "/FileStore/models/rf_top10_weighted_model_40pct",
    "GBT": "/FileStore/models/gbt_top10_no_weights"
}

model_classes = {
    "Logistic Regression": LogisticRegressionModel,
    "Random Forest": RandomForestClassificationModel,
    "GBT": GBTClassificationModel
}

# --- Evaluation Helper ---
def evaluate_spark_model(model_name, model, df):
    preds = model.transform(df)
    f1 = evaluator.evaluate(preds)
    
    preds_rdd = preds.select("prediction", "label").rdd.map(lambda r: (float(r[0]), float(r[1])))
    metrics = MulticlassMetrics(preds_rdd)

    report = {}
    total_support = 0
    weighted_sum = {"precision": 0, "recall": 0, "f1": 0}
    labels = [0.0, 1.0]

    for label in labels:
        precision = metrics.precision(label)
        recall = metrics.recall(label)
        f1_score = metrics.fMeasure(label)
        support = preds_rdd.filter(lambda r: r[1] == label).count()
        report[label] = {"precision": precision, "recall": recall, "f1-score": f1_score, "support": support}
        total_support += support
        weighted_sum["precision"] += precision * support
        weighted_sum["recall"] += recall * support
        weighted_sum["f1"] += f1_score * support

    macro_avg = {
        "precision": np.mean([metrics.precision(l) for l in labels]),
        "recall": np.mean([metrics.recall(l) for l in labels]),
        "f1-score": np.mean([metrics.fMeasure(l) for l in labels]),
        "support": total_support
    }

    print(f"\n Evaluation for {model_name}:")
    print("Confusion Matrix:")
    print(metrics.confusionMatrix().toArray().astype(int))

    print("\nClassification Report:")
    print(f"{'Class':<10}{'Precision':>10}{'Recall':>10}{'F1-Score':>10}{'Support':>10}")
    for label in labels:
        vals = report[label]
        print(f"{str(int(label)):<10}{vals['precision']:10.4f}{vals['recall']:10.4f}{vals['f1-score']:10.4f}{vals['support']:10d}")
    
    print(f"\n{'Accuracy':<10}{'':>10}{'':>10}{metrics.accuracy:10.4f}{total_support:10d}")
    print(f"{'Macro Avg':<10}{macro_avg['precision']:10.4f}{macro_avg['recall']:10.4f}{macro_avg['f1-score']:10.4f}{macro_avg['support']:10d}")
    print(f"{'Weighted Avg':<10}{(weighted_sum['precision']/total_support):10.4f}{(weighted_sum['recall']/total_support):10.4f}{(weighted_sum['f1']/total_support):10.4f}{total_support:10d}")

    return report[1.0]['f1-score']

# --- Run Evaluation for Spark Models ---
f1_scores = []

for name, path in model_paths.items():
    model = model_classes[name].load(path)
    f1_class1 = evaluate_spark_model(name, model, test_topk)
    f1_scores.append((name, f1_class1))

# --- Final Ranking ---
f1_scores.sort(key=lambda x: x[1], reverse=True)

print("\n Final model comparison (sorted by F1-score for class 1):")
for name, score in f1_scores:
    print(f"{name:20} ➤  F1-score (class 1): {score:.4f}")


 Evaluation for Logistic Regression:
Confusion Matrix:
[[329234 403185]
 [ 10726  16688]]

Classification Report:
Class      Precision    Recall  F1-Score   Support
0             0.9684    0.4495    0.6140    732419
1             0.0397    0.6087    0.0746     27414

Accuracy                          0.4553    759833
Macro Avg     0.5041    0.5291    0.3443    759833
Weighted Avg    0.9349    0.4553    0.5946    759833

 Evaluation for Random Forest:
Confusion Matrix:
[[512154 220265]
 [ 17187  10227]]

Classification Report:
Class      Precision    Recall  F1-Score   Support
0             0.9675    0.6993    0.8118    732419
1             0.0444    0.3731    0.0793     27414

Accuracy                          0.6875    759833
Macro Avg     0.5060    0.5362    0.4456    759833
Weighted Avg    0.9342    0.6875    0.7854    759833

 Evaluation for GBT:
Confusion Matrix:
[[732406     13]
 [ 27414      0]]

Classification Report:
Class      Precision    Recall  F1-Score   Support
0       