In [0]:
from pyspark.ml import PipelineModel
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col, udf
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType
from pyspark.mllib.evaluation import MulticlassMetrics
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load models and data
slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")
train_ready = spark.read.format("delta").load("/FileStore/data/train_ready")
val_ready = spark.read.format("delta").load("/FileStore/data/val_ready")

# Apply feature slicer
train_topk = slicer_model.transform(train_ready)
val_topk = slicer_model.transform(val_ready)

# Apply light undersampling to balance the training set
minority_df = train_topk.filter(col("label") == 1)
majority_df = train_topk.filter(col("label") != 1)
train_balanced = majority_df.sample(False, 0.8, seed=42).union(minority_df)

# Check label distribution after balancing
print("\nLabel distribution after balancing:")
train_balanced.groupBy("label").count().show()

# Compute class weights
label_counts = train_balanced.groupBy("label").count().collect()
label_dict = {row["label"]: row["count"] for row in label_counts}
total = sum(label_dict.values())
class_weights = {label: total / count for label, count in label_dict.items()}

# Create UDF to assign weights
def get_weight(label):
    return float(class_weights[label])

weight_udf = F.udf(get_weight, DoubleType())

# Add class weight column
train_weighted = train_balanced.withColumn("classWeightCol", weight_udf(col("label")))

# Define Logistic Regression model
lr = LogisticRegression(
    labelCol="label",
    featuresCol="features",
    weightCol="classWeightCol",
    maxIter=100,
    regParam=0.01,
    elasticNetParam=0.0  # Ridge regularization
)

# Train the model
model = lr.fit(train_weighted)

# Perform inference on validation set
val_preds = model.transform(val_topk)

# Define function to apply a custom threshold
def apply_threshold(df, threshold):
    predict_udf = udf(lambda prob: float(1.0) if prob[1] > threshold else float(0.0), DoubleType())
    return df.withColumn("adjusted_prediction", predict_udf(col("probability")))

# Grid search for best threshold based on F1-score for class 1
best_f1 = 0
best_threshold = 0.5

print("\n Threshold Search (for F1-score of class 1):")
for t in [x / 100.0 for x in range(5, 95, 5)]:
    adjusted_df = apply_threshold(val_preds, t)
    rdd = adjusted_df.select("adjusted_prediction", "label").rdd.map(lambda r: (float(r[0]), float(r[1])))
    metrics = MulticlassMetrics(rdd)
    f1_class1 = metrics.fMeasure(1.0)
    print(f"Threshold = {t:.2f} | F1 (class 1): {f1_class1:.4f}")
    if f1_class1 > best_f1:
        best_f1 = f1_class1
        best_threshold = t

print(f"\n Best Threshold Found: {best_threshold:.2f} with F1-score (class 1) = {best_f1:.4f}")

# Apply the best threshold
val_preds_adjusted = apply_threshold(val_preds, best_threshold)

# Final evaluation
final_rdd = val_preds_adjusted.select("adjusted_prediction", "label").rdd.map(lambda r: (float(r[0]), float(r[1])))
metrics = MulticlassMetrics(final_rdd)

labels = [0.0, 1.0]

# Prepare report dictionary
report = {}
total_support = 0
weighted_sum = {"precision": 0, "recall": 0, "f1": 0}

for label in labels:
    precision = metrics.precision(label)
    recall = metrics.recall(label)
    f1 = metrics.fMeasure(label)
    support = final_rdd.filter(lambda r: r[1] == label).count()
    report[label] = {"precision": precision, "recall": recall, "f1-score": f1, "support": support}
    total_support += support
    weighted_sum["precision"] += precision * support
    weighted_sum["recall"] += recall * support
    weighted_sum["f1"] += f1 * support

macro_avg = {
    "precision": sum(metrics.precision(l) for l in labels) / len(labels),
    "recall": sum(metrics.recall(l) for l in labels) / len(labels),
    "f1-score": sum(metrics.fMeasure(l) for l in labels) / len(labels),
    "support": total_support
}

# Print confusion matrix
print("\nConfusion Matrix:")
print(metrics.confusionMatrix().toArray().astype(int))

# Print classification report like sklearn output
print("\nClassification Report:")
print(f"{'Class':<10}{'Precision':>10}{'Recall':>10}{'F1-Score':>10}{'Support':>10}")
for label in labels:
    vals = report[label]
    print(f"{str(int(label)):<10}{vals['precision']:10.4f}{vals['recall']:10.4f}{vals['f1-score']:10.4f}{vals['support']:10d}")

print(f"\n{'Accuracy':<10}{'':>10}{'':>10}{metrics.accuracy:10.4f}{total_support:10d}")
print(f"{'Macro Avg':<10}{macro_avg['precision']:10.4f}{macro_avg['recall']:10.4f}{macro_avg['f1-score']:10.4f}{macro_avg['support']:10d}")
print(f"{'Weighted Avg':<10}{(weighted_sum['precision']/total_support):10.4f}{(weighted_sum['recall']/total_support):10.4f}{(weighted_sum['f1']/total_support):10.4f}{total_support:10d}")

# Save the trained model
model.write().overwrite().save("/FileStore/models/lr_top10_weighted_model")


Label distribution after balancing:
+-----+-------+
|label|  count|
+-----+-------+
|  0.0|2699450|
|  1.0| 125441|
+-----+-------+


 Threshold Search (for F1-score of class 1):
Threshold = 0.05 | F1 (class 1): 0.0696
Threshold = 0.10 | F1 (class 1): 0.0696
Threshold = 0.15 | F1 (class 1): 0.0696
Threshold = 0.20 | F1 (class 1): 0.0696
Threshold = 0.25 | F1 (class 1): 0.0696
Threshold = 0.30 | F1 (class 1): 0.0696
Threshold = 0.35 | F1 (class 1): 0.0696
Threshold = 0.40 | F1 (class 1): 0.0696
Threshold = 0.45 | F1 (class 1): 0.0702
Threshold = 0.50 | F1 (class 1): 0.0727
Threshold = 0.55 | F1 (class 1): 0.0645
Threshold = 0.60 | F1 (class 1): 0.0282
Threshold = 0.65 | F1 (class 1): 0.0095
Threshold = 0.70 | F1 (class 1): 0.0025
Threshold = 0.75 | F1 (class 1): 0.0004
Threshold = 0.80 | F1 (class 1): 0.0001
Threshold = 0.85 | F1 (class 1): 0.0000
Threshold = 0.90 | F1 (class 1): 0.0000

 Best Threshold Found: 0.50 with F1-score (class 1) = 0.0727

Confusion Matrix:
[[371688 342233]
 [