In [0]:
from pyspark.ml import PipelineModel
from pyspark.ml.classification import RandomForestClassifier
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col, udf
from pyspark.sql.types import DoubleType
from pyspark.sql import functions as F
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

# Load feature slicer model and preprocessed data
slicer_model = PipelineModel.load("/FileStore/models/slicer_top10")
train_ready = spark.read.format("delta").load("/FileStore/data/train_ready")
val_ready = spark.read.format("delta").load("/FileStore/data/val_ready")

# Apply feature selection
train_topk = slicer_model.transform(train_ready)
val_topk = slicer_model.transform(val_ready)

# Part 1: Light undersampling of majority class with 10%
minority_df = train_topk.filter(col("label") == 1)
majority_df = train_topk.filter(col("label") != 1)
train_balanced_10 = majority_df.sample(False, 0.1, seed=42).union(minority_df)

# Compute class weights
label_counts = train_balanced_10.groupBy("label").count().collect()
label_dict = {row["label"]: row["count"] for row in label_counts}
total = sum(label_dict.values())
class_weights = {label: total / count for label, count in label_dict.items()}

# Define UDF to assign weights
def get_weight(label):
    return float(class_weights[label])
weight_udf = F.udf(get_weight, DoubleType())

# Add class weight column
train_weighted_10 = train_balanced_10.withColumn("classWeightCol", weight_udf(col("label")))

# Define and train the Random Forest model
rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
    weightCol="classWeightCol",
    numTrees=100,
    maxDepth=10,
    seed=42
)
model_10 = rf.fit(train_weighted_10)

# Predict on validation set
val_preds_10 = model_10.transform(val_topk)

# Function to apply custom threshold
def apply_threshold(df, threshold):
    predict_udf = udf(lambda prob: float(1.0) if prob[1] > threshold else float(0.0), DoubleType())
    return df.withColumn("adjusted_prediction", predict_udf(col("probability")))

# Grid search to find best threshold based on F1-score for class 1
best_f1 = 0
best_threshold = 0.5

print("\nThreshold Search (for F1 class 1):")
for t in [x / 100.0 for x in range(5, 95, 5)]:
    adjusted_df = apply_threshold(val_preds_10, t)
    rdd = adjusted_df.select("adjusted_prediction", "label").rdd.map(lambda r: (float(r[0]), float(r[1])))
    metrics = MulticlassMetrics(rdd)
    f1_class1 = metrics.fMeasure(1.0)
    print(f"Threshold = {t:.2f} | F1 (class 1): {f1_class1:.4f}")
    if f1_class1 > best_f1:
        best_f1 = f1_class1
        best_threshold = t

print(f"\nBest Threshold Found: {best_threshold:.2f} with F1 for class 1 = {best_f1:.4f}")


Threshold Search (for F1 class 1):
Threshold = 0.05 | F1 (class 1): 0.0696
Threshold = 0.10 | F1 (class 1): 0.0696
Threshold = 0.15 | F1 (class 1): 0.0696
Threshold = 0.20 | F1 (class 1): 0.0696
Threshold = 0.25 | F1 (class 1): 0.0696
Threshold = 0.30 | F1 (class 1): 0.0696
Threshold = 0.35 | F1 (class 1): 0.0696
Threshold = 0.40 | F1 (class 1): 0.0696
Threshold = 0.45 | F1 (class 1): 0.0764
Threshold = 0.50 | F1 (class 1): 0.0817
Threshold = 0.55 | F1 (class 1): 0.0000
Threshold = 0.60 | F1 (class 1): 0.0000
Threshold = 0.65 | F1 (class 1): 0.0000
Threshold = 0.70 | F1 (class 1): 0.0000
Threshold = 0.75 | F1 (class 1): 0.0000
Threshold = 0.80 | F1 (class 1): 0.0000
Threshold = 0.85 | F1 (class 1): 0.0000
Threshold = 0.90 | F1 (class 1): 0.0000

Best Threshold Found: 0.50 with F1 for class 1 = 0.0817


In [0]:
# Part 2: Use 40% undersampling of majority class with best threshold from Part 1
train_balanced_40 = majority_df.sample(False, 0.4, seed=42).union(minority_df)

# Compute class weights for 40% sample
label_counts_40 = train_balanced_40.groupBy("label").count().collect()
label_dict_40 = {row["label"]: row["count"] for row in label_counts_40}
total_40 = sum(label_dict_40.values())
class_weights_40 = {label: total_40 / count for label, count in label_dict_40.items()}

# Define UDF to assign weights for 40% sample
def get_weight_40(label):
    return float(class_weights_40[label])
weight_udf_40 = F.udf(get_weight_40, DoubleType())

# Add class weight column
train_weighted_40 = train_balanced_40.withColumn("classWeightCol", weight_udf_40(col("label")))

# Train new Random Forest model on 40% sample
model_40 = rf.fit(train_weighted_40)

# Predict on validation set
val_preds_40 = model_40.transform(val_topk)

# Apply the best threshold found on the new predictions
val_preds_adjusted_40 = apply_threshold(val_preds_40, best_threshold)

# Final evaluation
final_rdd_40 = val_preds_adjusted_40.select("adjusted_prediction", "label").rdd.map(lambda r: (float(r[0]), float(r[1])))
metrics_40 = MulticlassMetrics(final_rdd_40)

labels = [0.0, 1.0]

# Prepare report dictionary
report_40 = {}
total_support_40 = 0
weighted_sum_40 = {"precision": 0, "recall": 0, "f1": 0}

for label in labels:
    precision = metrics_40.precision(label)
    recall = metrics_40.recall(label)
    f1 = metrics_40.fMeasure(label)
    support = final_rdd_40.filter(lambda r: r[1] == label).count()
    report_40[label] = {"precision": precision, "recall": recall, "f1-score": f1, "support": support}
    total_support_40 += support
    weighted_sum_40["precision"] += precision * support
    weighted_sum_40["recall"] += recall * support
    weighted_sum_40["f1"] += f1 * support

macro_avg_40 = {
    "precision": sum(metrics_40.precision(l) for l in labels) / len(labels),
    "recall": sum(metrics_40.recall(l) for l in labels) / len(labels),
    "f1-score": sum(metrics_40.fMeasure(l) for l in labels) / len(labels),
    "support": total_support_40
}

# Print confusion matrix
print("\nConfusion Matrix (40% undersampling):")
print(metrics_40.confusionMatrix().toArray().astype(int))

# Print classification report
print("\nClassification Report (40% undersampling):")
print(f"{'Class':<10}{'Precision':>10}{'Recall':>10}{'F1-Score':>10}{'Support':>10}")
for label in labels:
    vals = report_40[label]
    print(f"{str(int(label)):<10}{vals['precision']:10.4f}{vals['recall']:10.4f}{vals['f1-score']:10.4f}{vals['support']:10d}")

accuracy_40 = final_rdd_40.filter(lambda x: x[0] == x[1]).count() / final_rdd_40.count()
print(f"\n{'Accuracy':<10}{'':>10}{'':>10}{accuracy_40:10.4f}{total_support_40:10d}")
print(f"{'Macro Avg':<10}{macro_avg_40['precision']:10.4f}{macro_avg_40['recall']:10.4f}{macro_avg_40['f1-score']:10.4f}{macro_avg_40['support']:10d}")
print(f"{'Weighted Avg':<10}{(weighted_sum_40['precision']/total_support_40):10.4f}{(weighted_sum_40['recall']/total_support_40):10.4f}{(weighted_sum_40['f1']/total_support_40):10.4f}{total_support_40:10d}")

# Save model trained on 40% sample
model_40.write().overwrite().save("/FileStore/models/rf_top10_weighted_model_40pct")


Confusion Matrix (40% undersampling):
[[411943 301978]
 [ 12738  13960]]

Classification Report (40% undersampling):
Class      Precision    Recall  F1-Score   Support
0             0.9700    0.5770    0.7236    713921
1             0.0442    0.5229    0.0815     26698

Accuracy                          0.5751    740619
Macro Avg     0.5071    0.5500    0.4025    740619
Weighted Avg    0.9366    0.5751    0.7004    740619
