In [0]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorSlicer
from pyspark.ml import Pipeline
import numpy as np

# Load preprocessed data
train_ready = spark.read.format("delta").load("/FileStore/data/train_ready")
val_ready   = spark.read.format("delta").load("/FileStore/data/val_ready")
test_ready  = spark.read.format("delta").load("/FileStore/data/test_ready")

In [0]:
# --- Train Random Forest on preprocessed training data ----------
rf = RandomForestClassifier(
    featuresCol="features",
    labelCol="label",
    weightCol="weight",  # Remove if you don't have class weights
    numTrees=100,
    maxDepth=6,
    seed=42
)
rf_model = rf.fit(train_ready)

In [0]:
# --- Extract top-10 feature indices -----------------------------
imp = rf_model.featureImportances.toArray()
top_k = 10
top_idx = np.argsort(imp)[::-1][:top_k].tolist()
print("Top-10 feature indices:", top_idx)

Top-10 feature indices: [10, 4, 8, 9, 3, 0, 1, 7, 1335, 6]


In [0]:
# --- Optional: Map indices to names -----------------------------
meta = train_ready.schema["features"].metadata
attrs = []

for group in ["numeric", "binary", "categorical"]:
    if "ml_attr" in meta and group in meta["ml_attr"].get("attrs", {}):
        attrs += meta["ml_attr"]["attrs"][group]

name_by_index = [attr["name"] for attr in sorted(attrs, key=lambda x: x["idx"])]
for i in top_idx:
    name = name_by_index[i] if i < len(name_by_index) else f"<hashed_{i}>"
    print(f"Index {i}: {name}")

Index 10: <hashed_10>
Index 4: <hashed_4>
Index 8: <hashed_8>
Index 9: <hashed_9>
Index 3: <hashed_3>
Index 0: <hashed_0>
Index 1: <hashed_1>
Index 7: <hashed_7>
Index 1335: <hashed_1335>
Index 6: <hashed_6>


In [0]:
# --- Create and save VectorSlicer pipeline ----------------------
slicer = VectorSlicer(inputCol="features", outputCol="features_topK", indices=top_idx)
slicer_pipe = Pipeline(stages=[slicer])
slicer_model = slicer_pipe.fit(train_ready)
slicer_model.write().overwrite().save("/FileStore/models/slicer_top10")