Day2-1교시: 트리 기반 분류 (RF / GBT)
- RandomForestClassifier, GBTClassifier, feature importance
- 산출물: 모델별 성능 비교표

In [None]:
import os
import sys
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.sql import SparkSession

IN_COLAB = "google.colab" in sys.modules
BASE = "/content" if IN_COLAB else os.getcwd()
CSV_PATH = os.path.join(BASE, "TestData", "Social_Network_Ads.csv")
SEED = 42

spark = SparkSession.builder.appName("Day2_RF_GBT").getOrCreate()

In [None]:
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(CSV_PATH)
indexer = StringIndexer(inputCol="Gender", outputCol="Gender_idx").setHandleInvalid("keep")
encoder = OneHotEncoder(inputCols=["Gender_idx"], outputCols=["Gender_ohe"])
assembler = VectorAssembler(inputCols=["Age", "EstimatedSalary", "Gender_ohe"], outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
pipeline = Pipeline(stages=[indexer, encoder, assembler, scaler])
df_ready = pipeline.fit(df).transform(df)
data = df_ready.select("scaled_features", "Purchased").withColumnRenamed("scaled_features", "features")

train_data, test_data = data.randomSplit([0.8, 0.2], seed=SEED)

In [None]:
rf = RandomForestClassifier(featuresCol="features", labelCol="Purchased", seed=SEED)
rf_model = rf.fit(train_data)
rf_preds = rf_model.transform(test_data)

In [None]:
gbt = GBTClassifier(featuresCol="features", labelCol="Purchased", seed=SEED)
gbt_model = gbt.fit(train_data)
gbt_preds = gbt_model.transform(test_data)

In [None]:
auc_eval = BinaryClassificationEvaluator(labelCol="Purchased", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
acc_eval = MulticlassClassificationEvaluator(labelCol="Purchased", predictionCol="prediction", metricName="accuracy")

rf_auc = auc_eval.evaluate(rf_preds)
rf_acc = acc_eval.evaluate(rf_preds)
gbt_auc = auc_eval.evaluate(gbt_preds)
gbt_acc = acc_eval.evaluate(gbt_preds)

print("RF  - AUC:", rf_auc, " Accuracy:", rf_acc)
print("GBT - AUC:", gbt_auc, " Accuracy:", gbt_acc)

In [None]:
print("RF feature importances:", rf_model.featureImportances)

In [None]:
print("=== 모델별 성능 비교 (산출물에 기록) ===")
print("model,auc,accuracy")
print(f"RandomForestClassifier,{rf_auc:.4f},{rf_acc:.4f}")
print(f"GBTClassifier,{gbt_auc:.4f},{gbt_acc:.4f}")

spark.stop()