Day2-6교시: 성능-비용-복잡도 트레이드오프 분석
- 모델 선택 기준: 학습 시간 vs 성능, 해석 가능성
- 산출물: 최종 모델 선정 근거표 (templates/model_selection_justification.md)

In [None]:
import os
import sys
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.sql import SparkSession

IN_COLAB = "google.colab" in sys.modules
BASE = "/content" if IN_COLAB else os.getcwd()
CSV_PATH = os.path.join(BASE, "TestData", "Social_Network_Ads.csv")
SEED = 42

spark = SparkSession.builder.appName("Day2_Tradeoff").getOrCreate()

In [None]:
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(CSV_PATH)
indexer = StringIndexer(inputCol="Gender", outputCol="Gender_idx").setHandleInvalid("keep")
encoder = OneHotEncoder(inputCols=["Gender_idx"], outputCols=["Gender_ohe"])
assembler = VectorAssembler(inputCols=["Age", "EstimatedSalary", "Gender_ohe"], outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
prep = Pipeline(stages=[indexer, encoder, assembler, scaler])
df_ready = prep.fit(df).transform(df)
data = df_ready.select("scaled_features", "Purchased").withColumnRenamed("scaled_features", "features")
train_data, test_data = data.randomSplit([0.8, 0.2], seed=SEED)

evaluator = BinaryClassificationEvaluator(labelCol="Purchased", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

In [None]:
results = []

for name, est in [
    ("LR", LogisticRegression(featuresCol="features", labelCol="Purchased")),
    ("RF", RandomForestClassifier(featuresCol="features", labelCol="Purchased", seed=SEED)),
    ("GBT", GBTClassifier(featuresCol="features", labelCol="Purchased", seed=SEED)),
]:
    model = est.fit(train_data)
    preds = model.transform(test_data)
    auc = evaluator.evaluate(preds)
    results.append((name, auc))
    print(f"{name}: AUC={auc:.4f}")

In [None]:
print("=== 의사결정 테이블 (산출물: model_selection_justification.md) ===")
print("model,auc,해석가능성")
print("LR,", results[0][1], ",높음(계수)")
print("RF,", results[1][1], ",중간(feature importance)")
print("GBT,", results[2][1], ",중간(feature importance)")

spark.stop()