Day1-6교시: 교차검증 및 하이퍼파라미터 튜닝
- ParamGridBuilder, CrossValidator, BestModel 선택
- 산출물: 튜닝 전/후 성능 비교표, 최적 파라미터 기록

In [None]:
import os
import sys
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import SparkSession

IN_COLAB = "google.colab" in sys.modules
BASE = "/content" if IN_COLAB else os.getcwd()
CSV_PATH = os.path.join(BASE, "TestData", "Social_Network_Ads.csv")
SEED = 42

spark = SparkSession.builder.appName("Day1_CrossValidator").getOrCreate()

In [None]:
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(CSV_PATH)
indexer = StringIndexer(inputCol="Gender", outputCol="Gender_idx").setHandleInvalid("keep")
encoder = OneHotEncoder(inputCols=["Gender_idx"], outputCols=["Gender_ohe"])
assembler = VectorAssembler(inputCols=["Age", "EstimatedSalary", "Gender_ohe"], outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
pipeline = Pipeline(stages=[indexer, encoder, assembler, scaler])
df_ready = pipeline.fit(df).transform(df)
data = df_ready.select("scaled_features", "Purchased").withColumnRenamed("scaled_features", "features")

train_data, test_data = data.randomSplit([0.8, 0.2], seed=SEED)

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="Purchased")
evaluator = BinaryClassificationEvaluator(labelCol="Purchased", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

param_grid = ParamGridBuilder() \
    .addGrid(lr.regParam, [0.01, 0.1]) \
    .addGrid(lr.elasticNetParam, [0.0, 0.5]) \
    .build()

cv = CrossValidator(
    estimator=lr,
    estimatorParamMaps=param_grid,
    evaluator=evaluator,
    numFolds=3,
    seed=SEED,
)

In [None]:
cv_model = cv.fit(train_data)
best_lr = cv_model.bestModel
predictions = cv_model.transform(test_data)
auc_tuned = evaluator.evaluate(predictions)

In [None]:
print("Best model params:")
print(f"  regParam: {best_lr.getRegParam()}")
print(f"  elasticNetParam: {best_lr.getElasticNetParam()}")
print(f"  Test AUC: {auc_tuned:.4f}")

In [None]:
print("=== 튜닝 결과 (산출물에 기록) ===")
print("튜닝 전: baseline LR (040_MLlib_Classification_Baseline.py 결과 참고)")
print(f"튜닝 후: AUC={auc_tuned:.4f}, regParam={best_lr.getRegParam()}, elasticNetParam={best_lr.getElasticNetParam()}")

spark.stop()