Day2-2교시: 분류 모델 튜닝 및 성능 비교
- RF/GBT 튜닝, MLflow 비교
- 산출물: 최종 분류 모델 선정 보고서

In [None]:
import os
import sys
import mlflow
import mlflow.spark
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql import SparkSession

IN_COLAB = "google.colab" in sys.modules
BASE = "/content" if IN_COLAB else os.getcwd()
CSV_PATH = os.path.join(BASE, "TestData", "Social_Network_Ads.csv")
MLFLOW_DIR = os.path.join(BASE, "mlruns_day2")
SEED = 42

mlflow.set_tracking_uri("file://" + os.path.abspath(MLFLOW_DIR))
mlflow.set_experiment("ncs_spark_day2_classification")

spark = SparkSession.builder.appName("Day2_Classification_Tuning").getOrCreate()

In [None]:
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(CSV_PATH)
indexer = StringIndexer(inputCol="Gender", outputCol="Gender_idx").setHandleInvalid("keep")
encoder = OneHotEncoder(inputCols=["Gender_idx"], outputCols=["Gender_ohe"])
assembler = VectorAssembler(inputCols=["Age", "EstimatedSalary", "Gender_ohe"], outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
pipeline = Pipeline(stages=[indexer, encoder, assembler, scaler])
df_ready = pipeline.fit(df).transform(df)
data = df_ready.select("scaled_features", "Purchased").withColumnRenamed("scaled_features", "features")
train_data, test_data = data.randomSplit([0.8, 0.2], seed=SEED)

evaluator = BinaryClassificationEvaluator(labelCol="Purchased", rawPredictionCol="rawPrediction", metricName="areaUnderROC")

In [None]:
rf = RandomForestClassifier(featuresCol="features", labelCol="Purchased", seed=SEED)
rf_grid = ParamGridBuilder().addGrid(rf.numTrees, [20, 50]).addGrid(rf.maxDepth, [3, 5]).build()
rf_cv = CrossValidator(estimator=rf, estimatorParamMaps=rf_grid, evaluator=evaluator, numFolds=3, seed=SEED)

with mlflow.start_run(run_name="rf_tuned"):
    rf_cv_model = rf_cv.fit(train_data)
    rf_preds = rf_cv_model.transform(test_data)
    rf_auc = evaluator.evaluate(rf_preds)
    mlflow.log_param("model", "RandomForestClassifier")
    mlflow.log_param("numTrees", str(rf_cv_model.bestModel.getNumTrees()))
    mlflow.log_param("maxDepth", str(rf_cv_model.bestModel.getMaxDepth()))
    mlflow.log_metric("test_auc", rf_auc)
    mlflow.spark.log_model(rf_cv_model.bestModel, "model")

In [None]:
gbt = GBTClassifier(featuresCol="features", labelCol="Purchased", seed=SEED)
gbt_grid = ParamGridBuilder().addGrid(gbt.maxDepth, [3, 5]).addGrid(gbt.maxIter, [20, 50]).build()
gbt_cv = CrossValidator(estimator=gbt, estimatorParamMaps=gbt_grid, evaluator=evaluator, numFolds=3, seed=SEED)

with mlflow.start_run(run_name="gbt_tuned"):
    gbt_cv_model = gbt_cv.fit(train_data)
    gbt_preds = gbt_cv_model.transform(test_data)
    gbt_auc = evaluator.evaluate(gbt_preds)
    mlflow.log_param("model", "GBTClassifier")
    mlflow.log_param("maxDepth", str(gbt_cv_model.bestModel.getMaxDepth()))
    mlflow.log_param("maxIter", str(gbt_cv_model.bestModel.getMaxIter()))
    mlflow.log_metric("test_auc", gbt_auc)
    mlflow.spark.log_model(gbt_cv_model.bestModel, "model")

In [None]:
print("RF tuned AUC:", rf_auc)
print("GBT tuned AUC:", gbt_auc)
print("최종 분류 모델 선정: AUC가 높은 모델을 선택하고, templates/model_selection_justification.md에 근거 기록.")

spark.stop()