Day1-5교시: 분류 베이스라인 구축 (Logistic Regression)
- Train/Test 분리 (seed 고정), 평가 지표: AUC, PR, Confusion Matrix
- 산출물: baseline 결과표 (templates/tuning_before_after.csv 참고)

In [None]:
import os
import sys
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.sql import SparkSession

IN_COLAB = "google.colab" in sys.modules
BASE = "/content" if IN_COLAB else os.getcwd()
CSV_PATH = os.path.join(BASE, "TestData", "Social_Network_Ads.csv")
SEED = 42

spark = SparkSession.builder.appName("Day1_Classification_Baseline").getOrCreate()

In [None]:
df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(CSV_PATH)

indexer = StringIndexer(inputCol="Gender", outputCol="Gender_idx").setHandleInvalid("keep")
encoder = OneHotEncoder(inputCols=["Gender_idx"], outputCols=["Gender_ohe"])
assembler = VectorAssembler(inputCols=["Age", "EstimatedSalary", "Gender_ohe"], outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
pipeline = Pipeline(stages=[indexer, encoder, assembler, scaler])
df_ready = pipeline.fit(df).transform(df)
data = df_ready.select("scaled_features", "Purchased").withColumnRenamed("scaled_features", "features")

In [None]:
train_data, test_data = data.randomSplit([0.8, 0.2], seed=SEED)

In [None]:
lr = LogisticRegression(featuresCol="features", labelCol="Purchased")
lr_model = lr.fit(train_data)
predictions = lr_model.transform(test_data)

In [None]:
auc_eval = BinaryClassificationEvaluator(labelCol="Purchased", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
pr_eval = BinaryClassificationEvaluator(labelCol="Purchased", rawPredictionCol="rawPrediction", metricName="areaUnderPR")
auc = auc_eval.evaluate(predictions)
pr_area = pr_eval.evaluate(predictions)

accuracy_eval = MulticlassClassificationEvaluator(labelCol="Purchased", predictionCol="prediction", metricName="accuracy")
accuracy = accuracy_eval.evaluate(predictions)

In [None]:
print("Baseline LR - Test metrics:")
print(f"  AUC (ROC): {auc:.4f}")
print(f"  AUC (PR):  {pr_area:.4f}")
print(f"  Accuracy:  {accuracy:.4f}")

In [None]:
predictions.groupBy("Purchased", "prediction").count().orderBy("Purchased", "prediction").show()

산출물: baseline 결과표에 위 지표 기록

In [None]:
print("=== baseline 결과 (산출물에 기록) ===")
print(f"model,stage,metric_name,metric_value")
print(f"LogisticRegression,baseline,auc,{auc:.4f}")
print(f"LogisticRegression,baseline,areaUnderPR,{pr_area:.4f}")
print(f"LogisticRegression,baseline,accuracy,{accuracy:.4f}")

spark.stop()