Day2-4교시: 회귀 개선 (RFRegressor / GBTRegressor)
- 산출물: baseline vs 개선 모델 비교

In [None]:
import os
import sys
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession

IN_COLAB = "google.colab" in sys.modules
BASE = "/content" if IN_COLAB else os.getcwd()
SEED = 42

spark = SparkSession.builder.appName("Day2_Regression_RF_GBT").getOrCreate()

In [None]:
try:
    from sklearn.datasets import fetch_california_housing
    import pandas as pd
    housing = fetch_california_housing()
    pdf = pd.DataFrame(housing.data, columns=housing.feature_names)
    pdf["PRICE"] = housing.target
    spark_df = spark.createDataFrame(pdf)
except Exception:
    csv_path = os.path.join(BASE, "TestData", "retail_sales_dataset.csv")
    spark_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(csv_path)
    if "PRICE" not in spark_df.columns and "target" in spark_df.columns:
        spark_df = spark_df.withColumnRenamed("target", "PRICE")

feature_cols = [c for c in spark_df.columns if c != "PRICE" and spark_df.schema[c].dataType.simpleString() in ("double", "int")]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
scaler = StandardScaler(inputCol="features", outputCol="scaled_features")
prep = Pipeline(stages=[assembler, scaler])
df_ready = prep.fit(spark_df).transform(spark_df)
data = df_ready.select("scaled_features", "PRICE").withColumnRenamed("scaled_features", "features")
train_data, test_data = data.randomSplit([0.8, 0.2], seed=SEED)

r2_eval = RegressionEvaluator(labelCol="PRICE", predictionCol="prediction", metricName="r2")
rmse_eval = RegressionEvaluator(labelCol="PRICE", predictionCol="prediction", metricName="rmse")

In [None]:
lr = LinearRegression(featuresCol="features", labelCol="PRICE")
lr_model = lr.fit(train_data)
lr_preds = lr_model.transform(test_data)
lr_r2 = r2_eval.evaluate(lr_preds)
lr_rmse = rmse_eval.evaluate(lr_preds)

In [None]:
rf = RandomForestRegressor(featuresCol="features", labelCol="PRICE", seed=SEED)
rf_model = rf.fit(train_data)
rf_preds = rf_model.transform(test_data)
rf_r2 = r2_eval.evaluate(rf_preds)
rf_rmse = rmse_eval.evaluate(rf_preds)

In [None]:
gbt = GBTRegressor(featuresCol="features", labelCol="PRICE", seed=SEED)
gbt_model = gbt.fit(train_data)
gbt_preds = gbt_model.transform(test_data)
gbt_r2 = r2_eval.evaluate(gbt_preds)
gbt_rmse = rmse_eval.evaluate(gbt_preds)

In [None]:
print("=== baseline vs 개선 (산출물에 기록) ===")
print("model,rmse,r2")
print(f"LinearRegression,{lr_rmse:.4f},{lr_r2:.4f}")
print(f"RandomForestRegressor,{rf_rmse:.4f},{rf_r2:.4f}")
print(f"GBTRegressor,{gbt_rmse:.4f},{gbt_r2:.4f}")

spark.stop()