In [None]:
# Spark 기반 몹 드롭 가치 예측 모델 (선형 회귀 with 파이프라인 전체 구성)

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, sum as spark_sum
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler
from pyspark.ml import Pipeline

# 1. Spark 세션 생성
spark = SparkSession.builder.appName("MobDropValueRegression").getOrCreate()

# 2. 데이터 불러오기
mobs = spark.read.csv('../learning_spark_data/minecraft/Mobs.csv', header=True, inferSchema=True)
food = spark.read.csv('../learning_spark_data/minecraft/Food.csv', header=True, inferSchema=True)
mob_food = spark.read.csv('../learning_spark_data/minecraft/MobFoodDrops.csv', header=True, inferSchema=True)

# 3. 드롭 가치 계산: foodID 기준으로 hunger 합산
drop_value = mob_food.join(food.withColumnRenamed("ID", "foodID").select("foodID", "hunger"), on="foodID", how="left")
value_per_mob = drop_value.groupBy("mobID").agg(spark_sum("hunger").alias("totalDropValue"))

# 4. mobs와 join하여 전체 mob 정보에 드롭 가치 추가
mob_all = mobs.join(value_per_mob, mobs["ID"] == value_per_mob["mobID"], how="left")
mob_all = mob_all.withColumn("totalDropValue", when(col("totalDropValue").isNull(), 0).otherwise(col("totalDropValue")))

# 5. 피처 엔지니어링: 필요 컬럼 선택
features = ["healthPoints", "maxDamage"]

# 6. Null 값 처리 (0으로 대체)
for f in features:
    mob_all = mob_all.withColumn(f, when(col(f).isNull(), 0).otherwise(col(f)))

# 7. VectorAssembler + 표준화 (StandardScaler)
assembler = VectorAssembler(inputCols=features, outputCol="assembled")
scaler = StandardScaler(inputCol="assembled", outputCol="features")

# 8. 선형 회귀 모델 정의
lr = LinearRegression(featuresCol="features", labelCol="totalDropValue")

# 9. 파이프라인 구성
stages = [assembler, scaler, lr]
pipeline = Pipeline(stages=stages)

# 10. 데이터 분할 (train/test)
train_data, test_data = mob_all.randomSplit([0.8, 0.2], seed=42)

# 11. 파이프라인 학습 및 변환
fitted_transform = pipeline.fit(train_data)
vtrain_data = fitted_transform.transform(train_data)
vtrain_data.printSchema()

# 12. 예측 및 평가
predictions = fitted_transform.transform(test_data)
final_model = fitted_transform.stages[-1]  # LinearRegressionModel
summary = final_model.evaluate(predictions)

# 13. 결과 출력
print("\n📊 모델 성능 평가 (테스트 데이터 기준)")
print(f"- 회귀 계수 (slope): {final_model.coefficients}")
print(f"- 절편 (intercept): {final_model.intercept:.4f}")
print(f"- R^2 (설명력): {summary.r2:.4f}")
print(f"- RMSE (평균 제곱근 오차): {summary.rootMeanSquaredError:.4f}")

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, sum as spark_sum
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler
from pyspark.ml import Pipeline

# 1. Spark 세션 생성
spark = SparkSession.builder.appName("MobDropValueRegression").getOrCreate()

# 2. 데이터 불러오기
mobs = spark.read.csv('../learning_spark_data/minecraft/Mobs.csv', header=True, inferSchema=True)
food = spark.read.csv('../learning_spark_data/minecraft/Food.csv', header=True, inferSchema=True)
mob_food = spark.read.csv('../learning_spark_data/minecraft/MobFoodDrops.csv', header=True, inferSchema=True)

In [1]:
spark.stop()

NameError: name 'spark' is not defined