In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, sum as spark_sum
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, StandardScaler
from pyspark.ml import Pipeline

# 1. Spark 세션 생성
spark = SparkSession.builder.appName("MobDropValueRegression").getOrCreate()

# 2. 데이터 불러오기
mobs = spark.read.csv('../learning_spark_data/minecraft/Mobs.csv', header=True, inferSchema=True)
food = spark.read.csv('../learning_spark_data/minecraft/Food.csv', header=True, inferSchema=True)
mob_food = spark.read.csv('../learning_spark_data/minecraft/MobFoodDrops.csv', header=True, inferSchema=True)

In [2]:
# 필요없는 데이터 컬럼 drop
mobs = mobs.drop("behaviorTypes", "spawnBehavior", "debutDate", "minecraftVersion", "reproductiveRequirement")
food = food.drop("debutDate", "minecraftVersion")

In [3]:
# mobID → 몹 ID, foodID → 음식 ID로 연결

# mob_food + food 조인 (foodID 기준)
mob_food_value = mob_food.join(food, mob_food["foodID"] == food["ID"], how="left") \
                         .select(mob_food["mobID"], food["hunger"])

# 몹별 총 음식 드롭 가치 합산
mob_drop_value = mob_food_value.groupBy("mobID").agg(
    spark_sum("hunger").alias("totalDropValue")
)

mob_drop_value.show(10)

+-----+--------------+
|mobID|totalDropValue|
+-----+--------------+
|   53|             3|
|   78|             8|
|   34|             3|
|   28|             2|
|   76|             8|
|   26|             2|
|   22|             3|
|   52|             4|
|    6|             2|
|   54|             3|
+-----+--------------+
only showing top 10 rows



In [4]:
# NULL 처리 및 캐스팅
mobs = mobs.withColumn("healthPoints", col("healthPoints").cast("double"))
mobs = mobs.withColumn("maxDamage", when(col("maxDamage").isNull(), 0).otherwise(col("maxDamage")).cast("double"))
mobs = mobs.withColumn("difficultyScore", col("healthPoints") + col("maxDamage") * 2)

food = food.withColumn("hunger", col("hunger").cast("double"))  # 드롭 가치 = hunger

In [5]:
from pyspark.sql.functions import col, when, format_number

# mobs.ID ↔ mob_drop_value.mobID 연결
mob_efficiency = mobs.join(mob_drop_value, mobs["ID"] == mob_drop_value["mobID"], how="left")

# NULL 처리: 드롭 없으면 0
mob_efficiency = mob_efficiency.withColumn(
    "totalDropValue",
    when(col("totalDropValue").isNull(), 0).otherwise(col("totalDropValue"))
)

# 가성비 계산: 드롭 가치 / (사냥 난이도 + 1)
mob_efficiency = mob_efficiency.withColumn(
    "efficiencyScore",
    col("totalDropValue") / (col("difficultyScore") + 1)
)

# totalDropValue가 0인 행 제거
mob_efficiency = mob_efficiency.filter(col("totalDropValue") > 0)

# 소수점 둘째자리로 포맷 (출력용)
mob_efficiency = mob_efficiency.withColumn(
    "efficiencyScore", format_number("efficiencyScore", 2)
)

# 출력
mob_efficiency.select(
    "name", "healthPoints", "maxDamage", "difficultyScore", "totalDropValue", "efficiencyScore"
).orderBy(col("efficiencyScore").desc()).show(30, truncate=False)

+----------------+------------+---------+---------------+--------------+---------------+
|name            |healthPoints|maxDamage|difficultyScore|totalDropValue|efficiencyScore|
+----------------+------------+---------+---------------+--------------+---------------+
|salmon          |3.0         |0.0      |3.0            |2             |0.50           |
|cod             |3.0         |0.0      |3.0            |2             |0.50           |
|chicken         |4.0         |0.0      |4.0            |2             |0.40           |
|zombie_villager |20.0        |3.0      |26.0           |8             |0.30           |
|husk            |20.0        |3.0      |26.0           |8             |0.30           |
|zombie          |20.0        |3.0      |26.0           |8             |0.30           |
|mooshroom       |10.0        |0.0      |10.0           |3             |0.27           |
|pig             |10.0        |0.0      |10.0           |3             |0.27           |
|cow             |10.

In [6]:
from pyspark.sql.functions import col

mob_efficiency = mob_efficiency.withColumn("efficiencyScore", col("efficiencyScore").cast("double"))


In [7]:
# 5. 피처 엔지니어링: 필요 컬럼 선택
features = ["difficultyScore", "totalDropValue"]


In [8]:
stages = []

In [9]:
from pyspark.ml.feature import StandardScaler, VectorAssembler

num_assembler = VectorAssembler(inputCols=["difficultyScore", "totalDropValue"], outputCol= 'feature_vector')
stages += [num_assembler]

stages

[VectorAssembler_89c139c1558e]

In [10]:
train_df, test_df = mob_efficiency.randomSplit([0.8,0.2], seed=1)

In [11]:
from pyspark.ml import Pipeline
pipeline = Pipeline(stages=stages)
fitted_transform = pipeline.fit(train_df)
vtrain_df = fitted_transform.transform(train_df)
vtrain_df.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- healthPoints: double (nullable = true)
 |-- maxDamage: double (nullable = true)
 |-- difficultyScore: double (nullable = true)
 |-- mobID: integer (nullable = true)
 |-- totalDropValue: long (nullable = true)
 |-- efficiencyScore: double (nullable = true)
 |-- feature_vector: vector (nullable = true)



In [12]:
vtrain_df.select('feature_vector', 'efficiencyScore').show(2)

+--------------+---------------+
|feature_vector|efficiencyScore|
+--------------+---------------+
|     [4.0,2.0]|            0.4|
|    [16.0,2.0]|           0.12|
+--------------+---------------+
only showing top 2 rows



In [13]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=50, solver='normal', 
                 labelCol='efficiencyScore', featuresCol='feature_vector')

In [14]:
model = lr.fit(vtrain_df)

In [15]:
#테스트데이터도 변환
vtest_df = fitted_transform.transform(test_df)
#테스트데이터로 예측
pred = model.transform(vtest_df)

In [16]:
pred.select('efficiencyScore', 'prediction').show()

+---------------+-------------------+
|efficiencyScore|         prediction|
+---------------+-------------------+
|           0.09| 0.1717937842865192|
|           0.11|0.19880663086235328|
|           0.15|0.26358818147514457|
|           0.15|0.26358818147514457|
|           0.09| 0.1717937842865192|
+---------------+-------------------+



In [17]:
model.summary.r2, model.summary.rootMeanSquaredError

(0.6146803976545887, 0.09029380624055988)

In [None]:
spark.stop()