# Decision Tree, Random Forest Classifier, Gradient Boost

Titanic 데이터셋의을 이용하여 분류 모델을 작성합니다.

---

**컬럼 설명**

    1. **`PassengerId`**: 승객 고유 식별할 수 있는 ID 번호.
    2. **`Survived`**: 0 - 사망, 1 - 생존
    3. **`Pclass`**:  승객의 좌석 등급(Class)
    4. **`Name`**: 승객 이름
    5. **`Sex`**: 성별
    6. **`Age`**: 승객의 나이
    7. **`SibSp`**: 함께 탑승한 **형제자매(Siblings)** 및 **배우자(Spouse)**의 수
    8. **`Parch`**: 함께 탑승한 **부모(Parents)** 및 **자녀(Children)**의 수
    9. **`Ticket`**:  티켓 고유 번호
    10. **`Fare`**: 운임 요금
    11. **`Cabin`**: 객실 번호.
    12. **`Embarked`**: 탑승 항구

In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("SparkML").config("spark.sql.repl.eagerEval.enabled", True).getOrCreate()

In [3]:
from google.colab import drive

# Google Drive 마운트
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import VectorAssembler

In [6]:
df = spark.read.csv("/content/drive/Othercomputers/내 컴퓨터/BigDataDatasets/titanic.csv", header="true", inferSchema="true")
df.limit(5)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
1,0,3,"Braund, Mr. Owen ...",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. Joh...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. ...",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Ja...",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. Willia...",male,35.0,0,0,373450,8.05,,S


### Simple EDA

In [7]:
# df 의 전체 행(row) 개수
df.count()

891

In [8]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [9]:
## 불필요한 column 삭제
df = df.drop("PassengerId", "Name", "Ticket", "Cabin")
df.show(5)

+--------+------+------+----+-----+-----+-------+--------+
|Survived|Pclass|   Sex| Age|SibSp|Parch|   Fare|Embarked|
+--------+------+------+----+-----+-----+-------+--------+
|       0|     3|  male|22.0|    1|    0|   7.25|       S|
|       1|     1|female|38.0|    1|    0|71.2833|       C|
|       1|     3|female|26.0|    0|    0|  7.925|       S|
|       1|     1|female|35.0|    1|    0|   53.1|       S|
|       0|     3|  male|35.0|    0|    0|   8.05|       S|
+--------+------+------+----+-----+-----+-------+--------+
only showing top 5 rows



### Missing Value 처리

In [10]:
df.describe()

summary,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
count,891.0,891.0,891,714.0,891.0,891.0,891.0,889
mean,0.3838383838383838,2.308641975308642,,29.69911764705882,0.5230078563411896,0.3815937149270482,32.2042079685746,
stddev,0.4865924542648575,0.8360712409770491,,14.526497332334037,1.1027434322934315,0.8060572211299488,49.69342859718089,
min,0.0,1.0,female,0.42,0.0,0.0,0.0,C
max,1.0,3.0,male,80.0,8.0,6.0,512.3292,S


In [12]:
from pyspark.sql.functions import count

# 데이터프레임에서 각 컬럼의 데이터 개수를 계산하여 출력
# - count(c): 각 컬럼(c)에 대해 데이터 개수를 계산 (결측값은 제외)
# df.select([...])는 데이터프레임의 특정 컬럼 또는 계산 결과를 선택
df.select([count(c) for c in df.columns])

count(Survived),count(Pclass),count(Sex),count(Age),count(SibSp),count(Parch),count(Fare),count(Embarked)
891,891,891,714,891,891,891,889


In [15]:
from pyspark.sql.functions import col, count, when

# 각 컬럼의 결측값(null) 개수를 계산
# 전체 row 수df.cout() 와 column의 데이터 개수 차이
null_counts = df.select([
    (df.count() - count(c)).alias(f"null_{c}") for c in df.columns
])

null_counts

null_Survived,null_Pclass,null_Sex,null_Age,null_SibSp,null_Parch,null_Fare,null_Embarked
0,0,0,177,0,0,0,2


`Age`는 결측값이 많으므로 평균으로 채워넣고 `Embarked`는 2 개 뿐이므로 drop 시킵니다.

In [13]:
from pyspark.ml.feature import Imputer

# 결측값(null)을 평균으로 대체하는 Imputer 생성
# - setStrategy('mean'): 결측값을 대체하는 전략을 'mean'(평균)으로 설정
imputer = Imputer(inputCols=['Age'], outputCols=['Age']).setStrategy('mean')

# Imputer 모델을 학습하고 데이터프레임(df)에 적용하여 결측값을 대체
df_cleaned = imputer.fit(df).transform(df)

df_cleaned.limit(5)

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
1,3,female,26.0,0,0,7.925,S
1,1,female,35.0,1,0,53.1,S
0,3,male,35.0,0,0,8.05,S


In [16]:
# Embarked는 2개 밖에 안되므로 drop
# - na.drop(how='any'): 결측값이 하나라도 있는 행을 제거
df_cleaned = df_cleaned.na.drop(how='any')
df_cleaned.limit(5)

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
1,3,female,26.0,0,0,7.925,S
1,1,female,35.0,1,0,53.1,S
0,3,male,35.0,0,0,8.05,S


In [20]:
# df_cleaned에서 결측치가 모두 제거되 것을 확인
df_cleaned.select([
    (df_cleaned.count() - count(c)).alias(f"null_{c}") for c in df_cleaned.columns
])

null_Survived,null_Pclass,null_Sex,null_Age,null_SibSp,null_Parch,null_Fare,null_Embarked
0,0,0,0,0,0,0,0


In [23]:
# 주요 특성별로 데이터의 분포 분석
df_cleaned.groupBy("Survived").count().show()
df_cleaned.groupBy("Pclass").count().show()
df_cleaned.groupBy("Sex").count().show()
df_cleaned.groupBy("Embarked").count().show()

+--------+-----+
|Survived|count|
+--------+-----+
|       1|  340|
|       0|  549|
+--------+-----+

+------+-----+
|Pclass|count|
+------+-----+
|     1|  214|
|     3|  491|
|     2|  184|
+------+-----+

+------+-----+
|   Sex|count|
+------+-----+
|female|  312|
|  male|  577|
+------+-----+

+--------+-----+
|Embarked|count|
+--------+-----+
|       Q|   77|
|       C|  168|
|       S|  644|
+--------+-----+



### 범주형 컬럼의 One Hot Encoding

In [24]:
## 범주형(feature) 데이터 변환
from pyspark.ml.feature import StringIndexer

# - inputCols: 인덱싱할 입력 컬럼들의 리스트 (여기서는 'Pclass', 'Sex', 'Embarked')
# - outputCols: 변환된 인덱스를 저장할 출력 컬럼들의 리스트 (여기서는 'Pclass_', 'Sex_', 'Embarked_')
# - fit: 데이터프레임(df_cleaned)을 기반으로 StringIndexer 모델을 학습

indexer = StringIndexer(
    inputCols=['Pclass', 'Sex', 'Embarked'],
    outputCols=['Pclass_', 'Sex_', 'Embarked_']
).fit(df_cleaned)

# 학습된 StringIndexer 모델을 사용하여 데이터프레임(df_cleaned)에 변환을 적용
# 변환된 데이터는 새로운 컬럼('Pclass_', 'Sex_', 'Embarked_')이 추가된 데이터프레임(df_r)에 저장됩니다.
df_r = indexer.transform(df_cleaned)
df_r.limit(5)

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Pclass_,Sex_,Embarked_
0,3,male,22.0,1,0,7.25,S,0.0,0.0,0.0
1,1,female,38.0,1,0,71.2833,C,1.0,1.0,1.0
1,3,female,26.0,0,0,7.925,S,0.0,1.0,0.0
1,1,female,35.0,1,0,53.1,S,1.0,1.0,0.0
0,3,male,35.0,0,0,8.05,S,0.0,0.0,0.0


In [25]:
## 원-핫 인코딩(OneHot Encoding)
from pyspark.ml.feature import OneHotEncoder

# - inputCols: 원-핫 인코딩을 적용할 입력 컬럼들의 리스트 (여기서는 'Pclass_', 'Sex_', 'Embarked_').
# - outputCols: 변환된 원-핫 벡터를 저장할 출력 컬럼들의 리스트 (여기서는 'Pclass_ohe', 'Sex_ohe', 'Embarked_ohe').
ohe = OneHotEncoder(
    inputCols=['Pclass_', 'Sex_', 'Embarked_'],
    outputCols=['Pclass_ohe', 'Sex_ohe', 'Embarked_ohe']
).fit(df_r)

# 학습된 OneHotEncoder 모델을 사용하여 데이터프레임(df_r)에 변환을 적용
# 변환된 데이터는 새로운 컬럼('Pclass_ohe', 'Sex_ohe', 'Embarked_ohe')이 추가된 데이터프레임(df_ohe)에 저장됩니다.
df_ohe = ohe.transform(df_r)
df_ohe.limit(5)

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Pclass_,Sex_,Embarked_,Pclass_ohe,Sex_ohe,Embarked_ohe
0,3,male,22.0,1,0,7.25,S,0.0,0.0,0.0,"(2,[0],[1.0])","(1,[0],[1.0])","(2,[0],[1.0])"
1,1,female,38.0,1,0,71.2833,C,1.0,1.0,1.0,"(2,[1],[1.0])","(1,[],[])","(2,[1],[1.0])"
1,3,female,26.0,0,0,7.925,S,0.0,1.0,0.0,"(2,[0],[1.0])","(1,[],[])","(2,[0],[1.0])"
1,1,female,35.0,1,0,53.1,S,1.0,1.0,0.0,"(2,[1],[1.0])","(1,[],[])","(2,[0],[1.0])"
0,3,male,35.0,0,0,8.05,S,0.0,0.0,0.0,"(2,[0],[1.0])","(1,[0],[1.0])","(2,[0],[1.0])"


In [26]:
## 벡터 특징(Vector Feature) 생성
from pyspark.ml.feature import VectorAssembler

#  여러 개의 입력 컬럼을 하나의 벡터로 결합하여 머신러닝 모델에서 사용할 수 있는 형식으로 변환
# - inputCols: 벡터로 결합할 입력 컬럼들의 리스트
# - outputCol: 결합된 벡터를 저장할 출력 컬럼의 이름
assembler = VectorAssembler(
    inputCols=["Age", "SibSp", "Parch", "Fare", "Pclass_ohe", "Sex_ohe", "Embarked_ohe"],
    outputCol="features"
)

# 학습된 VectorAssembler를 사용하여 데이터프레임(df_ohe)에 변환을 적용
output = assembler.transform(df_ohe)
output.limit(5)

Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Pclass_,Sex_,Embarked_,Pclass_ohe,Sex_ohe,Embarked_ohe,features
0,3,male,22.0,1,0,7.25,S,0.0,0.0,0.0,"(2,[0],[1.0])","(1,[0],[1.0])","(2,[0],[1.0])","[22.0,1.0,0.0,7.2..."
1,1,female,38.0,1,0,71.2833,C,1.0,1.0,1.0,"(2,[1],[1.0])","(1,[],[])","(2,[1],[1.0])","[38.0,1.0,0.0,71...."
1,3,female,26.0,0,0,7.925,S,0.0,1.0,0.0,"(2,[0],[1.0])","(1,[],[])","(2,[0],[1.0])","(9,[0,3,4,7],[26...."
1,1,female,35.0,1,0,53.1,S,1.0,1.0,0.0,"(2,[1],[1.0])","(1,[],[])","(2,[0],[1.0])","[35.0,1.0,0.0,53...."
0,3,male,35.0,0,0,8.05,S,0.0,0.0,0.0,"(2,[0],[1.0])","(1,[0],[1.0])","(2,[0],[1.0])","[35.0,0.0,0.0,8.0..."


In [27]:
output.first()

Row(Survived=0, Pclass=3, Sex='male', Age=22.0, SibSp=1, Parch=0, Fare=7.25, Embarked='S', Pclass_=0.0, Sex_=0.0, Embarked_=0.0, Pclass_ohe=SparseVector(2, {0: 1.0}), Sex_ohe=SparseVector(1, {0: 1.0}), Embarked_ohe=SparseVector(2, {0: 1.0}), features=DenseVector([22.0, 1.0, 0.0, 7.25, 1.0, 0.0, 1.0, 1.0, 0.0]))

이제 모든 열이 단일 기능 벡터로 변환되었으므로 Feature Scaling을 합니다.

In [28]:
## 특징 스케일링(Feature Scaling)
from pyspark.ml.feature import StandardScaler

# 데이터를 표준화(Standardization)하여 평균이 0이고 분산이 1이 되도록 변환합니다.
scaler = StandardScaler(inputCol='features', outputCol='standardized')

data_scaled = scaler.fit(output).transform(output)
data_scaled.show(truncate=False)

+--------+------+------+-----------------+-----+-----+-------+--------+-------+----+---------+-------------+-------------+-------------+------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------+
|Survived|Pclass|Sex   |Age              |SibSp|Parch|Fare   |Embarked|Pclass_|Sex_|Embarked_|Pclass_ohe   |Sex_ohe      |Embarked_ohe |features                                        |standardized                                                                                                                                 |
+--------+------+------+-----------------+-----+-----+-------+--------+-------+----+---------+-------------+-------------+-------------+------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------+
|0       |3     

In [30]:
data = data_scaled.select("standardized", "Survived")
data

standardized,Survived
[1.69643573256440...,0
[2.93020717442942...,1
"(9,[0,3,4,7],[2.0...",1
[2.69887502907973...,1
[2.69887502907973...,0
"(9,[0,3,4,6],[2.2...",0
[4.16397861629444...,0
[0.15422143023312...,0
[2.08198930814722...,1
"(9,[0,1,3,8],[1.0...",1


In [32]:
## train/test split
train_data, test_data = data.randomSplit([0.75, 0.25])
train_data.count(), test_data.count()

(658, 231)

In [33]:
## 머신러닝 모델 작성 및 학습
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, GBTClassifier

# 1. 의사결정나무(Decision Tree) 분류 모델
# - maxDepth: 의사결정나무의 최대 깊이
dt = DecisionTreeClassifier(
    featuresCol="standardized",
    labelCol="Survived",
    maxDepth=5,
    seed=42
).fit(train_data)

# 2. 랜덤 포레스트(Random Forest) 분류 모델
# - numTrees: 랜덤 포레스트에서 사용할 의사결정나무의 개수 (여기서는 100으로 설정).
# - maxDepth: 각 의사결정나무의 최대 깊이
rf = RandomForestClassifier(
    featuresCol="standardized",
    labelCol="Survived",
    numTrees=100,
    maxDepth=5,
    seed=42
).fit(train_data)

# 3. 그레디언트 부스팅 트리(Gradient-Boosted Tree) 분류 모델
gb = GBTClassifier(
    featuresCol="standardized",
    labelCol="Survived",
    seed=42
).fit(train_data)

In [34]:
# feature importance -  희소 벡터 형식 출력
# 희소 벡터는 (전체 특징 개수, [활성화된 인덱스], [각 특징의 중요도])로 구성
print("DT", dt.featureImportances)
print("RF", rf.featureImportances)
print("GB", gb.featureImportances)

DT (9,[0,1,2,3,4,5,6,7],[0.12454969107433443,0.06606932774185169,0.016448509719065137,0.053800521805894785,0.14387491143627096,0.04992368108130126,0.530474073165098,0.014859283976183708])
RF (9,[0,1,2,3,4,5,6,7,8],[0.11824145521968703,0.05469757753037316,0.03320133173345825,0.12533622018950302,0.11018956316708349,0.06303020824982218,0.46079953109814614,0.022774213042715534,0.01172989976921133])
GB (9,[0,1,2,3,4,5,6,7,8],[0.32219084903162115,0.06231933241224164,0.031854524785253725,0.2737107489751033,0.06636352940390587,0.031147178639385136,0.17691718438781712,0.01610782111276195,0.01938883125191011])


In [36]:
## test dataset을 이용한 예측
pred_dt = dt.transform(test_data)
pred_rf = rf.transform(test_data)
pred_gb = gb.transform(test_data)
pred_dt.show(5)
pred_rf.show(5)
pred_gb.show(5)

+--------------------+--------+-------------+--------------------+----------+
|        standardized|Survived|rawPrediction|         probability|prediction|
+--------------------+--------+-------------+--------------------+----------+
|(9,[0,1,3,4],[2.2...|       1|  [10.0,30.0]|         [0.25,0.75]|       1.0|
|(9,[0,1,3,7],[1.8...|       1|  [8.0,110.0]|[0.06779661016949...|       1.0|
|(9,[0,1,3,7],[2.1...|       1|  [8.0,110.0]|[0.06779661016949...|       1.0|
|(9,[0,1,3,7],[2.2...|       1|  [8.0,110.0]|[0.06779661016949...|       1.0|
|(9,[0,2,3,7],[1.0...|       1|  [8.0,110.0]|[0.06779661016949...|       1.0|
+--------------------+--------+-------------+--------------------+----------+
only showing top 5 rows

+--------------------+--------+--------------------+--------------------+----------+
|        standardized|Survived|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|(9,[0,1,3,4],[2.2

### 각 모델의 성능 측정

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# 1. 의사결정나무(Decision Tree) 모델의 AUC 점수 계산
dt_auc = BinaryClassificationEvaluator(labelCol="Survived").evaluate(pred_dt)

# 2. 랜덤 포레스트(Random Forest) 모델의 AUC 점수 계산
rf_auc = BinaryClassificationEvaluator(labelCol="Survived").evaluate(pred_rf)

# 3. 그레디언트 부스팅 트리(Gradient-Boosted Tree) 모델의 AUC 점수 계산
gb_auc = BinaryClassificationEvaluator(labelCol="Survived").evaluate(pred_gb)

# 계산된 AUC 점수를 출력
dt_auc, rf_auc, gb_auc

(0.7330337235228538, 0.8783444816053512, 0.8715510033444811)