In [1]:
# spark 세션 시작
from pyspark.sql import SparkSession
MAX_MEMORY = '8g'
spark = SparkSession.builder.appName("accident-severity-prediction")\
        .config('spark.driver.memory', MAX_MEMORY)\
        .config('spark.executor.memory', MAX_MEMORY)\
        .getOrCreate()

In [2]:
# 데이터 불러오기
import os
cwd = os.getcwd()
a_data_path = os.path.join(cwd, 'learning_spark_data', 'accident' , 'Accident_Information.csv')
v_data_path = os.path.join(cwd, 'learning_spark_data', 'accident' , 'Vehicle_Information.csv')

a_file_path = f"file:///{a_data_path.replace(os.sep, '/')}"
v_file_path = f"file:///{v_data_path.replace(os.sep, '/')}"

accident_df = spark.read.csv(a_file_path, inferSchema=True, header=True)
vehicle_df = spark.read.csv(v_file_path, inferSchema=True, header=True)

accident_df.printSchema(), vehicle_df.printSchema()

root
 |-- Accident_Index: string (nullable = true)
 |-- 1st_Road_Class: string (nullable = true)
 |-- 1st_Road_Number: string (nullable = true)
 |-- 2nd_Road_Class: string (nullable = true)
 |-- 2nd_Road_Number: string (nullable = true)
 |-- Accident_Severity: string (nullable = true)
 |-- Carriageway_Hazards: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Day_of_Week: string (nullable = true)
 |-- Did_Police_Officer_Attend_Scene_of_Accident: string (nullable = true)
 |-- Junction_Control: string (nullable = true)
 |-- Junction_Detail: string (nullable = true)
 |-- Latitude: string (nullable = true)
 |-- Light_Conditions: string (nullable = true)
 |-- Local_Authority_(District): string (nullable = true)
 |-- Local_Authority_(Highway): string (nullable = true)
 |-- Location_Easting_OSGR: string (nullable = true)
 |-- Location_Northing_OSGR: string (nullable = true)
 |-- Longitude: string (nullable = true)
 |-- LSOA_of_Accident_Location: string (nullable = true)
 |-- Num

(None, None)

In [8]:
# 컬럼명 통일
vehicle_df = vehicle_df.withColumnRenamed("Engine_Capacity_.CC.", "Engine_Capacity")

In [9]:
# 필요한 컬럼 선택
accident_cols = [
    "Accident_Index", "1st_Road_Class", "Date", "Time", "Day_of_Week", "Accident_Severity",
    "Did_Police_Officer_Attend_Scene_of_Accident", "Road_Type", "Speed_limit",
    "Urban_or_Rural_Area", "Light_Conditions", "Weather_Conditions"
]

vehicle_cols = [
    "Accident_Index", "Age_Band_of_Driver", "Sex_of_Driver", "Vehicle_Type",
    "Age_of_Vehicle", "Engine_Capacity", "Driver_Home_Area_Type"
]



# 필요한 컬럼만 선택
accident_df_sel = accident_df.select(*accident_cols)
vehicle_df_sel = vehicle_df.select(*vehicle_cols)

# inner 조인
join_df = accident_df_sel.join(vehicle_df_sel, on="Accident_Index", how="inner")


In [10]:
# 수치형 컬럼으로 변환

from pyspark.sql.functions import mean, col

# 1. 수치형 컬럼 리스트 정의
cols_to_cast = ["Speed_limit", "Age_of_Vehicle", "Engine_Capacity"]

# 2. 수치형으로 변환 (cast)
for c in cols_to_cast:
    join_df = join_df.withColumn(c, col(c).cast("double"))

# 3. 평균값 계산
mean_values = join_df.select([mean(c).alias(c) for c in cols_to_cast]).collect()[0].asDict()

# 4. 평균값으로 결측치 대체
join_df = join_df.na.fill(mean_values)

# 5. 사고 심각도 이진 분류 라벨 생성
from pyspark.sql.functions import when, col

# 6. 'Fatal', 'Serious' → 위험(1), 'Slight' → 안전(0)
join_df = join_df.withColumn(
    "label",
    when(col("Accident_Severity").isin("Fatal", "Serious"), 1).otherwise(0)
)

from pyspark.sql.functions import hour

# 7. 시간대 파생 컬럼
join_df = join_df.withColumn("Hour", hour("Time"))
join_df = join_df.withColumn(
    "Time_Band",
    when((col("Hour") >= 7) & (col("Hour") < 10), "Morning_Rush")
    .when((col("Hour") >= 17) & (col("Hour") < 20), "Evening_Rush")
    .when((col("Hour") >= 0) & (col("Hour") < 6), "Late_Night")
    .otherwise("Other")
)

# 8. 조도 (Light_Conditions → Light_Simple)
join_df = join_df.withColumn(
    "Light_Simple",
    when(col("Light_Conditions").contains("Daylight"), "Day").otherwise("Night")
)

# 9. 날씨 (Weather_Conditions → Weather_Simple)
join_df = join_df.withColumn(
    "Weather_Simple",
    when(col("Weather_Conditions").contains("Fine"), "Clear").otherwise("Bad")
)

In [11]:
join_df.printSchema()

root
 |-- Accident_Index: string (nullable = true)
 |-- 1st_Road_Class: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Time: timestamp (nullable = true)
 |-- Day_of_Week: string (nullable = true)
 |-- Accident_Severity: string (nullable = true)
 |-- Did_Police_Officer_Attend_Scene_of_Accident: string (nullable = true)
 |-- Road_Type: string (nullable = true)
 |-- Speed_limit: double (nullable = false)
 |-- Urban_or_Rural_Area: string (nullable = true)
 |-- Light_Conditions: string (nullable = true)
 |-- Weather_Conditions: string (nullable = true)
 |-- Age_Band_of_Driver: string (nullable = true)
 |-- Sex_of_Driver: string (nullable = true)
 |-- Vehicle_Type: string (nullable = true)
 |-- Age_of_Vehicle: double (nullable = false)
 |-- Engine_Capacity: double (nullable = false)
 |-- Driver_Home_Area_Type: string (nullable = true)
 |-- label: integer (nullable = false)
 |-- Hour: integer (nullable = true)
 |-- Time_Band: string (nullable = false)
 |-- Light_Simple: strin

In [17]:
# 인코딩 대상 컬럼 정의

# 범주형 컬럼 (StringIndexer + OneHotEncoder 대상)
# "Time_Band", "Light_Simple", "Weather_Simple" 컬럼 추가

cat_columns = [
    "1st_Road_Class", "Day_of_Week", "Did_Police_Officer_Attend_Scene_of_Accident",
    "Road_Type", "Urban_or_Rural_Area", "Age_Band_of_Driver", "Sex_of_Driver",
    "Vehicle_Type", "Driver_Home_Area_Type",
    "Time_Band", "Light_Simple", "Weather_Simple"
]

# 수치형 컬럼
num_columns = ["Speed_limit", "Age_of_Vehicle", "Engine_Capacity"]

In [18]:
# 타겟 라벨 인코딩

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.classification import GBTClassifier
from pyspark.ml import Pipeline

# 인코더 구성
indexers = [StringIndexer(inputCol=c, outputCol=c+"_Index") for c in cat_columns]
encoders = [OneHotEncoder(inputCol=c+"_Index", outputCol=c+"_Vec") for c in cat_columns]

# 피처 구성
feature_columns = [c + "_Vec" for c in cat_columns] + ["Speed_limit", "Age_of_Vehicle", "Engine_Capacity"]
assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")

# 모델
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=30)

# 파이프라인
pipeline = Pipeline(stages=indexers + encoders + [assembler, gbt])

In [20]:
from pyspark.sql.functions import col

# 1. 위험/안전 데이터 분리
minor_df = join_df.filter(col("label") == 1)  # 위험
major_df = join_df.filter(col("label") == 0)  # 안전

# 2. 소수 클래스 수 만큼 다수 클래스 언더샘플링
minor_count = minor_df.count()
major_sample_df = major_df.sample(fraction=minor_count / major_df.count(), seed=42)

# 3. 언더샘플링된 balanced_df 생성
balanced_df = minor_df.union(major_sample_df)

# 4. train/test 분리
train_df, test_df = balanced_df.randomSplit([0.8, 0.2], seed=42)


In [21]:
# 모델 학습

model = pipeline.fit(train_df)
predictions = model.transform(test_df)


In [22]:
# AUC 확인

from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(
    labelCol="label",
    rawPredictionCol="probability",  # GBTClassifier는 probability 컬럼 사용 가능
    metricName="areaUnderROC"
)

auc = evaluator.evaluate(predictions)
print("AUC:", auc)


AUC: 0.674229418206051


In [23]:
# threshold 조정

from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

def adjust_threshold(probability, threshold=0.2):
    return float(probability[1] > threshold)

adjust_threshold_udf = udf(lambda p: adjust_threshold(p, threshold=0.2), DoubleType())

predictions = predictions.withColumn("adjusted_prediction", adjust_threshold_udf(col("probability")))


In [24]:
# 혼동행렬(Confusion Matrix)
cm = predictions.groupBy("label", "adjusted_prediction").count()
cm.show()

+-----+-------------------+-----+
|label|adjusted_prediction|count|
+-----+-------------------+-----+
|    1|                0.0|  425|
|    1|                1.0|58081|
|    0|                0.0| 2268|
|    0|                1.0|56380|
+-----+-------------------+-----+



In [25]:
# 정밀도 / 재현율 / F1-score 수동 계산
TP = 58081
FP = 56380
FN = 425
TN = 2268

# 평가지표 계산
precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) != 0 else 0

print(f"정밀도: {precision:.4f}")
print(f"재현율: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

정밀도: 0.5074
재현율: 0.9927
F1-score: 0.6716


# GBTClassifier 결과 해석
AUC: 0.67 → 의미 있는 패턴은 잡지만, 강력하지는 않음

## 혼동행렬


  
|label|adjusted_prediction|coun|||---|---|---|-+
|    1|                0.0|25|  4    X False Negative: 위험인데 안전예측25|
|    1|                1.081|0|58    O True Negative: 안전을 잘 맞081|
|    0|                2268|0.0|     O True Positive: 위험을 잘 2268|
|    0|               56380| 1.0|    X False Positive: 안전인데 위험예

--   +(-  label=0  
-    )- : label=1 (
-  험일 확률) 모든 샘플의 p1이 0.`5 미만 → predict`ion  으0.0만 나옴)


## 원인 가능성
- 라벨 불균형 (0:1 비율     너무 차이남)
- 임계값(threshold)     조정 안 됨
- 모델이 너무 보수적 (risk 회피 성향)능   ve)


In [26]:
spark.stop()