In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
import shap
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

# 1. 데이터 불러오기
df = pd.read_excel("example.xlsx")  # 파일은 실행 파일과 같은 폴더에 있어야 함

# 2. 통합지표 생성
# 필수 컬럼: 사망자수, 중상자수, 경상자수 (수치형이어야 함)
df['사망자수'] = pd.to_numeric(df['사망자수'], errors='coerce').fillna(0)
df['중상자수'] = pd.to_numeric(df['중상자수'], errors='coerce').fillna(0)
df['경상자수'] = pd.to_numeric(df['경상자수'], errors='coerce').fillna(0)

df['incident_score'] = 1.0 + 0.7 * (df['중상자수'] > 0) + 0.3 * (df['경상자수'] > 0)
df['severity_score'] = df['사망자수'] * 1.0 + df['중상자수'] * 0.7 + df['경상자수'] * 0.3
df['total_risk'] = df['incident_score'] * 0.4 + df['severity_score'] * 0.6

# 3. feature & target 정의
feature_cols = ['사망자수', '중상자수', '경상자수', '부상신고자수']
X = df[feature_cols]
y_reg = df['total_risk']
y_clf = (df['total_risk'] > df['total_risk'].median()).astype(int)  # 이진 분류용 타겟

# 4. 데이터 분리
X_train, X_test, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)
_, _, y_train_clf, y_test_clf = train_test_split(X, y_clf, test_size=0.2, random_state=42)

# 5. 정규화 (선형 모델에 필요)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6-1. Linear Regression (회귀 + 해석 가능)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train_reg)
y_pred_lr = lr.predict(X_test_scaled)
print("Linear Regression RMSE:", mean_squared_error(y_test_reg, y_pred_lr, squared=False))
print("회귀 계수 (선형모델 해석):", dict(zip(feature_cols, lr.coef_)))

# 6-2. Random Forest Regressor (비선형 관계 포함한 회귀)
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train_reg)
y_pred_rf = rf.predict(X_test)
print("Random Forest Regressor RMSE:", mean_squared_error(y_test_reg, y_pred_rf, squared=False))

# 6-3. Permutation Importance (랜덤포레스트 회귀 기준)
perm = permutation_importance(rf, X_test, y_test_reg, n_repeats=10, random_state=42)
print("Permutation Importance (Reg):")
for i in perm.importances_mean.argsort()[::-1]:
    print(f"{feature_cols[i]}: {perm.importances_mean[i]:.4f}")

# 6-4. SHAP 값 시각화 (랜덤포레스트 회귀 기준)
explainer = shap.Explainer(rf, X_train)
shap_values = explainer(X_test)

# 요약 플롯 (전체 변수 중요도)
shap.summary_plot(shap_values, X_test, plot_type="bar")

# 6-5. Partial Dependence Plot (변수 하나의 영향 시각화)
fig, ax = plt.subplots(figsize=(10, 4))
PartialDependenceDisplay.from_estimator(rf, X_test, features=[0, 1], feature_names=feature_cols, ax=ax)
plt.tight_layout()
plt.show()

# 7-1. Logistic Regression (이진 분류 + 변수 해석)
logr = LogisticRegression()
logr.fit(X_train_scaled, y_train_clf)
y_pred_logr = logr.predict(X_test_scaled)
print("Logistic Regression Accuracy:", accuracy_score(y_test_clf, y_pred_logr))
print("회귀 계수 (로지스틱):", dict(zip(feature_cols, logr.coef_[0])))

# 7-2. Random Forest Classifier (비선형 분류 모델)
rfc = RandomForestClassifier(random_state=42)
rfc.fit(X_train, y_train_clf)
y_pred_rfc = rfc.predict(X_test)
print("Random Forest Classifier Accuracy:", accuracy_score(y_test_clf, y_pred_rfc))
print(classification_report(y_test_clf, y_pred_rfc))


셀 별 실행

In [None]:
df = pd.read_excel("example.xlsx")  # 현재 코드 파일과 같은 폴더에 있어야 함


In [None]:
df['사망자수'] = pd.to_numeric(df['사망자수'], errors='coerce').fillna(0)
df['중상자수'] = pd.to_numeric(df['중상자수'], errors='coerce').fillna(0)
df['경상자수'] = pd.to_numeric(df['경상자수'], errors='coerce').fillna(0)

df['incident_score'] = 1.0 + 0.7 * (df['중상자수'] > 0) + 0.3 * (df['경상자수'] > 0)
df['severity_score'] = df['사망자수'] * 1.0 + df['중상자수'] * 0.7 + df['경상자수'] * 0.3
df['total_risk'] = df['incident_score'] * 0.4 + df['severity_score'] * 0.6


In [None]:
feature_cols = ['사망자수', '중상자수', '경상자수', '부상신고자수']
X = df[feature_cols]
y_reg = df['total_risk']
y_clf = (df['total_risk'] > df['total_risk'].median()).astype(int)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train_reg, y_test_reg = train_test_split(X, y_reg, test_size=0.2, random_state=42)
_, _, y_train_clf, y_test_clf = train_test_split(X, y_clf, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Linear Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train_reg)
print("Linear RMSE:", mean_squared_error(y_test_reg, lr.predict(X_test_scaled), squared=False))
print(dict(zip(feature_cols, lr.coef_)))  # 계수 해석

# Random Forest Regression
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train_reg)
print("RF RMSE:", mean_squared_error(y_test_reg, rf.predict(X_test)))

# Permutation Importance
from sklearn.inspection import permutation_importance
perm = permutation_importance(rf, X_test, y_test_reg, n_repeats=10, random_state=42)
print("Permutation Importance:")
for i in perm.importances_mean.argsort()[::-1]:
    print(f"{feature_cols[i]}: {perm.importances_mean[i]:.4f}")


In [None]:
import shap
explainer = shap.Explainer(rf, X_train)
shap_values = explainer(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar")  # 전체 변수 중요도 시각화


In [None]:
from sklearn.inspection import PartialDependenceDisplay
PartialDependenceDisplay.from_estimator(rf, X_test, features=[0, 1], feature_names=feature_cols)
plt.tight_layout()
plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Logistic Regression (설명 가능한 이진 분류)
logr = LogisticRegression()
logr.fit(X_train_scaled, y_train_clf)
print("Logistic Accuracy:", accuracy_score(y_test_clf, logr.predict(X_test_scaled)))
print(dict(zip(feature_cols, logr.coef_[0])))

# Random Forest Classifier (성능 기반 이진 분류)
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train_clf)
print("RFC Accuracy:", accuracy_score(y_test_clf, rfc.predict(X_test)))
print(classification_report(y_test_clf, rfc.predict(X_test)))
