In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

- 심사 기준     : **RMSLE(Root Mean Squared Logarithmic Error)** of ECLO  
- ECLO         : 인명피해 심각도(Equivalent Casualty Loss Only)  
- **ECLO = 사망자수 * 10 + 중상자수 * 5 + 경상자수 * 3 + 부상자수 * 1**
- 다른 유형의 사고들을 부상자 기준으로 환산하여 사고의 심각 정도와 위험도를 표현하는 방법  
- 부상자       : 교통사고로 인하여 5일 미만의 치료를 요하는 부상을 입은 경우   
- Public Score : 전체 테스트 데이터 중 30%  
- Private Score: 전체 테스트 데이터 중 70%  

In [2]:
train = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/train.csv')
test = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/test.csv')
sample = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/sample_submission.csv')
accident = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/countrywide_accident.csv')
cctv = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 CCTV 정보.csv', encoding='euc-kr')
light = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 보안등 정보.csv', encoding='euc-kr')
kid = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 어린이 보호 구역 정보.csv', encoding='euc-kr')
park = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 주차장 정보.csv', encoding='euc-kr')

In [3]:
def calculate_rmsle(y_true, y_pred):
    """
    Root Mean Squared Logarithmic Error (RMSLE) 계산
    
    Parameters:
    - y_true: 실제 값 배열
    - y_pred: 예측 값 배열
    
    Returns:
    - rmsle_score: RMSLE 점수
    """
    assert len(y_true) == len(y_pred), "입력 배열의 길이가 같아야 합니다."
    
    # 각 값에 1을 더하고 로그 취하기
    log_diff = np.log1p(y_pred) - np.log1p(y_true)
    
    # 제곱하기
    squared_log_diff = np.square(log_diff)
    
    # 평균 구하기
    mean_squared_log_diff = np.mean(squared_log_diff)
    
    # 제곱근 취하기
    rmsle_score = np.sqrt(mean_squared_log_diff)
    
    return rmsle_score

In [4]:
train = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/train.csv')
test = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/test.csv')
sample = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/sample_submission.csv')
accident = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/countrywide_accident.csv')
cctv = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 CCTV 정보.csv', encoding='euc-kr')
light = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 보안등 정보.csv', encoding='euc-kr')
kid = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 어린이 보호 구역 정보.csv', encoding='euc-kr')
park = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 주차장 정보.csv', encoding='euc-kr')

In [5]:
train.drop(['ID', '사고일시', '시군구'], axis=1, inplace=True)
test.drop(['ID', '사고일시', '시군구'], axis=1, inplace=True)
accident.drop(['ID', '사고일시', '시군구'], axis=1, inplace=True)
train = train.dropna()
accident = accident.dropna()
train = train[train['피해운전자 연령'] != '미분류']
train = train[train['가해운전자 연령'] != '미분류']
accident = accident[accident['피해운전자 연령'] != '미분류']
accident = accident[accident['가해운전자 연령'] != '미분류']

In [6]:
train['피해운전자 연령'] = train['피해운전자 연령'].apply(lambda x: '90' if x == '90세 이상' else x)
train['피해운전자 연령'] = train['피해운전자 연령'].apply(lambda x: '98' if x == '98세 이상' else x)
train['피해운전자 연령'] = train['피해운전자 연령'].apply(lambda x: '98' if x == '98 이상' else x)
train['가해운전자 연령'] = train['가해운전자 연령'].apply(lambda x: '90' if x == '90세 이상' else x)
train['가해운전자 연령'] = train['가해운전자 연령'].apply(lambda x: '90' if x == '98세 이상' else x)
accident['가해운전자 연령'] = accident['가해운전자 연령'].apply(lambda x: '90' if x == '90세 이상' else x)
accident['가해운전자 연령'] = accident['가해운전자 연령'].apply(lambda x: '90' if x == '98세 이상' else x)
accident['피해운전자 연령'] = accident['피해운전자 연령'].apply(lambda x: '90' if x == '90세 이상' else x)
accident['피해운전자 연령'] = accident['피해운전자 연령'].apply(lambda x: '98' if x == '98세 이상' else x)

In [7]:
train['피해운전자 연령'] = [i.replace('세', '') for i in train['피해운전자 연령']]
train['피해운전자 연령'] = train['피해운전자 연령'].astype(int)

train['가해운전자 연령'] = [i.replace('세', '') for i in train['가해운전자 연령']]
train['가해운전자 연령'] = train['가해운전자 연령'].astype(int)

accident['피해운전자 연령'] = [i.replace('세', '') for i in accident['피해운전자 연령']]
accident['피해운전자 연령'] = accident['피해운전자 연령'].astype(int)

accident['가해운전자 연령'] = [i.replace('세', '') for i in accident['가해운전자 연령']]
accident['가해운전자 연령'] = accident['가해운전자 연령'].astype(int)

In [8]:
train['가해운전자 성별'] = train['가해운전자 성별'].apply(lambda x: 0 if x == '남' else 1 if x == '여' else 2)
train['피해운전자 성별'] = train['피해운전자 성별'].apply(lambda x: 0 if x == '남' else 1 if x == '여' else 2)
accident['피해운전자 성별'] = accident['피해운전자 성별'].apply(lambda x: 0 if x == '남' else 1 if x == '여' else 2)
accident['가해운전자 성별'] = accident['가해운전자 성별'].apply(lambda x: 0 if x == '남' else 1 if x == '여' else 2)

In [9]:
train['가해운전자 상해정도'] = train['가해운전자 상해정도'].apply(lambda x: 0 if x == '상해없음' else 1 if x == '부상신고' or x == '기타불명' else 3 if x == '경상' else 5 if x == '중상' else 10)
train['피해운전자 상해정도'] = train['피해운전자 상해정도'].apply(lambda x: 0 if x == '상해없음' else 1 if x == '부상신고' or x == '기타불명' else 3 if x == '경상' else 5 if x == '중상' else 10)
accident['피해운전자 상해정도'] = accident['피해운전자 상해정도'].apply(lambda x: 0 if x == '상해없음' else 1 if x == '부상신고' or x == '기타불명' else 3 if x == '경상' else 5 if x == '중상' else 10)
accident['가해운전자 상해정도'] = accident['가해운전자 상해정도'].apply(lambda x: 0 if x == '상해없음' else 1 if x == '부상신고' or x == '기타불명' else 3 if x == '경상' else 5 if x == '중상' else 10)

In [10]:
accident = accident[accident['노면상태'] != '해빙']
accident = accident[accident['도로형태'] != '단일로 - 철길건널목']
accident = accident[accident['사고유형 - 세부분류'] != '철길건널목']
accident = accident[accident['사고유형'] != '철길건널목']

In [11]:
accident['사고유형'] = accident['사고유형'].apply(lambda x: 0 if x == '차대차' else 1 if '차대사람' else 2)
train['사고유형'] = train['사고유형'].apply(lambda x: 0 if x == '차대차' else 1  if'차대사람' else 2)
test['사고유형'] = test['사고유형'].apply(lambda x: 0 if x == '차대차' else 1 if '차대사람' else 2)

---

In [38]:
from sklearn.model_selection import train_test_split, GridSearchCV

acc_dummy = pd.get_dummies(accident[['기상상태', '도로형태', '사고유형', '요일', '노면상태']])
train_dummy = pd.get_dummies(train[['기상상태', '도로형태', '사고유형', '요일', '노면상태']])
test_dummy = pd.get_dummies(test[['기상상태', '도로형태', '사고유형', '요일', '노면상태']])

y_acc = accident[['사망자수', '중상자수', '경상자수', '부상자수']]
y_tra = train[['사망자수', '중상자수', '경상자수', '부상자수']]

train_dummy = train_dummy.drop('기상상태_안개', axis=1)
acc_dummy = acc_dummy.drop('기상상태_안개', axis=1)




In [39]:
from sklearn.preprocessing import StandardScaler

# X_train, X_test, y_train, y_test = train_test_split(train_dummy, y_tra, test_size=0.2)
# scaler = StandardScaler()
# X_train_sc = scaler.fit_transform(X_train)
# X_test_sc = scaler.transform(X_test)
# test_sc = scaler.transform(test_dummy)

# X_train, X_test, y_train, y_test = train_test_split(acc_dummy, y_acc, test_size=0.2)
scaler_2 = StandardScaler()
X2_train_sc = scaler_2.fit_transform(acc_dummy)
X2_test_sc = scaler_2.transform(train_dummy)
test_sc = scaler_2.transform(test_dummy)
y2_train = y_acc.copy()
y2_test = y_tra.copy()

In [16]:
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import time

strat = time.time()

# XGBoost 회귀 모델 초기화
xgb_model = XGBRegressor()

# 다중 출력 회귀를 위한 래퍼
multioutput_model = MultiOutputRegressor(xgb_model)

# 그리드 서치를 위한 파라미터 설정
parameters = {
    'estimator__n_estimators': [100, 300, 500],
    'estimator__learning_rate': [0.001, 0.01, 0.1],
    'estimator__max_depth': [3, 5, 7]
}

# 그리드 서치 객체 생성
grid_search = GridSearchCV(multioutput_model, parameters, cv=3, scoring='neg_mean_squared_error')

# 그리드 서치를 통한 모델 훈련
grid_search.fit(X2_train_sc, y2_train)

# 최적의 파라미터와 점수 출력
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_parameters}")
print(f"Best Score: {best_score}")

# 최적의 파라미터를 가진 모델을 사용하여 테스트 데이터에 대한 예측
best_model = grid_search.best_estimator_
predictions = best_model.predict(X2_test_sc)

end = time.time()
print(f"Time taken: {end - strat:.2f} seconds")

Best Parameters: {'estimator__learning_rate': 0.01, 'estimator__max_depth': 3, 'estimator__n_estimators': 500}
Best Score: -0.33218866258048413
Time taken: 182.02 seconds


In [40]:
xgb = MultiOutputRegressor(XGBRegressor(learning_rate=0.01, max_depth=3, n_estimators=500))
xgb.fit(X2_train_sc, y2_train)
predictions = xgb.predict(X2_test_sc)
calculate_rmsle(y2_test, predictions)

0.2963762096518575

In [41]:
pred_test = xgb.predict(test_sc)
pred_test = pd.DataFrame(pred_test)
pred_test = pred_test.rename(columns={0:'사망자수', 1:'중상자수', 2:'경상자수', 3:'부상자수'})
pred_test['ECLO'] = (pred_test['사망자수']*10) + (pred_test['중상자수']*5) + (pred_test['경상자수']*3) + (pred_test['부상자수']*1)
pred_test

Unnamed: 0,사망자수,중상자수,경상자수,부상자수,ECLO
0,0.028722,0.445947,0.571899,0.047743,4.280390
1,0.031110,0.381270,0.597757,0.065764,4.076479
2,0.009522,0.308217,1.263781,0.098719,5.526371
3,0.012303,0.257018,1.331500,0.107679,5.510296
4,0.009522,0.308217,1.263781,0.098719,5.526371
...,...,...,...,...,...
10958,0.022926,0.347062,1.860155,0.162095,7.707131
10959,0.012303,0.257018,1.331500,0.107679,5.510296
10960,0.012303,0.257018,1.331500,0.107679,5.510296
10961,0.007837,0.227020,1.354996,0.096675,5.375132


In [42]:
sample.drop('ECLO', axis=1, inplace=True)
sample['ECLO'] = pred_test['ECLO']
sample.to_csv('submission9_xgb4_2.csv', index=False)

In [18]:
from sklearn.ensemble import RandomForestRegressor

# 랜덤 포레스트 회귀 모델 초기화
rf_model = RandomForestRegressor()

# 다중 출력 회귀를 위한 래퍼
multioutput_model = MultiOutputRegressor(rf_model)

# 그리드 서치를 위한 파라미터 설정
parameters = {
    'estimator__n_estimators': [100, 300, 500],
    'estimator__max_depth': [3, 5, 7],
    'estimator__min_samples_split': [1 ,2, 5]
}

# 그리드 서치 객체 생성
grid_search = GridSearchCV(multioutput_model, parameters, cv=3, scoring='neg_mean_squared_error')

# 그리드 서치를 통한 모델 훈련
grid_search.fit(X_train_sc, y_train)

# 최적의 파라미터와 점수 출력
best_parameters = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_parameters}")
print(f"Best Score: {best_score}")

# 최적의 파라미터를 가진 모델을 사용하여 테스트 데이터에 대한 예측
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test_sc)


Best Parameters: {'estimator__max_depth': 5, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 100}
Best Score: -0.3321994321074865


In [37]:
xgb = MultiOutputRegressor(RandomForestRegressor(n_estimators=500, max_depth=5, min_samples_split=2))
xgb.fit(X2_train_sc, y2_train)
predictions = xgb.predict(X2_test_sc)
calculate_rmsle(y2_test, predictions)

0.2966231445989728

In [100]:
pred_test = xgb.predict(test_sc)
pred_test = pd.DataFrame(pred_test)
pred_test = pred_test.rename(columns={0:'사망자수', 1:'중상자수', 2:'경상자수', 3:'부상자수'})
pred_test['ECLO'] = (pred_test['사망자수']*10) + (pred_test['중상자수']*5) + (pred_test['경상자수']*3) + (pred_test['부상자수']*1)
pred_test

Unnamed: 0,사망자수,중상자수,경상자수,부상자수,ECLO
0,0.026509,0.396197,0.576100,0.053055,4.027424
1,0.026209,0.352960,0.588185,0.089077,3.880525
2,0.004164,0.310099,1.206833,0.131921,5.344553
3,0.004243,0.217613,1.244224,0.120983,4.984154
4,0.004164,0.310099,1.206833,0.131921,5.344553
...,...,...,...,...,...
10958,0.003322,0.228429,1.458153,0.103750,5.653575
10959,0.004243,0.217613,1.244224,0.120983,4.984154
10960,0.004243,0.217613,1.244224,0.120983,4.984154
10961,0.002879,0.216920,1.250912,0.126391,4.992524


In [89]:
sample.drop('ECLO', axis=1, inplace=True)
sample['ECLO'] = pred_test['ECLO']
sample.to_csv('submission7_rf4_.csv', index=False)