In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')
plt.rcParams['font.family'] = 'Malgun Gothic'
plt.rcParams['axes.unicode_minus'] = False

- 심사 기준     : **RMSLE(Root Mean Squared Logarithmic Error)** of ECLO  
- ECLO         : 인명피해 심각도(Equivalent Casualty Loss Only)  
- **ECLO = 사망자수 * 10 + 중상자수 * 5 + 경상자수 * 3 + 부상자수 * 1**
- 다른 유형의 사고들을 부상자 기준으로 환산하여 사고의 심각 정도와 위험도를 표현하는 방법  
- 부상자       : 교통사고로 인하여 5일 미만의 치료를 요하는 부상을 입은 경우   
- Public Score : 전체 테스트 데이터 중 30%  
- Private Score: 전체 테스트 데이터 중 70%  

In [2]:
train = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/train.csv')
test = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/test.csv')
sample = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/sample_submission.csv')
accident = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/countrywide_accident.csv')
cctv = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 CCTV 정보.csv', encoding='euc-kr')
light = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 보안등 정보.csv', encoding='euc-kr')
kid = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 어린이 보호 구역 정보.csv', encoding='euc-kr')
park = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 주차장 정보.csv', encoding='euc-kr')

In [3]:
def calculate_rmsle(y_true, y_pred):
    """
    Root Mean Squared Logarithmic Error (RMSLE) 계산
    
    Parameters:
    - y_true: 실제 값 배열
    - y_pred: 예측 값 배열
    
    Returns:
    - rmsle_score: RMSLE 점수
    """
    assert len(y_true) == len(y_pred), "입력 배열의 길이가 같아야 합니다."
    
    # 각 값에 1을 더하고 로그 취하기
    log_diff = np.log1p(y_pred) - np.log1p(y_true)
    
    # 제곱하기
    squared_log_diff = np.square(log_diff)
    
    # 평균 구하기
    mean_squared_log_diff = np.mean(squared_log_diff)
    
    # 제곱근 취하기
    rmsle_score = np.sqrt(mean_squared_log_diff)
    
    return rmsle_score

In [4]:
train = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/train.csv')
test = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/test.csv')
sample = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/sample_submission.csv')
accident = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/countrywide_accident.csv')
cctv = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 CCTV 정보.csv', encoding='euc-kr')
light = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 보안등 정보.csv', encoding='euc-kr')
kid = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 어린이 보호 구역 정보.csv', encoding='euc-kr')
park = pd.read_csv('/users/jjong/desktop/vscode/Dacon_src/Daegu/external_open/대구 주차장 정보.csv', encoding='euc-kr')

In [5]:
train.drop(['ID', '사고일시', '시군구'], axis=1, inplace=True)
test.drop(['ID', '사고일시', '시군구'], axis=1, inplace=True)
accident.drop(['ID', '사고일시', '시군구'], axis=1, inplace=True)
train = train.dropna()
accident = accident.dropna()
train = train[train['피해운전자 연령'] != '미분류']
train = train[train['가해운전자 연령'] != '미분류']
accident = accident[accident['피해운전자 연령'] != '미분류']
accident = accident[accident['가해운전자 연령'] != '미분류']

In [6]:
train['피해운전자 연령'] = train['피해운전자 연령'].apply(lambda x: '90' if x == '90세 이상' else x)
train['피해운전자 연령'] = train['피해운전자 연령'].apply(lambda x: '98' if x == '98세 이상' else x)
train['피해운전자 연령'] = train['피해운전자 연령'].apply(lambda x: '98' if x == '98 이상' else x)
train['가해운전자 연령'] = train['가해운전자 연령'].apply(lambda x: '90' if x == '90세 이상' else x)
train['가해운전자 연령'] = train['가해운전자 연령'].apply(lambda x: '90' if x == '98세 이상' else x)
accident['가해운전자 연령'] = accident['가해운전자 연령'].apply(lambda x: '90' if x == '90세 이상' else x)
accident['가해운전자 연령'] = accident['가해운전자 연령'].apply(lambda x: '90' if x == '98세 이상' else x)
accident['피해운전자 연령'] = accident['피해운전자 연령'].apply(lambda x: '90' if x == '90세 이상' else x)
accident['피해운전자 연령'] = accident['피해운전자 연령'].apply(lambda x: '98' if x == '98세 이상' else x)

In [7]:
train['피해운전자 연령'] = [i.replace('세', '') for i in train['피해운전자 연령']]
train['피해운전자 연령'] = train['피해운전자 연령'].astype(int)

train['가해운전자 연령'] = [i.replace('세', '') for i in train['가해운전자 연령']]
train['가해운전자 연령'] = train['가해운전자 연령'].astype(int)

accident['피해운전자 연령'] = [i.replace('세', '') for i in accident['피해운전자 연령']]
accident['피해운전자 연령'] = accident['피해운전자 연령'].astype(int)

accident['가해운전자 연령'] = [i.replace('세', '') for i in accident['가해운전자 연령']]
accident['가해운전자 연령'] = accident['가해운전자 연령'].astype(int)

In [8]:
train['가해운전자 성별'] = train['가해운전자 성별'].apply(lambda x: 0 if x == '남' else 1 if x == '여' else 2)
train['피해운전자 성별'] = train['피해운전자 성별'].apply(lambda x: 0 if x == '남' else 1 if x == '여' else 2)
accident['피해운전자 성별'] = accident['피해운전자 성별'].apply(lambda x: 0 if x == '남' else 1 if x == '여' else 2)
accident['가해운전자 성별'] = accident['가해운전자 성별'].apply(lambda x: 0 if x == '남' else 1 if x == '여' else 2)

In [9]:
train['가해운전자 상해정도'] = train['가해운전자 상해정도'].apply(lambda x: 0 if x == '상해없음' else 1 if x == '부상신고' or x == '기타불명' else 3 if x == '경상' else 5 if x == '중상' else 10)
train['피해운전자 상해정도'] = train['피해운전자 상해정도'].apply(lambda x: 0 if x == '상해없음' else 1 if x == '부상신고' or x == '기타불명' else 3 if x == '경상' else 5 if x == '중상' else 10)
accident['피해운전자 상해정도'] = accident['피해운전자 상해정도'].apply(lambda x: 0 if x == '상해없음' else 1 if x == '부상신고' or x == '기타불명' else 3 if x == '경상' else 5 if x == '중상' else 10)
accident['가해운전자 상해정도'] = accident['가해운전자 상해정도'].apply(lambda x: 0 if x == '상해없음' else 1 if x == '부상신고' or x == '기타불명' else 3 if x == '경상' else 5 if x == '중상' else 10)

In [10]:
accident = accident[accident['노면상태'] != '해빙']
accident = accident[accident['도로형태'] != '단일로 - 철길건널목']
accident = accident[accident['사고유형 - 세부분류'] != '철길건널목']
accident = accident[accident['사고유형'] != '철길건널목']

In [11]:
accident['사고유형'] = accident['사고유형'].apply(lambda x: 0 if x == '차대차' else 1 if '차대사람' else 2)
train['사고유형'] = train['사고유형'].apply(lambda x: 0 if x == '차대차' else 1  if'차대사람' else 2)
test['사고유형'] = test['사고유형'].apply(lambda x: 0 if x == '차대차' else 1 if '차대사람' else 2)

---

In [95]:
from sklearn.model_selection import train_test_split, GridSearchCV

acc_dummy = pd.get_dummies(accident[['기상상태', '도로형태', '사고유형']])
train_dummy = pd.get_dummies(train[['기상상태', '도로형태', '사고유형']])
test_dummy = pd.get_dummies(test[['기상상태', '도로형태', '사고유형']])

y_acc = accident[['사망자수', '중상자수', '경상자수', '부상자수']]
y_tra = train[['사망자수', '중상자수', '경상자수', '부상자수']]

train_dummy = train_dummy.drop('기상상태_안개', axis=1)
acc_dummy = acc_dummy.drop('기상상태_안개', axis=1)

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(train_dummy, y_tra, test_size=0.2)
# X_train, X_test, y_train, y_test = train_test_split(acc_dummy, y_acc, test_size=0.2)

In [97]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

std = StandardScaler()
X_train_sc = std.fit_transform(X_train)
X_test_sc = std.transform(X_test)
test_sc = std.transform(test_dummy)

mm = MinMaxScaler()

X_train_mm = mm.fit_transform(X_train)
X_test_mm = mm.transform(X_test)
test_mm = mm.transform(test_dummy)

In [98]:
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

# 1. 기본 모델 훈련
input_shape = (X_train_sc.shape[1],)
output_shape = (4,)

# 기본 모델 구성
input_layer = Input(shape=input_shape)
x = Dense(256, activation='relu')(input_layer)
x = BatchNormalization()(x) 
x = Dropout(0.5)(x)  

x = Dense(128, activation='relu')(x)
x = BatchNormalization()(x) 
x = Dropout(0.4)(x)  

x = Dense(64, activation='relu')(x)
x = BatchNormalization()(x) 
x = Dropout(0.4)(x)  

x = Dense(32, activation='relu')(x)
x = BatchNormalization()(x) 
x = Dropout(0.4)(x)  

x = Dense(16, activation='relu')(x)
x = BatchNormalization()(x) 
x = Dropout(0.4)(x)  

x = Dense(8, activation='relu')(x)
x = BatchNormalization()(x) 
x = Dropout(0.4)(x)  

x = Dense(4, activation='relu')(x)
x = BatchNormalization()(x) 
x = Dropout(0.4)(x)  

x = Dense(4, activation='relu')(x)
x = BatchNormalization()(x) 
x = Dropout(0.4)(x)  

x = Dense(8, activation='relu')(x)
x = BatchNormalization()(x) 
x = Dropout(0.4)(x)  

x = Dense(16, activation='relu')(x)
x = BatchNormalization()(x) 
x = Dropout(0.4)(x)  

x = Dense(32, activation='relu')(x)
x = BatchNormalization()(x) 
x = Dropout(0.4)(x)  

x = Dense(64, activation='relu')(x)
x = BatchNormalization()(x) 
x = Dropout(0.4)(x)  

x = Dense(8, activation='relu')(x)
x = BatchNormalization()(x) 
x = Dropout(0.4)(x)  

x = Dense(4, activation='relu')(x)
x = BatchNormalization()(x) 
output_layer = Dense(output_shape[0], activation='linear')(x)

model = Model(inputs=input_layer, outputs=output_layer)
optimizer = Adam(learning_rate=0.001)

# 모델 컴파일 및 훈련
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics='mse')
model.summary()

Model: "model_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_23 (InputLayer)       [(None, 17)]              0         
                                                                 
 dense_236 (Dense)           (None, 256)               4608      
                                                                 
 batch_normalization_207 (B  (None, 256)               1024      
 atchNormalization)                                              
                                                                 
 dropout_155 (Dropout)       (None, 256)               0         
                                                                 
 dense_237 (Dense)           (None, 128)               32896     
                                                                 
 batch_normalization_208 (B  (None, 128)               512       
 atchNormalization)                                       

In [50]:
# from tensorflow.keras.callbacks import EarlyStopping

# # EarlyStopping 콜백 설정
# # early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# # 간소화된 모델 구성
# input_layer = Input(shape=input_shape)
# x = Dense(64, activation='relu')(input_layer)
# x = BatchNormalization()(x) 
# x = Dense(32, activation='relu')(x)
# x = BatchNormalization()(x) 
# output_layer = Dense(output_shape[0], activation='linear')(x)

# model = Model(inputs=input_layer, outputs=output_layer)

# # 모델 컴파일 및 훈련
# model.compile(optimizer='adam', loss='mean_squared_error')
# model.summary()



In [99]:
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)
model.fit(X_train_sc, y_train, validation_split=0.2, batch_size=128, epochs=100, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100


<keras.src.callbacks.History at 0x2ea89053f10>

In [100]:
pred = model.predict(X_test_sc)
pred = pd.DataFrame(pred)
pred = pred.rename(columns={0:'사망자수', 1:'중상자수', 2:'경상자수', 3:'부상자수'})
pred['ECLO'] = (pred['사망자수']*10) + (pred['중상자수']*5) + (pred['경상자수']*3) + (pred['부상자수']*1)
calculate_rmsle(y_test, pred)



0.31671899056149133

In [57]:
pred.shape, sample.shape

((7514, 5), (10963, 2))

In [60]:
pred = model.predict(test_sc)
pred = pd.DataFrame(pred)
pred = pred.rename(columns={0:'사망자수', 1:'중상자수', 2:'경상자수', 3:'부상자수'})
pred['ECLO'] = (pred['사망자수']*10) + (pred['중상자수']*5) + (pred['경상자수']*3) + (pred['부상자수']*1)

sample.drop('ECLO', axis=1, inplace=True)
sample['ECLO'] = pred['ECLO']
sample.to_csv('submission8_DL4_.csv', index=False)

