In [None]:
import os
import random
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df=pd.read_csv("/content/drive/MyDrive/대구 교통사고 피해 예측 경진대회/train.csv")
test_df=pd.read_csv("/content/drive/MyDrive/대구 교통사고 피해 예측 경진대회/test.csv")

In [None]:
light_df = pd.read_csv('/content/drive/MyDrive/대구 교통사고 피해 예측 경진대회/대구 보안등 정보.csv', encoding='cp949')[['설치개수', '소재지지번주소','설치형태']]

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

light_df[['도시', '구', '동', '번지']] = light_df['소재지지번주소'].str.extract(location_pattern)
light_df = light_df.drop(columns=['소재지지번주소', '번지'])

light_df = light_df.groupby(['도시', '구', '동']).agg({'설치개수': 'sum','설치형태':'first'}).reset_index()

light_df.reset_index(inplace=True, drop=True)

In [None]:
child_area_df = pd.read_csv('/content/drive/MyDrive/대구 교통사고 피해 예측 경진대회/대구 어린이 보호 구역 정보.csv', encoding='cp949').drop_duplicates()[['CCTV설치대수','소재지지번주소','보호구역도로폭','CCTV설치여부']]
child_area_df['cnt'] = 1

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

child_area_df[['도시', '구', '동', '번지']] = child_area_df['소재지지번주소'].str.extract(location_pattern)
child_area_df = child_area_df.drop(columns=['소재지지번주소', '번지'])

child_area_df = child_area_df.groupby(['도시', '구', '동']).agg({'CCTV설치대수': 'sum','보호구역도로폭':'first','CCTV설치여부':'first'}).reset_index()
child_area_df.reset_index(inplace=True, drop=True)

In [None]:
child_area_df.fillna(0, inplace=True)

In [None]:
child_area_df['CCTV설치여부'] = child_area_df['CCTV설치여부'].replace({'Y': 1, 'N': 0})

In [None]:
parking_df = pd.read_csv('/content/drive/MyDrive/대구 교통사고 피해 예측 경진대회/대구 주차장 정보.csv', encoding='cp949')[['소재지지번주소', '급지구분']]
parking_df = pd.get_dummies(parking_df, columns=['급지구분'])

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

parking_df[['도시', '구', '동', '번지']] = parking_df['소재지지번주소'].str.extract(location_pattern)
parking_df = parking_df.drop(columns=['소재지지번주소', '번지'])

parking_df = parking_df.groupby(['도시', '구', '동']).sum().reset_index()
parking_df.reset_index(inplace=True, drop=True)

In [None]:
cctv_df = pd.read_csv('/content/drive/MyDrive/대구 교통사고 피해 예측 경진대회/대구 CCTV 정보.csv', encoding='cp949')[['소재지지번주소', '단속구분','제한속도']]

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

cctv_df[['도시', '구', '동', '번지']] = cctv_df['소재지지번주소'].str.extract(location_pattern)
cctv_df = cctv_df.drop(columns=['소재지지번주소', '번지'])

cctv_df = cctv_df.groupby(['도시', '구', '동']).agg({'단속구분': 'first','제한속도':'first'}).reset_index()
cctv_df.reset_index(inplace=True, drop=True)

In [None]:
cctv_df

Unnamed: 0,도시,구,동,단속구분,제한속도
0,대구,달성군,가창면,1,30
1,대구,달성군,구지면,1,30
2,대구,달성군,다사읍,1,30
3,대구,달성군,옥포읍,2,30
4,대구,달성군,유가읍,2,30
...,...,...,...,...,...
176,대구광역시,중구,인교동,4,0
177,대구광역시,중구,종로1가,4,0
178,대구광역시,중구,종로2가,4,0
179,대구광역시,중구,태평로1가,4,0


In [None]:
location_pattern = r'(\S+) (\S+) (\S+)'

train_df[['도시', '구', '동']] = train_df['시군구'].str.extract(location_pattern)
train_df = train_df.drop(columns=['시군구'])

test_df[['도시', '구', '동']] = test_df['시군구'].str.extract(location_pattern)
test_df = test_df.drop(columns=['시군구'])

In [None]:
time_pattern = r'(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2})'

train_df[['연', '월', '일', '시간']] = train_df['사고일시'].str.extract(time_pattern)
train_df[['연', '월', '일', '시간']] = train_df[['연', '월', '일', '시간']].apply(pd.to_numeric) # 추출된 문자열을 수치화해줍니다
train_df = train_df.drop(columns=['사고일시']) # 정보 추출이 완료된 '사고일시' 컬럼은 제거합니다

# 해당 과정을 test_x에 대해서도 반복해줍니다
test_df[['연', '월', '일', '시간']] = test_df['사고일시'].str.extract(time_pattern)
test_df[['연', '월', '일', '시간']] = test_df[['연', '월', '일', '시간']].apply(pd.to_numeric)
test_df = test_df.drop(columns=['사고일시'])

In [None]:
road_pattern = r'(.+) - (.+)'

train_df[['도로형태1', '도로형태2']] = train_df['도로형태'].str.extract(road_pattern)
train_df = train_df.drop(columns=['도로형태'])

test_df[['도로형태1', '도로형태2']] = test_df['도로형태'].str.extract(road_pattern)
test_df = test_df.drop(columns=['도로형태'])

In [None]:
# train_df와 test_df에, light_df와 child_area_df, parking_df를 merge하세요.
train_df = pd.merge(train_df, light_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, child_area_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, parking_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, cctv_df, how='left', on=['도시', '구', '동'])

test_df = pd.merge(test_df, light_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, child_area_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, parking_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, cctv_df, how='left', on=['도시', '구', '동'])

In [None]:
train_df = train_df.drop(columns=['도시','구','ID'])

In [None]:
test_df = test_df.drop(columns=['도시','구','ID'])

In [None]:
train_df

NameError: ignored

In [None]:
#train_df=pd.get_dummies(train_df.drop(['요일','ID','가해운전자 연령','피해운전자 연령'], axis=1))

In [None]:
import pandas as pd
train_df=pd.read_csv("/content/drive/MyDrive/대구 교통사고 피해 예측 경진대회/train_df.csv")
test_df=pd.read_csv("/content/drive/MyDrive/대구 교통사고 피해 예측 경진대회/test_df.csv")

In [None]:
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

In [None]:
X_train=train_df.drop(columns='ECLO')
y_train=train_df['ECLO']
X_test=train_df.drop(columns='ECLO')

In [None]:
X_train = X_train.drop(columns=['군_남구','군_달서구','군_달성군','군_동구','군_북구','군_서구','군_수성구','군_중구'])
X_test = X_test.drop(columns=['군_남구','군_달서구','군_달성군','군_동구','군_북구','군_서구','군_수성구','군_중구'])

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.3


In [None]:
from sklearn.preprocessing import LabelEncoder
from category_encoders.target_encoder import TargetEncoder

categorical_features = list(X_train.dtypes[X_train.dtypes == "object"].index)
# 추출된 문자열 변수 확인
display(categorical_features)

for i in categorical_features:
    le = TargetEncoder(cols=[i])
    X_train[i] = le.fit_transform(X_train[i], y_train)
    X_test[i] = le.transform(X_test[i])

display(X_train.head())
display(X_test.head())

['ID',
 '시군구',
 '사고유형 - 세부분류',
 '법규위반',
 '가해운전자 차종',
 '가해운전자 성별',
 '가해운전자 연령',
 '가해운전자 상해정도',
 '피해운전자 차종',
 '피해운전자 성별',
 '피해운전자 연령',
 '피해운전자 상해정도']

Unnamed: 0,ID,요일,기상상태,시군구,노면상태,사고유형,사고유형 - 세부분류,법규위반,가해운전자 차종,가해운전자 성별,...,사고 발생 시간대,요일 가중치,도로형태 가중치,설치개수,cnt,급지구분_1,급지구분_2,급지구분_3,사고발생횟수,season
0,4.762262,4.627926,4.712888,4.282449,4.712878,3.81765,3.152034,4.484001,4.912151,4.790513,...,5.232928,1.0,1.0,391.0,2.0,11.0,0.0,0.0,131,4.659111
1,4.502045,4.627926,4.77915,4.738938,4.712878,3.81765,3.78626,4.131089,4.912151,4.773165,...,5.232928,1.0,1.0,932.0,0.0,0.0,1.0,3.0,678,4.659111
2,4.502045,4.627926,4.712888,4.842715,4.712878,3.81765,3.907468,4.484001,4.912151,4.773165,...,5.232928,1.0,1.0,473.0,5.0,0.0,0.0,0.0,604,4.659111
3,4.762262,4.627926,4.712888,4.20892,4.712878,4.944597,5.572812,4.484001,4.912151,4.773165,...,5.232928,1.0,1.0,534.0,11.0,0.0,9.0,5.0,426,4.659111
4,4.502045,4.627926,4.712888,4.549091,4.712878,4.944597,5.572812,4.484001,4.912151,4.773165,...,5.232928,1.0,1.0,2057.0,0.0,0.0,1.0,0.0,825,4.659111


Unnamed: 0,ID,요일,기상상태,시군구,노면상태,사고유형,사고유형 - 세부분류,법규위반,가해운전자 차종,가해운전자 성별,...,사고 발생 시간대,요일 가중치,도로형태 가중치,설치개수,cnt,급지구분_1,급지구분_2,급지구분_3,사고발생횟수,season
0,4.762262,4.627926,4.712888,4.282449,4.712878,3.81765,3.152034,4.484001,4.912151,4.790513,...,5.232928,1.0,1.0,391.0,2.0,11.0,0.0,0.0,131,4.659111
1,4.502045,4.627926,4.77915,4.738938,4.712878,3.81765,3.78626,4.131089,4.912151,4.773165,...,5.232928,1.0,1.0,932.0,0.0,0.0,1.0,3.0,678,4.659111
2,4.502045,4.627926,4.712888,4.842715,4.712878,3.81765,3.907468,4.484001,4.912151,4.773165,...,5.232928,1.0,1.0,473.0,5.0,0.0,0.0,0.0,604,4.659111
3,4.762262,4.627926,4.712888,4.20892,4.712878,4.944597,5.572812,4.484001,4.912151,4.773165,...,5.232928,1.0,1.0,534.0,11.0,0.0,9.0,5.0,426,4.659111
4,4.502045,4.627926,4.712888,4.549091,4.712878,4.944597,5.572812,4.484001,4.912151,4.773165,...,5.232928,1.0,1.0,2057.0,0.0,0.0,1.0,0.0,825,4.659111


In [None]:
!pip install category_encoders



In [None]:
train_x['시간'].unique
# 오전 오후 나눠도 될 듯

<bound method Series.unique of 0         0
1         0
2         1
3         2
4         4
         ..
39604    19
39605    19
39606    21
39607    22
39608    23
Name: 시간, Length: 39609, dtype: int64>

In [None]:
train_x.fillna(0, inplace=True)
test_x.fillna(0, inplace=True)

In [None]:
import tensorflow as tf

def rmsle(y_true, y_pred):
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)

    squared_error = tf.square(tf.math.log1p(y_pred) - tf.math.log1p(y_true))

    return tf.sqrt(tf.reduce_mean(squared_error))

def loss_fn(y_true, y_pred):
    return rmsle(y_true, y_pred)

def metric_fn(y_true, y_pred):
    return rmsle(y_true, y_pred)

In [None]:
callbacks_list = [
    tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=30, verbose=2, mode='min',restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.8, patience=3, min_lr=0.00001),
    tf.keras.callbacks.TerminateOnNaN()
]

In [None]:
def create_model():

    input_layer = tf.keras.Input(shape=(len(train_x.columns), ))
    x = tf.keras.layers.BatchNormalization(epsilon=0.00001)(input_layer)
    x = tf.keras.layers.Dense(16, activation='relu')(x)

    x = tf.keras.layers.Dense(32, activation='relu')(x)

    x = tf.keras.layers.Dense(48, activation='relu')(x)
    output_layer = tf.keras.layers.Dense(1)(x)

    model = tf.keras.Model(inputs=input_layer, outputs=output_layer)

    model.compile(optimizer=tf.keras.optimizers.Adam(0.001),
                  loss=loss_fn,
                  metrics=[metric_fn]
                  )

    return model

model = create_model()
history = model.fit(train_x.astype('float32'), train_y.astype('float32'),
                    epochs=100,
                    callbacks=callbacks_list,
                    validation_split=0.1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
test_x

Unnamed: 0,요일,기상상태,노면상태,사고유형,사고유형 - 세부분류,법규위반,가해운전자 차종,가해운전자 성별,가해운전자 연령,가해운전자 상해정도,피해운전자 차종,피해운전자 성별,피해운전자 연령,피해운전자 상해정도,사망자수,중상자수,경상자수,부상자수,동,연,월,일,시간,도로형태1,도로형태2,설치개수,설치형태,CCTV설치대수,보호구역도로폭,CCTV설치여부,급지구분_1,급지구분_2,급지구분_3,단속구분,제한속도
0,화요일,맑음,건조,차대사람,길가장자리구역통행중,안전운전불이행,승용,여,51세,상해없음,보행자,여,70세,중상,0,1,0,0,대신동,2019,1,1,0,단일로,기타,391.0,한전주,13.0,4~32,1.0,11.0,0.0,0.0,4.0,0.0
1,화요일,흐림,건조,차대사람,보도통행중,기타,승용,남,39세,상해없음,보행자,남,61세,경상,0,0,1,0,감삼동,2019,1,1,0,단일로,기타,932.0,0,0.0,0,0.0,0.0,1.0,3.0,4.0,0.0
2,화요일,맑음,건조,차대사람,차도통행중,안전운전불이행,승용,남,70세,상해없음,보행자,남,38세,경상,0,0,1,0,두산동,2019,1,1,1,단일로,기타,473.0,건축물,0.0,0,1.0,0.0,0.0,0.0,1.0,60.0
3,화요일,맑음,건조,차대차,추돌,안전운전불이행,승용,남,49세,상해없음,승용,남,36세,중상,0,1,0,0,복현동,2019,1,1,2,단일로,기타,534.0,0,32.0,10.0,1.0,0.0,9.0,5.0,2.0,50.0
4,화요일,맑음,건조,차대차,추돌,안전운전불이행,승용,남,30세,상해없음,승용,남,52세,경상,0,0,1,0,신암동,2019,1,1,4,단일로,기타,2057.0,한전주,0.0,0,0.0,0.0,1.0,0.0,2.0,50.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,금요일,맑음,건조,차대차,측면충돌,신호위반,승용,여,52세,상해없음,이륜,남,28세,경상,0,0,1,0,수성동3가,2021,12,31,19,교차로,교차로안,0.0,0,0.0,0,1.0,0.0,0.0,0.0,2.0,40.0
39605,금요일,맑음,건조,차대차,측면충돌,안전거리미확보,승용,여,60세,상해없음,승용,남,52세,경상,0,0,1,0,상인동,2021,12,31,19,단일로,기타,843.0,0,0.0,0,0.0,0.0,0.0,5.0,4.0,0.0
39606,금요일,맑음,건조,차대차,측면충돌,교차로운행방법위반,승용,남,60세,중상,승용,남,73세,중상,0,2,0,0,월성동,2021,12,31,21,교차로,교차로안,164.0,0,0.0,0,0.0,0.0,1.0,0.0,4.0,0.0
39607,금요일,맑음,건조,차대차,추돌,안전운전불이행,승용,남,40세,상해없음,승용,여,57세,경상,0,0,1,0,장동,2021,12,31,22,기타,기타,210.0,0,0.0,0,0.0,0.0,0.0,1.0,1.0,60.0


In [None]:
sample_submission = pd.read_csv('/content/drive/MyDrive/대구 교통사고 피해 예측 경진대회/sample_submission.csv')
sample_submission["ECLO"] = model.predict(test_x.astype('float32'))

sample_submission.to_csv("/content/drive/MyDrive/대구 교통사고 피해 예측 경진대회/tensorflow_ss77.csv", index=False)



ValueError: ignored