In [1]:
import os
import random
import numpy as np


def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)


seed_everything(42)

In [2]:

import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

plt.rc('font', family='Malgun Gothic')  # 폰트 지정
plt.rc('axes', unicode_minus=False)  # 마이너스 폰트 설정
%config InlineBackend.figure_format = 'retina'  # 그래프 글씨 뚜렷

In [3]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

In [4]:
target_col = ['ECLO']
use_col = df_test.columns.tolist()

use_col.extend(target_col)

In [5]:
df_train = df_train[use_col]

print(len(df_train.columns), len(df_test.columns))

9 8


# ID

In [6]:
df_train.drop(columns=['ID'], axis=1, inplace=True)
df_test.drop(columns=['ID'], axis=1, inplace=True)

# 사고일시
    - 달 : month
    - 시간 : time

In [7]:
df_train['month'] = df_train['사고일시'].str.split(expand=True)[0]
df_train['time'] = df_train['사고일시'].str.split(expand=True)[1]

df_test['month'] = df_test['사고일시'].str.split(expand=True)[0]
df_test['time'] = df_test['사고일시'].str.split(expand=True)[1]

df_train['month'] = df_train['month'].str.split('-', expand=True)[1]
df_test['month'] = df_test['month'].str.split('-', expand=True)[1]

df_train.drop(columns=['사고일시'], axis=1, inplace=True)
df_test.drop(columns=['사고일시'], axis=1, inplace=True)

df_train[['month', 'time']] = df_train[['month', 'time']].astype('float')
df_test[['month', 'time']] = df_test[['month', 'time']].astype('float')

In [8]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()

df_train[['month', 'time']] = mms.fit_transform(df_train[['month', 'time']])
df_test[['month', 'time']] = mms.fit_transform(df_test[['month', 'time']])

In [9]:
print(len(df_train.columns), len(df_test.columns))

9 8


# 요일
    - 원-핫

In [10]:
df_train = pd.concat([df_train, pd.get_dummies(df_train['요일'])], axis=1)
df_test = pd.concat([df_test, pd.get_dummies(df_test['요일'])], axis=1)

In [11]:
df_train.drop(columns=['요일'], axis=1, inplace=True)
df_test.drop(columns=['요일'], axis=1, inplace=True)

In [12]:
df_train.iloc[:3, -9:]

Unnamed: 0,month,time,금요일,목요일,수요일,월요일,일요일,토요일,화요일
0,0.0,0.0,0,0,0,0,0,0,1
1,0.0,0.0,0,0,0,0,0,0,1
2,0.0,0.043478,0,0,0,0,0,0,1


In [13]:
print(len(df_train.columns), len(df_test.columns))

15 14


# 기상상태
    - 1. train 안개 포함 행을 지워 -> 원핫인코딩 , 데이터 개수가 줄어
    - 2. 원-핫 -> 안개 컬럼을 지워, 안개 컬럼의 정보가 들어가 있음
    - 기타 -> 살리자

In [14]:
not_use_col_train = df_train[df_train['기상상태'] == '안개'].index.tolist()

df_train.drop(index=not_use_col_train, inplace=True)

df_train.reset_index(drop=True, inplace=True)

In [15]:
df_train = pd.concat([df_train, pd.get_dummies(df_train['기상상태'])], axis=1)
df_test = pd.concat([df_test, pd.get_dummies(df_test['기상상태'])], axis=1)

In [16]:
df_train.drop(columns=['기상상태'], axis=1, inplace = True)
df_test.drop(columns=['기상상태'], axis=1, inplace = True)

In [17]:
df_train.iloc[:3, -14:]

Unnamed: 0,month,time,금요일,목요일,수요일,월요일,일요일,토요일,화요일,기타,눈,맑음,비,흐림
0,0.0,0.0,0,0,0,0,0,0,1,0,0,1,0,0
1,0.0,0.0,0,0,0,0,0,0,1,0,0,0,0,1
2,0.0,0.043478,0,0,0,0,0,0,1,0,0,1,0,0


In [18]:
print(len(df_train.columns), len(df_test.columns))

19 18


In [19]:
df_train.rename(columns={'기타': "기상상태-기타"}, inplace=True)

df_test.rename(columns={'기타': "기상상태-기타"}, inplace=True)

# 시군구

In [20]:
df_train['군구'] = df_train['시군구'].str.split(expand=True)[1] + ' ' + \
    df_train['시군구'].str.split(expand=True)[2]

df_test['군구'] = df_test['시군구'].str.split(expand=True)[1] + ' ' + \
    df_test['시군구'].str.split(expand=True)[2]

In [21]:
df_train.drop(columns=['시군구'], axis=1, inplace = True)
df_test.drop(columns=['시군구'], axis=1, inplace = True)

In [22]:
gu_train = df_train['군구'].value_counts().index.tolist()
gu_test = df_test['군구'].value_counts().index.tolist()

In [23]:
not_gu_use_col = []

for i in range(len(gu_train)):
    if gu_train[i] not in gu_test :
        not_gu_use_col.append(gu_train[i])

In [24]:
not_gu_use_col

['동구 신무동', '동구 둔산동', '동구 내동', '중구 서야동', '중구 장관동', '중구 도원동', '북구 도남동']

In [25]:
for i in not_gu_use_col :
    not_use_col = df_train[df_train['군구'] == i].index.tolist()
    df_train.drop(index=not_use_col, inplace = True)
    df_train.reset_index(drop=True)

In [26]:
len(df_train['군구'].value_counts()) , len(df_test['군구'].value_counts())

(192, 192)

In [27]:
df_train = pd.concat([df_train, pd.get_dummies(df_train['군구'])], axis=1)
df_test = pd.concat([df_test, pd.get_dummies(df_test['군구'])], axis=1)

df_train.drop(columns=['군구'], axis=1, inplace = True)
df_test.drop(columns=['군구'], axis=1, inplace = True)

In [28]:
print(len(df_train.columns), len(df_test.columns))

210 209


# 도로형태

In [29]:
df_train = pd.concat([df_train, pd.get_dummies(df_train['도로형태'])], axis=1)
df_test = pd.concat([df_test, pd.get_dummies(df_test['도로형태'])], axis=1)

df_train.drop(columns=['도로형태'], axis=1, inplace = True)
df_test.drop(columns=['도로형태'], axis=1, inplace = True)

In [30]:
print(len(df_train.columns), len(df_test.columns))

220 219


# 노면상태

In [31]:
df_train = pd.concat([df_train, pd.get_dummies(df_train['노면상태'])], axis=1)
df_test = pd.concat([df_test, pd.get_dummies(df_test['노면상태'])], axis=1)

df_train.drop(columns=['노면상태'], axis=1, inplace = True)
df_test.drop(columns=['노면상태'], axis=1, inplace = True)

In [32]:
print(len(df_train.columns), len(df_test.columns))

225 224


In [33]:
df_train.rename(columns={'기타': "노면상태-기타"}, inplace=True)

df_test.rename(columns={'기타': "노면상태-기타"}, inplace=True)

# 사고유형

In [34]:
df_train = pd.concat([df_train, pd.get_dummies(df_train['사고유형'])], axis=1)
df_test = pd.concat([df_test, pd.get_dummies(df_test['사고유형'])], axis=1)

df_train.drop(columns=['사고유형'], axis=1, inplace = True)
df_test.drop(columns=['사고유형'], axis=1, inplace = True)

In [35]:
print(len(df_train.columns), len(df_test.columns))

227 226


# 데이터 저장

In [36]:
df_train.to_csv("../data/df_train_005.csv", index=False, encoding='utf-8')
df_test.to_csv("../data/df_test_005.csv", index=False, encoding='utf-8')