## Import data

In [3]:
import pandas as pd 
import os
import random
import numpy as np
import matplotlib.pyplot as plt


path = "/mnt/d/data/accident/"

train_org = pd.read_csv(path + 'train.csv') 
test_org = pd.read_csv(path + 'test.csv')

sample_submission = pd.read_csv(path+"sample_submission.csv")

## Set seed

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

## train, test 데이터 기간 확인

In [5]:
display(f"train : {train_org.iloc[0]['사고일시']} ~ {train_org.iloc[-1]['사고일시']}")
display(f"test : {test_org.iloc[0]['사고일시']} ~ {test_org.iloc[-1]['사고일시']}")     

'train : 2019-01-01 00 ~ 2021-12-31 23'

'test : 2022-01-01 01 ~ 2022-12-31 21'

# **데이터 전처리**  

In [6]:
train_df = train_org.copy()
test_df = test_org.copy()

time_pattern = r'(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2})' 

train_df[['연', '월', '일', '시간']] = train_org['사고일시'].str.extract(time_pattern)
train_df[['연', '월', '일', '시간']] = train_df[['연', '월', '일', '시간']].apply(pd.to_numeric) # 추출된 문자열을 수치화해줍니다 
train_df = train_df.drop(columns=['사고일시']) # 정보 추출이 완료된 '사고일시' 컬럼은 제거합니다 

# 해당 과정을 test_x에 대해서도 반복해줍니다 
test_df[['연', '월', '일', '시간']] = test_org['사고일시'].str.extract(time_pattern)
test_df[['연', '월', '일', '시간']] = test_df[['연', '월', '일', '시간']].apply(pd.to_numeric)
test_df = test_df.drop(columns=['사고일시'])

location_pattern = r'(\S+) (\S+) (\S+)'

train_df[['도시', '구', '동']] = train_org['시군구'].str.extract(location_pattern)
train_df = train_df.drop(columns=['시군구'])

test_df[['도시', '구', '동']] = test_org['시군구'].str.extract(location_pattern)
test_df = test_df.drop(columns=['시군구'])

road_pattern = r'(.+) - (.+)'

train_df[['도로형태1', '도로형태2']] = train_org['도로형태'].str.extract(road_pattern)
train_df = train_df.drop(columns=['도로형태'])

test_df[['도로형태1', '도로형태2']] = test_org['도로형태'].str.extract(road_pattern)
test_df = test_df.drop(columns=['도로형태'])

In [7]:
train_org.corr()

  train_org.corr()


Unnamed: 0,사망자수,중상자수,경상자수,부상자수,ECLO
사망자수,1.0,-0.004368,-0.060607,-0.022771,0.218507
중상자수,-0.004368,1.0,-0.325585,-0.105167,0.46452
경상자수,-0.060607,-0.325585,1.0,-0.165157,0.63637
부상자수,-0.022771,-0.105167,-0.165157,1.0,-0.118713
ECLO,0.218507,0.46452,0.63637,-0.118713,1.0


## Use additional data

In [18]:
light_df = pd.read_csv(os.path.join(path, "external_open/light.csv"), encoding='cp949')[['설치개수', '소재지지번주소']]

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

light_df[['도시', '구', '동', '번지']] = light_df['소재지지번주소'].str.extract(location_pattern)
light_df = light_df.drop(columns=['소재지지번주소', '번지'])

light_df = light_df.groupby(['도시', '구', '동']).sum().reset_index()
light_df.reset_index(inplace=True, drop=True)

  light_df = pd.read_csv(os.path.join(path, "external_open/light.csv"), encoding='cp949')[['설치개수', '소재지지번주소']]


In [19]:
child_area_df = pd.read_csv(os.path.join(path, "external_open/child.csv"), encoding='cp949')[['CCTV설치대수', '소재지지번주소']]
child_area_df['보호구역수'] = 1

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

child_area_df[['도시', '구', '동', '번지']] = child_area_df['소재지지번주소'].str.extract(location_pattern)
child_area_df = child_area_df.drop(columns=['소재지지번주소', '번지'])

child_area_df = child_area_df.groupby(['도시', '구', '동']).sum().reset_index()
child_area_df.reset_index(inplace=True, drop=True)

In [20]:
parking_df = pd.read_csv(os.path.join(path, "external_open/parking.csv"), encoding='cp949')[['소재지지번주소', '급지구분', "주차구획수"]]
parking_df = pd.get_dummies(parking_df, columns=['급지구분'])

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

parking_df[['도시', '구', '동', '번지']] = parking_df['소재지지번주소'].str.extract(location_pattern)
parking_df = parking_df.drop(columns=['소재지지번주소', '번지'])

parking_df = parking_df.groupby(['도시', '구', '동']).sum().reset_index()
parking_df.reset_index(inplace=True, drop=True)

In [21]:
train_df["구"].unique()

array(['중구', '달서구', '수성구', '북구', '동구', '서구', '달성군', '남구'], dtype=object)

In [22]:
train_df

Unnamed: 0,ID,요일,기상상태,노면상태,사고유형,사고유형 - 세부분류,법규위반,가해운전자 차종,가해운전자 성별,가해운전자 연령,...,ECLO,연,월,일,시간,도시,구,동,도로형태1,도로형태2
0,ACCIDENT_00000,화요일,맑음,건조,차대사람,길가장자리구역통행중,안전운전불이행,승용,여,51세,...,5,2019,1,1,0,대구광역시,중구,대신동,단일로,기타
1,ACCIDENT_00001,화요일,흐림,건조,차대사람,보도통행중,기타,승용,남,39세,...,3,2019,1,1,0,대구광역시,달서구,감삼동,단일로,기타
2,ACCIDENT_00002,화요일,맑음,건조,차대사람,차도통행중,안전운전불이행,승용,남,70세,...,3,2019,1,1,1,대구광역시,수성구,두산동,단일로,기타
3,ACCIDENT_00003,화요일,맑음,건조,차대차,추돌,안전운전불이행,승용,남,49세,...,5,2019,1,1,2,대구광역시,북구,복현동,단일로,기타
4,ACCIDENT_00004,화요일,맑음,건조,차대차,추돌,안전운전불이행,승용,남,30세,...,3,2019,1,1,4,대구광역시,동구,신암동,단일로,기타
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,ACCIDENT_39604,금요일,맑음,건조,차대차,측면충돌,신호위반,승용,여,52세,...,3,2021,12,31,19,대구광역시,수성구,수성동3가,교차로,교차로안
39605,ACCIDENT_39605,금요일,맑음,건조,차대차,측면충돌,안전거리미확보,승용,여,60세,...,3,2021,12,31,19,대구광역시,달서구,상인동,단일로,기타
39606,ACCIDENT_39606,금요일,맑음,건조,차대차,측면충돌,교차로운행방법위반,승용,남,60세,...,10,2021,12,31,21,대구광역시,달서구,월성동,교차로,교차로안
39607,ACCIDENT_39607,금요일,맑음,건조,차대차,추돌,안전운전불이행,승용,남,40세,...,3,2021,12,31,22,대구광역시,달서구,장동,기타,기타


### Merge with original data

In [23]:
train_df = pd.merge(train_df, light_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, child_area_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, parking_df, how='left', on=['도시', '구', '동'])

test_df = pd.merge(test_df, light_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, child_area_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, parking_df, how='left', on=['도시', '구', '동'])

## Drop labels not included in test_x

In [24]:
test_x = test_df.drop(columns=['ID']).copy()
train_x = train_df[test_x.columns].copy()
train_y = train_df['ECLO'].copy()
train_ys = train_df[['중상자수', '경상자수', '부상자수']]

## **범주형(Categorical) 변수, 수치형 변수로 변환하기**

모델 학습을 위해 train_x의 문자열 타입의 컬럼들을 추출하고, LabelEncoder를 활용하여 이 컬럼들을 모두 수치형 변수로 변환해 보겠습니다

In [25]:
from sklearn.preprocessing import LabelEncoder

categorical_features = list(train_x.dtypes[train_x.dtypes == "object"].index)
# 추출된 문자열 변수 확인
display(categorical_features)

for i in categorical_features:
    le = LabelEncoder()
    le=le.fit(train_x[i]) 
    train_x[i]=le.transform(train_x[i])
    
    test_x[i]=le.transform(test_x[i])

['요일', '기상상태', '노면상태', '사고유형', '도시', '구', '동', '도로형태1', '도로형태2']

In [26]:
train_x.fillna(0, inplace=True)
test_x.fillna(0, inplace=True)

# Model Train & Prediction

## Make dataset

In [27]:
train_x.columns

Index(['요일', '기상상태', '노면상태', '사고유형', '연', '월', '일', '시간', '도시', '구', '동',
       '도로형태1', '도로형태2', '설치개수', 'CCTV설치대수', '보호구역수', '주차구획수', '급지구분_1',
       '급지구분_2', '급지구분_3'],
      dtype='object')

In [28]:
train_x

Unnamed: 0,요일,기상상태,노면상태,사고유형,연,월,일,시간,도시,구,동,도로형태1,도로형태2,설치개수,CCTV설치대수,보호구역수,주차구획수,급지구분_1,급지구분_2,급지구분_3
0,6,2,0,0,2019,1,1,0,0,7,40,2,5,391.0,13.0,2.0,500.0,11.0,0.0,0.0
1,6,5,0,0,2019,1,1,0,0,1,4,2,5,932.0,0.0,0.0,114.0,0.0,1.0,3.0
2,6,2,0,0,2019,1,1,1,0,6,66,2,5,473.0,0.0,5.0,0.0,0.0,0.0,0.0
3,6,2,0,1,2019,1,1,2,0,4,79,2,5,534.0,32.0,11.0,374.0,0.0,9.0,5.0
4,6,2,0,1,2019,1,1,4,0,3,129,2,5,2057.0,0.0,0.0,63.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,0,2,0,1,2021,12,31,19,0,6,118,0,3,0.0,0.0,1.0,0.0,0.0,0.0,0.0
39605,0,2,0,1,2021,12,31,19,0,1,103,2,5,843.0,0.0,0.0,466.0,0.0,0.0,5.0
39606,0,2,0,1,2021,12,31,21,0,1,144,0,3,164.0,0.0,0.0,32.0,0.0,1.0,0.0
39607,0,2,0,1,2021,12,31,22,0,1,158,1,5,210.0,0.0,0.0,188.0,0.0,0.0,1.0


In [17]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset

In [18]:
class CustomDataset(Dataset):
    def __init__(self, x, y):
        super(CustomDataset, self).__init__()
        self.x = x
        self.y = y
        # 텐서 변환
        self.x = torch.tensor(self.x.values).float()
        self.y = torch.tensor(self.y).float()
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        y = self.y[idx]
        return x, y

In [19]:
dataset = CustomDataset(train_x, train_y)

In [20]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [20]:
def rmsle(pred, target):
    pred = pred.contiguous()
    target = target.contiguous()
    loss = torch.square(torch.log1p(pred) - torch.log1p(target))

    return(torch.sqrt(loss.mean()))

In [71]:
def loss_fn(pred, target):
    return rmsle(pred, target)

In [74]:
print ("PyTorch version:[%s]."%(torch.__version__))
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print ("device:[%s]."%(device))

PyTorch version:[1.13.1+cu117].
device:[cuda:0].


In [90]:
class DenseModel(nn.Module):
    def __init__(self, input_dim = 10):
        super(DenseModel, self).__init__()
        self.input_dim = input_dim


        self.layers = []
        self.layers.append(nn.BatchNorm1d(input_dim))
        self.layers.append(nn.Linear(input_dim, 16, bias=True))
        self.layers.append(nn.ReLU(True))
        self.layers.append(nn.Linear(16, 32, bias=True))
        self.layers.append(nn.ReLU(True))
        self.layers.append(nn.Linear(32, 1, bias=True))

        self.net = nn.Sequential(*self.layers)


    def forward(self,x):
        return self.net(x)     
    
model = DenseModel(len(train_x.columns)).to(device)
optm = optim.Adam(model.parameters(),lr=1e-3)

In [92]:
# 최대 반복 횟수 정의
num_epoch = 50
# loss 기록하기 위한 list 정의
losses = []
for epoch in range(num_epoch):
    # loss 초기화
    running_loss = 0
    for x, y in dataloader:
        # x, y 데이터를 device 에 올립니다. (cuda:0 혹은 cpu)
        x = x.to(device)
        y = y.to(device)
    
        # 그라디언트 초기화 (초기화를 수행하지 않으면 계산된 그라디언트는 누적됩니다.)
        optm.zero_grad()

        # output 계산: model의 __call__() 함수 호출
        y_hat =  model(x)

        # 손실(loss) 계산
        loss = loss_fn(y, y_hat)

        # 미분 계산
        loss.backward()

        # 경사하강법 계산 및 적용
        optm.step()

        # 배치별 loss 를 누적합산 합니다.
        running_loss += loss.item()
        
    # 누적합산된 배치별 loss값을 배치의 개수로 나누어 Epoch당 loss를 산출합니다.
    loss = running_loss / len(dataloader)
    losses.append(loss)

    # 20번의 Epcoh당 출력합니다.
    if epoch % 20 == 0:
        print("{0:05d} loss = {1:.5f}".format(epoch, loss))
    
print("----" * 15)
print("{0:05d} loss = {1:.5f}".format(epoch, loss))

00000 loss = 0.52055
00020 loss = 0.44816
00040 loss = 0.44816
------------------------------------------------------------
00049 loss = 0.44785


In [103]:
np.unique(model(x).detach().cpu().numpy())

array([4.0955486, 4.095549 , 4.095565 , 4.0955806, 4.095581 ],
      dtype=float32)

In [93]:
class TestDataset(Dataset):
    def __init__(self, x):
        super(TestDataset, self).__init__()
        self.x = x
        # 텐서 변환
        self.x = torch.tensor(self.x.values).float()
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, idx):
        x = self.x[idx]
        return x

In [94]:
testset = TestDataset(test_x)
testloader = DataLoader(testset, batch_size=32, shuffle=True)

In [95]:
predictions = []
model.eval()
with torch.no_grad():
    for batch in testloader:
        batch_in = batch.to(device)
        pred = model(batch_in)

        predictions.append(pred.cpu().numpy())
all_predictions = np.concatenate(predictions, axis=0)

In [96]:
all_predictions

array([[4.0955334],
       [4.0955334],
       [4.0955334],
       ...,
       [4.095533 ],
       [4.095534 ],
       [4.095533 ]], dtype=float32)

In [97]:
np.unique(all_predictions)

array([4.095533 , 4.0955334, 4.095534 , 4.0955343], dtype=float32)

## **Submission 양식 확인**

sample_submission.csv 화일 데이터(sample_submission)를 그대로 복사한 후, 
양식의 'ECLO' 컬럼에 test_x에 대한 ECLO(y) 예측값을 입력합니다 

In [84]:
baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = all_predictions
baseline_submission 

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,4.058453
1,ACCIDENT_39610,4.058453
2,ACCIDENT_39611,4.058453
3,ACCIDENT_39612,4.058453
4,ACCIDENT_39613,4.058453
...,...,...
10958,ACCIDENT_50567,4.058453
10959,ACCIDENT_50568,4.058453
10960,ACCIDENT_50569,4.058453
10961,ACCIDENT_50570,4.058453


## **답안지 저장 및 제출하기**

In [85]:
baseline_submission.to_csv('baseline_submit.csv', index=False)