## Import data

In [21]:
import pandas as pd 
import os
import random
import numpy as np
import matplotlib.pyplot as plt


path = "/mnt/d/data/accident/"

train_org = pd.read_csv(path + 'train.csv') 
test_org = pd.read_csv(path + 'test.csv')

sample_submission = pd.read_csv(path+"sample_submission.csv")

## Set seed

In [22]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

## train, test 데이터 기간 확인

In [23]:
display(f"train : {train_org.iloc[0]['사고일시']} ~ {train_org.iloc[-1]['사고일시']}")
display(f"test : {test_org.iloc[0]['사고일시']} ~ {test_org.iloc[-1]['사고일시']}")     

'train : 2019-01-01 00 ~ 2021-12-31 23'

'test : 2022-01-01 01 ~ 2022-12-31 21'

# import pretrain dataset

In [24]:
data = pd.read_csv(os.path.join(path, "external_open/countrywide_accident.csv"))

In [25]:
train_org.columns

Index(['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형',
       '사고유형 - 세부분류', '법규위반', '가해운전자 차종', '가해운전자 성별', '가해운전자 연령', '가해운전자 상해정도',
       '피해운전자 차종', '피해운전자 성별', '피해운전자 연령', '피해운전자 상해정도', '사망자수', '중상자수',
       '경상자수', '부상자수', 'ECLO'],
      dtype='object')

In [26]:
data.columns

Index(['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형',
       '사고유형 - 세부분류', '법규위반', '가해운전자 차종', '가해운전자 성별', '가해운전자 연령', '가해운전자 상해정도',
       '피해운전자 차종', '피해운전자 성별', '피해운전자 연령', '피해운전자 상해정도', '사망자수', '중상자수',
       '경상자수', '부상자수', 'ECLO'],
      dtype='object')

In [27]:
pre_df = data.copy()

time_pattern = r'(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2})' 

pre_df[['연', '월', '일', '시간']] = data['사고일시'].str.extract(time_pattern)
pre_df[['연', '월', '일', '시간']] = pre_df[['연', '월', '일', '시간']].apply(pd.to_numeric) # 추출된 문자열을 수치화해줍니다 
pre_df = pre_df.drop(columns=['사고일시']) # 정보 추출이 완료된 '사고일시' 컬럼은 제거합니다 

location_pattern = r'(\S+) (\S+) (\S+)'

pre_df[['도시', '구', '동']] = data['시군구'].str.extract(location_pattern)
pre_df = pre_df.drop(columns=['시군구'])

road_pattern = r'(.+) - (.+)'

pre_df[['도로형태1', '도로형태2']] = data['도로형태'].str.extract(road_pattern)
pre_df = pre_df.drop(columns=['도로형태'])

In [28]:
pre_df

Unnamed: 0,ID,요일,기상상태,노면상태,사고유형,사고유형 - 세부분류,법규위반,가해운전자 차종,가해운전자 성별,가해운전자 연령,...,ECLO,연,월,일,시간,도시,구,동,도로형태1,도로형태2
0,COUNTRYWIDE_ACCIDENT_000000,화요일,맑음,건조,차대사람,횡단중,보행자보호의무위반,승용,남,26세,...,3,2019,1,1,0,서울특별시,강서구,방화동,교차로,교차로횡단보도내
1,COUNTRYWIDE_ACCIDENT_000001,화요일,맑음,건조,차대차,기타,직진우회전진행방해,승용,남,54세,...,3,2019,1,1,0,경기도,포천시,소흘읍,교차로,교차로안
2,COUNTRYWIDE_ACCIDENT_000002,화요일,맑음,건조,차대차,측면충돌,신호위반,승용,여,57세,...,5,2019,1,1,0,경기도,양주시,고암동,교차로,교차로안
3,COUNTRYWIDE_ACCIDENT_000003,화요일,맑음,건조,차대사람,횡단중,보행자보호의무위반,승용,여,65세,...,10,2019,1,1,0,경기도,남양주시,다산동,기타,기타
4,COUNTRYWIDE_ACCIDENT_000004,화요일,맑음,건조,차대차,측면충돌,신호위반,승용,여,74세,...,7,2019,1,1,0,인천광역시,부평구,부평동,교차로,교차로안
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
602770,COUNTRYWIDE_ACCIDENT_602770,금요일,흐림,건조,차대차,기타,신호위반,이륜,남,31세,...,8,2021,12,31,23,부산광역시,해운대구,우동,교차로,교차로안
602771,COUNTRYWIDE_ACCIDENT_602771,금요일,맑음,건조,차대차,측면충돌,안전거리미확보,승용,남,25세,...,6,2021,12,31,23,서울특별시,성동구,행당동,단일로,기타
602772,COUNTRYWIDE_ACCIDENT_602772,금요일,맑음,건조,차대차,측면충돌,안전운전불이행,승용,남,49세,...,9,2021,12,31,23,대전광역시,유성구,원신흥동,교차로,교차로안
602773,COUNTRYWIDE_ACCIDENT_602773,금요일,맑음,젖음/습기,차량단독,기타,안전운전불이행,원동기,남,32세,...,1,2021,12,31,23,경기도,화성시,서신면,기타,기타


# **데이터 전처리**  

In [29]:
train_df = train_org.copy()
test_df = test_org.copy()

time_pattern = r'(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2})' 

train_df[['연', '월', '일', '시간']] = train_org['사고일시'].str.extract(time_pattern)
train_df[['연', '월', '일', '시간']] = train_df[['연', '월', '일', '시간']].apply(pd.to_numeric) # 추출된 문자열을 수치화해줍니다 
train_df = train_df.drop(columns=['사고일시']) # 정보 추출이 완료된 '사고일시' 컬럼은 제거합니다 

# 해당 과정을 test_x에 대해서도 반복해줍니다 
test_df[['연', '월', '일', '시간']] = test_org['사고일시'].str.extract(time_pattern)
test_df[['연', '월', '일', '시간']] = test_df[['연', '월', '일', '시간']].apply(pd.to_numeric)
test_df = test_df.drop(columns=['사고일시'])

location_pattern = r'(\S+) (\S+) (\S+)'

train_df[['도시', '구', '동']] = train_org['시군구'].str.extract(location_pattern)
train_df = train_df.drop(columns=['시군구'])

test_df[['도시', '구', '동']] = test_org['시군구'].str.extract(location_pattern)
test_df = test_df.drop(columns=['시군구'])

road_pattern = r'(.+) - (.+)'

train_df[['도로형태1', '도로형태2']] = train_org['도로형태'].str.extract(road_pattern)
train_df = train_df.drop(columns=['도로형태'])

test_df[['도로형태1', '도로형태2']] = test_org['도로형태'].str.extract(road_pattern)
test_df = test_df.drop(columns=['도로형태'])

## Use additional data

In [30]:
light_df = pd.read_csv(os.path.join(path, "external_open/light.csv"), encoding='cp949')[['설치개수', '소재지지번주소']]

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

light_df[['도시', '구', '동', '번지']] = light_df['소재지지번주소'].str.extract(location_pattern)
light_df = light_df.drop(columns=['소재지지번주소', '번지'])

light_df = light_df.groupby(['도시', '구', '동']).sum().reset_index()
light_df.reset_index(inplace=True, drop=True)

  light_df = pd.read_csv(os.path.join(path, "external_open/light.csv"), encoding='cp949')[['설치개수', '소재지지번주소']]


In [31]:
child_area_df = pd.read_csv(os.path.join(path, "external_open/child.csv"), encoding='cp949')[['CCTV설치대수', '소재지지번주소']]
child_area_df['보호구역수'] = 1

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

child_area_df[['도시', '구', '동', '번지']] = child_area_df['소재지지번주소'].str.extract(location_pattern)
child_area_df = child_area_df.drop(columns=['소재지지번주소', '번지'])

child_area_df = child_area_df.groupby(['도시', '구', '동']).sum().reset_index()
child_area_df.reset_index(inplace=True, drop=True)

In [32]:
parking_df = pd.read_csv(os.path.join(path, "external_open/parking.csv"), encoding='cp949')[['소재지지번주소', '급지구분', "주차구획수"]]
parking_df = pd.get_dummies(parking_df, columns=['급지구분'])

location_pattern = r'(\S+) (\S+) (\S+) (\S+)'

parking_df[['도시', '구', '동', '번지']] = parking_df['소재지지번주소'].str.extract(location_pattern)
parking_df = parking_df.drop(columns=['소재지지번주소', '번지'])

parking_df = parking_df.groupby(['도시', '구', '동']).sum().reset_index()
parking_df.reset_index(inplace=True, drop=True)

In [33]:
light_df

Unnamed: 0,도시,구,동,설치개수
0,대구광역시,남구,대명동,5377
1,대구광역시,남구,봉덕동,1424
2,대구광역시,남구,이천동,556
3,대구광역시,달서구,갈산동,349
4,대구광역시,달서구,감삼동,932
...,...,...,...,...
223,대구광역시,중구,태평로2가,38
224,대구광역시,중구,태평로3가,47
225,대구광역시,중구,포정동,18
226,대구광역시,중구,향촌동,28


In [34]:
child_area_df

Unnamed: 0,도시,구,동,CCTV설치대수,보호구역수
0,대구광역시,남구,대명동,106.0,26
1,대구광역시,남구,봉덕동,34.0,8
2,대구광역시,남구,이천동,22.0,6
3,대구광역시,달성군,가창면,0.0,8
4,대구광역시,달성군,구지면,4.0,6
...,...,...,...,...,...
66,대구광역시,중구,봉산동,18.0,2
67,대구광역시,중구,삼덕동2가,9.0,1
68,대구광역시,중구,삼덕동3가,11.0,1
69,대구광역시,중구,서문로1가,8.0,1


In [35]:
parking_df

Unnamed: 0,도시,구,동,주차구획수,급지구분_1,급지구분_2,급지구분_3
0,대구광역시,남구,대명동,709,20,1,0
1,대구광역시,남구,봉덕동,527,9,3,0
2,대구광역시,남구,이천동,43,3,0,0
3,대구광역시,달서구,갈산동,242,0,0,4
4,대구광역시,달서구,감삼동,114,0,1,3
...,...,...,...,...,...,...,...
131,대구광역시,중구,태평로2가,178,5,0,0
132,대구광역시,중구,태평로3가,10,1,0,0
133,대구광역시,중구,포정동,127,4,0,0
134,대구광역시,중구,향촌동,80,2,0,0


### Merge with original data

In [36]:
train_df = pd.merge(train_df, light_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, child_area_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, parking_df, how='left', on=['도시', '구', '동'])

test_df = pd.merge(test_df, light_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, child_area_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, parking_df, how='left', on=['도시', '구', '동'])

## Drop labels not included in test_x

In [37]:
test_x = test_df.drop(columns=['ID']).copy()
train_x = train_df[test_x.columns].copy()
train_y = train_df['ECLO'].copy()

## **범주형(Categorical) 변수, 수치형 변수로 변환하기**

모델 학습을 위해 train_x의 문자열 타입의 컬럼들을 추출하고, LabelEncoder를 활용하여 이 컬럼들을 모두 수치형 변수로 변환해 보겠습니다

In [38]:
from sklearn.preprocessing import LabelEncoder

categorical_features = list(train_x.dtypes[train_x.dtypes == "object"].index)
# 추출된 문자열 변수 확인
display(categorical_features)

for i in categorical_features:
    le = LabelEncoder()
    le=le.fit(train_x[i]) 
    train_x[i]=le.transform(train_x[i])
    
    test_x[i]=le.transform(test_x[i])

['요일', '기상상태', '노면상태', '사고유형', '도시', '구', '동', '도로형태1', '도로형태2']

In [39]:
train_x.fillna(0, inplace=True)
test_x.fillna(0, inplace=True)

# Model Train & Prediction

In [40]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor() 
model.fit(train_x, train_y)

prediction = model.predict(test_x)

KeyboardInterrupt: 

In [None]:
from autogluon.tabular import TabularDataset, TabularPredictor

train_x = TabularDataset(train_x)
train_x["label"] = train_y
predictor = TabularPredictor(label="label", eval_metric="root_mean_squared_error", problem_type="regression").fit(train_x)

No path specified. Models will be saved in: "AutogluonModels/ag-20231127_133048"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20231127_133048"
AutoGluon Version:  0.8.2
Python Version:     3.10.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Thu Oct 5 21:02:42 UTC 2023
Disk Space Avail:   1938.07 GB / 2000.40 GB (96.9%)
Train Data Rows:    39609
Train Data Columns: 20
Label Column: label
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    10158.44 MB
	Train Data (Original)  Memory Usage: 6.34 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatur

In [None]:
predictor.leaderboard()

                  model  score_val  pred_time_val   fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -3.154248       0.101439  19.812445                0.000222           0.146062            2       True         12
1              CatBoost  -3.154663       0.001944   0.615816                0.001944           0.615816            1       True          6
2              LightGBM  -3.161942       0.001652   0.176849                0.001652           0.176849            1       True          4
3               XGBoost  -3.162130       0.004154   0.377868                0.004154           0.377868            1       True          9
4       NeuralNetFastAI  -3.168732       0.018366  15.781750                0.018366          15.781750            1       True          8
5            LightGBMXT  -3.171463       0.002525   0.572522                0.002525           0.572522            1       True          3
6         LightGBMLarge  -3

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-3.154248,0.101439,19.812445,0.000222,0.146062,2,True,12
1,CatBoost,-3.154663,0.001944,0.615816,0.001944,0.615816,1,True,6
2,LightGBM,-3.161942,0.001652,0.176849,0.001652,0.176849,1,True,4
3,XGBoost,-3.16213,0.004154,0.377868,0.004154,0.377868,1,True,9
4,NeuralNetFastAI,-3.168732,0.018366,15.78175,0.018366,15.78175,1,True,8
5,LightGBMXT,-3.171463,0.002525,0.572522,0.002525,0.572522,1,True,3
6,LightGBMLarge,-3.174114,0.001978,0.354304,0.001978,0.354304,1,True,11
7,NeuralNetTorch,-3.203236,0.008327,11.732957,0.008327,11.732957,1,True,10
8,RandomForestMSE,-3.273199,0.076752,2.890948,0.076752,2.890948,1,True,5
9,ExtraTreesMSE,-3.287109,0.066056,1.411693,0.066056,1.411693,1,True,7


In [None]:
prediction = predictor.predict(test_x)
y_pred.head()

  self._init_pool(data, label, cat_features, text_features, embedding_features, pairs, weight,


0    4.459158
1    4.119506
2    5.926070
3    5.229726
4    5.027195
Name: label, dtype: float32

## **Submission 양식 확인**

sample_submission.csv 화일 데이터(sample_submission)를 그대로 복사한 후, 
양식의 'ECLO' 컬럼에 test_x에 대한 ECLO(y) 예측값을 입력합니다 

In [None]:
baseline_submission = sample_submission.copy()
baseline_submission['ECLO'] = prediction
baseline_submission 

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,4.459158
1,ACCIDENT_39610,4.119506
2,ACCIDENT_39611,5.926070
3,ACCIDENT_39612,5.229726
4,ACCIDENT_39613,5.027195
...,...,...
10958,ACCIDENT_50567,5.593705
10959,ACCIDENT_50568,4.927447
10960,ACCIDENT_50569,5.318663
10961,ACCIDENT_50570,5.268946


## **답안지 저장 및 제출하기**

In [None]:
baseline_submission.to_csv('baseline_submit.csv', index=False)