https://dacon.io/competitions/official/236193/codeshare/9472?page=1&dtype=recent

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
#hiddencell
# from pbl_tools import *

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm

fe = fm.FontEntry(fname = 'MaruBuri-Regular.otf', name = 'MaruBuri')
fm.fontManager.ttflist.insert(0, fe)
plt.rc('font', family='MaruBuri')

In [3]:
import os
import random
import numpy as np
import pandas as pd 

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(77) 

In [4]:
train_org = pd.read_csv('D:/Daegu_Data/train.csv') 
test_org = pd.read_csv('D:/Daegu_Data/test.csv')
sample_submission = pd.read_csv("D:/Daegu_Data/sample_submission.csv")

In [5]:
train_org.drop(['ID'], axis=1, inplace=True)
test_org.drop(['ID'], axis=1, inplace=True)

train_df = train_org.copy()
test_df = test_org.copy()


time_pattern = r'(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2})' 
train_df[['연', '월', '일', '시간']] = train_org['사고일시'].str.extract(time_pattern)
train_df[['연', '월', '일', '시간']] = train_df[['연', '월', '일', '시간']].apply(pd.to_numeric) # 추출된 문자열을 수치화해줍니다 
train_df = train_df.drop(columns=['사고일시']) # 정보 추출이 완료된 '사고일시' 컬럼은 제거합니다 
# 해당 과정을 test_x에 대해서도 반복해줍니다 
test_df[['연', '월', '일', '시간']] = test_org['사고일시'].str.extract(time_pattern)
test_df[['연', '월', '일', '시간']] = test_df[['연', '월', '일', '시간']].apply(pd.to_numeric)
test_df = test_df.drop(columns=['사고일시'])


location_pattern = r'(\S+) (\S+) (\S+)'
train_df[['도시', '구', '동']] = train_org['시군구'].str.extract(location_pattern)
train_df = train_df.drop(columns=['시군구'])
test_df[['도시', '구', '동']] = test_org['시군구'].str.extract(location_pattern)
test_df = test_df.drop(columns=['시군구'])


road_pattern = r'(.+) - (.+)'
train_df[['도로형태1', '도로형태2']] = train_org['도로형태'].str.extract(road_pattern)
train_df = train_df.drop(columns=['도로형태'])
test_df[['도로형태1', '도로형태2']] = test_org['도로형태'].str.extract(road_pattern)
test_df = test_df.drop(columns=['도로형태'])

#### Add External Data

In [6]:
light_df = pd.read_csv('D:/Daegu_Data/external_open/대구 보안등 정보.csv', encoding='cp949')[['설치개수', '소재지지번주소']]
parking_df = pd.read_csv('D:/Daegu_Data/external_open/대구 주차장 정보.csv', encoding='cp949')[['소재지지번주소', '급지구분']]
child_area_df = pd.read_csv('D:/Daegu_Data/external_open/대구 어린이 보호 구역 정보.csv', encoding='cp949').drop_duplicates()[['소재지지번주소']]

In [7]:
# 보안등 
location_pattern = r'(\S+) (\S+) (\S+) (\S+)'
light_df[['도시', '구', '동', '번지']] = light_df['소재지지번주소'].str.extract(location_pattern)
light_df = light_df.drop(columns=['소재지지번주소', '번지'])
light_df = light_df.groupby(['도시', '구', '동']).sum().reset_index()
light_df.reset_index(inplace=True, drop=True)

# 주차장 
parking_df = pd.get_dummies(parking_df, columns=['급지구분'])
location_pattern = r'(\S+) (\S+) (\S+) (\S+)'
parking_df[['도시', '구', '동', '번지']] = parking_df['소재지지번주소'].str.extract(location_pattern)
parking_df = parking_df.drop(columns=['소재지지번주소', '번지'])
parking_df = parking_df.groupby(['도시', '구', '동']).sum().reset_index()
parking_df.reset_index(inplace=True, drop=True)

# 어린이 보호구역
child_area_df['child'] = 1
location_pattern = r'(\S+) (\S+) (\S+) (\S+)'
child_area_df[['도시', '구', '동', '번지']] = child_area_df['소재지지번주소'].str.extract(location_pattern)
child_area_df = child_area_df.drop(columns=['소재지지번주소', '번지'])
child_area_df = child_area_df.groupby(['도시', '구', '동']).sum().reset_index()
child_area_df.reset_index(inplace=True, drop=True)

In [8]:
train_df = pd.merge(train_df, light_df, how='left', on=['도시', '구', '동'])
train_df = pd.merge(train_df, child_area_df, how='left', on=['도시', '구', '동']).fillna(0)
train_df = pd.merge(train_df, parking_df, how='left', on=['도시', '구', '동']).fillna(0)

test_df = pd.merge(test_df, light_df, how='left', on=['도시', '구', '동'])
test_df = pd.merge(test_df, child_area_df, how='left', on=['도시', '구', '동']).fillna(0)
test_df = pd.merge(test_df, parking_df, how='left', on=['도시', '구', '동']).fillna(0)

Seasonal Variable

In [9]:
def group_Season(df):
    df.loc[(df['월'] == 3) | (df['월'] == 4) | (df['월'] == 5), 'Season'] = '봄'
    df.loc[(df['월'] == 6) | (df['월'] == 7) | (df['월'] == 8), 'Season'] = '여름'
    df.loc[(df['월'] == 9) | (df['월'] == 10) | (df['월'] == 11), 'Season'] = '가을'
    df.loc[(df['월'] == 12) | (df['월'] == 1) | (df['월'] == 2), 'Season'] = '겨울'
    return df['Season']

train_df['Cosine_Time'] = np.cos(2 * np.pi * train_df['시간'] / 24)
test_df['Cosine_Time'] = np.cos(2 * np.pi * test_df['시간'] / 24)
train_df['Season'] = group_Season(train_df)
test_df['Season'] = group_Season(test_df)

#### Scaling & Labeling

In [10]:
X_test = test_df.copy()
X_train = train_df[X_test.columns].copy() # X_test와 동일한 컬럼으로 X_train 생성
y_train = train_df['ECLO'].copy()

In [None]:
display(X_test)
display(X_train)
display(y_train)

In [11]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
# list(X_train.dtypes[X_train.dtypes == "object"].index) : ['요일', '기상상태', '노면상태', '사고유형', '도시', '구', '동', '도로형태1', '도로형태2']
# 도시는 대구광역시로 동일하기 때문에 제거
# 연도별 추세는 중요하지 않다고 판단하여 제거
X_train.drop(['도시', '연'], axis=1, inplace=True)
X_test.drop(['도시', '연'], axis=1, inplace=True)
categorical_features = ['요일', '기상상태', '노면상태', '사고유형', '구', '동', '도로형태1', '도로형태2', 'Season']
# 추출된 문자열 변수 확인
data = pd.concat([X_train, X_test])

# from sklearn.preprocessing import LabelEncoder
for i in categorical_features:
    # Get dummies
    dummies = pd.get_dummies(data[i], prefix=i)
    # Drop the original column
    data.drop([i], axis=1, inplace=True)
    # Concatenate the dummy variables
    data = pd.concat([data, dummies], axis=1)

# Split the data back into train and test dataframes
X_train = data[:len(X_train)]
X_test = data[len(X_train):]

In [12]:
[x for x in X_train.columns]

['월',
 '일',
 '시간',
 '설치개수',
 'child',
 '급지구분_1',
 '급지구분_2',
 '급지구분_3',
 'Cosine_Time',
 '요일_금요일',
 '요일_목요일',
 '요일_수요일',
 '요일_월요일',
 '요일_일요일',
 '요일_토요일',
 '요일_화요일',
 '기상상태_기타',
 '기상상태_눈',
 '기상상태_맑음',
 '기상상태_비',
 '기상상태_안개',
 '기상상태_흐림',
 '노면상태_건조',
 '노면상태_기타',
 '노면상태_서리/결빙',
 '노면상태_적설',
 '노면상태_젖음/습기',
 '노면상태_침수',
 '사고유형_차대사람',
 '사고유형_차대차',
 '사고유형_차량단독',
 '구_남구',
 '구_달서구',
 '구_달성군',
 '구_동구',
 '구_북구',
 '구_서구',
 '구_수성구',
 '구_중구',
 '동_가창면',
 '동_가천동',
 '동_각산동',
 '동_갈산동',
 '동_감삼동',
 '동_검단동',
 '동_검사동',
 '동_계산동1가',
 '동_계산동2가',
 '동_고모동',
 '동_고성동1가',
 '동_고성동2가',
 '동_고성동3가',
 '동_공평동',
 '동_관음동',
 '동_괴전동',
 '동_교동',
 '동_구암동',
 '동_구지면',
 '동_국우동',
 '동_금강동',
 '동_금호동',
 '동_남산동',
 '동_남성로',
 '동_남일동',
 '동_내당동',
 '동_내동',
 '동_노곡동',
 '동_노변동',
 '동_노원동1가',
 '동_노원동2가',
 '동_노원동3가',
 '동_논공읍',
 '동_능성동',
 '동_다사읍',
 '동_달성동',
 '동_대곡동',
 '동_대림동',
 '동_대명동',
 '동_대봉동',
 '동_대신동',
 '동_대안동',
 '동_대천동',
 '동_대현동',
 '동_대흥동',
 '동_덕곡동',
 '동_덕산동',
 '동_도남동',
 '동_도동',
 '동_도원동',
 '동_도학동',
 '동_동내동',
 '동_동문동',
 '동_동변동',
 '동_동산동',
 '동_동성로1가

#### Modeling

AutoML

In [None]:
from sklearn.model_selection import train_test_split
X_tra, X_val, y_tra, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [None]:
from supervised.automl import AutoML
automl = AutoML(mode="Compete",
                algorithms = ['Random Forest', 'LightGBM', 'Xgboost', 'CatBoost'],
                n_jobs = -1, 
                eval_metric="rmse", 
                ml_task = "regression")

In [None]:
automl.fit(X_tra, y_tra)

#### Prediction

In [13]:
from supervised.automl import AutoML
automl = AutoML(mode="Compete",
                algorithms = ['Random Forest', 'LightGBM', 'Xgboost', 'CatBoost'],
                n_jobs = -1, 
                eval_metric="rmse", 
                ml_task = "regression")

In [15]:
# Fit Final Model
automl.fit(X_train, y_train)

AutoML directory: AutoML_5
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Random Forest', 'LightGBM', 'Xgboost', 'CatBoost']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'mix_encoding', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree rmse 3.22834 trained in 1.67 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 4 models
1_Default_LightGBM rmse 3.244074 trained in 15.99 seconds
2_Default_Xgboost rmse 3.244729 trained in 17.32 seconds
3_Default_CatBoost rmse 3.240977 trained in 78.09 seconds

In [16]:
import joblib

# Save the model as a pickle file
joblib.dump(automl, 'automl_model.pkl')

['automl_model.pkl']

In [18]:
prediction = automl.predict(X_test)
prediction

array([4.0408382, 3.5411594, 4.962862 , ..., 4.190703 , 4.294964 ,
       4.294793 ], dtype=float32)

#### submission

In [19]:
sample_submission = pd.read_csv("D:/Daegu_Data/sample_submission.csv")

In [20]:
submit = sample_submission.copy()
submit['ECLO'] = prediction
min(submit['ECLO'])
# submit.loc[submit['ECLO'] < 0.0, 'ECLO'] = 0.0

2.530893564224243

In [21]:
submit

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,4.040838
1,ACCIDENT_39610,3.541159
2,ACCIDENT_39611,4.962862
3,ACCIDENT_39612,4.485795
4,ACCIDENT_39613,4.412438
...,...,...
10958,ACCIDENT_50567,4.958821
10959,ACCIDENT_50568,4.192525
10960,ACCIDENT_50569,4.190703
10961,ACCIDENT_50570,4.294964


In [22]:
submit.to_csv('automl.csv', index=False)