In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 
import random 
import warnings 
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler,LabelEncoder,OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error,mean_squared_log_error,make_scorer
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.preprocessing import PolynomialFeatures
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import bisect
pd.set_option('display.max_columns', None)  # 모든 열을 출력하도록 설정
pd.set_option('display.expand_frame_repr', False)  # 너비 제한 없이 출력
plt.rcParams['font.family'] = 'Malgun Gothic'


In [37]:
def make_xtrain_ytrain_xtest(train, test):
    x_test = test.drop(columns=['ID']).copy()
    y_train =train['ECLO'].copy()
    x_train = train[x_test.columns].copy()
    return x_train,y_train,x_test

In [39]:
def encoding_function(x_train, x_test, cols_to_onehot) :
    xx_train = pd.get_dummies(x_train,columns=cols_to_onehot).copy()
    xx_test  = pd.get_dummies(x_test,columns=cols_to_onehot).copy()
    
    cat_features = []
    for col in xx_train.columns:
        if xx_train[col].dtype == 'object':
            cat_features.append(col)
    encoders={}
    for feature in cat_features:
        le = LabelEncoder()
        xx_train[feature] = le.fit_transform(xx_train[feature].astype(str))
        le_classes_set = set(le.classes_)
        xx_test[feature] = xx_test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
        le_classes = le.classes_.tolist()
        bisect.insort_left(le_classes, '-1')
        le.classes_ = np.array(le_classes)
        xx_test[feature] = le.transform(xx_test[feature].astype(str))
        encoders[feature] = le
        
    return xx_train,xx_test 

In [40]:
# x_test, x_train 생성하기 
def evaluate_regr(y,pred):
    mae = mean_absolute_error(y,pred)
    rmse = np.sqrt(mean_squared_error(y,pred))
    log_y = np.log1p(y)
    log_pred = np.log1p(pred)
    msle = np.mean((log_y - log_pred) ** 2)
    rmsle = np.sqrt(msle)    
    return rmsle 

def test_function(x_train,y_train,cols_to_del):
    # train.drop(columns=cols_to_del , inplace = True)
    # X = train.drop('ECLO',axis=1).values
    # y = train['ECLO'].values
    # for i in range(0,7):  X[:,i] = LabelEncoder().fit_transform(X[:,i])
    # X_standard = StandardScaler().fit_transform(X)
    x_train,x_test,y_train,y_test = train_test_split(x_train,y_train,test_size=0.2, random_state= 42 )
    gbr = CatBoostRegressor(random_state= 42 ,silent=True)
    result_arr = []
    
    for model in [gbr] :
        model.fit(x_train,y_train) 
        pred = model.predict(x_test)
        rmsle =evaluate_regr(y_test, pred) 
        result_arr.append((rmsle, model))
    result_arr.sort(key = lambda x : x[0])
    for i in range(len(result_arr)) :
        score,model = result_arr[i] 
        print(f"제거된 열 :{cols_to_del}의 점수는 {score}")


# 전처리  +  특성 생성하기 

In [41]:
# Seed 고정 및 데이터 불러오기 

def seed_everything(seed):
    
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) 

train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

def data_preprocess(data):
    time_pattern = r'(\d{4})-(\d{1,2})-(\d{1,2}) (\d{1,2})' 
    data[['연', '월', '일', '시간']] = data['사고일시'].str.extract(time_pattern)
    data[['연', '월', '일', '시간']] = data[['연', '월', '일', '시간']].apply(pd.to_numeric) # 추출된 문자열을 수치화해줍니다 
    data = data.drop(columns=['사고일시']) # 정보 추출이 완료된 '사고일시' 컬럼은 제거합니다 
    location_pattern = r'(\S+) (\S+) (\S+)'
    data[['도시', '구', '동']] = data['시군구'].str.extract(location_pattern)
    data = data.drop(columns=['시군구'])
    road_pattern = r'(.+) - (.+)'
    data[['도로형태1', '도로형태2']] = data['도로형태'].str.extract(road_pattern)
    data = data.drop(columns=['도로형태'])
    data = data.drop(columns=['도시'])
    return data 

train = data_preprocess(train)
test = data_preprocess(test)

In [42]:
# 이상치 제거하기 
to_delete = [10155 ,37536, 32591,12632]
train.drop(to_delete, axis=0 ,inplace=True)

In [43]:
# 시간 그룹화 , 휴일 데이터 
def make_time_data(data) :
    data['요일']= data['요일'].astype(str)
    data['weekend'] = np.where(data['요일'].isin(['토요일','일요일']),1,0)
    data['time_group'] = data['시간'].apply(convert_hour)
    return data 

def convert_hour(time) :
    if time>=0 and  time<6 :
        return 0 
    elif time>=6 and  time<12 :
        return 1
    elif time>=12 and  time<18 :
        return 2 
    else :
        return 3
train, test  = make_time_data(train) , make_time_data(test)

In [46]:
x_train,y_train,x_test= make_xtrain_ytrain_xtest(train,test)
x_train,x_test = encoding_function(x_train,x_test,['구','사고유형'])
# test_function(x_train, y_train, [])
x_train

Unnamed: 0,요일,기상상태,노면상태,연,월,일,시간,동,도로형태1,도로형태2,weekend,time_group,구_남구,구_달서구,구_달성군,구_동구,구_북구,구_서구,구_수성구,구_중구,사고유형_차대사람,사고유형_차대차,사고유형_차량단독
0,6,2,0,2019,1,1,0,40,2,5,0,0,0,0,0,0,0,0,0,1,1,0,0
1,6,5,0,2019,1,1,0,4,2,5,0,0,0,1,0,0,0,0,0,0,1,0,0
2,6,2,0,2019,1,1,1,66,2,5,0,0,0,0,0,0,0,0,1,0,1,0,0
3,6,2,0,2019,1,1,2,79,2,5,0,0,0,0,0,0,1,0,0,0,0,1,0
4,6,2,0,2019,1,1,4,129,2,5,0,0,0,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39604,0,2,0,2021,12,31,19,118,0,3,0,3,0,0,0,0,0,0,1,0,0,1,0
39605,0,2,0,2021,12,31,19,103,2,5,0,3,0,1,0,0,0,0,0,0,0,1,0
39606,0,2,0,2021,12,31,21,144,0,3,0,3,0,1,0,0,0,0,0,0,0,1,0
39607,0,2,0,2021,12,31,22,158,1,5,0,3,0,1,0,0,0,0,0,0,0,1,0


In [5]:
#ECLO의 평균 , 분산을 특성으로 생성하기 
# serious는 동으로 혹은 구로만 하는 것 결정 
train['serious'] = (1.2*train['사망자수']+train['중상자수']) / (train['부상자수']+ train['경상자수']+0.2)
serious_group3 = train.groupby(['구','동','time_group', '사고유형'])['serious'].mean().reset_index().sort_values(by = 'serious').rename(columns={'serious':'serious_type_time'})

test = test.merge(serious_group3, on=['구','동','time_group','사고유형'], how='left')
train = train.merge(serious_group3, on=['구','동','time_group','사고유형'], how='left')

eclo_group_avg = train.groupby(['구','time_group'])['ECLO'].mean().reset_index().sort_values(by = 'ECLO').rename(columns={"ECLO":"eclo_avg1"})
eclo_group_std = train.groupby(['구','time_group'])['ECLO'].std().reset_index().sort_values(by = 'ECLO').rename(columns={"ECLO":"eclo_std1"})

test = test.merge(eclo_group_avg, on=['구','time_group'], how='left')
train = train.merge(eclo_group_avg,on=['구','time_group'], how='left')
test = test.merge(eclo_group_std, on=['구','time_group'], how='left')
train = train.merge(eclo_group_std,on=['구','time_group'], how='left')

eclo_group_avg2 = train.groupby(['구','time_group', '요일'])['ECLO'].mean().reset_index().sort_values(by = 'ECLO').rename(columns={"ECLO":"eclo_avg2"})
eclo_group_std2 = train.groupby(['구','time_group','요일'])['ECLO'].std().reset_index().sort_values(by = 'ECLO').rename(columns={"ECLO":"eclo_std2"})

test = test.merge(eclo_group_avg2, on=['구','time_group','요일'], how='left')
train = train.merge(eclo_group_avg2,on=['구','time_group','요일'], how='left')
test = test.merge(eclo_group_std2, on=['구','time_group','요일'], how='left')
train = train.merge(eclo_group_std2,on=['구','time_group','요일'], how='left')


# 외부 데이터 추출하기 

In [6]:
# 외부데이터 (보안등, 어린이 보호구역 , 대구 주차장 , cctv )
light_df = pd.read_csv('./data/external_open/대구 보안등 정보.csv', encoding='cp949')[['설치개수', '소재지지번주소']]
location_pattern = r'(\S+) (\S+) (\S+) (\S+)'
light_df[['도시', '구', '동', '번지']] = light_df['소재지지번주소'].str.extract(location_pattern)
light_df = light_df.drop(columns=['소재지지번주소', '번지'])
light_df = light_df.groupby(['도시', '구', '동']).sum().reset_index()
light_df.reset_index(inplace=True, drop=True)
# light_df

#어린이 보호 구역 정보
child_area_df = pd.read_csv('./data/external_open/대구 어린이 보호 구역 정보.csv', encoding='cp949').drop_duplicates()[['소재지지번주소']]
child_area_df['kid_area'] = 1

child_area_df[['도시', '구', '동', '번지']] = child_area_df['소재지지번주소'].str.extract(location_pattern)
child_area_df = child_area_df.drop(columns=['소재지지번주소', '번지'])

child_area_df = child_area_df.groupby(['도시', '구', '동']).sum().reset_index()
child_area_df.reset_index(inplace=True, drop=True)

# 대구 주차장 정보
parking_df = pd.read_csv('./data/external_open/대구 주차장 정보.csv', encoding='cp949')[['소재지지번주소', '급지구분','주차구획수']]
parking_df = pd.get_dummies(parking_df, columns=['급지구분'])

parking_df[['도시', '구', '동', '번지']] = parking_df['소재지지번주소'].str.extract(location_pattern)
parking_df = parking_df.drop(columns=['소재지지번주소', '번지'])

parking_df = parking_df.groupby(['도시', '구', '동']).sum().reset_index()
parking_df.reset_index(inplace=True, drop=True)

# 대구 cctv정보 
cctv_df = pd.read_csv('./data/external_open/대구 CCTV 정보.csv', encoding='cp949')
cctv_df['cctv_cnt'] = 1 
# location_pattern = r'(\S+) (\S+) (\S+) (\S+)'
cctv_df[['도시', '구', '동', '번지']] = cctv_df['소재지지번주소'].str.extract(location_pattern)
cctv_df = cctv_df.drop(columns=['소재지지번주소', '번지'])

cctv_df = cctv_df.groupby(['도시', '구', '동']).sum().reset_index()
cctv_df.reset_index(inplace=True, drop=True)
cctv_df = cctv_df[['도시', '구', '동','cctv_cnt','제한속도']]

In [7]:
for one_data in [light_df, child_area_df,parking_df, cctv_df ]:
    test = test.merge(one_data,on = ['도시', '구', '동'],how = 'left')
    train= train.merge(one_data,on = ['도시', '구', '동'],how = 'left')
    

In [8]:
# 보행자 사고 데이터 
walker_df = pd.read_csv('./data/보행자사고지표.csv')
walker_df = walker_df.drop(columns=['치사율', '지점'])
walker_df['eclo_walker'] = (1.5*walker_df['사망'] +1.2*walker_df['중상'])/(1+ 1*walker_df['경상']+walker_df['부상']) 
walker_df = walker_df.groupby(['구','동']).median().reset_index()
walker_df['eclo_walker_avg'] = walker_df['eclo_walker']/walker_df['건수']
walker_df = walker_df.sort_values(by = 'eclo_walker_avg')
walker_df = walker_df[['구', '동', 'eclo_walker_avg']]
walker_df.tail(10)

# 노약자 사고 데이터 
olders_df = pd.read_csv('./data/노약자사고지표.csv').drop(columns=['지역'])
olders_df['eclo_older']= 10*olders_df['사망'] +5*olders_df['중상']+3*olders_df['경상']
olders_df = olders_df.groupby(['구','동']).median().reset_index()
olders_df['eclo_older_avg'] = olders_df['eclo_older']/olders_df['건수']
olders_df =olders_df.sort_values(by = 'eclo_older_avg')
olders_df = olders_df[['구', '동', 'eclo_older_avg']]
olders_df.tail(12)

for one_data in [walker_df , olders_df] :
    test = test.merge(one_data,on=['구', '동'], how = 'left')
    train = train.merge(one_data ,on =['구', '동'],how = 'left')
    
    
# for col in ['eclo_walker_avg', 'eclo_older_avg'] : 
#     test[col] = test[col].fillna(0)
#     train[col] = train[col].fillna(0)

# # df['a'].fillna(df.groupby('d')['a'].transform('mean'), inplace=True)
# for col in ['주차구획수','급지구분_1','급지구분_2','급지구분_3','cctv_cnt','제한속도']:
#     test[col] = test[col].fillna(test.groupby('구')[col].transform('mean'))
#     train[col] = train[col].fillna(train.groupby('구')[col].transform('mean'))

# test['serious_type_time'] = test['serious_type_time'].fillna(test.groupby('구')['serious_type_time'].transform('mean'))

# for col in ['설치개수','kid_area']:
#     test[col] = test[col].fillna(test[col].mean())
#     train[col] = train[col].fillna(train[col].mean())

In [9]:
population_cnt =pd.read_csv('./data/대구광역시_주민등록인구 통계현황.csv',encoding='cp949')
car_cnt = pd.read_csv('./data/대구광역시_읍면동별 자동차 등록현황_20211031.csv',encoding='cp949')
olders = pd.read_csv('./data/노인요양시설.csv', encoding='cp949')

# 구별 인구 데이터 생성 
population_cnt.drop(population_cnt.iloc[0].name, inplace = True )
population_cnt['구'] = population_cnt['행정구역'].astype(str).str.slice(6,9)
population_cnt['구'] = population_cnt['구'].str.strip()
population_cnt = population_cnt[['구','2021년05월_총인구수','2021년05월_남자 인구수']]

train = train.merge(population_cnt , on ='구',how = 'left')
test = test.merge(population_cnt , on ='구',how = 'left')

#구별 노인 수용 가능 인원 데이터 
olders_grouped = olders.groupby('구분')['입소정원'].agg(['sum']).reset_index()
olders_grouped=olders_grouped.rename(columns={'구분': '구', 'sum':'노인수용가능'})

train = train.merge(olders_grouped , on = '구',how ='left')
test = test.merge(olders_grouped , on = '구',how ='left')

#지역별 등록된 자동차 수 
car_cnt = car_cnt.rename(columns={'구군':'구', '읍면동':'동'})
train = train.merge(car_cnt, on =['구', '동'],how='left')
test = test.merge(car_cnt, on =['구', '동'],how='left')

In [10]:
x_test = test.drop(columns=['ID']).copy()
y_train =train['ECLO'].copy()
x_train = train[x_test.columns].copy()


In [327]:
cat_features = []
for col in x_train.columns:
    if x_train[col].dtype == 'object':
        cat_features.append(col)
        

In [329]:
import bisect
encoders={}
for feature in cat_features:
    le = LabelEncoder()
    x_train[feature] = le.fit_transform(x_train[feature].astype(str))
    le_classes_set = set(le.classes_)
    x_test[feature] = x_test[feature].map(lambda s: '-1' if s not in le_classes_set else s)
    le_classes = le.classes_.tolist()
    bisect.insort_left(le_classes, '-1')
    le.classes_ = np.array(le_classes)
    x_test[feature] = le.transform(x_test[feature].astype(str))
    encoders[feature] = le

In [333]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(x_train)
test_scaled = scaler.transform(x_test)

In [11]:
x_train = pd.DataFrame(x_train, columns = x_train.columns)
x_test = pd.DataFrame(x_test, columns = x_test.columns)

In [12]:
from supervised.automl import AutoML

def rmsle_metric(true, predicted, sample_weight =None) :
    true = np.array(true)
    predicted = np.array(predicted) 
    log_true = np.log1p(true)
    log_predicted = np.log1p(predicted) 
    difference = log_predicted-log_true 
    difference = np.square(difference) 
    mean_diff = difference.mean() 
    score = np.sqrt(mean_diff) 
    return score 
from sklearn.metrics import mean_squared_log_error
def rmsle(y_true, y_predicted, sample_weight=None):
    val = mean_squared_log_error(y_true, y_predicted, sample_weight=sample_weight)
    return np.sqrt(val) if val > 0 else -np.Inf

automl = AutoML(
    
    mode="Compete",
    ml_task="regression",
    eval_metric='rmse',
    random_state=42,
    total_time_limit=60*60*2,
    model_time_limit=None
)


In [13]:
automl.fit(x_train, y_train)
pred = automl.predict(x_test)

Linear algorithm was disabled.
AutoML directory: AutoML_2
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Decision Tree', 'Random Forest', 'Extra Trees', 'LightGBM', 'Xgboost', 'CatBoost', 'Neural Network', 'Nearest Neighbors']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'mix_encoding', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree rmse 3.037163 trained in 0.5 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 10-fold CV Shuffle
* Step simple_algorithms will try to check up to 3 models
1_DecisionTree rmse 3.095144 trained in 4.04 seconds
2_DecisionTree rmse 3.107493 trained in 3.66 seconds
3_DecisionTree rmse 3.1074

In [15]:
sub= pd.read_csv('./data/sample_submission.csv')
sub['ECLO']=pred

In [17]:
sub.to_csv('./data/submission/mljar_newstart.csv',index=False)