- gungu, dong 으로 segment -> test에만 있는 값이 있는지 확인

# Library setting

In [1]:
import sys
sys.path.append('/Volumes/KHJ/Github/hyuckjinkim/lib-python')

from base import gc_collect_all, setdiff
from filesystem_utils import mkdir
from environment import LocalFontSetting, ColabInstallFont, ColabFontSetting
from graph import abline, actual_prediction_scatterplot
from data_prepare import (
    get_holiday, reduce_mem_usage, delete_unique_columns,
    TypeController, CategoricalQuantileCalculator,
    GroupScaler, OneHotEncoder, InteractionTerm, TargetTransform,
)

In [2]:
gc_collect_all()
LocalFontSetting()

In [3]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from tqdm import tqdm
tqdm.pandas()

import numpy as np
import pandas as pd
pd.reset_option('display')
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', None)
pd.set_option('mode.chained_assignment', None)

import datetime
import glob

import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
def return_unique_columns(data):
    unique_info = data.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    return unique_cols

In [5]:
class CFG:
    SEED = 42
    SUBSET_DEPTH = 3
    N_SPLITS = 5
    TARGET = ['사망자수','중상자수','경상자수','부상자수'] # 'ECLO'
    TARGET_TRANSFORMATION = 'log'

<br></br>

# Data

## Data load

In [6]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

# cctv_df = pd.read_csv('./data/external_open/대구 CCTV 정보.csv',encoding='cp949')
# security_light_df = pd.read_csv('./data/external_open/대구 보안등 정보.csv',encoding='cp949',low_memory=False)
# child_area_df = pd.read_csv('./data/external_open/대구 어린이 보호 구역 정보.csv',encoding='cp949')
# parking_df = pd.read_csv('./data/external_open/대구 주차장 정보.csv',encoding='cp949')

<br>

## Preprocessing

In [7]:
def preprocessing(data):
    d = data.copy()
    
    # (1) test data에 없는 컬럼 제거
    no_columns_in_test = ['사고유형 - 세부분류','법규위반','가해운전자 차종','가해운전자 성별','가해운전자 연령',
                          '가해운전자 상해정도','피해운전자 차종','피해운전자 성별','피해운전자 연령','피해운전자 상해정도']
    no_columns_in_test = list(set(d.columns)&set(no_columns_in_test))
    d.drop(no_columns_in_test,axis=1,inplace=True)
    
    # (1) 시군구: 도시 / 구 / 동
    location_pattern = r'(\S+) (\S+) (\S+)'
    d[['도시','구','동']] = d['시군구'].str.extract(location_pattern)
    d.drop(['시군구','도시'],axis=1,inplace=True)
    
    # (2) 도로형태
    d[['도로구분','도로형태상세']] = np.stack(d['도로형태'].str.split(' - '))
    d['주차장여부'] = d['도로구분'].map({'단일로':0,'교차로':0,'기타':np.nan,'주차장':1,'미분류':np.nan})
    d['도로여부']    = d['도로구분'].map({'단일로':1,'교차로':1,'기타':np.nan,'주차장':np.nan,'미분류':np.nan})
    d.drop('도로형태',axis=1,inplace=True)
    for col in ['주차장여부','도로여부']:
        d[col] = d[col].fillna(0)
        
    # (3) 기상상태
    d['기상상태맑음여부'] = np.where(d['기상상태']=='맑음',1,0)
    d['노면상태건조여부'] = np.where(d['노면상태']=='건조',1,0)

    return d

def derived_features(data):
    d = data.copy()
    date = d['사고일시'].apply(lambda x: datetime.datetime.strptime(str(x),'%Y-%m-%d %H'))
    
    # (1) date columns
    d['year']       = date.dt.year
    d['month']      = date.dt.month
    d['day']        = date.dt.day
    d['hour']       = date.dt.hour
    d['dayofweek']  = date.dt.dayofweek
    d['weekend']    = date.dt.dayofweek.isin([5,6]).astype(int)
    d['week']       = [t.isocalendar()[1] for t in date]
    d['season']     = d['month'].map({1:0,2:0,3:1,4:1,5:1,6:2,7:2,8:2,9:3,10:3,11:3,12:3})
    
    # 저어어어어엉님 코드 (https://dacon.io/competitions/official/236176/codeshare/9381?page=1&dtype=recent)
    # 주차 누적값
    week_list=[]
    for i in range(len(d)) :
        if d['year'][i] == 2019 :
            week_list.append(int(d['week'][i]))
        elif d['year'][i] == 2020 :
            week_list.append(int(d['week'][i])+52)
        elif d['year'][i] == 2021 :
            week_list.append(int(d['week'][i])+52+53)
        elif d['year'][i] == 2022 :
            week_list.append(int(d['week'][i])+52+53+53)
        else:
            raise ValueError('Unknown year value')
    d['week_num']= week_list
    
    # datetime 패키지에서 19년 12월 마지막주가 첫째주로 들어가는거 발견하여 수정
    d.loc[(d['year']==2019) & (d['month']==12) & (d['day']==30), 'week_num'] = 52
    d.loc[(d['year']==2019) & (d['month']==12) & (d['day']==31), 'week_num'] = 52

    # (2) is holiday & is dayoff
    holiday_list = get_holiday(d['year'].unique())
    d['is_holiday'] = date.isin(holiday_list).astype(int)
    d['is_dayoff']  = ((d.is_holiday==1) | (d.weekend==1)).astype(int)
    
    # (3) unuse features
    unuse_features = ['ID','사고일시','요일'] # '사망자수','중상자수','경상자수','부상자수'
    unuse_features = list(set(d.columns)&set(unuse_features))
    d.drop(columns=unuse_features,inplace=True)
    
    num_to_cat = ['year','month','day','hour','dayofweek','week','season']
    d[num_to_cat] = d[num_to_cat].astype(str)
    
    return d

In [8]:
train_df = preprocessing(train_df)
train_df = derived_features(train_df)

test_df = preprocessing(test_df)
test_df = derived_features(test_df)

In [9]:
train_df.head()

Unnamed: 0,기상상태,노면상태,사고유형,사망자수,중상자수,경상자수,부상자수,ECLO,구,동,도로구분,도로형태상세,주차장여부,도로여부,기상상태맑음여부,노면상태건조여부,year,month,day,hour,dayofweek,weekend,week,season,week_num,is_holiday,is_dayoff
0,맑음,건조,차대사람,0,1,0,0,5,중구,대신동,단일로,기타,0.0,1.0,1,1,2019,1,1,0,1,0,1,0,1,1,1
1,흐림,건조,차대사람,0,0,1,0,3,달서구,감삼동,단일로,기타,0.0,1.0,0,1,2019,1,1,0,1,0,1,0,1,1,1
2,맑음,건조,차대사람,0,0,1,0,3,수성구,두산동,단일로,기타,0.0,1.0,1,1,2019,1,1,1,1,0,1,0,1,0,0
3,맑음,건조,차대차,0,1,0,0,5,북구,복현동,단일로,기타,0.0,1.0,1,1,2019,1,1,2,1,0,1,0,1,0,0
4,맑음,건조,차대차,0,0,1,0,3,동구,신암동,단일로,기타,0.0,1.0,1,1,2019,1,1,4,1,0,1,0,1,0,0


<br>

## Merge addtional information

In [10]:
def frequency_merge(data,info_data,group,target,fillna_method):
    assert fillna_method in ['zero','min','max','avg'], \
        "fillna_method must be one of ['zero','min','max','avg']"
    prefix = f'{target}_'
    
    d = data.copy()
    info_d = info_data.copy()
    
    freq_data = info_data\
        .groupby(group)[target]\
        .apply(lambda x: x.value_counts())\
        .reset_index()\
        .pivot_table(index=group,columns=f'level_{len(group)}',values=target)\
        .add_prefix(prefix)\
        .reset_index()
    d = pd.merge(d,freq_data,how='left',on=group)
    
    cols = info_data[target].dropna().unique()
    cols = [f'{target}_{col}' for col in cols]
    cols = list(set(cols)&set(d.columns))
    for col in cols:
        if fillna_method=='zero':
            fillna_value = 0
        elif fillna_method=='min':
            fillna_value = d[col].min()
        elif fillna_method=='max':
            fillna_value = d[col].max()
        elif fillna_method=='avg':
            fillna_value = d[col].mean()
        d[col].fillna(fillna_value,inplace=True)
        
    assert len(data)==len(d), \
        "duplicated"
        
    return d

In [11]:
def agg_merge(data,info_data,group,target,agg,fillna_method=None):
    assert fillna_method in [None,'zero','min','max','avg'], \
        "fillna_method must be one of ['zero','min','max','avg']"
    assert agg in ['min','max','avg','sum'], \
        "agg must be one of ['min','max','avg','sum']"
    
    if agg=='min':
        aggfunc = np.nanmin
    elif agg=='max':
        aggfunc = np.nanmax
    elif agg=='avg':
        aggfunc = np.nanmean
    elif agg=='sum':
        aggfunc = np.nansum
        
    d = data.copy()
    info_d = info_data.copy()
        
    agg_df = info_d\
        .groupby(group)[target]\
        .apply(lambda x: aggfunc(x))\
        .reset_index()\
        .rename(columns={target:f'{target}_{agg}'})
    d = pd.merge(d,agg_df,how='left',on=group)
    
    if fillna_method is not None:
        if fillna_method=='zero':
            fillna_value = 0
        elif fillna_method=='min':
            fillna_value = d[f'{target}_{agg}'].min()
        elif fillna_method=='max':
            fillna_value = d[f'{target}_{agg}'].max()
        elif fillna_method=='avg':
            fillna_value = d[f'{target}_{agg}'].mean()
        d[f'{target}_{agg}'].fillna(fillna_value,inplace=True)
    
    assert len(data)==len(d), \
        "duplicated"
    
    return d

<br>

### CCTV information

In [12]:
def add_cctv_info(data):
    d = data.copy()
    cctv_df = pd.read_csv('./data/external_open/대구 CCTV 정보.csv',encoding='cp949')

    # 소재지지번주소 -> 도시 / 구 / 동 / 지번
    location_pattern = r'(\S+) (\S+) (\S+) (\S+)'
    cctv_df[['도시','구','동','지번']] = cctv_df['소재지지번주소'].str.extract(location_pattern)
    cctv_df.drop(['소재지지번주소','도시','지번'],axis=1,inplace=True)
    
    # 단속구분별 cctv 수
    tmp = cctv_df[cctv_df['단속구분']!=99]
    d = frequency_merge(d,tmp,group=['구','동'],target='단속구분',fillna_method='zero')

    # 도로노선방향별 cctv 수
    d = frequency_merge(d,cctv_df,group=['구','동'],target='도로노선방향',fillna_method='zero')

    # 제한속도 평균
    tmp = cctv_df[cctv_df['제한속도']!=0]
    d = agg_merge(d,tmp,group=['구','동'],target='제한속도',agg='avg',fillna_method='max')

    # 설치연도 평균
    d = agg_merge(d,cctv_df,group=['구','동'],target='설치연도',agg='avg',fillna_method='min')

    # 보호구역구분: 1,2인 경우 보호구역으로 보고, 나머지(99,nan)는 보호구역이 아닌 것으로 봄
    cctv_df['보호구역여부'] = np.where(cctv_df['보호구역구분'].isin([1,2]),1,0)
    d = agg_merge(d,cctv_df,group=['구','동'],target='보호구역여부',agg='sum',fillna_method='zero')
    
    # 위도,경도
    d = agg_merge(d,cctv_df,group=['구','동'],target='위도',agg='avg',fillna_method=None)
    d = agg_merge(d,cctv_df,group=['구','동'],target='경도',agg='avg',fillna_method=None)
    
    use_cols = setdiff(d.columns,data.columns)
    rename_dict = {col:'CCTV_'+col for col in use_cols}
    d = d.rename(columns=rename_dict)
    
    assert len(data)==len(d), \
        "duplicated"
    
    return d

In [13]:
train_df = add_cctv_info(train_df)
test_df  = add_cctv_info(test_df)

<br>

### Light information

In [14]:
def add_light_info(data):
    d = data.copy()
    light_df = pd.read_csv('./data/external_open/대구 보안등 정보.csv',encoding='cp949',low_memory=False)
    
    # 소재지지번주소 -> 도시 / 구 / 동 / 지번
    location_pattern = r'(\S+) (\S+) (\S+) (\S+)'
    light_df[['도시','구','동','지번']] = light_df['소재지지번주소'].str.extract(location_pattern)
    light_df.drop(['소재지지번주소','도시','지번'],axis=1,inplace=True)
    
    # 보안등 설치수
    d = agg_merge(d,light_df,group=['구','동'],target='설치개수',agg='sum',fillna_method='zero')
    
    # 보안등 설치연도
    d = agg_merge(d,light_df,group=['구','동'],target='설치연도',agg='avg',fillna_method='min')
    
    # 보안등 설치형태의 빈도
    d = frequency_merge(d,light_df,group=['구','동'],target='설치형태',fillna_method='zero')
    
    # 위도,경도
    d = agg_merge(d,light_df,group=['구','동'],target='위도',agg='avg',fillna_method=None)
    d = agg_merge(d,light_df,group=['구','동'],target='경도',agg='avg',fillna_method=None)
    
    use_cols = setdiff(d.columns,data.columns)
    rename_dict = {col:'보안등_'+col for col in use_cols}
    d = d.rename(columns=rename_dict)
    
    assert len(data)==len(d), \
        "duplicated"
    
    return d

In [15]:
train_df = add_light_info(train_df)
test_df  = add_light_info(test_df)

  .apply(lambda x: aggfunc(x))\
  .apply(lambda x: aggfunc(x))\
  .apply(lambda x: aggfunc(x))\
  .apply(lambda x: aggfunc(x))\
  .apply(lambda x: aggfunc(x))\
  .apply(lambda x: aggfunc(x))\


<br>

### Child area information

In [16]:
def add_child_area_info(data):
    d = data.copy()
    child_area_df = pd.read_csv('./data/external_open/대구 어린이 보호 구역 정보.csv',encoding='cp949')

    # 소재지지번주소 -> 도시 / 구 / 동 / 지번
    location_pattern = r'(\S+) (\S+) (\S+) (\S+)'
    child_area_df[['도시','구','동','지번']] = child_area_df['소재지지번주소'].str.extract(location_pattern)
    child_area_df.drop(['소재지지번주소','도시','지번'],axis=1,inplace=True)

    # 시설종류의 빈도
    d = frequency_merge(d,child_area_df,group=['구','동'],target='시설종류',fillna_method='zero')

    # 관할경찰서명의 빈도
    d = frequency_merge(d,child_area_df,group=['구','동'],target='관할경찰서명',fillna_method='zero')

    # CCTV설치여부의 빈도
    d = frequency_merge(d,child_area_df,group=['구','동'],target='CCTV설치여부',fillna_method='zero')

    # CCTV설치대수
    d = agg_merge(d,child_area_df,group=['구','동'],target='CCTV설치대수',agg='sum',fillna_method='zero')

    # 보호구역도로폭
    child_area_df['보호구역도로폭'] = child_area_df['보호구역도로폭'].apply(lambda x: sum([float(s) for s in str(x).split('~')]))
    d = agg_merge(d,child_area_df,group=['구','동'],target='보호구역도로폭',agg='avg',fillna_method='min')
    
    # 위도,경도
    d = agg_merge(d,child_area_df,group=['구','동'],target='위도',agg='avg',fillna_method=None)
    d = agg_merge(d,child_area_df,group=['구','동'],target='경도',agg='avg',fillna_method=None)
    
    use_cols = setdiff(d.columns,data.columns)
    rename_dict = {col:'어린이보호구역_'+col for col in use_cols}
    d = d.rename(columns=rename_dict)
    
    assert len(data)==len(d), \
        "duplicated"

    return d

In [17]:
train_df = add_child_area_info(train_df)
test_df  = add_child_area_info(test_df)

  .apply(lambda x: aggfunc(x))\
  .apply(lambda x: aggfunc(x))\


<br>

### Parking information

In [18]:
def add_parking_info(data):
    d = data.copy()
    parking_df = pd.read_csv('./data/external_open/대구 주차장 정보.csv',encoding='cp949')

    # 소재지지번주소 -> 도시 / 구 / 동 / 지번
    location_pattern = r'(\S+) (\S+) (\S+) (\S+)'
    parking_df[['도시','구','동','지번']] = parking_df['소재지지번주소'].str.extract(location_pattern)
    parking_df.drop(['소재지지번주소','도시','지번'],axis=1,inplace=True)

    # 주차장구분의 빈도
    d = frequency_merge(d,parking_df,group=['구','동'],target='주차장구분',fillna_method='zero')

    # 주차장유형의 빈도
    d = frequency_merge(d,parking_df,group=['구','동'],target='주차장유형',fillna_method='zero')

    # 급지구분의 빈도
    d = frequency_merge(d,parking_df,group=['구','동'],target='급지구분',fillna_method='zero')

    # 요금정보의 빈도
    d = frequency_merge(d,parking_df,group=['구','동'],target='요금정보',fillna_method='zero')

    # 주차구획수
    d = agg_merge(d,parking_df,group=['구','동'],target='주차구획수',agg='avg',fillna_method='zero')

    # 월정기권요금 (0제거)
    parking_df['월정기권요금'] = parking_df['월정기권요금'].replace(0,np.nan)
    d = agg_merge(d,parking_df,group=['구','동'],target='월정기권요금',agg='avg',fillna_method='zero')

    # 평일/토요일/공휴일 운영시간
    for daytype in ['평일','토요일','공휴일']:
        new_col = f'{daytype}운영시간'
        starttime_col = f'{daytype}운영시작시각'
        endtime_col = f'{daytype}운영종료시각'

        parking_df[new_col] = [round((datetime.datetime.strptime(b,'%H:%M')-datetime.datetime.strptime(a,'%H:%M')).seconds/3600,0)
                               for a,b in parking_df[[starttime_col,endtime_col]].values]
        parking_df[new_col] = [24 if t==0 else t for t in parking_df[new_col]]
        parking_df[new_col] = [t if w.find(daytype)>=0 else 0 for t,w in parking_df[[new_col,'운영요일']].values]

        d = agg_merge(d,parking_df,group=['구','동'],target=new_col,agg='avg',fillna_method='zero')
        
    use_cols = setdiff(d.columns,data.columns)
    rename_dict = {col:'주차장_'+col for col in use_cols}
    d = d.rename(columns=rename_dict)
        
    assert len(data)==len(d), \
        "duplicated"

    return d

In [19]:
train_df = add_parking_info(train_df)
test_df  = add_parking_info(test_df)

  .apply(lambda x: aggfunc(x))\
  .apply(lambda x: aggfunc(x))\


<br>

## Feature engineering

### fill N/A values by averaging

In [20]:
def coordinates_fillna(values):
    x = np.array(values).flatten()
    if sum(np.isnan(x)) == len(x):
        return [np.nan]*len(x)
    else:
        fillna_value = np.mean(x[~np.isnan(x)])
        return [fillna_value if np.isnan(v) else v for v in x]

In [21]:
latitude_cols  = train_df.columns[train_df.columns.str.contains('위도')].tolist()
longitude_cols = train_df.columns[train_df.columns.str.contains('경도')].tolist()

train_df[latitude_cols] = np.stack(train_df[latitude_cols].apply(coordinates_fillna,axis=1))
test_df [latitude_cols] = np.stack(test_df [latitude_cols].apply(coordinates_fillna,axis=1))

train_df[longitude_cols] = np.stack(train_df[longitude_cols].apply(coordinates_fillna,axis=1))
test_df [longitude_cols] = np.stack(test_df [longitude_cols].apply(coordinates_fillna,axis=1))

<br>

### Get missing latitude and longitude via geocode

In [22]:
# (참조) https://medium.com/@hazallgultekin/convert-address-to-latitude-longitude-using-python-21844da3d032
from geopy.geocoders import Nominatim
def get_geocode(address):
    # calling the Nominatim tool and create Nominatim class
    loc = Nominatim(user_agent="Geopy Library")

    # entering the location name
    getLoc = loc.geocode(address)

    return getLoc

In [23]:
col = latitude_cols[0]
pd.crosstab(
    np.where(train_df[col].isnull(),1,0),
    np.where(train_df[col].isnull(),1,0),
)

col_0,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,39286,0
1,0,323


In [24]:
# 위도/경도가 Null인 구/동을 모두 모음
gu_dong_data = pd.concat([
    train_df[['구','동']][train_df[col].isnull()],
    test_df [['구','동']][test_df [col].isnull()],
],axis=0).drop_duplicates().values

# 위의 구/동으로부터 위도/경도를 search (7개)
coord_info = []
for gu,dong in tqdm(gu_dong_data):
    address = f'대구광역시 {gu} {dong}'
    info = get_geocode(address)
    if info is not None:
        coord_info.append([gu,dong,info.latitude,info.longitude])
    else:
        coord_info.append([gu,dong,None,None])
coord_info = pd.DataFrame(coord_info,columns=['구','동','위도','경도'])
display(coord_info.isnull().sum())

100%|█████████████████████████████████████████████| 7/7 [00:06<00:00,  1.10it/s]


구     0
동     0
위도    0
경도    0
dtype: int64

In [25]:
def coordinates_fillna_with_geocode_values(values):
    x = np.array(values).flatten()
    if np.isnan(x[-1]):
        return x[:-1]
    else:
        return [x[-1]]*(len(x)-1)

In [26]:
train_df = pd.merge(train_df,coord_info,how='left',on=['구','동'])
train_df[latitude_cols]  = np.stack(train_df[latitude_cols +['위도']].apply(coordinates_fillna_with_geocode_values,axis=1))
train_df[longitude_cols] = np.stack(train_df[longitude_cols+['위도']].apply(coordinates_fillna_with_geocode_values,axis=1))
train_df.drop(['위도','경도'],axis=1,inplace=True)

test_df = pd.merge(test_df,coord_info,how='left',on=['구','동'])
test_df[latitude_cols]  = np.stack(test_df[latitude_cols +['위도']].apply(coordinates_fillna_with_geocode_values,axis=1))
test_df[longitude_cols] = np.stack(test_df[longitude_cols+['위도']].apply(coordinates_fillna_with_geocode_values,axis=1))
test_df .drop(['위도','경도'],axis=1,inplace=True)

In [27]:
train_df.isnull().sum().sum(),test_df.isnull().sum().sum()

(0, 0)

In [28]:
train_df.shape

(39609, 78)

<br>

## Target transformation

In [29]:
target_transform = TargetTransform(func=CFG.TARGET_TRANSFORMATION, offset=1)
train_df[CFG.TARGET] = target_transform.fit_transform(train_df[CFG.TARGET])

In [30]:
train_df.head(1)

Unnamed: 0,기상상태,노면상태,사고유형,사망자수,중상자수,경상자수,부상자수,ECLO,구,동,도로구분,도로형태상세,주차장여부,도로여부,기상상태맑음여부,노면상태건조여부,year,month,day,hour,dayofweek,weekend,week,season,week_num,is_holiday,is_dayoff,CCTV_단속구분_1,CCTV_단속구분_2,CCTV_단속구분_4,CCTV_도로노선방향_1,CCTV_도로노선방향_2,CCTV_도로노선방향_3,CCTV_제한속도_avg,CCTV_설치연도_avg,CCTV_보호구역여부_sum,CCTV_위도_avg,CCTV_경도_avg,보안등_설치개수_sum,보안등_설치연도_avg,보안등_설치형태_건축물,보안등_설치형태_전용주,보안등_설치형태_한전주,보안등_위도_avg,보안등_경도_avg,어린이보호구역_시설종류_어린이집,어린이보호구역_시설종류_유치원,어린이보호구역_시설종류_초등학교,어린이보호구역_시설종류_특수학교,어린이보호구역_시설종류_학원,어린이보호구역_관할경찰서명_강북경찰서,어린이보호구역_관할경찰서명_남부경찰서,어린이보호구역_관할경찰서명_달성경찰서,어린이보호구역_관할경찰서명_대구광역시 중부경찰서,어린이보호구역_관할경찰서명_북부경찰서,어린이보호구역_관할경찰서명_서부경찰서,어린이보호구역_관할경찰서명_수성경찰서,어린이보호구역_CCTV설치여부_N,어린이보호구역_CCTV설치여부_Y,어린이보호구역_CCTV설치대수_sum,어린이보호구역_보호구역도로폭_avg,어린이보호구역_위도_avg,어린이보호구역_경도_avg,주차장_주차장구분_공영,주차장_주차장구분_민영,주차장_주차장유형_노상,주차장_주차장유형_노외,주차장_급지구분_1,주차장_급지구분_2,주차장_급지구분_3,주차장_요금정보_무료,주차장_요금정보_유료,주차장_요금정보_혼합,주차장_주차구획수_avg,주차장_월정기권요금_avg,주차장_평일운영시간_avg,주차장_토요일운영시간_avg,주차장_공휴일운영시간_avg
0,맑음,건조,차대사람,0.0,0.693147,0.0,0.0,5,중구,대신동,단일로,기타,0.0,1.0,1,1,2019,1,1,0,1,0,1,0,1,1,1,0.0,1.0,4.0,0.0,1.0,4.0,40.0,2013.0,2.0,35.868237,128.580886,391.0,2007.076046,177.0,30.0,135.0,35.867981,128.579156,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,13.0,26.0,35.868541,128.581033,0.0,11.0,0.0,11.0,11.0,0.0,0.0,0.0,11.0,0.0,45.454545,0.0,12.636364,12.636364,2.181818


<br>

## Quantile values of target

In [31]:
cat_features = train_df.columns[train_df.dtypes=='object'].tolist()
dummy_features = ['주차장여부','도로여부','기상상태맑음여부','노면상태건조여부','weekend','is_holiday','is_dayoff']
num_features = setdiff(train_df.columns,cat_features+dummy_features+[CFG.TARGET])

In [32]:
# # feature engineering
# calculator = CategoricalQuantileCalculator()
# calculator.fit(
#     data=train_df,
#     test_data=test_df,
#     target_feature=CFG.TARGET,
#     cat_features=cat_features,
#     subset_depth=CFG.SUBSET_DEPTH,
# )
# train_df = calculator.transform(train_df)
# test_df  = calculator.transform(test_df)

In [33]:
train_df.shape

(39609, 78)

<br>

## TargetEncoder

In [34]:
# target값이 높은 category에 높은 숫자를 부여
from category_encoders.target_encoder import TargetEncoder

te = TargetEncoder(cols=cat_features)
train_df[cat_features] = te.fit_transform(train_df[cat_features],train_df['ECLO'])
test_df [cat_features] = te.transform(test_df[cat_features])

In [35]:
train_df.head()

Unnamed: 0,기상상태,노면상태,사고유형,사망자수,중상자수,경상자수,부상자수,ECLO,구,동,도로구분,도로형태상세,주차장여부,도로여부,기상상태맑음여부,노면상태건조여부,year,month,day,hour,dayofweek,weekend,week,season,week_num,is_holiday,is_dayoff,CCTV_단속구분_1,CCTV_단속구분_2,CCTV_단속구분_4,CCTV_도로노선방향_1,CCTV_도로노선방향_2,CCTV_도로노선방향_3,CCTV_제한속도_avg,CCTV_설치연도_avg,CCTV_보호구역여부_sum,CCTV_위도_avg,CCTV_경도_avg,보안등_설치개수_sum,보안등_설치연도_avg,보안등_설치형태_건축물,보안등_설치형태_전용주,보안등_설치형태_한전주,보안등_위도_avg,보안등_경도_avg,어린이보호구역_시설종류_어린이집,어린이보호구역_시설종류_유치원,어린이보호구역_시설종류_초등학교,어린이보호구역_시설종류_특수학교,어린이보호구역_시설종류_학원,어린이보호구역_관할경찰서명_강북경찰서,어린이보호구역_관할경찰서명_남부경찰서,어린이보호구역_관할경찰서명_달성경찰서,어린이보호구역_관할경찰서명_대구광역시 중부경찰서,어린이보호구역_관할경찰서명_북부경찰서,어린이보호구역_관할경찰서명_서부경찰서,어린이보호구역_관할경찰서명_수성경찰서,어린이보호구역_CCTV설치여부_N,어린이보호구역_CCTV설치여부_Y,어린이보호구역_CCTV설치대수_sum,어린이보호구역_보호구역도로폭_avg,어린이보호구역_위도_avg,어린이보호구역_경도_avg,주차장_주차장구분_공영,주차장_주차장구분_민영,주차장_주차장유형_노상,주차장_주차장유형_노외,주차장_급지구분_1,주차장_급지구분_2,주차장_급지구분_3,주차장_요금정보_무료,주차장_요금정보_유료,주차장_요금정보_혼합,주차장_주차구획수_avg,주차장_월정기권요금_avg,주차장_평일운영시간_avg,주차장_토요일운영시간_avg,주차장_공휴일운영시간_avg
0,4.712888,4.712878,3.81765,0.0,0.693147,0.0,0.0,5,4.54161,4.282449,4.671841,4.599599,0.0,1.0,1,1,4.842185,4.661716,4.814509,5.071256,4.627926,0,4.702492,4.723309,1,1,1,0.0,1.0,4.0,0.0,1.0,4.0,40.0,2013.0,2.0,35.868237,128.580886,391.0,2007.076046,177.0,30.0,135.0,35.867981,128.579156,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,13.0,26.0,35.868541,128.581033,0.0,11.0,0.0,11.0,11.0,0.0,0.0,0.0,11.0,0.0,45.454545,0.0,12.636364,12.636364,2.181818
1,4.77915,4.712878,3.81765,0.0,0.0,0.693147,0.0,3,4.618441,4.738938,4.671841,4.599599,0.0,1.0,0,1,4.842185,4.661716,4.814509,5.071256,4.627926,0,4.702492,4.723309,1,1,1,1.0,3.0,8.0,3.0,1.0,8.0,52.5,2015.333333,2.0,35.849099,128.540606,932.0,1999.889485,0.0,0.0,0.0,35.849927,128.54216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,35.849513,128.541383,4.0,0.0,1.0,3.0,0.0,1.0,3.0,2.0,2.0,0.0,28.5,80000.0,18.5,18.5,18.5
2,4.712888,4.712878,3.81765,0.0,0.0,0.693147,0.0,3,4.7273,4.842715,4.671841,4.599599,0.0,1.0,1,1,4.842185,4.661716,4.814509,5.251121,4.627926,0,4.702492,4.723309,1,0,0,2.0,2.0,0.0,3.0,1.0,0.0,55.0,2018.25,0.0,35.834183,128.621395,473.0,2015.334395,14.0,31.0,425.0,35.834061,128.621224,3.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,4.0,35.833939,128.621053,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4.712888,4.712878,4.944597,0.0,0.693147,0.0,0.0,5,4.687669,4.20892,4.671841,4.599599,0.0,1.0,1,1,4.842185,4.661716,4.814509,5.407692,4.627926,0,4.702492,4.723309,1,0,0,2.0,8.0,0.0,7.0,3.0,0.0,38.0,2018.3,4.0,35.899975,128.619733,534.0,1990.285714,0.0,0.0,0.0,35.895712,128.619904,3.0,4.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,11.0,0.0,0.0,2.0,9.0,32.0,9.636364,35.897687,128.622803,14.0,0.0,11.0,3.0,0.0,9.0,5.0,13.0,1.0,0.0,26.714286,0.0,23.142857,23.142857,23.142857
4,4.712888,4.712878,4.944597,0.0,0.0,0.693147,0.0,3,4.889534,4.549091,4.671841,4.599599,0.0,1.0,1,1,4.842185,4.661716,4.814509,5.128065,4.627926,0,4.702492,4.723309,1,0,0,2.0,10.0,0.0,9.0,3.0,0.0,40.0,2018.25,7.0,35.883077,128.620268,2057.0,1990.285714,540.0,57.0,1396.0,35.884415,128.623264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,35.883746,128.621766,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,63.0,80000.0,10.0,10.0,10.0


<br>

## Group scaler

In [36]:
# num_features = train_df.drop(cat_features,axis=1)
# num_features = setdiff(num_features,[CFG.TARGET])

In [37]:
# from sklearn.preprocessing import MinMaxScaler
# scaler = GroupScaler(scaler=MinMaxScaler())
# scaler.fit(
#     data=train_df,
#     group='gungu',
#     num_features=num_features,
# )
# train_df = scaler.transform(train_df)
# test_df  = scaler.transform(test_df)

<br>

## Memory reduction

In [38]:
train_df, _ = reduce_mem_usage(train_df,verbose=False)
test_df , _ = reduce_mem_usage(test_df ,verbose=False)

In [39]:
train_df.to_parquet('./out/train_data_log_target4.parquet')
test_df .to_parquet('./out/test_data_log_target4.parquet')

<br></br>

# EDA

In [40]:
# d = train_df.copy()

# for i,col in enumerate(num_features):
#     str_i = str(i+1).zfill(len(str(len(num_features))))
#     plt.figure(figsize=(15,7))
#     sns.scatterplot(x=d[col],y=d[CFG.TARGET])
#     plt.grid()
#     plt.title('[{}/{}] {}'.format(str_i,len(num_features),col))
#     plt.show()

# categorical_features = cat_features+dummy_features
# for i,col in enumerate(categorical_features):
#     str_i = str(i+1).zfill(len(str(len(categorical_features))))
#     plt.figure(figsize=(15,7))
#     sns.boxplot(x=d[col],y=d[CFG.TARGET])
#     plt.grid()
#     plt.title('[{}/{}] {}'.format(str_i,len(categorical_features),col))
#     plt.show()

<br></br>

# Modeling

In [41]:
# https://dacon.io/en/codeshare/6499

In [42]:
from copy import deepcopy
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.base import BaseEstimator, RegressorMixin
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV, MultiTaskElasticNetCV
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
import time
import pickle
import dill

def RMSE(y_true, y_pred):
    return mean_squared_error(y_true=y_true,y_pred=y_pred)**0.5

def RMSLE(y_true, y_pred):
    log_true = np.log1p(y_true)
    log_pred = np.log1p(y_pred)
    squared_error = (log_true-log_pred)**2
    return np.sqrt(np.mean(squared_error))

In [43]:
# 2) 업데이트버전
#  (1) stacking 추가
#  (2) LGBM에 sample_weight 추가
class WeightedEnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,
                 hyperparameters,
                 weight='balanced',
                 inverse_transform=None,
                 eval_metric=None,
                 method='ensemble',
                 use_weightedsum_in_stacking=True):
        super().__init__()
        
        assert weight in ['equal','balanced'], \
            "weight must be one of ['equal','balanced']"
        assert method in ['ensemble','stacking'], \
            "method must be one of ['ensemble','stacking']"
        
        self.hyperparameters = hyperparameters
        self.weight = weight
        self.inverse_transform = inverse_transform
        self.eval_metric = RMSE if eval_metric is None else eval_metric
        self.method = method
        self.use_weightedsum_in_stacking = use_weightedsum_in_stacking
        
        self._get_regressors()
        self._get_regressors_name()
        
        if use_weightedsum_in_stacking:
            self.stacking_feature = 'pred'
        else:
            self.stacking_feature = [f'pred{i+1}' for i in range(len(self.regressors))]
            
    def _get_regressors(self):
        max_depth = 9
        n_jobs = -1
        cat_loss_function = 'RMSE' # 'RMSE','MAE','MultiRMSE'
        cat_eval_metric = 'RMSE'   # 'RMSE','MAE','MultiRMSE'
        lgb_metric = 'rmse' # 'rmse','mean_absolute_error'
        cv = RepeatedKFold(n_splits=3, n_repeats=2, random_state=self.hyperparameters['random_state'])
        
        params_ridge = {
            'alphas' : [1e-5, 1e-3, 1e-1, 1.0, 10.0, 100.0],
            'cv' : cv,
        }
        
        params_lasso = {
            'alphas' : [1e-5, 1e-3, 1e-1, 1.0, 10.0, 100.0],
            'cv' : cv,
            'n_jobs' : n_jobs,
            #'max_iter' : 30000,
            'tol' : 0.001,
        }
        
        params_elasticnet = {
            'l1_ratio' : np.arange(0.1, 1, 0.1),
            'alphas' : [1e-5, 1e-3, 1e-1, 1.0, 10.0, 100.0],
            'cv' : cv,
            'n_jobs' : n_jobs,
            #'max_iter' : 30000,
            'tol' : 0.001,
        }
        
        params_catboost1 = {
            'random_state' : self.hyperparameters['random_state'],
            'iterations' : self.hyperparameters['iterations'],
            'early_stopping_rounds' : self.hyperparameters['early_stopping_rounds'],
            'learning_rate' : self.hyperparameters['learning_rate'],
            'loss_function' : cat_loss_function,
            #'loss_function' : cat_loss_function, 'eval_metric' : cat_eval_metric,
            'grow_policy' : 'Lossguide', # 'SymmetricTree','Depthwise'
            'use_best_model' : True,
            'allow_writing_files' : False,
            'verbose' : 0,
            'max_depth' : self.hyperparameters['max_depth'],
            #'l2_leaf_reg' : 1,
        }
        
        params_catboost2 = {
            'random_state' : self.hyperparameters['random_state'],
            'iterations' : self.hyperparameters['iterations'],
            'early_stopping_rounds' : self.hyperparameters['early_stopping_rounds'],
            'learning_rate' : self.hyperparameters['learning_rate'],
            'loss_function' : cat_loss_function,
            #'loss_function' : cat_loss_function, 'eval_metric' : cat_eval_metric,
            'grow_policy' : 'Lossguide', # 'SymmetricTree','Depthwise'
            'use_best_model' : True,
            'allow_writing_files' : False,
            'verbose' : 0,
            #'max_depth' : self.hyperparameters['max_depth'],
            'l2_leaf_reg' : 3,
        }
        
        params_catboost3 = {
            'random_state' : self.hyperparameters['random_state'],
            'iterations' : self.hyperparameters['iterations'],
            'early_stopping_rounds' : self.hyperparameters['early_stopping_rounds'],
            'learning_rate' : self.hyperparameters['learning_rate'],
            'loss_function' : cat_loss_function,
            #'loss_function' : cat_loss_function, 'eval_metric' : cat_eval_metric,
            'grow_policy' : 'SymmetricTree', # 'Lossguide','Depthwise'
            'use_best_model' : True,
            'allow_writing_files' : False,
            'verbose' : 0,
            #'max_depth' : self.hyperparameters['max_depth'],
            'l2_leaf_reg' : 1,
        }
        
        params_catboost4 = {
            'random_state' : self.hyperparameters['random_state'],
            'iterations' : self.hyperparameters['iterations'],
            'early_stopping_rounds' : self.hyperparameters['early_stopping_rounds'],
            'learning_rate' : self.hyperparameters['learning_rate'],
            'loss_function' : cat_loss_function,
            #'loss_function' : cat_loss_function, 'eval_metric' : cat_eval_metric,
            'grow_policy' : 'Depthwise', # 'SymmetricTree','Depthwise'
            'use_best_model' : True,
            'allow_writing_files' : False,
            'verbose' : 0,
            'max_depth' : self.hyperparameters['max_depth'],
            'l2_leaf_reg' : 3,
        }
        
        params_lightgbm1 = {
            'random_state' : self.hyperparameters['random_state'],
            'n_estimators' : self.hyperparameters['iterations'],
            'early_stopping_round' : self.hyperparameters['early_stopping_rounds'],
            'learning_rate' : self.hyperparameters['learning_rate'],
            'objective' : 'regression',
            'metric' : lgb_metric,
            'verbosity' : -1,
            'max_depth': self.hyperparameters['max_depth'],
            'reg_lambda' : 1,
            'n_jobs' : n_jobs,
        }
        
        params_lightgbm2 = {
            'random_state' : self.hyperparameters['random_state'],
            'n_estimators' : self.hyperparameters['iterations'],
            'early_stopping_round' : self.hyperparameters['early_stopping_rounds'],
            'learning_rate' : self.hyperparameters['learning_rate'],
            'objective' : 'regression',
            'metric' : lgb_metric,
            'verbosity' : -1,
            'max_depth': self.hyperparameters['max_depth'],
            'reg_lambda' : 3,
            'n_jobs' : n_jobs,
        }
        
        params_lightgbm3 = {
            'random_state' : self.hyperparameters['random_state'],
            'n_estimators' : self.hyperparameters['iterations'],
            'early_stopping_round' : self.hyperparameters['early_stopping_rounds'],
            'learning_rate' : self.hyperparameters['learning_rate'],
            'objective' : 'regression',
            'metric' : lgb_metric,
            'verbosity' : -1,
            'max_depth': self.hyperparameters['max_depth'],
            'reg_alpha' : 1,
            'n_jobs' : n_jobs,
        }
        
        params_lightgbm4 = {
            'random_state' : self.hyperparameters['random_state'],
            'n_estimators' : self.hyperparameters['iterations'],
            'early_stopping_round' : self.hyperparameters['early_stopping_rounds'],
            'learning_rate' : self.hyperparameters['learning_rate'],
            'objective' : 'regression',
            'metric' : lgb_metric,
            'verbosity' : -1,
            'max_depth': self.hyperparameters['max_depth'],
            'reg_alpha' : 3,
            'n_jobs' : n_jobs,
        }
        
        params_lightgbm5 = {
            'random_state' : self.hyperparameters['random_state'],
            'n_estimators' : self.hyperparameters['iterations'],
            'early_stopping_round' : self.hyperparameters['early_stopping_rounds'],
            'learning_rate' : self.hyperparameters['learning_rate'],
            'objective' : 'regression',
            'metric' : lgb_metric,
            'verbosity' : -1,
            'max_depth': self.hyperparameters['max_depth'],
            'reg_alpha' : 1,
            'reg_lambda' : 1,
            'n_jobs' : n_jobs,
        }
        
        params_lightgbm6 = {
            'random_state' : self.hyperparameters['random_state'],
            'n_estimators' : self.hyperparameters['iterations'],
            'early_stopping_round' : self.hyperparameters['early_stopping_rounds'],
            'learning_rate' : self.hyperparameters['learning_rate'],
            'objective' : 'regression',
            'metric' : lgb_metric,
            'verbosity' : -1,
            'max_depth': self.hyperparameters['max_depth'],
            'reg_alpha' : 1,
            'reg_lambda' : 3,
            'n_jobs' : n_jobs,
        }
        
        params_lightgbm7 = {
            'random_state' : self.hyperparameters['random_state'],
            'n_estimators' : self.hyperparameters['iterations'],
            'early_stopping_round' : self.hyperparameters['early_stopping_rounds'],
            'learning_rate' : self.hyperparameters['learning_rate'],
            'objective' : 'regression',
            'metric' : lgb_metric,
            'verbosity' : -1,
            'max_depth': self.hyperparameters['max_depth'],
            'reg_alpha' : 3,
            'reg_lambda' : 1,
            'n_jobs' : n_jobs,
        }
        
        params_lightgbm8 = {
            'random_state' : self.hyperparameters['random_state'],
            'n_estimators' : self.hyperparameters['iterations'],
            'early_stopping_round' : self.hyperparameters['early_stopping_rounds'],
            'learning_rate' : self.hyperparameters['learning_rate'],
            'objective' : 'regression',
            'metric' : lgb_metric,
            'verbosity' : -1,
            'max_depth': self.hyperparameters['max_depth'],
            'reg_alpha' : 3,
            'reg_lambda' : 3,
            'n_jobs' : n_jobs,
        }
        
        params_xgboost = {
            'random_state' : self.hyperparameters['random_state'],
            'n_estimators' : self.hyperparameters['xgb_iterations'],
            'early_stopping_rounds' : self.hyperparameters['xgb_early_stopping_rounds'],
            'learning_rate' : self.hyperparameters['xgb_learning_rate'],
            'objective' : 'reg:squarederror',#'reg:absoluteerror',
            'verbosity' : 0,
            'max_depth': self.hyperparameters['max_depth'],
            'n_jobs' : n_jobs,
        }
        
        # params_extratrees = {
        #     'random_state' : self.hyperparameters['random_state'],
        #     'n_estimators' : self.hyperparameters['extratrees_iterations'],
        #     'criterion' : 'absolute_error',
        #     'verbose' : 0,
        #     'max_depth' : self.hyperparameters['max_depth'],
        #     'n_jobs' : n_jobs,
        # }
        
        params_hgb = {
            'random_state' : self.hyperparameters['random_state'],
            'max_iter' : self.hyperparameters['hgb_iterations'],
            'verbose' : 0,
            'max_depth' : self.hyperparameters['max_depth'],
        }
        
        params_rf = {
            'random_state' : self.hyperparameters['random_state'],
            'n_estimators' : self.hyperparameters['rf_iterations'],
            'verbose' : 0,
            'max_depth' : self.hyperparameters['max_depth'],
        }
        
        self.regressors = [
            # LinearRegression(),
            # RidgeCV(**params_ridge),
            # LassoCV(**params_lasso),
            # ElasticNetCV(**params_elasticnet),
            CatBoostRegressor(**params_catboost4),
            XGBRegressor(**params_xgboost),
            LGBMRegressor(**params_lightgbm8),
            #RandomForestRegressor(**params_rf),
            #ExtraTreesRegressor(**params_extratrees),
            #HistGradientBoostingRegressor(**params_hgb),
        ]
        
        self.stacking_regressors = [
            # LinearRegression(),
            # RidgeCV(**params_ridge),
            # LassoCV(**params_lasso),
            # ElasticNetCV(**params_elasticnet),
            CatBoostRegressor(**params_catboost4),
            XGBRegressor(**params_xgboost),
            LGBMRegressor(**params_lightgbm8),
            #RandomForestRegressor(**params_rf),
            #ExtraTreesRegressor(**params_extratrees),
            #HistGradientBoostingRegressor(**params_hgb),
        ]
        
    def _get_regressors_name(self):
        self.regressors_name = [type(r).__name__ for r in self.regressors]
        self.stacking_regressors_name = [type(r).__name__ for r in self.stacking_regressors]
        
    def _get_ohe(self,X,cat_features):
        ohe = OneHotEncoder()
        ohe.fit(X,cat_features,remove_first=False)
        return ohe
        
    def _set_zero_to_minimum(self,pred,minimum_value):
        pred = np.array(pred).flatten()
        if np.where(pred<0,1,0).sum()>0:
            pred = [x if x>0 else minimum_value for x in pred]
        pred = np.array(pred).flatten()
        return pred
    
    def _set_inf_to_maximum(self,pred,maximum_value):
        pred = np.array(pred).flatten()
        if np.where(pred==np.inf,1,0).sum()>0:
            pred = [x if x!=np.inf else maximum_value for x in pred]
        pred = np.array(pred).flatten()
        return pred
    
    def _preprocess(self,pred):
        # pred = self._set_zero_to_minimum(pred,self.minimum_value)
        # pred = self._set_inf_to_maximum(pred,self.maximum_value)
        return pred
    
    def _fit_regressor(self,
                       regressor,regressor_name,
                       features,oh_features,
                       X,X_oh,X_val,X_val_oh,y,y_val,cat_features,
                       sample_weight,eval_sample_weight):
        X = X[features]
        X_val = X_val[features]
        X_oh = X_oh[oh_features]
        X_val_oh = X_val_oh[oh_features]
        
        if (regressor_name in ['LinearRegression','RidgeCV','LassoCV']) or\
            (regressor_name.find('ExtraTreesRegressor')>=0) or\
            (regressor_name.find('RandomForestRegressor')>=0) or\
            (regressor_name.find('HistGradientBoostingRegressor')>=0) or\
            (regressor_name.find('ElasticNetCV')>=0):
            warnings.filterwarnings("ignore", category=UserWarning)
            # fitting
            regressor.fit(X_oh,y)
            # prediction
            tr_pred = self._preprocess(regressor.predict(X_oh))
            va_pred = self._preprocess(regressor.predict(X_val_oh))

        elif regressor_name.find('XGBRegressor')>=0:
            # fitting
            regressor.fit(
                X_oh,y,
                eval_set=[(X_val_oh,y_val)],
                verbose=0,
            )
            # prediction
            tr_pred = self._preprocess(regressor.predict(X_oh))
            va_pred = self._preprocess(regressor.predict(X_val_oh))

        elif regressor_name.find('CatBoostRegressor')>=0:
            # dataset
            train_dataset = Pool(X    ,y    ,cat_features=cat_features)
            val_dataset   = Pool(X_val,y_val,cat_features=cat_features)
            # fitting
            regressor.fit(
                train_dataset,
                eval_set=val_dataset,
                #metric_period=self.hyperparameters['iterations']//50, verbose=True,
                verbose=False,
            )
            # prediction
            tr_pred = self._preprocess(regressor.predict(train_dataset))
            va_pred = self._preprocess(regressor.predict(val_dataset))

        elif regressor_name.find('LGBMRegressor')>=0:
            warnings.filterwarnings("ignore", category=UserWarning)
            # astype category
            for col in cat_features:
                X[col]     = X[col]    .astype('category')
                X_val[col] = X_val[col].astype('category')
            # fitting
            regressor.fit(
                X,y,
                eval_set=[(X_val,y_val)],
                sample_weight=sample_weight,
                eval_sample_weight=eval_sample_weight,
                categorical_feature=cat_features,
                verbose=-1,
            )
            tr_pred = self._preprocess(regressor.predict(X))
            va_pred = self._preprocess(regressor.predict(X_val))
            # # fitting
            # regressor.fit(
            #     X_oh,y,
            #     eval_set=[(X_val_oh,y_val)],
            #     sample_weight=sample_weight,
            #     eval_sample_weight=eval_sample_weight,
            #     #categorical_feature=cat_features,
            #     verbose=-1,
            # )
            # tr_pred = self._preprocess(regressor.predict(X_oh))
            # va_pred = self._preprocess(regressor.predict(X_val_oh))
            
        else:
            raise ValueError('Unknown Regressor: {}'.format(regressor_name))
            
        return regressor, tr_pred, va_pred
            
    def _get_prediction_values(self,X,X_oh,method,regressors_name,regressors,weights,return_weighted):
        if method=='ensemble':
            features    = self.features
            oh_features = self.oh_features
        elif method=='stacking':
            stacking_feature = [self.stacking_feature] if isinstance(self.stacking_feature,str) else self.stacking_feature
            features    = self.features + stacking_feature
            oh_features = self.oh_features + stacking_feature
        
        # (1) 예측값생성
        pred_list = []
        for regressor_name,regressor in zip(regressors_name,regressors):
            if (regressor_name in ['LinearRegression','RidgeCV','LassoCV','ElasticNetCV','RandomForestRegressor']) or\
                (regressor_name.find('ExtraTreesRegressor')>=0) or\
                (regressor_name.find('RandomForestRegressor')>=0) or\
                (regressor_name.find('XGBRegressor')>=0) or\
                (regressor_name.find('HistGradientBoostingRegressor')>=0) or\
                (regressor_name.find('ElasticNetCV')>=0):
                dataset = X_oh[oh_features]
            elif regressor_name.find('CatBoostRegressor')>=0:
                dataset = Pool(X[features],cat_features=self.cat_features)
            elif regressor_name.find('LGBMRegressor')>=0:
                # dataset = X_oh[oh_features]
                dataset = X[features].copy()
                for col in self.cat_features:
                    dataset[col] = dataset[col].astype('category')
            else:
                raise ValueError('Unknown Regressor: {}'.format(regressor_name))
            
            y_pred = self._preprocess(regressor.predict(dataset))
            pred_list.append(y_pred)
        
        # (2) return weighted or original value
        if return_weighted:
            final_pred = []
            for pred,weight in zip(pred_list,weights):
                p = np.array(pred)*weight
                final_pred.append(p)
            final_pred = np.sum(final_pred,axis=0)
            if self.inverse_transform is not None:
                final_pred = self.inverse_transform(np.array(final_pred))
                final_pred = self._preprocess(final_pred)
        else:
            final_pred = np.array(pred_list).T
            
        return final_pred
        
    def _predict(self,X,method,return_weighted=True):
        if len(self.cat_features)>0:
            X_oh = self.ohe.transform(X)
        else:
            X_oh = X.copy()
        assert len(X)==len(X_oh), \
            "X and X_oh must be same length"
        
        # (1) ensemble
        pred_list = self._get_prediction_values(
            X,X_oh,
            'ensemble',
            self.regressors_name,self.regressors,
            self.ensemble_weights,return_weighted,
        )
        
        if method=='ensemble':
            return pred_list
        
        elif method=='stacking':
            # (2) stacking
            columns = [self.stacking_feature] if isinstance(self.stacking_feature,str) else self.stacking_feature
            pred_df = pd.DataFrame(pred_list,columns=columns,index=X.index)
            
            X    = pd.concat([X   ,pred_df],axis=1)
            X_oh = pd.concat([X_oh,pred_df],axis=1)

            pred_list = self._get_prediction_values(
                X,X_oh,
                'stacking',
                self.stacking_regressors_name,self.stacking_regressors,
                self.stacking_weights,return_weighted,
            )
            return pred_list
        
    def get_feature_importance(self):
        # message_print = warnings.warn
        message_print = print
        supported_models = self.regressors_name
        
        # # feature_importances_를 지원하는 모델들
        # supported_models = ['CatBoostRegressor','XGBRegressor','LGBMRegressor','ExtraTreesRegressor',
        #                     'ExtraTreesRegressor', 'RandomForestRegressor', 'HistGradientBoostingRegressor']

        # # 지원하지않는 모델이 있는 경우 warning message
        # not_supported_1 = len([name for name in self.regressors_name if name not in supported_models])
        # not_supported_2 = 0
        # for name in self.regressors_name:
        #     for m in supported_models:
        #         k = 0 if name.find(m)>=0 else 1
        #         not_supported_2 += k

        # if not_supported_1 + not_supported_2 > 0:
        #     message_print("not support model")

        # get weighted feature importance by using ensemble_weights
        feature_importance_df = pd.DataFrame(self.features,columns=['feature'])
        for i,(regressor,regressor_name,weight) in enumerate(zip(self.regressors,self.regressors_name,self.ensemble_weights)):
            if regressor_name in supported_models:
                feature_importance = regressor.feature_importances_
                fi_list = []
                for feature in self.features:
                    fi = feature_importance[np.where(np.array(self.features)==feature)[0]]
                    fi_list.append([feature,sum(fi)])

                imp_col = f'importance{i}'
                fi_df = pd.DataFrame(fi_list,columns=['feature',imp_col]).sort_values(imp_col,ascending=False)
                fi_df[imp_col] = 100 * fi_df[imp_col] / fi_df[imp_col].sum()
                fi_df[imp_col] *= weight

                feature_importance_df = pd.merge(feature_importance_df,fi_df,how='left',on='feature')

        feature_importance_df = feature_importance_df.fillna(0)
        feature_importance_df['importance'] = feature_importance_df.drop('feature',axis=1).sum(axis=1)
        feature_importance_df = feature_importance_df[['feature','importance']]
        
        return feature_importance_df
        
    def plot_feature_importance(self):
        feature_importance_df = self.get_feature_importance()
        feature_importance_df.sort_values('importance',ascending=True,inplace=True)
        plt.figure(figsize=(15,7))
        plt.barh(feature_importance_df['feature'],feature_importance_df['importance'])
        plt.show()
            
    def fit(self,
            X,y,eval_set,cat_features=None,
            sample_weight=None,eval_sample_weight=None,verbose=1):
        assert len(eval_set)==1, \
            "eval_set length must be 1. len(eval_set)={}".format(len(eval_set))
        
        if len(self.regressors)!=len(self.regressors_name):
            self._get_regressors_name()
        
        self.sample_weight = sample_weight
        self.eval_sample_weight = eval_sample_weight
        self.cat_features = [] if cat_features is None else cat_features
        if cat_features is None:
            self.enable_categorical = [False]*X.shape[1]
        else:
            self.enable_categorical = [True if col in cat_features else False for col in X.columns]
        
        #----------------------------------------------------------------------------------------#
        # prepare dataset
        #----------------------------------------------------------------------------------------#
        X_val, y_val = eval_set[0]
        
        del_cols = return_unique_columns(X)
        X.drop(del_cols,axis=1,inplace=True)
        X_val.drop(del_cols,axis=1,inplace=True)
        self.cat_features = list(set(self.cat_features)-set(del_cols))
        
        if len(self.cat_features)>0:
            self.ohe = self._get_ohe(X,cat_features)
            X_oh = self.ohe.transform(X)
            X_val_oh = self.ohe.transform(X_val)
        else:
            X_oh = X.copy()
            X_val_oh = X_val.copy()
        
        del_oh_cols = return_unique_columns(X_oh)
        X_oh.drop(del_oh_cols,axis=1,inplace=True)
        X_val_oh.drop(del_oh_cols,axis=1,inplace=True)
        
        #----------------------------------------------------------------------------------------#
        # save feature names
        #----------------------------------------------------------------------------------------#
        self.features    = X   .columns.tolist()
        self.oh_features = X_oh.columns.tolist()
        
        #----------------------------------------------------------------------------------------#
        # true value
        #----------------------------------------------------------------------------------------#
        tr_true = np.array(y)
        va_true = np.array(y_val)
        if self.inverse_transform is not None:
            tr_true = self.inverse_transform(tr_true)
            va_true = self.inverse_transform(va_true)
        
        #----------------------------------------------------------------------------------------#
        # set min,max value
        #----------------------------------------------------------------------------------------#
        self.minimum_value = min(np.nanmin(y),np.nanmin(y_val))
        self.maximum_value = max(np.nanmax(y),np.nanmax(y_val))
        
        #----------------------------------------------------------------------------------------#
        # (1) ensemble fitting
        #----------------------------------------------------------------------------------------#
        # prepare ensemble fitting
        self.ensemble_scores = []
        self.ensemble_weights = []
        self.ensemble_fitting_elapsed = []
        ensemble_pbar = zip(self.regressors_name,self.regressors)

        # fitting
        if self.method=='stacking':
            if verbose:
                print('\n########  <Step1> Ensemble  ########')
        for fit_iter,(regressor_name,regressor) in enumerate(ensemble_pbar):
            s = time.time()
                
            # fit
            regressor, tr_pred, va_pred = self._fit_regressor(
                regressor, regressor_name,
                self.features,self.oh_features,
                X, X_oh, X_val, X_val_oh, y, y_val, self.cat_features,
                sample_weight, eval_sample_weight,
            )
            self.regressors[fit_iter] = regressor
            
            # progress
            if self.inverse_transform is not None:
                tr_pred = self.inverse_transform(tr_pred)
                tr_pred = self._preprocess(tr_pred)
                va_pred = self.inverse_transform(va_pred)
                va_pred = self._preprocess(va_pred)
            
            tr_score = self.eval_metric(y_pred=tr_pred,y_true=tr_true)
            va_score = self.eval_metric(y_pred=va_pred,y_true=va_true)
            
            e = time.time()
            self.ensemble_scores.append(va_score)
            self.ensemble_weights.append(1/va_score)
            self.ensemble_fitting_elapsed.append(e-s)
            
            if verbose:
                blank = ' '*(11-len(regressor_name))
                fit_progress = '[{}/{}] {}{}: loss={:.3f}, val_loss={:.3f}, elasped={:.1f}s'\
                    .format(fit_iter+1,len(self.regressors),regressor_name,blank,tr_score,va_score,e-s)
                print(fit_progress)
            
        # get weighted prediction & score
        if self.weight=='equal':
            self.ensemble_weights = np.array([1.0 for _ in self.regressors])
        self.ensemble_weights /= sum(self.ensemble_weights)
        
        tr_pred = self._predict(X,method='ensemble',return_weighted=True)
        va_pred = self._predict(X_val,method='ensemble',return_weighted=True)
        
        ## -> self.predict에서 inverse_transform 해줌
        # if self.inverse_transform is not None:
        #     tr_pred = self.inverse_transform(tr_pred)
        #     va_pred = self.inverse_transform(va_pred)
        
        ens_tr_score = self.eval_metric(y_true=tr_true,y_pred=tr_pred)
        ens_va_score = self.eval_metric(y_true=va_true,y_pred=va_pred)
        
        if verbose:
            ens_fit_progress = "<Weighted Ensemble(weight='{}')> loss={:.3f}, val_loss={:.3f}, elasped={:.1f}s"\
                .format(self.weight,ens_tr_score,ens_va_score,sum(self.ensemble_fitting_elapsed))
            print(ens_fit_progress)
        
        if self.method=='ensemble':
            self.total_score = ens_va_score
            
        elif self.method=='stacking':
            #----------------------------------------------------------------------------------------#
            # (2) stacking fitting
            #----------------------------------------------------------------------------------------#
            tr_pred = self._predict(X,method='ensemble',return_weighted=self.use_weightedsum_in_stacking)
            va_pred = self._predict(X_val,method='ensemble',return_weighted=self.use_weightedsum_in_stacking)

            stacking_columns = [self.stacking_feature] if isinstance(self.stacking_feature,str) else self.stacking_feature
            tr_pred_df = pd.DataFrame(tr_pred,columns=stacking_columns,index=X.index)
            va_pred_df = pd.DataFrame(va_pred,columns=stacking_columns,index=X_val.index)
            
            X        = pd.concat([X       ,tr_pred_df],axis=1)
            X_oh     = pd.concat([X_oh    ,tr_pred_df],axis=1)
            X_val    = pd.concat([X_val   ,va_pred_df],axis=1)
            X_val_oh = pd.concat([X_val_oh,va_pred_df],axis=1)

            # prepare stacking fitting
            self.stacking_scores = []
            self.stacking_weights = []
            self.stacking_fitting_elapsed = []

            stacking_regressors = deepcopy(self.stacking_regressors)
            stacking_pbar = zip(self.stacking_regressors_name,stacking_regressors)

            if verbose:
                print('\n########  <Step2> Stacking  ########')
            self.stacking_regressors = []
            for fit_iter,(regressor_name,regressor) in enumerate(stacking_pbar):
                s = time.time()

                # fitting
                stacking_regressor, tr_pred, va_pred = self._fit_regressor(
                    regressor, regressor_name,
                    self.features+stacking_columns,self.oh_features+stacking_columns,
                    X, X_oh, X_val, X_val_oh, y, y_val, self.cat_features,
                    sample_weight, eval_sample_weight,
                )
                self.stacking_regressors.append(stacking_regressor)

                # progress
                if self.inverse_transform is not None:
                    tr_pred = self.inverse_transform(tr_pred)
                    tr_pred = self._preprocess(tr_pred)
                    va_pred = self.inverse_transform(va_pred)
                    va_pred = self._preprocess(va_pred)

                tr_score = self.eval_metric(y_pred=tr_pred,y_true=tr_true)
                va_score = self.eval_metric(y_pred=va_pred,y_true=va_true)

                e = time.time()
                self.stacking_scores.append(va_score)
                self.stacking_weights.append(1/va_score)
                self.stacking_fitting_elapsed.append(e-s)

                if verbose:
                    blank = ' '*(11-len(regressor_name))
                    iter_str = str(fit_iter+1).zfill(len(str(len(stacking_regressors))))
                    fit_progress = '[{}/{}] {}{}: loss={:.3f}, val_loss={:.3f}, elasped={:.1f}s'\
                        .format(iter_str,len(stacking_regressors),regressor_name,blank,tr_score,va_score,e-s)
                    print(fit_progress)

            # get weighted prediction & score
            if self.weight=='equal':
                self.stacking_weights = np.array([1.0 for _ in self.stacking_regressors])
            self.stacking_weights /= sum(self.stacking_weights)

            tr_pred = self._predict(
                X.drop(self.stacking_feature,axis=1),
                method='stacking',
                return_weighted=True,
            )
            va_pred = self._predict(
                X_val.drop(self.stacking_feature,axis=1),
                method='stacking',
                return_weighted=True,
            )

            ## -> self.predict에서 inverse_transform 해줌
            # if self.inverse_transform is not None:
            #     tr_pred = self.inverse_transform(tr_pred)
            #     va_pred = self.inverse_transform(va_pred)

            stacking_tr_score = self.eval_metric(y_true=tr_true,y_pred=tr_pred)
            stacking_va_score = self.eval_metric(y_true=va_true,y_pred=va_pred)

            if verbose:
                stacking_fit_progress = "<Weighted Stacking(weight='{}')> loss={:.3f}, val_loss={:.3f}, elasped={:.1f}s"\
                    .format(self.weight,stacking_tr_score,stacking_va_score,sum(self.stacking_fitting_elapsed))
                print(stacking_fit_progress)

            self.total_score = stacking_va_score
            
        #self.feature_importances_ = self.get_feature_importance()['importance'].values.tolist()

    def predict(self,X,method=None):
        if method is None:
            method = self.method
        if (self.method=='ensemble') & (method=='stacking'):
            raise ValueError("The training method is 'ensemble', so 'stacking' prediction is not possible")
        return self._predict(X,method=self.method,return_weighted=self.use_weightedsum_in_stacking)
        
    def save(self,path):
        save_dict = {
            'ohe' : self.ohe,
            'cat_features' : self.cat_features,
            'minimum_value' : self.minimum_value,
            'maximum_value' : self.maximum_value,
            'features' : self.features,
            'oh_features' : self.oh_features,
            'hyperparameters' : self.hyperparameters,
            'inverse_transform' : self.inverse_transform,
            'sample_weight' : self.sample_weight,
            'eval_sample_weight' : self.eval_sample_weight,
            'regressors' : self.regressors,
            'ensemble_weights' : self.ensemble_weights,
            'ensemble_fitting_elapsed' : self.ensemble_fitting_elapsed,
            'ensemble_scores' : self.ensemble_scores,
            'total_score' : self.total_score,
            #'feature_importances_' : self.feature_importances_,
        }
        if self.method=='stacking':
            additional_save_dict = {
                'stacking_regressors' : self.stacking_regressors,
                'stacking_weights' : self.stacking_weights,
                'stacking_fitting_elapsed' : self.stacking_fitting_elapsed,
                'stacking_scores' : self.stacking_scores,
            }
            save_dict = {**save_dict,**additional_save_dict}
        with open(path, 'wb') as f:
            #pickle.dump(save_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
            dill.dump(save_dict, f)
            
    def load(self,path):
        with open(path, 'rb') as f:
            #save_dict = pickle.load(f)
            save_dict = dill.load(f)
            self.ohe = save_dict['ohe']
            self.cat_features = save_dict['cat_features']
            self.minimum_value = save_dict['minimum_value']
            self.maximum_value = save_dict['maximum_value']
            self.features = save_dict['features']
            self.oh_features = save_dict['oh_features']
            self.hyperparameters = save_dict['hyperparameters']
            self.inverse_transform = save_dict['inverse_transform']
            self.sample_weight = save_dict['sample_weight']
            self.eval_sample_weight = save_dict['eval_sample_weight']
            self.regressors = save_dict['regressors']
            self.ensemble_weights = save_dict['ensemble_weights']
            self.ensemble_fitting_elapsed = save_dict['ensemble_fitting_elapsed']
            self.ensemble_scores = save_dict['ensemble_scores']
            self.total_score = save_dict['total_score']
            #self.feature_importances_ = save_dict['feature_importances_']
            
            if self.method=='stacking':
                self.stacking_regressors = save_dict['stacking_regressors']
                self.stacking_weights = save_dict['stacking_weights']
                self.stacking_fitting_elapsed = save_dict['stacking_fitting_elapsed']
                self.stacking_scores = save_dict['stacking_scores']

In [44]:
class KfoldWeightedEnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,
                 hyperparameters,
                 method='ensemble',
                 weight='balanced',
                 use_weightedsum_in_stacking=True,
                 inverse_transform=None,
                 eval_metric=None,
                 n_splits=5,
                 random_state=0):
        super().__init__()
        
        assert weight in ['equal','balanced'], \
            "weight must be one of ['equal','balanced']"
        
        self.hyperparameters = hyperparameters
        self.method = method
        self.weight = weight
        self.use_weightedsum_in_stacking = use_weightedsum_in_stacking
        self.inverse_transform = inverse_transform
        self.eval_metric = RMSE if eval_metric is None else eval_metric
        self.n_splits = n_splits
        self.random_state = random_state
        
    def get_feature_importance(self):
        fs = [m.features for m in self.base_models]
        fs = list(set(item for sublist in fs for item in sublist))
        feature_importance_df = pd.DataFrame(fs,columns=['feature'])

        for i,(base_model,base_feature_importance) in enumerate(zip(self.base_models,self.base_feature_importances)):
            imp_col = f'imp{i}'
            imp_df = pd.DataFrame({
                'feature' : base_model.features,
                imp_col : base_feature_importance,
            })
            feature_importance_df = pd.merge(feature_importance_df,imp_df,how='left',on='feature')

        feature_importance_df.fillna(0,inplace=True)
        feature_importance_df['importance'] = feature_importance_df.drop('feature',axis=1).sum(axis=1)
        feature_importance_df['importance'] = 100 * feature_importance_df['importance'] / feature_importance_df['importance'].sum()
        
        return feature_importance_df
        
    def plot_feature_importance(self):
        feature_importance_df = self.get_feature_importance()
        feature_importance_df.sort_values('importance',ascending=True,inplace=True)

        plt.figure(figsize=(15,7))
        plt.barh(feature_importance_df.feature,feature_importance_df.importance)
        plt.show()
        
    def fit(self,X,y,cat_features=None,sample_weight=None,verbose=True):
        self.cat_features = [] if cat_features is None else cat_features
        self.sample_weight = sample_weight
        self.features = X.columns.tolist()

        self.base_models = []
        self.base_scores = []
        self.base_feature_importances = []
        kf = KFold(n_splits=self.n_splits,random_state=self.random_state,shuffle=True)

        progress_fmt = '> KFold: {}/{}'
        for k, (tr_idx, val_idx) in enumerate(kf.split(X)):
            k_str = str(k+1).zfill(len(str(self.n_splits)))
            print('')
            print('-'*80)
            print(progress_fmt.format(k_str,self.n_splits))
            print('-'*80)
            
            X_tr, X_va = X.iloc[tr_idx], X.iloc[val_idx]
            y_tr, y_va = y.iloc[tr_idx], y.iloc[val_idx]
            
            if self.sample_weight is None:
                sample_weight = None
                eval_sample_weight = None
            else:
                sample_weight = self.sample_weight[tr_idx]
                eval_sample_weight = self.sample_weight[val_idx]

            #------------------------------------------------------------------------------------#
            # (1) base model
            #------------------------------------------------------------------------------------#
            # define the base model
            base_model = WeightedEnsembleRegressor(
                hyperparameters,
                weight='balanced', # 'equal','balanced',
                inverse_transform=self.inverse_transform,
                eval_metric=self.eval_metric,
                method=self.method, # 'ensemble','stacking'
                use_weightedsum_in_stacking=self.use_weightedsum_in_stacking,
            )
            # fit the model
            base_model.fit(
                X_tr,y_tr,
                eval_set=[(X_va,y_va)],
                cat_features=cat_features,
                sample_weight=sample_weight,
                eval_sample_weight=[eval_sample_weight],
                verbose=verbose,
            )
            
            # prediction
            y_pred = base_model.predict(X_va)
            if self.inverse_transform is not None:
                y_true = self.inverse_transform(y_va.values)
            else:
                y_true = y_va.values
            
            # caculate score
            score = mean_squared_error(y_true=y_true,y_pred=y_pred)**0.5

            # append inner loop
            self.base_models.append(base_model)
            self.base_scores.append([k+1,len(X_tr),len(X_va),score])
            
            # # plot feature importance
            # self.base_feature_importances.append(base_model.feature_importances_)
            # base_model.plot_feature_importance()
        
        self.base_score = pd.DataFrame(self.base_scores,columns=['k','n_train','n_val','rmse'])
        self.validation_score = self.base_score.rmse.mean()
        
        #self.plot_feature_importance()
        
    def predict(self,X):
        pred = [base_model.predict(X) for base_model in self.base_models]
        pred = np.mean(pred,axis=0)
        return pred
    
    def save(self,path):
        save_dict = {
            'hyperparameters' : self.hyperparameters,
            'weight' : self.weight,
            'n_splits' : self.n_splits,
            'random_state' : self.random_state,
            'inverse_transform' : self.inverse_transform,
            'cat_features' : self.cat_features,
            'sample_weight' : self.sample_weight,
            'base_models' : self.base_models,
            'base_scores' : self.base_scores,
            'base_score' : self.base_score,
            'validation_score' : self.validation_score,
            'base_feature_importances' : self.base_feature_importances,
        }
        with open(path, 'wb') as f:
            #pickle.dump(save_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
            dill.dump(save_dict, f)
            
    def load(self,path):
        with open(path, 'rb') as f:
            #save_dict = pickle.load(f)
            save_dict = dill.load(f)
            
            self.hyperparameters = save_dict['hyperparameters']
            self.weight = save_dict['weight']
            self.n_splits = save_dict['n_splits']
            self.random_state = save_dict['random_state']
            self.inverse_transform = save_dict['inverse_transform']
            self.cat_features = save_dict['cat_features']
            self.sample_weight = save_dict['sample_weight']
            self.base_models = save_dict['base_models']
            self.base_scores = save_dict['base_scores']
            self.base_score = save_dict['base_score']
            self.validation_score = save_dict['validation_score']
            self.base_feature_importances = save_dict['base_feature_importances']

In [45]:
hyperparameters = {
    'max_depth' : 9,
    'random_state' : CFG.SEED,
    'learning_rate' : 0.05,
    'iterations' : 30000,
    'early_stopping_rounds' : 300,
    'xgb_learning_rate' : 0.3,         # default=0.3
    'xgb_iterations' : 3000,           # default=100
    'xgb_early_stopping_rounds' : 100,
    'rf_iterations' : 100,
    'hgb_iterations' : 100,            # default=100
    #'extratrees_iterations' : 100,    # default=100
}

In [51]:
mc_path_fmt = './mc/weiens_model_log_target{}.pkl'

In [55]:
%%time

models = []
for i,target in enumerate(CFG.TARGET):
    print('#'*80)
    print('### [{}/{}] {}'.format(i+1,len(CFG.TARGET),target))
    print('#'*80)
    
    X = train_df.drop(CFG.TARGET,axis=1)
    y = train_df[target]
    X_test = test_df.copy()

    unique_info = X.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    unuse_cols = ['ECLO']

    drop_cols = unique_cols+unuse_cols
    X     .drop(list(set(X.columns)&set(drop_cols)),axis=1,inplace=True)
    X_test.drop(list(set(X_test.columns)&set(drop_cols)),axis=1,inplace=True)
    
    model = KfoldWeightedEnsembleRegressor(
        hyperparameters,
        method='stacking', #'ensemble',
        weight='balanced',
        use_weightedsum_in_stacking=True,
        inverse_transform=target_transform.inverse_transform,
        eval_metric=RMSE,
        n_splits=CFG.N_SPLITS,
        random_state=CFG.SEED,
    )
    model.fit(
        X,y,
        cat_features=None,
        sample_weight=None,
        verbose=True,
    )
    mc_path = mc_path_fmt.format(i+1)
    model.save(mc_path)
    models.append(model)

################################################################################
### [1/4] 사망자수
################################################################################

--------------------------------------------------------------------------------
> KFold: 1/5
--------------------------------------------------------------------------------

########  <Step1> Ensemble  ########
[1/3] CatBoostRegressor: loss=0.083, val_loss=0.090, elasped=7.5s
[2/3] XGBRegressor: loss=0.061, val_loss=0.093, elasped=4.0s
[3/3] LGBMRegressor: loss=0.089, val_loss=0.090, elasped=1.5s
<Weighted Ensemble(weight='balanced')> loss=0.078, val_loss=0.090, elasped=12.9s

########  <Step2> Stacking  ########
[1/3] CatBoostRegressor: loss=0.088, val_loss=0.090, elasped=7.3s
[2/3] XGBRegressor: loss=0.019, val_loss=0.126, elasped=2.1s
[3/3] LGBMRegressor: loss=0.088, val_loss=0.090, elasped=0.9s
<Weighted Stacking(weight='balanced')> loss=0.072, val_loss=0.092, elasped=10.3s

------------------------------

In [69]:
tr_prediction = []
te_prediction = []
for i,target in enumerate(CFG.TARGET):
    X = train_df.drop(CFG.TARGET,axis=1)
    y = train_df[target]
    X_test = test_df.copy()

    unique_info = X.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    unuse_cols = ['ECLO']

    drop_cols = unique_cols+unuse_cols
    X     .drop(list(set(X.columns)&set(drop_cols)),axis=1,inplace=True)
    X_test.drop(list(set(X_test.columns)&set(drop_cols)),axis=1,inplace=True)
    
    model = models[i]
    
    tr_prediction.append(model.predict(X).flatten().tolist())
    te_prediction.append(model.predict(X_test).flatten().tolist())
    
tr_prediction = np.array(tr_prediction).T
te_prediction = np.array(te_prediction).T

In [79]:
for i in range(len(CFG.TARGET)):
    pred = tr_prediction[:,i]
    true = target_transform.inverse_transform(train_df[CFG.TARGET[i]])
    rmse = mean_squared_error(pred,true)**0.5
    print('Target: {}, RMSE: {:.3f}'.format(CFG.TARGET[i],rmse))

Target: 사망자수, RMSE: 0.075
Target: 중상자수, RMSE: 0.458
Target: 경상자수, RMSE: 0.943
Target: 부상자수, RMSE: 0.360


In [89]:
tr_pred = tr_prediction @ [10,5,3,1]
tr_true = train_df['ECLO'].values
print('RMSE: {:.3f}, RMSLE: {:.3f}'.format(RMSE(tr_true,tr_pred),RMSLE(tr_true,tr_pred)))

RMSE: 3.121, RMSLE: 0.418


In [88]:
te_pred = te_prediction @ [10,5,3,1]

In [90]:
# 0.436056003
submit = pd.read_csv('./data/sample_submission.csv')
submit['ECLO'] = te_pred
submit.to_csv('./out/15_weiens_stacking_merge_additional_data_target4.csv',index=False)
submit.head()

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,4.321914
1,ACCIDENT_39610,3.200378
2,ACCIDENT_39611,3.964339
3,ACCIDENT_39612,4.110498
4,ACCIDENT_39613,4.153589
