# Library Setting

In [1]:
from tqdm import tqdm
tqdm.pandas()

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

import gc
gc.collect()

0

<br></br>

# Configuration

In [2]:
class CFG:
    SEED = 0
    
    SUBSET_DEPTH = 3
    INTERACTION = False
    
    N_SPLITS = 5
    
    LR = 0.03
    EPOCHS = 10000
    ES = 300
    XGB_LR = 0.3     # default
    XGB_EPOCHS = 100 # default
    XGB_ES = 10

<br></br>

# Data

## Data Load

In [3]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [4]:
train_df.shape, test_df.shape

((57920, 15), (14480, 14))

In [5]:
train_df.head()

Unnamed: 0,ID,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격
0,TRAIN_00000,2018,2014,skoda,fabia,KAT,SLA,85231,999,0,0,1,0,0,51.74
1,TRAIN_00001,2010,2006,toyota,auris,RKO,SWI,135000,1598,0,0,1,0,0,41.47
2,TRAIN_00002,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223,1796,0,0,1,0,0,17.81
3,TRAIN_00003,2006,2001,nissan,x-trail,EHX,WIE,238000,2184,0,1,0,0,0,18.2
4,TRAIN_00004,2007,2007,fiat,bravo,OSW,MAL,251000,1910,0,1,0,0,0,17.55


<br>

## Resetting Columns Type

In [6]:
class TypeResetting:
    def __init__(self):
        self.cat_features = ['브랜드','차량모델명','판매도시','판매구역','생산년도','모델출시년도']
        
    def add_categorical_features(self,cat_features):
        self.cat_features += cat_features
        
    def fit(self,data):
        self.target_feature = ['가격']
        self.unuse_features = ['ID']
        self.dummy_features = ['압축천연가스(CNG)','액화석유가스(LPG)','경유','가솔린','하이브리드']
        self.num_features   = [col for col in data.columns
                               if col not in self.target_feature+self.unuse_features+self.dummy_features+self.cat_features]
        
    def transform(self,data):
        d = data.copy()
        for col in self.dummy_features:
            if d[col].dtypes!=int:
                d[col] = d[col].astype(int)
        for col in self.cat_features:
            if d[col].dtypes!=object:
                d[col] = d[col].astype(str)
        for col in self.num_features:
            if d[col].dtypes!=float:
                d[col] = d[col].astype(float)
        for col in self.unuse_features:
            if col in d.columns:
                d.drop(col,axis=1,inplace=True)
        return d
    
    def fit_transform(self,data):
        self.fit(data)
        return self.transform(data)
    
    def get_feature_type(self):
        globals()['target_feature'] = self.target_feature
        globals()['unuse_features'] = self.unuse_features
        globals()['dummy_features'] = self.dummy_features
        globals()['cat_features']   = self.cat_features
        globals()['num_features']   = self.num_features

In [7]:
type_resetor = TypeResetting()
type_resetor.fit(train_df)
type_resetor.get_feature_type()

train_df2 = type_resetor.transform(train_df)
test_df2  = type_resetor.transform(test_df)

In [8]:
import warnings
warnings.simplefilter("always")

def check_only_oneside(train,test,cat_features):
    not_test_only_features = []
    for iter,col in enumerate(cat_features):
        print('[{}/{}] {}'.format(iter+1,len(cat_features),col))
        
        only_train = list(set(train[col].unique())-set(test[col].unique()))
        only_test  = list(set(test[col].unique())-set(train[col].unique()))
        print(' - Only Train:',len(only_train))
        print(' - Only Test :',len(only_test))
        if len(only_test)>0:
            print('******Warning******')
        else:
            not_test_only_features.append(col)
        print('')
    return not_test_only_features

In [9]:
# 브랜드, 차량모델명, 판매구역, 모델출시년도
not_test_only_features = check_only_oneside(train_df2,test_df2,cat_features+dummy_features)
not_test_only_features = list(set(not_test_only_features)-set(dummy_features))

[1/11] 브랜드
 - Only Train: 0
 - Only Test : 0

[2/11] 차량모델명
 - Only Train: 2
 - Only Test : 0

[3/11] 판매도시
 - Only Train: 1750
 - Only Test : 300

[4/11] 판매구역
 - Only Train: 0
 - Only Test : 0

[5/11] 생산년도
 - Only Train: 3
 - Only Test : 1

[6/11] 모델출시년도
 - Only Train: 0
 - Only Test : 0

[7/11] 압축천연가스(CNG)
 - Only Train: 0
 - Only Test : 0

[8/11] 액화석유가스(LPG)
 - Only Train: 0
 - Only Test : 0

[9/11] 경유
 - Only Train: 0
 - Only Test : 0

[10/11] 가솔린
 - Only Train: 0
 - Only Test : 0

[11/11] 하이브리드
 - Only Train: 0
 - Only Test : 0



In [10]:
not_test_only_features

['브랜드', '차량모델명', '모델출시년도', '판매구역']

In [11]:
seg_df = train_df.groupby(['브랜드','차량모델명']+dummy_features).size().reset_index().rename(columns={0:'cnt'}).sort_values('cnt')
print(seg_df.shape)
seg_df.head()

(430, 8)


Unnamed: 0,브랜드,차량모델명,압축천연가스(CNG),액화석유가스(LPG),경유,가솔린,하이브리드,cnt
214,mercedes-benz,gle-klasa,0,0,0,1,0,1
42,bmw,seria-5,0,0,0,0,1,1
63,bmw,x6,0,0,0,0,1,1
250,nissan,patrol,0,0,0,1,0,1
81,fiat,doblo,1,0,0,0,0,1


<br></br>

# New Features

In [12]:
train_df.head()

Unnamed: 0,ID,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격
0,TRAIN_00000,2018,2014,skoda,fabia,KAT,SLA,85231,999,0,0,1,0,0,51.74
1,TRAIN_00001,2010,2006,toyota,auris,RKO,SWI,135000,1598,0,0,1,0,0,41.47
2,TRAIN_00002,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223,1796,0,0,1,0,0,17.81
3,TRAIN_00003,2006,2001,nissan,x-trail,EHX,WIE,238000,2184,0,1,0,0,0,18.2
4,TRAIN_00004,2007,2007,fiat,bravo,OSW,MAL,251000,1910,0,1,0,0,0,17.55


In [13]:
from tqdm import tqdm
from itertools import chain, combinations
def all_subsets(ss):
    return list(chain(*map(lambda x: combinations(ss, x), range(0, len(ss)+1))))

class FeatureEngineering:
    def __init__(self):
        pass
    
    def _get_quantile(self,x,col):
        x = np.array(x).flatten()
        x = x[pd.notnull(x)]

        agg_df = pd.DataFrame(index=[0])
        for q in [0,25,50,75,100]:
            agg_df[f'{col}_Q{q}'] = np.quantile(x,q/100)

        return agg_df
    
    def _derived_features(self,data):
        d = data.copy()

        # (1) 모델출시년도에 생산된 차량인지
        d['출시년도생산여부'] = np.where(d['생산년도'].astype(float)==d['모델출시년도'].astype(float),1,0)

        # (2) 모델출시 이후에 몇년 지나서 생산됬는지
        d['출시이후생산년수'] = d['생산년도'].astype(float)-d['모델출시년도'].astype(float)

        # (3) 출시 이전에 생산되었는지
        d['출시이전생산여부'] = np.where(d['출시이후생산년수']<0,1,0)

        # (4) 브랜드의 국적 (구글링)
        d['브랜드국적'] = ['체코' if brand=='skoda' else
                        '일본' if brand in ['toyota','nissan','mazda','honda','mitsubishi'] else
                        '독일' if brand in ['mercedes-benz','audi','volkswagen','bmw','opel'] else
                        '이탈리아' if brand=='fiat' else
                        '프랑스' if brand in ['renault','citroen','peugeot'] else
                        '미국' if brand=='ford' else
                        '한국' if brand in ['kia','hyundai'] else
                        '스페인' if brand=='seat' else
                        '스웨덴' if brand=='volvo' else
                        np.nan for brand in d['브랜드']]

        # (5) 브랜드 국적의 대륙명
        d['브랜드대륙명'] = ['유럽' if country in ['체코','독일','이탈리아','프랑스','스페인','스웨덴'] else
                          '아시아' if country in ['일본','한국'] else
                          '아메리카' if country in ['미국'] else
                          np.nan for country in d['브랜드국적']]
        return d
    
    def fit(self,data,cat_features,subset_depth=1):
        assert '가격' in data.columns, \
            'Input data must be training dataset'
        assert len(cat_features)>=subset_depth, \
            'len(cat_features) >= subset_depth'
        
        self.cat_features = cat_features
        self.new_cat_features = ['출시년도생산여부','출시이후생산년수','출시이전생산여부','브랜드국적','브랜드대륙명']
        
        # (6) 카테고리 변수에 따른 가격의 Quantile값
        all_subset_list = all_subsets(cat_features)
        all_subset_list = [subset for subset in all_subset_list if (len(subset)<=subset_depth) & (len(subset)>=1)]
        
        self.agg_dict = {}
        for subset in tqdm(all_subset_list,desc=f'Get quantiles of target by categorical features (depth={subset_depth})'):
            subset = list(subset)
            subset_name = '_'.join(subset)
            agg_fn = data.groupby(subset)['가격'].apply(lambda x: self._get_quantile(x,subset_name)).reset_index()
            drop_cols = [col for col in agg_fn if col.find('level_')>=0]
            agg_fn.drop(columns=drop_cols,inplace=True)
            self.agg_dict[subset_name] = agg_fn
            
    def transform(self,data):
        data = self._derived_features(data)
        for key,agg_fn in self.agg_dict.items():
            data = pd.merge(data,agg_fn,how='left',on=key.split('_'))
        return data
    
    def fit_transform(self,data,cat_features,subset_depth=1):
        self.fit(data,cat_features,subset_depth)
        return self.transform(data)

In [14]:
fe = FeatureEngineering()
fe.fit(
    data=train_df2,
    cat_features=not_test_only_features, 
    subset_depth=CFG.SUBSET_DEPTH,
)
train_df3 = fe.transform(train_df2)
test_df3  = fe.transform(test_df2)

Get quantiles of target by categorical features (depth=3): 100%|██████████| 14/14 [00:16<00:00,  1.17s/it]


In [15]:
fe.new_cat_features

['출시년도생산여부', '출시이후생산년수', '출시이전생산여부', '브랜드국적', '브랜드대륙명']

In [16]:
type_resetor = TypeResetting()
type_resetor.add_categorical_features(fe.new_cat_features)
type_resetor.fit(train_df3)
type_resetor.get_feature_type()

train_df3 = type_resetor.transform(train_df3)
test_df3  = type_resetor.transform(test_df3)

In [17]:
train_df3.shape

(57920, 89)

<br></br>

# EDA

In [18]:
# check_num_features = [col for col in num_features if col.find('_Q')<0]

# i=0
# for col in check_num_features:
#     i+=1
#     print('\n({}/{}) {}'.format(i,len(check_num_features),col))
#     plt.figure(figsize=(15,7))
#     sns.scatterplot(x=train_df3['가격'],y=train_df3[col])
#     plt.show()

<br></br>

# Add the Interaction Term

In [19]:
import warnings
from tqdm import trange

def get_abs_corr(x,y):
    return np.abs(np.corrcoef(x,y))[0,1]

class InteractionTerm:
    def __init__(self):
        pass
    
    def fit(self,data,num_features,corr_cutoff=0.7):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        
        d = data.copy()
        self.interaction_list = []
        for i in range(len(num_features)):
            for j in range(len(num_features)):
                if i>j:
                    col_i = num_features[i]
                    col_j = num_features[j]
                    
                    # 상관계수가 cutoff보다 큰 경우에는 interaction을 생성하지 않음
                    if (get_abs_corr(d[col_i]*d[col_j],d[col_i])>=corr_cutoff) | (get_abs_corr(d[col_i]*d[col_j],d[col_j])>=corr_cutoff):
                        pass
                    else:
                        self.interaction_list.append(f'{col_i}*{col_j}')
    
    def transform(self,data):
        d = data.copy()
        for interaction in self.interaction_list:
            col_i,col_j = interaction.split('*')
            d[interaction] = d[col_i]*d[col_j]
        return d
    
    def fit_transform(self,data,num_features,corr_cutoff=0.7):
        self.fit(data,num_features,corr_cutoff)
        return self.transform(data)

In [20]:
train_df4 = train_df3.copy()
test_df4  = test_df3.copy()

if CFG.INTERACTION:
    interaction_maker = InteractionTerm()
    interaction_maker.fit(
        data=train_df3,
        num_features=num_features,
        corr_cutoff=0.7,
    )
    train_df4 = interaction_maker.transform(train_df4)
    test_df4  = interaction_maker.transform(test_df4)

    type_resetor = TypeResetting()
    type_resetor.add_categorical_features(fe.new_cat_features)
    type_resetor.fit(train_df4)
    type_resetor.get_feature_type()

    train_df4 = type_resetor.transform(train_df4)
    test_df4  = type_resetor.transform(test_df4)

<br></br>

# Feature Selection

In [21]:
# k=0
# for i in range(len(num_features)):
#     for j in range(len(num_features)):
#         if i>j:
#             col_i = num_features[i]
#             col_j = num_features[j]
#             corr = np.corrcoef(train_df4[col_i],train_df4[col_j])[0,1]
#             if corr>=0.7:
#                 k+=1
#                 print(k,col_i,col_j,corr)

In [22]:
alpha = 0.05

In [23]:
def log_offset(x):
    if min(x)>0:
        offset = 0
    elif min(x)==0:
        offset = 1e-3
    else:
        offset = min(x)+1e-3
        print('minimum = {:.3f}'.format(min(x)))
    return np.log(x+offset)

<br></br>

## Categorical Features

In [24]:
# import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [25]:
check_cat_features = [col for col in cat_features if train_df4[col].nunique()<=100]

# (1) ANOVA를 해서 p-value가 0.05보다 높은 것들 확인
pvalue_list = []
for col in tqdm(check_cat_features):
    d = train_df4[[col,'가격']].rename(columns={col:'feature'})
    
    model = ols(f'가격 ~ C(feature)',data=d).fit()
    pvalue = anova_lm(model).values[0][-1]
    pvalue_list.append([col,pvalue])
    
pvalue_df = pd.DataFrame(pvalue_list,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df[pvalue_df.pvalue>=alpha].round(4)

100%|██████████| 9/9 [00:03<00:00,  2.36it/s]


In [26]:
# (2) (1)에서 유의하지않은 feature들은 log적용 후에도 유의하지 않으면 제외
pvalue_list2 = []
unsignificant_features = pvalue_df[pvalue_df.pvalue>alpha].feature.tolist()
for col in tqdm(unsignificant_features):
    d = train_df4[[col,'target']].rename(columns={col:'feature'})
    d['feature'] = log_offset(d['feature'])
    
    model = ols(f'feature ~ C(target)',data=d).fit()
    pvalue = anova_lm(model).values[0][-1]
    pvalue_list2.append([col,pvalue])
    
pvalue_df2 = pd.DataFrame(pvalue_list2,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)

0it [00:00, ?it/s]


In [27]:
delete_features = pvalue_df2[pvalue_df2.pvalue> alpha].feature.tolist()
log_features    = pvalue_df2[pvalue_df2.pvalue<=alpha].feature.tolist()
print('> delete_features')
print('  - length : {}'.format(len(delete_features)))
print('  - feature_name : {}'.format(delete_features))
print('')
print('> log_features')
print('  - length : {}'.format(len(log_features)))
print('  - feature_name : {}'.format(log_features))

train_df5 = train_df4.copy()
train_df5.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    train_df5[col] = log_offset(train_df5[col])
    
test_df5 = test_df4.copy()
test_df5.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    test_df5[col] = log_offset(test_df5[col])

> delete_features
  - length : 0
  - feature_name : []

> log_features
  - length : 0
  - feature_name : []


<br>

## Numerical Features

In [28]:
import scipy

pvalue_list = []
for col in num_features:
    corr,pvalue = scipy.stats.pearsonr(train_df5['가격'],train_df5[col])
    pvalue_list.append([col,pvalue])

In [29]:
pvalue_df = pd.DataFrame(pvalue_list,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)

pvalue_df.round(4).head()

Unnamed: 0,feature,pvalue
17,판매구역_Q0,0.3518
21,판매구역_Q100,0.0
2,브랜드_Q0,0.0
18,판매구역_Q25,0.0
6,브랜드_Q100,0.0


<br></br>

# Modeling

In [30]:
import os
def mkdir(paths):
    if type(paths)==str:
        paths = [paths]
    for path in paths:
        if not os.path.isdir(path):
            print('> Create Folder: {}'.format(path))
            os.mkdir(path)

In [31]:
mkdir('./model_checkpoints')

<br>

## CatBoost
- public score : 6.2625421575

In [32]:
from sklearn.model_selection import train_test_split

In [None]:
# X = train_df5.drop(target_feature,axis=1)
# y = train_df5[target_feature]

# X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2,random_state=CFG.SEED)

In [33]:
gc.collect()

0

In [None]:
%%time
# 22분

X = train_df5.drop(target_feature,axis=1)
y = train_df5[target_feature]

kf = KFold(n_splits=CFG.N_SPLITS,random_state=CFG.SEED,shuffle=True)
models = []
scores = []

k=0
for train_idx, valid_idx in tqdm(kf.split(X,y),total=CFG.N_SPLITS):
    k+=1
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]
    
    train_dataset = Pool(X_train,y_train,cat_features=cat_features)
    valid_dataset = Pool(X_valid,y_valid,cat_features=cat_features)

    model = CatBoostRegressor(
        loss_function='MAE',
        random_state=CFG.SEED,
        iterations=CFG.EPOCHS,
        learning_rate=CFG.LR,
        allow_writing_files=False,
    )
    model.fit(
        train_dataset,
        eval_set=valid_dataset,
        metric_period=int(CFG.EPOCHS/5),
        early_stopping_rounds=CFG.ES,
    )
    model.save_model(f'./model_checkpoints/kfold_model_{k}.cbm')
    
    y_pred = model.predict(valid_dataset).flatten()
    y_true = y_valid.values
    score = mean_absolute_error(y_true=y_true,y_pred=y_pred)
    
    print('K-Fold {}, MAE: {:.4f}'.format(k,score))
    
    models.append(model)
    scores.append(score)

In [None]:
import pickle
with open('./model_checkpoints/kfold_models.pkl', 'wb') as f:
	pickle.dump(models, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/kfold_scores.pkl', 'wb') as f:
	pickle.dump(scores, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
weights = 1/np.array(scores)
weights /= sum(weights)

In [None]:
# train
X = train_df5.drop(target_feature,axis=1)
y = train_df5[target_feature]
dataset = Pool(X,y,cat_features=cat_features)

prediction = np.zeros(len(X))
for w,m in zip(weights,models):
    prediction += m.predict(dataset) * w

In [None]:
mean_absolute_error(y_pred=prediction,y_true=y.values.flatten())

In [None]:
# test
X_test = test_df5
dataset = Pool(X_test,cat_features=cat_features)

prediction = np.zeros(len(X_test))
for w,m in zip(weights,models):
    prediction += m.predict(dataset) * w

In [None]:
prediction

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['가격'] = prediction
submit.to_csv('./out/1_catboost_kfold.csv',index=False)

<br>

## Weighted Ensemble
- public score : 6.5505142995

In [34]:
import pandas as pd
import warnings

class OneHotEncoder:
    def __init__(self):
        pass
    
    def fit(self,data,columns):
        self.transform_list = []
        for col in columns:
            for i,value in enumerate(sorted(data[col].unique())):
                if i>0:
                    self.transform_list.append([col,value])
        
    def transform(self,data):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        new_data = data.copy()
        for col,value in self.transform_list:
            new_data[f'{col}_{value}'] = np.where(new_data[col]==value,1,0)
        drop_columns = pd.unique(np.array(self.transform_list)[:,0])
        new_data.drop(columns=drop_columns,inplace=True)
        return new_data

In [35]:
from copy import deepcopy
from sklearn.base import BaseEstimator, RegressorMixin
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
import warnings
import time

class WeightedEnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self):
        super().__init__()
        self._get_regressors()
    
    def _get_regressors(self):
        max_depth = 10
        n_jobs = -1
        
        params_catboost = {
            'random_state':CFG.SEED,
            'early_stopping_rounds' : CFG.ES,
            'learning_rate' : CFG.LR,
            'iterations' : CFG.EPOCHS,
            'loss_function': 'MAE',
            'grow_policy' : 'Lossguide', # 'SymmetricTree','Depthwise'
            'use_best_model' : True,
            'allow_writing_files' : False,
            'verbose' : 0,
            'max_depth': max_depth,
            'l2_leaf_reg' : 1,
        }
    
        params_xgboost = {
            'random_state':CFG.SEED,
            'early_stopping_rounds' : CFG.XGB_ES,
            'learning_rate' : CFG.XGB_LR,
            'n_estimators' : CFG.XGB_EPOCHS,
            'objective': 'reg:absoluteerror',
            'verbosity' : 0,
            'max_depth': max_depth,
            'n_jobs' : n_jobs,
        }
    
        params_lgb = {
            'objective': 'regression',
            'random_state':CFG.SEED,
            'early_stopping_round' : CFG.ES,
            'learning_rate' : CFG.LR,
            'n_estimators' : CFG.EPOCHS,
            'metric': 'mean_absolute_error',
            'verbosity' : -1,
            'max_depth': max_depth,
            'n_jobs' : n_jobs,
        }
        
        self.regressors = [
            CatBoostRegressor(**params_catboost),
            XGBRegressor(**params_xgboost),
            LGBMRegressor(**params_lgb),
        ]
        self.regressors_name = ['CatBoost','XGBoost','LightGBM']
    
    def fit(self,X,y,eval_set,oh_set,cat_features,verbose=1):
        assert len(eval_set)==1, \
            "eval_set length must be 1. len(eval_set)={}".format(len(eval_set))
        assert len(oh_set)==1, \
            "oh_set length must be 1. len(oh_set)={}".format(len(oh_set))
        X_val, y_val = eval_set[0]
        X_oh, X_val_oh = oh_set[0]
        
        self.cat_features = cat_features
        
        if verbose:
            print('> (2) Fitting Model')
        self.weights = []
        self.fitting_elapsed = []
        if verbose:
            pbar = tqdm(zip(self.regressors_name,self.regressors),total=len(self.regressors))
        else:
            pbar = zip(self.regressors_name,self.regressors)
        for name,regressor in pbar:
            s = time.time()
            if verbose:
                pbar.set_description(name)
            if name=='CatBoost':
                train_dataset = Pool(X,y,cat_features=cat_features)
                val_dataset   = Pool(X_val,y_val,cat_features=self.cat_features)
                regressor.fit(
                    train_dataset,
                    eval_set=val_dataset,
                    #metric_period=CFG.EPOCHS//5,
                )
                val_pred = regressor.predict(val_dataset)
            elif name=='XGBoost':
                regressor.fit(
                    X_oh,y,
                    eval_set=[(X_val_oh,y_val)],
                    verbose=0,
                )
                val_pred = regressor.predict(X_val_oh)
            elif name=='LightGBM':
                warnings.filterwarnings("ignore", category=UserWarning)
                regressor.fit(
                    X_oh,y,
                    eval_set=[(X_val_oh,y_val)],
                    verbose=-1,
                )
                val_pred = regressor.predict(X_val_oh)
            else:
                raise ValueError('Unknown Regressor: {}'.format(name))
                
            score = mean_absolute_error(y_pred=val_pred,y_true=y_val)
            e = time.time()
            
            self.weights.append(1/score)
            self.fitting_elapsed.append(e-s)
        
        self.weights /= sum(self.weights)
                
    def predict(self,X,X_oh):
        assert len(X)==len(X_oh), \
            "X and X_oh must be same length"
        
        pred_list = []
        for name,regressor in zip(self.regressors_name,self.regressors):
            if name=='CatBoost':
                dataset = Pool(X,cat_features=self.cat_features)
            else:
                dataset = X_oh
            
            y_pred = regressor.predict(dataset)
            y_pred = np.array(y_pred).flatten()
            pred_list.append(y_pred)
            
        final_pred = np.zeros(len(X))
        for pred,weight in zip(pred_list,self.weights):
            final_pred += np.array(pred)*weight
            
        return final_pred



In [36]:
from sklearn.model_selection import KFold

In [37]:
gc.collect()

0

In [38]:
%%time
# k=5 : iteration 당 12분

X = train_df5.drop(target_feature,axis=1)
y = train_df5[target_feature]

ohe = OneHotEncoder()
ohe.fit(X,cat_features)
X_oh = ohe.transform(X)

kf = KFold(n_splits=CFG.N_SPLITS,random_state=CFG.SEED,shuffle=True)
models = []
scores = []

k=0
for tr_idx, val_idx in tqdm(kf.split(X,y),total=CFG.N_SPLITS):
    k+=1
    X_train, y_train = X.iloc[tr_idx] , y.iloc[tr_idx]
    X_val  , y_val   = X.iloc[val_idx], y.iloc[val_idx]
    X_train_oh = X_oh.iloc[tr_idx]
    X_val_oh   = X_oh.iloc[val_idx]

    ensemble_model = WeightedEnsembleRegressor()
    ensemble_model.fit(
        X_train,y_train,
        eval_set=[(X_val,y_val)],
        oh_set=[(X_train_oh,X_val_oh)],
        cat_features=cat_features,
        verbose=1,
    )
    
    y_pred = ensemble_model.predict(X_val,X_val_oh).flatten()
    y_true = y_val.values
    score = mean_absolute_error(y_true=y_true,y_pred=y_pred)
    print('K-Fold {}, MAE: {:.4f}'.format(k,score))
    
    models.append(ensemble_model)
    scores.append(score)

  0%|          | 0/5 [00:00<?, ?it/s]

> (2) Fitting Model



  0%|          | 0/3 [00:00<?, ?it/s][A
CatBoost:   0%|          | 0/3 [00:00<?, ?it/s][A
CatBoost:  33%|███▎      | 1/3 [06:15<12:30, 375.11s/it][A
XGBoost:  33%|███▎      | 1/3 [06:15<12:30, 375.11s/it] [A
XGBoost:  67%|██████▋   | 2/3 [10:44<05:13, 313.13s/it][A
LightGBM:  67%|██████▋   | 2/3 [10:44<05:13, 313.13s/it][A
LightGBM: 100%|██████████| 3/3 [11:54<00:00, 238.30s/it][A
 20%|██        | 1/5 [12:00<48:03, 720.93s/it]

K-Fold 1, MAE: 5.9480
> (2) Fitting Model



  0%|          | 0/3 [00:00<?, ?it/s][A
CatBoost:   0%|          | 0/3 [00:00<?, ?it/s][A
CatBoost:  33%|███▎      | 1/3 [06:18<12:36, 378.10s/it][A
XGBoost:  33%|███▎      | 1/3 [06:18<12:36, 378.10s/it] [A
XGBoost:  67%|██████▋   | 2/3 [10:46<05:13, 313.75s/it][A
LightGBM:  67%|██████▋   | 2/3 [10:46<05:13, 313.75s/it][A
LightGBM: 100%|██████████| 3/3 [12:12<00:00, 244.09s/it][A
 40%|████      | 2/5 [24:16<36:28, 729.57s/it]

K-Fold 2, MAE: 5.9895
> (2) Fitting Model



  0%|          | 0/3 [00:00<?, ?it/s][A
CatBoost:   0%|          | 0/3 [00:00<?, ?it/s][A
CatBoost:  33%|███▎      | 1/3 [06:20<12:41, 380.91s/it][A
XGBoost:  33%|███▎      | 1/3 [06:20<12:41, 380.91s/it] [A
XGBoost:  67%|██████▋   | 2/3 [09:52<04:41, 281.28s/it][A
LightGBM:  67%|██████▋   | 2/3 [09:52<04:41, 281.28s/it][A
LightGBM: 100%|██████████| 3/3 [10:46<00:00, 215.38s/it][A
 60%|██████    | 3/5 [35:05<23:05, 692.95s/it]

K-Fold 3, MAE: 5.8836
> (2) Fitting Model



  0%|          | 0/3 [00:00<?, ?it/s][A
CatBoost:   0%|          | 0/3 [00:00<?, ?it/s][A
CatBoost:  33%|███▎      | 1/3 [05:58<11:57, 358.73s/it][A
XGBoost:  33%|███▎      | 1/3 [05:58<11:57, 358.73s/it] [A
XGBoost:  67%|██████▋   | 2/3 [09:30<04:32, 272.45s/it][A
LightGBM:  67%|██████▋   | 2/3 [09:30<04:32, 272.45s/it][A
LightGBM: 100%|██████████| 3/3 [10:31<00:00, 210.46s/it][A
 80%|████████  | 4/5 [45:40<11:09, 669.89s/it]

K-Fold 4, MAE: 5.9810
> (2) Fitting Model



  0%|          | 0/3 [00:00<?, ?it/s][A
CatBoost:   0%|          | 0/3 [00:00<?, ?it/s][A
CatBoost:  33%|███▎      | 1/3 [06:01<12:02, 361.31s/it][A
XGBoost:  33%|███▎      | 1/3 [06:01<12:02, 361.31s/it] [A
XGBoost:  67%|██████▋   | 2/3 [10:01<04:50, 290.26s/it][A
LightGBM:  67%|██████▋   | 2/3 [10:01<04:50, 290.26s/it][A
LightGBM: 100%|██████████| 3/3 [11:06<00:00, 222.09s/it][A
100%|██████████| 5/5 [56:49<00:00, 682.00s/it]

K-Fold 5, MAE: 6.0434
CPU times: user 4h 23min 16s, sys: 26min 24s, total: 4h 49min 40s
Wall time: 57min 3s





In [39]:
weights = 1/np.array(scores)
weights /= sum(weights)

In [40]:
# train
X = train_df5.drop(target_feature,axis=1)
y = train_df5[target_feature]

ohe = OneHotEncoder()
ohe.fit(X,cat_features)
X_oh = ohe.transform(X)

prediction = np.zeros(len(X))
for w,m in zip(weights,models):
    prediction += m.predict(X,X_oh) * w

In [41]:
mean_absolute_error(y_pred=prediction,y_true=y.values.flatten())

5.128814618764771

In [42]:
# test
X_test = test_df5
X_test_oh = ohe.transform(X_test)

prediction = np.zeros(len(X_test))
for w,m in zip(weights,models):
    prediction += m.predict(X_test,X_test_oh) * w

In [43]:
prediction

array([82.39274694, 26.53542805, 82.84549379, ..., 95.80957018,
       49.68591462, 43.09463356])

In [44]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['가격'] = prediction
submit.to_csv('./out/2_ensemble_kfold.csv',index=False)

<br>

## 참조 pycaret

In [None]:
# from pycaret import regression

In [None]:
# %%time
# regression.setup(data=train_df5,target='가격',remove_outliers=True,verbose=True)
# best = regression.compare_models(n_select=5,fold=5)