# Library Setting

In [63]:
from tqdm import tqdm
tqdm.pandas()

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

import gc
gc.collect()

0

<br></br>

# Configuration

In [64]:
class CFG:
    SEED = 0
    
    SUBSET_DEPTH = 1
    INTERACTION = True
    FS_ALPHA = 0.01
    
    N_SPLITS = 5
    TARGET_TRANSFORMATION = True
    
    LR = 0.003
    EPOCHS = 30000
    ES = 300
    XGB_LR = 0.01     # default=0.3
    XGB_EPOCHS = 1000 # default=100
    XGB_ES = 100
    XTRATREES_EPOCHS = 100 #default=100

<br></br>

# Data

## Data Load

In [65]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [66]:
train_df.shape, test_df.shape

((57920, 15), (14480, 14))

In [67]:
train_df.head()

Unnamed: 0,ID,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격
0,TRAIN_00000,2018,2014,skoda,fabia,KAT,SLA,85231,999,0,0,1,0,0,51.74
1,TRAIN_00001,2010,2006,toyota,auris,RKO,SWI,135000,1598,0,0,1,0,0,41.47
2,TRAIN_00002,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223,1796,0,0,1,0,0,17.81
3,TRAIN_00003,2006,2001,nissan,x-trail,EHX,WIE,238000,2184,0,1,0,0,0,18.2
4,TRAIN_00004,2007,2007,fiat,bravo,OSW,MAL,251000,1910,0,1,0,0,0,17.55


<br>

## Target Transformation

In [68]:
if CFG.TARGET_TRANSFORMATION:
    train_df['가격'] = np.log(train_df['가격'])

<br>

## Resetting Columns Type

In [69]:
class TypeResetting:
    def __init__(self):
        self.cat_features = ['브랜드','차량모델명','판매도시','판매구역','생산년도','모델출시년도']
        self.seg_features = []
        
    def add_categorical_features(self,cat_features):
        self.cat_features += cat_features
        
    def delete_categorical_features(self,cat_features):
        self.cat_features = [col for col in self.cat_features if col not in cat_features]
        
    def add_segment_features(self,segment_features):
        self.seg_features = ['segment']
        self.cat_features = [col for col in self.cat_features if col not in segment_features]
        
    def fit(self,data):
        if (len(self.seg_features)>0) & ('segment' not in data.columns):
            raise ValueError("segment column name must be 'segment'")
        self.target_feature = ['가격']
        self.unuse_features = ['ID']
        self.dummy_features = ['압축천연가스(CNG)','액화석유가스(LPG)','경유','가솔린','하이브리드']
        self.num_features   = [col for col in data.columns
                               if col not in self.target_feature+self.unuse_features+self.dummy_features+self.cat_features+self.seg_features]
        
    def transform(self,data):
        d = data.copy()
        for col in self.dummy_features:
            if d[col].dtypes!=int:
                d[col] = d[col].astype(int)
        for col in self.cat_features:
            if d[col].dtypes!=object:
                d[col] = d[col].astype(str)
        for col in self.num_features:
            if d[col].dtypes!=float:
                d[col] = d[col].astype(float)
        for col in self.seg_features:
            if d[col].dtypes!=object:
                d[col] = d[col].astype(str)
        for col in self.unuse_features:
            if col in d.columns:
                d.drop(col,axis=1,inplace=True)
        return d
    
    def fit_transform(self,data):
        self.fit(data)
        return self.transform(data)
    
    def get_feature_type(self):
        globals()['target_feature'] = self.target_feature
        globals()['unuse_features'] = self.unuse_features
        globals()['dummy_features'] = self.dummy_features
        globals()['cat_features']   = self.cat_features
        globals()['num_features']   = self.num_features

In [70]:
type_resetor = TypeResetting()
type_resetor.fit(train_df)
type_resetor.get_feature_type()

train_df2 = type_resetor.transform(train_df)
test_df2  = type_resetor.transform(test_df)

In [71]:
import warnings
warnings.simplefilter("always")

def check_only_oneside(train,test,cat_features):
    not_test_only_features = []
    for iter,col in enumerate(cat_features):
        print('[{}/{}] {}'.format(iter+1,len(cat_features),col))
        
        only_train = list(set(train[col].unique())-set(test[col].unique()))
        only_test  = list(set(test[col].unique())-set(train[col].unique()))
        print(' - Only Train:',len(only_train))
        print(' - Only Test :',len(only_test))
        if len(only_test)>0:
            print('******Warning******')
        else:
            not_test_only_features.append(col)
        print('')
    return not_test_only_features

In [72]:
# 브랜드, 차량모델명, 판매구역, 모델출시년도
not_test_only_features = check_only_oneside(train_df2,test_df2,cat_features+dummy_features)
not_test_only_features = list(set(not_test_only_features)-set(dummy_features))

[1/11] 브랜드
 - Only Train: 0
 - Only Test : 0

[2/11] 차량모델명
 - Only Train: 2
 - Only Test : 0

[3/11] 판매도시
 - Only Train: 1750
 - Only Test : 300

[4/11] 판매구역
 - Only Train: 0
 - Only Test : 0

[5/11] 생산년도
 - Only Train: 3
 - Only Test : 1

[6/11] 모델출시년도
 - Only Train: 0
 - Only Test : 0

[7/11] 압축천연가스(CNG)
 - Only Train: 0
 - Only Test : 0

[8/11] 액화석유가스(LPG)
 - Only Train: 0
 - Only Test : 0

[9/11] 경유
 - Only Train: 0
 - Only Test : 0

[10/11] 가솔린
 - Only Train: 0
 - Only Test : 0

[11/11] 하이브리드
 - Only Train: 0
 - Only Test : 0



In [73]:
not_test_only_features

['모델출시년도', '브랜드', '차량모델명', '판매구역']

<br></br>

# New Features

In [74]:
# pd.Series([str(round(int(year)/100,1)) for year in train_df6['생산년도']]).value_counts()

In [75]:
train_df2.head()

Unnamed: 0,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격
0,2018,2014,skoda,fabia,KAT,SLA,85231.0,999.0,0,0,1,0,0,3.946231
1,2010,2006,toyota,auris,RKO,SWI,135000.0,1598.0,0,0,1,0,0,3.72497
2,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223.0,1796.0,0,0,1,0,0,2.87976
3,2006,2001,nissan,x-trail,EHX,WIE,238000.0,2184.0,0,1,0,0,0,2.901422
4,2007,2007,fiat,bravo,OSW,MAL,251000.0,1910.0,0,1,0,0,0,2.865054


In [76]:
from tqdm import tqdm
from itertools import chain, combinations
def all_subsets(ss):
    return list(chain(*map(lambda x: combinations(ss, x), range(0, len(ss)+1))))

class FeatureEngineering:
    def __init__(self):
        pass
    
    def _get_quantile(self,x,col):
        x = np.array(x).flatten()
        x = x[pd.notnull(x)]

        agg_df = pd.DataFrame(index=[0])
        for q in [0,25,50,75,100]:
            agg_df[f'{col}_Q{q}'] = np.quantile(x,q/100)

        return agg_df
    
    def _derived_features(self,data):
        d = data.copy()

        # (1) 모델출시년도에 생산된 차량인지
        d['출시년도생산여부'] = np.where(d['생산년도'].astype(int)==d['모델출시년도'].astype(int),1,0)

        # (2) 모델출시 이후에 몇년 지나서 생산됬는지
        d['출시이후생산년수'] = d['생산년도'].astype(int)-d['모델출시년도'].astype(int)

        # (3) 출시 이전에 생산되었는지
        d['출시이전생산여부'] = np.where(d['출시이후생산년수']<0,1,0)

        # (4) 브랜드의 국적 (구글링)
        d['브랜드국적'] = ['체코' if brand=='skoda' else
                        '일본' if brand in ['toyota','nissan','mazda','honda','mitsubishi'] else
                        '독일' if brand in ['mercedes-benz','audi','volkswagen','bmw','opel'] else
                        '이탈리아' if brand=='fiat' else
                        '프랑스' if brand in ['renault','citroen','peugeot'] else
                        '미국' if brand=='ford' else
                        '한국' if brand in ['kia','hyundai'] else
                        '스페인' if brand=='seat' else
                        '스웨덴' if brand=='volvo' else
                        np.nan for brand in d['브랜드']]

        # (5) 브랜드 국적의 대륙명
        d['브랜드대륙명'] = ['유럽' if country in ['체코','독일','이탈리아','프랑스','스페인','스웨덴'] else
                          '아시아' if country in ['일본','한국'] else
                          '아메리카' if country in ['미국'] else
                          np.nan for country in d['브랜드국적']]
        
        # (6) 판매도시,판매구역 동일여부
        d['판매도시구역동일여부'] = np.where(d['판매도시']==d['판매구역'],1,0)
        
        return d
    
    def fit(self,data,cat_features,subset_depth=1):
        assert '가격' in data.columns, \
            'Input data must be training dataset'
        assert len(cat_features)>=subset_depth, \
            'len(cat_features) >= subset_depth'
        
        self.cat_features = cat_features
        self.new_cat_features = ['출시년도생산여부','출시이후생산년수','출시이전생산여부','브랜드국적','브랜드대륙명','판매도시구역동일여부']
        
        # (6) 카테고리 변수에 따른 가격의 Quantile값
        all_subset_list = all_subsets(cat_features)
        all_subset_list = [subset for subset in all_subset_list if (len(subset)<=subset_depth) & (len(subset)>=1)]
        
        self.agg_dict = {}
        for subset in tqdm(all_subset_list,desc=f'Get quantiles of target by categorical features (depth={subset_depth})'):
            subset = list(subset)
            subset_name = '_'.join(subset)
            agg_fn = data.groupby(subset)['가격'].apply(lambda x: self._get_quantile(x,subset_name)).reset_index()
            drop_cols = [col for col in agg_fn if col.find('level_')>=0]
            agg_fn.drop(columns=drop_cols,inplace=True)
            self.agg_dict[subset_name] = agg_fn
            
    def transform(self,data):
        data = self._derived_features(data)
        for key,agg_fn in self.agg_dict.items():
            data = pd.merge(data,agg_fn,how='left',on=key.split('_'))
        return data
    
    def fit_transform(self,data,cat_features,subset_depth=1):
        self.fit(data,cat_features,subset_depth)
        return self.transform(data)

In [97]:
fe = FeatureEngineering()
fe.fit(
    data=train_df2,
    cat_features=not_test_only_features, 
    subset_depth=CFG.SUBSET_DEPTH,
)
train_df3 = fe.transform(train_df2)
test_df3  = fe.transform(test_df2)

Get quantiles of target by categorical features (depth=1): 100%|██████████| 4/4 [00:00<00:00, 13.31it/s]


In [98]:
fe.new_cat_features

['출시년도생산여부', '출시이후생산년수', '출시이전생산여부', '브랜드국적', '브랜드대륙명', '판매도시구역동일여부']

In [99]:
type_resetor = TypeResetting()
type_resetor.add_categorical_features(fe.new_cat_features)
type_resetor.fit(train_df3)
type_resetor.get_feature_type()

train_df3 = type_resetor.transform(train_df3)
test_df3  = type_resetor.transform(test_df3)

In [100]:
print(train_df3.shape)
train_df3.head()

(57920, 40)


Unnamed: 0,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격,출시년도생산여부,출시이후생산년수,출시이전생산여부,브랜드국적,브랜드대륙명,판매도시구역동일여부,모델출시년도_Q0,모델출시년도_Q25,모델출시년도_Q50,모델출시년도_Q75,모델출시년도_Q100,브랜드_Q0,브랜드_Q25,브랜드_Q50,브랜드_Q75,브랜드_Q100,차량모델명_Q0,차량모델명_Q25,차량모델명_Q50,차량모델명_Q75,차량모델명_Q100,판매구역_Q0,판매구역_Q25,판매구역_Q50,판매구역_Q75,판매구역_Q100
0,2018,2014,skoda,fabia,KAT,SLA,85231.0,999.0,0,0,1,0,0,3.946231,0,4,0,체코,유럽,0,0.157004,4.019486,4.355041,4.64314,5.049856,0.727549,3.393501,4.057853,4.509072,5.049856,0.792993,2.933059,3.712352,4.065687,4.830312,0.262364,3.206803,3.77391,4.355041,5.049856
1,2010,2006,toyota,auris,RKO,SWI,135000.0,1598.0,0,0,1,0,0,3.72497,0,4,0,일본,아시아,0,1.358409,3.089678,3.348851,3.785779,5.049856,0.157004,3.353407,4.000034,4.435212,5.049856,2.687167,3.594569,4.112512,4.387075,4.761062,1.095273,3.113071,3.520461,4.066802,5.049856
2,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223.0,1796.0,0,0,1,0,0,2.87976,1,0,0,독일,유럽,0,0.955511,2.453588,2.826722,3.353407,5.049022,1.095273,3.152736,3.785779,4.354655,5.049856,2.250239,3.147165,3.440418,3.627069,4.866534,0.482426,3.089678,3.707577,4.296605,5.049856
3,2006,2001,nissan,x-trail,EHX,WIE,238000.0,2184.0,0,1,0,0,0,2.901422,0,5,0,일본,아시아,0,0.955511,2.341806,2.738903,3.095125,4.886356,1.050822,3.558201,3.923359,4.405499,5.049022,2.164472,4.368303,4.575844,4.761062,5.049022,0.482426,3.089678,3.707577,4.296605,5.049856
4,2007,2007,fiat,bravo,OSW,MAL,251000.0,1910.0,0,1,0,0,0,2.865054,1,0,0,이탈리아,유럽,0,1.697449,3.24921,3.626206,3.948741,5.049856,1.111858,2.865054,3.529985,4.174387,4.969049,2.134166,2.894253,3.065725,3.21072,3.660223,0.732368,3.201526,3.755837,4.305416,5.049856


<br></br>

# EDA

In [101]:
# check_num_features = [col for col in num_features if col.find('_Q')<0]

# i=0
# for col in check_num_features:
#     i+=1
#     print('\n({}/{}) {}'.format(i,len(check_num_features),col))
#     plt.figure(figsize=(15,7))
#     sns.scatterplot(x=train_df3['가격'],y=train_df3[col])
#     plt.show()

<br></br>

# Add the Interaction Term

In [102]:
import warnings
from tqdm import trange

def get_abs_corr(x,y):
    return np.abs(np.corrcoef(x,y))[0,1]

class InteractionTerm:
    def __init__(self):
        pass
    
    def fit(self,data,num_features,corr_cutoff=0.7):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        
        d = data.copy()
        self.interaction_list = []
        for i in range(len(num_features)):
            for j in range(len(num_features)):
                if i>j:
                    col_i = num_features[i]
                    col_j = num_features[j]
                    
                    # 상관계수가 cutoff보다 큰 경우에는 interaction을 생성하지 않음
                    if (get_abs_corr(d[col_i]*d[col_j],d[col_i])>=corr_cutoff) | (get_abs_corr(d[col_i]*d[col_j],d[col_j])>=corr_cutoff):
                        pass
                    else:
                        self.interaction_list.append(f'{col_i}*{col_j}')
    
    def transform(self,data):
        d = data.copy()
        for interaction in self.interaction_list:
            col_i,col_j = interaction.split('*')
            d[interaction] = d[col_i]*d[col_j]
        return d
    
    def fit_transform(self,data,num_features,corr_cutoff=0.7):
        self.fit(data,num_features,corr_cutoff)
        return self.transform(data)

In [103]:
train_df4 = train_df3.copy()
test_df4  = test_df3.copy()

if CFG.INTERACTION:
    interaction_maker = InteractionTerm()
    interaction_maker.fit(
        data=train_df3,
        num_features=num_features,
        corr_cutoff=0.9,
    )
    train_df4 = interaction_maker.transform(train_df4)
    test_df4  = interaction_maker.transform(test_df4)

    type_resetor = TypeResetting()
    type_resetor.add_categorical_features(fe.new_cat_features)
    type_resetor.fit(train_df4)
    type_resetor.get_feature_type()

    train_df4 = type_resetor.transform(train_df4)
    test_df4  = type_resetor.transform(test_df4)

In [104]:
print('VIF: {:.3f}'.format(1/(1-0.9**2)))

VIF: 5.263


In [105]:
train_df3.shape, train_df4.shape

((57920, 40), (57920, 78))

<br></br>

# Feature Selection

In [106]:
# k=0
# for i in range(len(num_features)):
#     for j in range(len(num_features)):
#         if i>j:
#             col_i = num_features[i]
#             col_j = num_features[j]
#             corr = np.corrcoef(train_df4[col_i],train_df4[col_j])[0,1]
#             if corr>=0.7:
#                 k+=1
#                 print(k,col_i,col_j,corr)

In [107]:
def log_offset(x):
    if min(x)>0:
        offset = 0
    elif min(x)==0:
        offset = 1e-3
    else:
        offset = min(x)+1e-3
        print('minimum = {:.3f}'.format(min(x)))
    return np.log(x+offset)

<br></br>

## Categorical Features

In [108]:
# import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [109]:
check_cat_features = [col for col in cat_features if train_df4[col].nunique()<=100]

# (1) ANOVA를 해서 p-value가 0.05보다 높은 것들 확인
pvalue_list = []
for col in tqdm(check_cat_features):
    d = train_df4[[col,'가격']].rename(columns={col:'feature'})
    
    model = ols(f'가격 ~ C(feature)',data=d).fit()
    pvalue = anova_lm(model).values[0][-1]
    pvalue_list.append([col,pvalue])
    
pvalue_df = pd.DataFrame(pvalue_list,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df[pvalue_df.pvalue>=alpha].round(4)

100%|██████████| 10/10 [00:03<00:00,  2.58it/s]


In [110]:
# (2) (1)에서 유의하지않은 feature들은 log적용 후에도 유의하지 않으면 제외
pvalue_list2 = []
unsignificant_features = pvalue_df[pvalue_df.pvalue>CFG.FS_ALPHA].feature.tolist()
for col in tqdm(unsignificant_features):
    d = train_df4[[col,'target']].rename(columns={col:'feature'})
    d['feature'] = log_offset(d['feature'])
    
    model = ols(f'feature ~ C(target)',data=d).fit()
    pvalue = anova_lm(model).values[0][-1]
    pvalue_list2.append([col,pvalue])
    
pvalue_df2 = pd.DataFrame(pvalue_list2,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)

0it [00:00, ?it/s]


In [111]:
delete_features = pvalue_df2[pvalue_df2.pvalue> CFG.FS_ALPHA].feature.tolist()
log_features    = pvalue_df2[pvalue_df2.pvalue<=CFG.FS_ALPHA].feature.tolist()
print('> delete_features')
print('  - length : {}'.format(len(delete_features)))
print('  - feature_name : {}'.format(delete_features))
print('')
print('> log_features')
print('  - length : {}'.format(len(log_features)))
print('  - feature_name : {}'.format(log_features))

train_df5 = train_df4.copy()
train_df5.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    train_df5[col] = log_offset(train_df5[col])
    
test_df5 = test_df4.copy()
test_df5.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    test_df5[col] = log_offset(test_df5[col])

> delete_features
  - length : 0
  - feature_name : []

> log_features
  - length : 0
  - feature_name : []


<br>

## Numerical Features

In [112]:
import scipy

In [113]:
# (1) corr test를 해서 p-value가 0.05보다 높은 것들 확인
pvalue_list = []
for col in num_features:
    corr,pvalue = scipy.stats.pearsonr(train_df5['가격'],train_df5[col])
    pvalue_list.append([col,pvalue])
pvalue_df = pd.DataFrame(pvalue_list,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df.round(4).head()

In [114]:
# (2) (1)에서 유의하지않은 feature들은 log적용 후에도 유의하지 않으면 제외
pvalue_list2 = []
unsignificant_features = pvalue_df[pvalue_df.pvalue>CFG.FS_ALPHA].feature.tolist()
for col in tqdm(unsignificant_features):
    corr,pvalue = scipy.stats.pearsonr(train_df5['가격'],log_offset(train_df5[col]))
    pvalue_list2.append([col,pvalue])
pvalue_df2 = pd.DataFrame(pvalue_list2,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df2.round(4).head()

0it [00:00, ?it/s]


In [115]:
delete_features = pvalue_df2[pvalue_df2.pvalue> CFG.FS_ALPHA].feature.tolist()
log_features    = pvalue_df2[pvalue_df2.pvalue<=CFG.FS_ALPHA].feature.tolist()
print('> delete_features')
print('  - length : {}'.format(len(delete_features)))
print('  - feature_name : {}'.format(delete_features))
print('')
print('> log_features')
print('  - length : {}'.format(len(log_features)))
print('  - feature_name : {}'.format(log_features))

train_df6 = train_df5.copy()
train_df6.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    train_df6[col] = log_offset(train_df6[col])
    
test_df6 = test_df5.copy()
test_df6.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    test_df6[col] = log_offset(test_df6[col])

> delete_features
  - length : 0
  - feature_name : []

> log_features
  - length : 0
  - feature_name : []


<br></br>

# Make Segment

In [116]:
def make_segment(data,segment: list):
    d = data.copy()
    d['segment'] = d[segment].apply(lambda x: '___'.join(x),axis=1)
    d.drop(columns=segment,inplace=True)
    return d

In [117]:
segment = ['브랜드']
train_df7 = make_segment(train_df6,segment)
test_df7  = make_segment(test_df6 ,segment)

In [118]:
test_only = list(set(test_df7.segment.unique())-set(train_df7.segment.unique()))
assert len(test_only)==0, \
    "Segment exists only in the test set ({})".format(len(test_only))

In [119]:
train_only = list(set(train_df7['segment'].unique())-set(test_df7['segment'].unique()))

n_asis = len(train_df7)
n_tobe = len(train_df7[~train_df7.segment.isin(train_only)])
train_df7 = train_df7[~train_df7.segment.isin(train_only)]
print('> Train에만 존재하는 Segment 제거')
print(' - 데이터수 : {:,} -> {:,}'.format(n_asis,n_tobe))
print(' - 세그먼트수 : {:,}'.format(train_df7['segment'].nunique()))

> Train에만 존재하는 Segment 제거
 - 데이터수 : 57,920 -> 57,920
 - 세그먼트수 : 20


In [120]:
vc = train_df7['segment'].value_counts().sort_values()
display(vc.head())
print('...')
display(vc.tail())

segment
mitsubishi     556
peugeot        793
citroen       1129
fiat          1164
volvo         1352
Name: count, dtype: int64

...


segment
bmw           5262
audi          5597
volkswagen    5693
ford          5819
opel          6651
Name: count, dtype: int64

In [121]:
type_resetor = TypeResetting()
type_resetor.add_categorical_features(fe.new_cat_features)
type_resetor.add_segment_features(segment)
type_resetor.fit(train_df7)
type_resetor.get_feature_type()

train_df7 = type_resetor.transform(train_df7)
test_df7  = type_resetor.transform(test_df7)

In [122]:
cat_features

['차량모델명',
 '판매도시',
 '판매구역',
 '생산년도',
 '모델출시년도',
 '출시년도생산여부',
 '출시이후생산년수',
 '출시이전생산여부',
 '브랜드국적',
 '브랜드대륙명',
 '판매도시구역동일여부']

In [123]:
print(train_df7.shape)
train_df7.head()

(57920, 78)


Unnamed: 0,생산년도,모델출시년도,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격,출시년도생산여부,출시이후생산년수,출시이전생산여부,브랜드국적,브랜드대륙명,판매도시구역동일여부,모델출시년도_Q0,모델출시년도_Q25,모델출시년도_Q50,모델출시년도_Q75,모델출시년도_Q100,브랜드_Q0,브랜드_Q25,브랜드_Q50,브랜드_Q75,브랜드_Q100,차량모델명_Q0,차량모델명_Q25,차량모델명_Q50,차량모델명_Q75,차량모델명_Q100,판매구역_Q0,판매구역_Q25,판매구역_Q50,판매구역_Q75,판매구역_Q100,배기량*주행거리,모델출시년도_Q0*주행거리,모델출시년도_Q0*배기량,모델출시년도_Q25*배기량,모델출시년도_Q50*배기량,모델출시년도_Q75*배기량,브랜드_Q0*주행거리,브랜드_Q0*배기량,브랜드_Q0*모델출시년도_Q0,브랜드_Q75*모델출시년도_Q100,차량모델명_Q0*주행거리,차량모델명_Q0*배기량,차량모델명_Q0*모델출시년도_Q0,차량모델명_Q0*브랜드_Q0,차량모델명_Q25*모델출시년도_Q25,차량모델명_Q25*모델출시년도_Q50,차량모델명_Q25*모델출시년도_Q75,차량모델명_Q50*모델출시년도_Q50,차량모델명_Q50*모델출시년도_Q75,차량모델명_Q75*모델출시년도_Q75,차량모델명_Q75*브랜드_Q25,차량모델명_Q100*브랜드_Q25,차량모델명_Q100*브랜드_Q50,차량모델명_Q100*브랜드_Q75,판매구역_Q0*주행거리,판매구역_Q0*배기량,판매구역_Q0*모델출시년도_Q0,판매구역_Q0*모델출시년도_Q25,판매구역_Q0*브랜드_Q0,판매구역_Q0*차량모델명_Q0,판매구역_Q25*모델출시년도_Q100,판매구역_Q25*브랜드_Q50,판매구역_Q25*브랜드_Q75,판매구역_Q25*차량모델명_Q100,판매구역_Q50*모델출시년도_Q100,판매구역_Q50*브랜드_Q50,판매구역_Q50*브랜드_Q75,판매구역_Q75*모델출시년도_Q100,segment
0,2018,2014,fabia,KAT,SLA,85231.0,999.0,0,0,1,0,0,3.946231,0,4,0,체코,유럽,0,0.157004,4.019486,4.355041,4.64314,5.049856,0.727549,3.393501,4.057853,4.509072,5.049856,0.792993,2.933059,3.712352,4.065687,4.830312,0.262364,3.206803,3.77391,4.355041,5.049856,85145769.0,13381.586515,156.846745,4015.466312,4350.685729,4638.496977,62009.695347,726.821059,0.114228,22.770164,67587.545091,792.199523,0.124503,0.576941,11.789388,12.77359,13.618602,16.167443,17.23697,18.877555,13.796914,16.391669,19.600697,21.780223,22361.568625,262.1019,0.041192,1.054569,0.190883,0.208053,16.193895,13.012738,14.459707,15.489859,19.057701,15.313973,17.016831,21.992329,skoda
1,2010,2006,auris,RKO,SWI,135000.0,1598.0,0,0,1,0,0,3.72497,0,4,0,일본,아시아,0,1.358409,3.089678,3.348851,3.785779,5.049856,0.157004,3.353407,4.000034,4.435212,5.049856,2.687167,3.594569,4.112512,4.387075,4.761062,1.095273,3.113071,3.520461,4.066802,5.049856,215730000.0,183385.23628,2170.737834,4937.305262,5351.46374,6049.675288,21195.506089,250.891991,0.213275,22.397181,362767.543675,4294.09285,3.650272,0.421895,11.10606,12.037675,13.608244,13.772189,15.569062,16.608499,14.711648,15.965778,19.04441,21.11632,147861.907299,1750.246873,1.487829,3.384042,0.171962,2.943182,15.720559,12.452389,13.807128,14.821524,17.77782,14.081962,15.61399,20.536765,toyota
2,2002,2002,clk-klasa,GNI,WIE,255223.0,1796.0,0,0,1,0,0,2.87976,1,0,0,독일,유럽,0,0.955511,2.453588,2.826722,3.353407,5.049022,1.095273,3.152736,3.785779,4.354655,5.049856,2.250239,3.147165,3.440418,3.627069,4.866534,0.482426,3.089678,3.707577,4.296605,5.049856,458380508.0,243868.497534,1716.098555,4406.643988,5076.792229,6022.718465,279538.959753,1967.111004,1.046546,21.986752,574312.649429,4041.428548,2.150129,2.464626,7.721846,8.89616,10.553724,9.725105,11.537121,12.163039,11.435192,15.342897,18.423623,21.192078,123126.249089,866.437364,0.460964,1.183675,0.528389,1.085574,15.599853,11.696839,13.454482,15.036022,18.71964,14.036069,16.145221,21.693654,mercedes-benz
3,2006,2001,x-trail,EHX,WIE,238000.0,2184.0,0,1,0,0,0,2.901422,0,5,0,일본,아시아,0,0.955511,2.341806,2.738903,3.095125,4.886356,1.050822,3.558201,3.923359,4.405499,5.049022,2.164472,4.368303,4.575844,4.761062,5.049022,0.482426,3.089678,3.707577,4.296605,5.049856,519792000.0,227411.723917,2086.836996,5114.503881,5981.763419,6759.753038,250095.54671,2294.994429,1.004072,21.526837,515144.286226,4727.206391,2.068178,2.274474,10.229716,11.964355,13.520442,12.532792,14.16281,14.736083,16.940817,17.965437,19.809125,22.243463,114817.42352,1053.61871,0.460964,1.129748,0.506944,1.044198,15.097267,12.121914,13.611573,15.599853,18.116543,14.546155,16.333728,20.994742,nissan
4,2007,2007,bravo,OSW,MAL,251000.0,1910.0,0,1,0,0,0,2.865054,1,0,0,이탈리아,유럽,0,1.697449,3.24921,3.626206,3.948741,5.049856,1.111858,2.865054,3.529985,4.174387,4.969049,2.134166,2.894253,3.065725,3.21072,3.660223,0.732368,3.201526,3.755837,4.305416,5.049856,479410000.0,426059.646229,3242.127188,6205.991763,6926.053192,7542.094524,279076.23637,2123.647854,1.887321,21.080055,535675.776784,4076.257903,3.622638,2.372889,9.404037,10.495158,11.428655,11.116949,12.105751,12.678299,9.198885,10.486736,12.920531,15.279187,183824.341322,1398.822677,1.243157,2.379617,0.814289,1.562995,16.167246,11.301339,13.36441,11.718299,18.966436,13.258048,15.678318,21.741728,fiat


<br></br>

# Modeling

In [124]:
import os
def mkdir(paths):
    if type(paths)==str:
        paths = [paths]
    for path in paths:
        if not os.path.isdir(path):
            print('> Create Folder: {}'.format(path))
            os.mkdir(path)

In [125]:
## dummy_features는 한가지만 속함
# X[dummy_features].apply(lambda x: np.sum(x),axis=1).value_counts()

def add_fuel_type(data,dummy_features):
    d = data.copy()
    d['fuel_type'] = d[dummy_features].apply(
        lambda x: dummy_features[np.where(x==1)[0][0]],axis=1)
    d.drop(columns=dummy_features,inplace=True)
    return d

In [126]:
mkdir('./model_checkpoints')
mkdir('./model_checkpoints/segment_catboost')
mkdir('./model_checkpoints/segment_weightedensemble')

In [127]:
def check_null_cnt(data):
    null_cnt = data.isnull().sum()
    null_cnt = len(null_cnt[null_cnt!=0])
    return null_cnt

check_null_cnt(train_df7),check_null_cnt(test_df7)

(0, 0)

<br>

## CatBoost
- public score : 5.7393705826

In [53]:
gc.collect()

0

In [54]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool



In [55]:
train_fn = train_df7.copy()
test_fn  = test_df7 .copy()

In [56]:
%%time
# 1시간

X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]

X = add_fuel_type(X,dummy_features)
new_cat_features = cat_features + ['fuel_type']

segment_list = X['segment'].unique()

models = {}
feature_info = {}
scores = []
pbar = tqdm(segment_list)

s_i = 0
for segment in pbar:
    s_i+=1
    
    # segment에 해당하는 데이터추출
    _X = X[X.segment==segment].drop('segment',axis=1)
    _y = y[X.segment==segment]
    
    # kfold
    kf = KFold(n_splits=CFG.N_SPLITS,random_state=1000*s_i+CFG.SEED,shuffle=True)
    
    # unique인 컬럼 제외
    unique_info = _X.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        _X = _X.drop(unique_cols,axis=1)
        
    # categorical feature에서 unique인 컬럼을 제외
    fixed_cat_features = [col for col in new_cat_features if col in _X.columns]
    
    _models = []
    _scores = []
    k=0
    for tr_idx,val_idx in kf.split(_X,_y):
        k+=1
        
        # kfold dataset
        X_tr, X_va = _X.iloc[tr_idx], _X.iloc[val_idx]
        y_tr, y_va = _y.iloc[tr_idx], _y.iloc[val_idx]

        # progress
        progress = 'Segment: [{}], Size: [{:,}], KFold: [{}/{}]'\
            .format(segment,len(_X),k,CFG.N_SPLITS)
        pbar.set_description(progress)

        # dataset
        train_dataset = Pool(X_tr,y_tr,cat_features=fixed_cat_features)
        valid_dataset = Pool(X_va,y_va,cat_features=fixed_cat_features)

        # define the model
        model = CatBoostRegressor(
            loss_function='MAE',
            random_state=CFG.SEED,
            iterations=CFG.EPOCHS,
            learning_rate=CFG.LR,
            allow_writing_files=False,
        )

        # fit the model
        model.fit(
            train_dataset,
            eval_set=valid_dataset,
            early_stopping_rounds=CFG.ES,
            verbose=0,
            #metric_period=CFG.EPOCHS//5,
        )

        # save the model
        model.save_model(f'./model_checkpoints/segment_catboost/{segment}_k{k}.cbm')

        # prediction
        y_pred = model.predict(valid_dataset).flatten()
        y_true = y_va.values
        
        # inverse transform
        if CFG.TARGET_TRANSFORMATION:
            y_pred = np.exp(y_pred)
            y_true = np.exp(y_true)
            
        # calculate score
        score = mean_absolute_error(y_true=y_true,y_pred=y_pred)
        
        # append inner loop
        _models.append(model)
        _scores.append([segment,k,len(X_tr),len(X_va),score])

    # append outer loop
    models[segment] = _models
    scores.append(_scores)
    feature_info[segment] = {'cat_features':fixed_cat_features,'features':_X.columns.tolist()}
    
    # score report
    mean_score_report = pd.Series(np.array(_scores)[:,-1]).astype(float).mean()
    print('Segment: {}'.format(segment))
    print("MAE's for {}-Fold: [{}]".format(CFG.N_SPLITS,np.array(pd.Series(np.array(_scores)[:,-1]).astype(float).values)))
    print("Mean of MAE's for {}-Fold: [{:.4f}]".format(CFG.N_SPLITS,mean_score_report))

Segment: [toyota], Size: [3,259], KFold: [1/5]:   5%|▌         | 1/20 [04:03<1:17:05, 243.45s/it]

Segment: skoda
MAE's for 5-Fold: [[6.5506249  6.30894093 6.08481638 6.44539008 6.50300828]]
Mean of MAE's for 5-Fold: [6.3786]


Segment: [mercedes-benz], Size: [2,899], KFold: [1/5]:  10%|█         | 2/20 [07:28<1:06:12, 220.72s/it]

Segment: toyota
MAE's for 5-Fold: [[5.57086587 5.49931207 5.10804655 5.43505902 4.75775299]]
Mean of MAE's for 5-Fold: [5.2742]


Segment: [nissan], Size: [2,129], KFold: [1/5]:  15%|█▌        | 3/20 [10:15<55:33, 196.09s/it]         

Segment: mercedes-benz
MAE's for 5-Fold: [[7.78865002 8.40132348 8.5596944  8.51332216 8.5425204 ]]
Mean of MAE's for 5-Fold: [8.3611]


Segment: [fiat], Size: [1,164], KFold: [1/5]:  20%|██        | 4/20 [12:01<42:50, 160.68s/it]  

Segment: nissan
MAE's for 5-Fold: [[5.14328921 5.28161989 5.13300196 4.74891644 5.06874316]]
Mean of MAE's for 5-Fold: [5.0751]


Segment: [audi], Size: [5,597], KFold: [1/5]:  25%|██▌       | 5/20 [13:30<33:43, 134.89s/it]

Segment: fiat
MAE's for 5-Fold: [[5.47603115 5.27386875 5.90098473 4.83299591 4.56521702]]
Mean of MAE's for 5-Fold: [5.2098]


Segment: [renault], Size: [3,853], KFold: [1/5]:  30%|███       | 6/20 [18:03<42:25, 181.81s/it]

Segment: audi
MAE's for 5-Fold: [[6.2132715  6.31445952 6.31198552 6.34739529 6.5382491 ]]
Mean of MAE's for 5-Fold: [6.3451]


Segment: [volkswagen], Size: [5,693], KFold: [1/5]:  35%|███▌      | 7/20 [22:37<45:54, 211.90s/it]

Segment: renault
MAE's for 5-Fold: [[4.96451759 4.9274676  4.59458365 5.06903728 4.79527106]]
Mean of MAE's for 5-Fold: [4.8702]


Segment: [citroen], Size: [1,129], KFold: [1/5]:  40%|████      | 8/20 [27:46<48:34, 242.90s/it]   

Segment: volkswagen
MAE's for 5-Fold: [[6.14279316 6.27862938 5.85141312 6.34629198 6.2195092 ]]
Mean of MAE's for 5-Fold: [6.1677]


Segment: [bmw], Size: [5,262], KFold: [1/5]:  45%|████▌     | 9/20 [29:16<35:46, 195.11s/it]    

Segment: citroen
MAE's for 5-Fold: [[3.88196664 4.14054084 4.26621983 3.7978728  3.93495667]]
Mean of MAE's for 5-Fold: [4.0043]


Segment: [opel], Size: [6,651], KFold: [1/5]:  50%|█████     | 10/20 [34:48<39:33, 237.33s/it]

Segment: bmw
MAE's for 5-Fold: [[7.86819646 7.7872699  7.43200761 7.61261937 7.51386646]]
Mean of MAE's for 5-Fold: [7.6428]


Segment: [ford], Size: [5,819], KFold: [1/5]:  55%|█████▌    | 11/20 [42:33<46:03, 307.00s/it]

Segment: opel
MAE's for 5-Fold: [[3.83937551 3.87926515 4.01055329 4.13425215 4.17060895]]
Mean of MAE's for 5-Fold: [4.0068]


Segment: [mazda], Size: [1,572], KFold: [1/5]:  60%|██████    | 12/20 [48:28<42:51, 321.48s/it]

Segment: ford
MAE's for 5-Fold: [[5.3932776  4.98676905 5.40616092 4.99034371 5.04632365]]
Mean of MAE's for 5-Fold: [5.1646]


Segment: [honda], Size: [1,545], KFold: [1/5]:  65%|██████▌   | 13/20 [50:31<30:31, 261.58s/it]

Segment: mazda
MAE's for 5-Fold: [[6.27186846 5.64062307 6.19107976 5.53813085 5.28899285]]
Mean of MAE's for 5-Fold: [5.7861]


Segment: [kia], Size: [2,034], KFold: [1/5]:  70%|███████   | 14/20 [52:48<22:23, 223.85s/it]  

Segment: honda
MAE's for 5-Fold: [[5.34235947 5.073683   5.88718764 5.31366508 5.47124713]]
Mean of MAE's for 5-Fold: [5.4176]


Segment: [seat], Size: [1,628], KFold: [1/5]:  75%|███████▌  | 15/20 [55:15<16:42, 200.57s/it]

Segment: kia
MAE's for 5-Fold: [[5.39397946 5.42465459 5.70504147 6.12794158 6.17143332]]
Mean of MAE's for 5-Fold: [5.7646]


Segment: [volvo], Size: [1,352], KFold: [1/5]:  80%|████████  | 16/20 [57:51<12:28, 187.25s/it]

Segment: seat
MAE's for 5-Fold: [[5.20594439 4.87333931 5.07364986 4.22765869 5.16304323]]
Mean of MAE's for 5-Fold: [4.9087]


Segment: [peugeot], Size: [793], KFold: [1/5]:  85%|████████▌ | 17/20 [1:00:38<09:03, 181.07s/it]

Segment: volvo
MAE's for 5-Fold: [[8.46512571 8.55485225 7.52772911 8.04010273 9.05802743]]
Mean of MAE's for 5-Fold: [8.3292]


Segment: [hyundai], Size: [1,855], KFold: [1/5]:  90%|█████████ | 18/20 [1:01:51<04:57, 148.72s/it]

Segment: peugeot
MAE's for 5-Fold: [[5.43218579 5.4948816  6.70494362 5.46392018 5.46005331]]
Mean of MAE's for 5-Fold: [5.7112]


Segment: [mitsubishi], Size: [556], KFold: [1/5]:  95%|█████████▌| 19/20 [1:04:36<02:33, 153.69s/it]

Segment: hyundai
MAE's for 5-Fold: [[5.45909766 5.46215539 5.59439802 6.0343424  5.88502506]]
Mean of MAE's for 5-Fold: [5.6870]


Segment: [mitsubishi], Size: [556], KFold: [5/5]: 100%|██████████| 20/20 [1:05:49<00:00, 197.47s/it]

Segment: mitsubishi
MAE's for 5-Fold: [[6.19889841 5.86706994 6.32588564 5.05584104 7.6078453 ]]
Mean of MAE's for 5-Fold: [6.2111]
CPU times: user 3h 13min 39s, sys: 41min 37s, total: 3h 55min 17s
Wall time: 1h 5min 52s





In [57]:
import pickle
with open('./model_checkpoints/segment_cat_models_brand_kf.pkl', 'wb') as f:
	pickle.dump(models, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_cat_feature_info_brand_kf.pkl', 'wb') as f:
	pickle.dump(feature_info, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_cat_scores_brand_kf.pkl', 'wb') as f:
	pickle.dump(scores, f, protocol=pickle.HIGHEST_PROTOCOL)

In [58]:
score_df = pd.DataFrame(
    np.array(scores).reshape(len(scores)*5,5),
    columns=['segment','k','n_tr','n_val','score']
)

score_df.sort_values(['segment','k']).head(10)

Unnamed: 0,segment,k,n_tr,n_val,score
25,audi,1,4477,1120,6.213271499763773
26,audi,2,4477,1120,6.314459516004917
27,audi,3,4478,1119,6.311985523019734
28,audi,4,4478,1119,6.3473952894233365
29,audi,5,4478,1119,6.538249104990782
45,bmw,1,4209,1053,7.868196463019282
46,bmw,2,4209,1053,7.787269897026184
47,bmw,3,4210,1052,7.432007608050447
48,bmw,4,4210,1052,7.6126193683086045
49,bmw,5,4210,1052,7.513866458347687


In [59]:
# inference
X = train_fn.drop(target_feature,axis=1)
X = add_fuel_type(X,dummy_features)
y = train_fn[target_feature]

X_test = add_fuel_type(test_fn,dummy_features)

segment_list = X['segment'].unique()

tr_pred_list = []
te_pred_list = []
for segment in tqdm(segment_list):
    ## data load
    # (1) train
    train_data = X[X.segment==segment][feature_info[segment]['features']]
    train_dataset = Pool(train_data,cat_features=feature_info[segment]['cat_features'])
    # (2) test
    test_data = X_test[X_test.segment==segment][feature_info[segment]['features']]
    test_dataset = Pool(test_data,cat_features=feature_info[segment]['cat_features'])
    
    ## model
    kfold_models = models[segment]
    
    ## prediction
    # (1) train
    tr_pred_df = pd.DataFrame({
        'segment':segment,
        'true':y[X.segment==segment].values.flatten(),
        'pred':np.mean([model.predict(train_dataset) for model in kfold_models],axis=0),
        #'pred':np.sum([weight*model.predict(train_dataset) for weight,model in zip(kfold_weights,kfold_models)],axis=0),
    })
    tr_pred_df.index = train_data.index
    # (2) test
    te_pred_df = pd.DataFrame({
        'segment':segment,
        'pred':np.mean([model.predict(test_dataset) for model in kfold_models],axis=0),
        #'pred':np.sum([weight*model.predict(test_dataset) for weight,model in zip(kfold_weights,kfold_models)],axis=0),
    })
    te_pred_df.index = test_data.index
    
    ## Target Transformation
    if CFG.TARGET_TRANSFORMATION:
        tr_pred_df['true'] = np.exp(tr_pred_df['true'])
        tr_pred_df['pred'] = np.exp(tr_pred_df['pred'])
        te_pred_df['pred'] = np.exp(te_pred_df['pred'])
    
    ## append
    tr_pred_list.append(tr_pred_df)
    te_pred_list.append(te_pred_df)

100%|██████████| 20/20 [00:05<00:00,  3.88it/s]


In [60]:
# train
tr_pred_df = pd.concat(tr_pred_list,axis=0).sort_index()
mean_absolute_error(y_true=tr_pred_df.true,y_pred=tr_pred_df.pred)

4.787905481043531

In [61]:
te_pred_df = pd.concat(te_pred_list,axis=0).sort_index()
te_pred_df.head()

Unnamed: 0,segment,pred
0,mazda,85.190652
1,ford,27.228487
2,volkswagen,89.687762
3,renault,122.284885
4,volvo,50.932252


In [62]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['가격'] = te_pred_df.pred.values
submit.to_csv('./out/12_catboost_segment_브랜드_kfold_logy_interaction.csv',index=False)

<br>

## Weighted Ensemble
- public score : 5.7381807069

In [128]:
import pandas as pd
import warnings

class OneHotEncoder:
    def __init__(self):
        pass
    
    def fit(self,data,columns):
        self.transform_list = []
        for col in columns:
            for i,value in enumerate(sorted(data[col].unique())):
                if i>0:
                    self.transform_list.append([col,value])
        
    def transform(self,data):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        new_data = data.copy()
        for col,value in self.transform_list:
            new_data[f'{col}_{value}'] = np.where(new_data[col]==value,1,0)
        drop_columns = pd.unique(np.array(self.transform_list)[:,0])
        new_data.drop(columns=drop_columns,inplace=True)
        return new_data

In [129]:
from copy import deepcopy
from sklearn.base import BaseEstimator, RegressorMixin
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
import warnings
import time
import pickle

class WeightedEnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,weight=['equal','balanced'],target_transformation=False):
        super().__init__()
        
        assert weight in ['equal','balanced'], \
            "weight must be one of ['equal','balanced']"
        assert isinstance(target_transformation,bool), \
            "target_transformation must be bool type"
        
        self.weight = weight
        self.target_transformation = target_transformation
        self._get_regressors()
    
    def _get_regressors(self):
        max_depth = 10
        n_jobs = -1
        
        params_elasticnet = {
            'l1_ratio' : np.arange(0.1, 1, 0.1),
            'alphas' : [1e-5, 1e-3, 1e-1, 0.0, 1.0, 10.0, 100.0],
            'cv' : RepeatedKFold(n_splits=CFG.N_SPLITS, n_repeats=3, random_state=CFG.SEED),
            'n_jobs' : n_jobs,
            #'max_iter' : 50000,
            'tol' : 0.001,
        }
        
        params_catboost = {
            'random_state' : CFG.SEED,
            'iterations' : CFG.EPOCHS,
            'early_stopping_rounds' : CFG.ES,
            'learning_rate' : CFG.LR,
            'loss_function' : 'MAE',
            'grow_policy' : 'Lossguide', # 'SymmetricTree','Depthwise'
            'use_best_model' : True,
            'allow_writing_files' : False,
            'verbose' : 0,
            'max_depth' : max_depth,
            #'l2_leaf_reg' : 1,
        }
    
        params_xgboost = {
            'random_state' : CFG.SEED,
            'n_estimators' : CFG.XGB_EPOCHS,
            'early_stopping_rounds' : CFG.XGB_ES,
            'learning_rate' : CFG.XGB_LR,
            'objective' : 'reg:absoluteerror',
            'verbosity' : 0,
            'max_depth': max_depth,
            'n_jobs' : n_jobs,
        }
    
        params_lightgbm = {
            'random_state' : CFG.SEED,
            'n_estimators' : CFG.EPOCHS,
            'early_stopping_round' : CFG.ES,
            'learning_rate' : CFG.LR,
            'objective' : 'regression',
            'metric' : 'mean_absolute_error',
            'verbosity' : -1,
            'max_depth': max_depth,
            'n_jobs' : n_jobs,
        }
        
        params_extratrees = {
            'random_state' : CFG.SEED,
            'n_estimators' : CFG.XTRATREES_EPOCHS,
            'criterion' : 'absolute_error',
            'verbose' : 0,
            'max_depth' : max_depth,
            'n_jobs' : n_jobs,
        }
        
        self.regressors = [
            ElasticNetCV(**params_elasticnet),
            CatBoostRegressor(**params_catboost),
            XGBRegressor(**params_xgboost),
            LGBMRegressor(**params_lightgbm),
            ExtraTreesRegressor(**params_extratrees),
        ]
        self.regressors_name = ['ElasticNet','CatBoost','XGBoost','LightGBM','ExtraTrees']
        
    def _adjust_prediction(self,pred):
        pred = np.array(pred).flatten()
        if np.where(pred<0,1,0).sum()>0:
            pred = [x if x>0 else self.minimum_value for x in pred]
        pred = np.exp(np.array(pred).flatten())
        if np.where(pred==np.inf,1,0).sum()>0:
            pred = [x if x!=np.inf else self.maximum_value for x in pred]
        pred = np.array(pred).flatten()
        return pred
    
    def fit(self,X,y,eval_set,oh_set,cat_features,verbose=1):
        assert len(eval_set)==1, \
            "eval_set length must be 1. len(eval_set)={}".format(len(eval_set))
        assert len(oh_set)==1, \
            "oh_set length must be 1. len(oh_set)={}".format(len(oh_set))
        X_val, y_val = eval_set[0]
        X_oh, X_val_oh = oh_set[0]
        
        self.cat_features = cat_features
        self.weights = []
        self.fitting_elapsed = []
        if verbose:
            pbar = tqdm(zip(self.regressors_name,self.regressors),total=len(self.regressors))
        else:
            pbar = zip(self.regressors_name,self.regressors)
            
        fit_iter = 0
        for regressor_name,regressor in pbar:
            fit_iter+=1
            s = time.time()
            
            if verbose:
                pbar.set_description(name)
                
            if regressor_name=='ElasticNet':
                warnings.filterwarnings("ignore", category=UserWarning)
                regressor.fit(X_oh,y)
                tr_pred = regressor.predict(X_oh)
                va_pred = regressor.predict(X_val_oh)
            elif regressor_name=='CatBoost':
                train_dataset = Pool(X,y,cat_features=cat_features)
                val_dataset   = Pool(X_val,y_val,cat_features=cat_features)
                regressor.fit(
                    train_dataset,
                    eval_set=val_dataset,
                    #metric_period=CFG.EPOCHS//5,
                )
                tr_pred = regressor.predict(train_dataset)
                va_pred = regressor.predict(val_dataset)
            elif regressor_name=='XGBoost':
                regressor.fit(
                    X_oh,y,
                    eval_set=[(X_val_oh,y_val)],
                    verbose=0,
                )
                tr_pred = regressor.predict(X_oh)
                va_pred = regressor.predict(X_val_oh)
            elif regressor_name=='LightGBM':
                warnings.filterwarnings("ignore", category=UserWarning)
                X_tmp = X.copy()
                X_val_tmp = X_val.copy()
                for col in cat_features:
                    X_tmp[col]     = X_tmp[col]    .astype('category')
                    X_val_tmp[col] = X_val_tmp[col].astype('category')
                regressor.fit(
                    X_tmp,y,
                    eval_set=[(X_val_tmp,y_val)],
                    verbose=-1,
                )
                tr_pred = regressor.predict(X_tmp)
                va_pred = regressor.predict(X_val_tmp)
            elif regressor_name=='ExtraTrees':
                regressor.fit(X_oh,y)
                tr_pred = regressor.predict(X_oh)
                va_pred = regressor.predict(X_val_oh)
            else:
                raise ValueError('Unknown Regressor: {}'.format(regressor_name))
            
            if self.target_transformation:
                tr_true = np.exp(np.array(y)    .flatten())
                va_true = np.exp(np.array(y_val).flatten())
                self.minimum_value = min(np.nanmin(tr_true),np.nanmin(va_true))
                self.maximum_value = max(np.nanmax(tr_true),np.nanmax(va_true))
                
                tr_pred = self._adjust_prediction(tr_pred)
                va_pred = self._adjust_prediction(va_pred)
            else:
                tr_true = np.array(y).flatten()
                va_true = np.array(y_val).flatten()
                tr_pred = np.array(tr_pred).flatten()
                va_pred = np.array(va_pred).flatten()
            tr_score = mean_absolute_error(y_pred=tr_pred,y_true=tr_true)
            va_score = mean_absolute_error(y_pred=va_pred,y_true=va_true)
            e = time.time()
            self.weights.append(1/va_score)
            self.fitting_elapsed.append(e-s)
            
            blank = ' '*(11-len(regressor_name))
            fit_progress = '  [{}/{}] {}{}: score={:.3f}, val_score={:.3f}, elasped={:.1f}s'\
                .format(fit_iter,len(self.regressors),regressor_name,blank,tr_score,va_score,e-s)
            print(fit_progress)
        
        if self.weight=='equal':
            self.weights = np.array([1.0 for _ in self.regressors])
        self.weights /= sum(self.weights)
        
        tr_pred = self.predict(X,X_oh)
        va_pred = self.predict(X_val,X_val_oh)
        ens_tr_score = mean_absolute_error(y_true=np.exp(np.array(y)    .flatten()),y_pred=tr_pred)
        ens_va_score = mean_absolute_error(y_true=np.exp(np.array(y_val).flatten()),y_pred=va_pred)
        
        total_fit_progress = '  Total({}): score={:.3f}, val_score={:.3f}, elasped={:.1f}s'\
            .format(self.weight,ens_tr_score,ens_va_score,sum(self.fitting_elapsed))
        print(total_fit_progress)
        
    def predict(self,X,X_oh):
        assert len(X)==len(X_oh), \
            "X and X_oh must be same length"
        
        pred_list = []
        for regressor_name,regressor in zip(self.regressors_name,self.regressors):
            if regressor_name in ['ElasticNet','XGBoost','ExtraTrees']:
                dataset = X_oh.copy()
            elif regressor_name=='CatBoost':
                dataset = Pool(X,cat_features=self.cat_features)
            elif regressor_name=='LightGBM':
                dataset = X.copy()
                for col in self.cat_features:
                    dataset[col] = dataset[col].astype('category')
            else:
                raise ValueError('Unknown Regressor: {}'.format(regressor_name))
            
            y_pred = regressor.predict(dataset)
            y_pred = self._adjust_prediction(y_pred)
            
            pred_list.append(y_pred)
            
        final_pred = np.zeros(len(X))
        for pred,weight in zip(pred_list,self.weights):
            final_pred += np.array(pred)*weight
            
        return final_pred
    
    def save_model(self,path):
        save_dict = {
            'cat_features' : self.cat_features,
            'weights' : self.weights,
            'target_transformation' : self.target_transformation,
            'fitting_elapsed' : self.fitting_elapsed,
            'regressors' : self.regressors,
        }
        with open(path, 'wb') as f:
            pickle.dump(save_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
            
    def load_model(self,path):
        with open(path, 'rb') as f:
            save_dict = pickle.load(f)
            self.cat_features = save_dict['cat_features']
            self.weights = save_dict['weights']
            self.target_transformation = save_dict['target_transformation'],
            self.fitting_elapsed = save_dict['fitting_elapsed']
            self.regressors = save_dict['regressors']

In [130]:
from sklearn.model_selection import KFold

In [131]:
gc.collect()

0

In [132]:
train_fn = train_df7.copy()
test_fn  = test_df7 .copy()

In [133]:
%%time
# 6시간

X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]
y = pd.Series(y.values.flatten())

ohe = OneHotEncoder()
ohe.fit(X,cat_features)
X_oh = ohe.transform(X)

segment_list = X['segment'].unique()

models = {}
feature_info = {}
scores = []
pbar = tqdm(segment_list)

s_i = 0
for segment in pbar:
    s_i+=1
    
    # segment에 해당하는 데이터추출
    _X    = X   [X   .segment==segment].drop('segment',axis=1)
    _X_oh = X_oh[X_oh.segment==segment].drop('segment',axis=1)
    _y    = y   [X   .segment==segment]
    
    # kfold
    kf = KFold(n_splits=CFG.N_SPLITS,random_state=1000*s_i+CFG.SEED,shuffle=True)
    
    # unique인 컬럼 제외
    # (1) X
    unique_info = _X.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        _X = _X.drop(unique_cols,axis=1)
    # (2) X_oh
    unique_info = _X_oh.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        _X_oh = _X_oh.drop(unique_cols,axis=1)
        
    # categorical feature에서 unique인 컬럼을 제외
    fixed_cat_features = [col for col in cat_features if col in _X.columns]
    
    # progress
    progress = 'Segment: {}, Length: {}'\
        .format(segment,len(_X))
    pbar.set_description(progress)
    
    _models = []
    _scores = []
    k=0
    for tr_idx,val_idx in kf.split(_X,_y):
        k+=1
        print('> [K-Fold] {}/{}'.format(k,CFG.N_SPLITS))
        
        # kfold dataset
        X_tr   , X_va    = _X   .iloc[tr_idx], _X   .iloc[val_idx]
        X_tr_oh, X_va_oh = _X_oh.iloc[tr_idx], _X_oh.iloc[val_idx]
        y_tr   , y_va    = _y   .iloc[tr_idx], _y   .iloc[val_idx]

        # define the model
        ensemble_model = WeightedEnsembleRegressor(
            weight='balanced',
            target_transformation=CFG.TARGET_TRANSFORMATION,
        )

        # fit the model
        ensemble_model.fit(
            X_tr,y_tr,
            eval_set=[(X_va,y_va)],
            oh_set=[(X_tr_oh,X_va_oh)],
            cat_features=fixed_cat_features,
            verbose=0,
        )

        # save the model
        ensemble_model.save_model(f'./model_checkpoints/segment_weightedensemble/{segment}_k{k}.pickle')

        # prediction
        y_pred = ensemble_model.predict(X_va,X_va_oh).flatten()
        y_true = y_va.values
        
        # caculate score
        score = mean_absolute_error(y_true=y_true,y_pred=y_pred)
        
        # append inner loop
        _models.append(ensemble_model)
        _scores.append([segment,k,len(X_tr),len(X_va),score])

    # append outer loop
    models[segment] = _models
    scores.append(_scores)
    feature_info[segment] = {
        'cat_features':fixed_cat_features,
        'features':_X.columns.tolist(),
        'oh_features':_X_oh.columns.tolist(),
    }

Segment: skoda, Length: 3130:   0%|          | 0/20 [00:01<?, ?it/s]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=6.344, val_score=6.908, elasped=17.9s
  [2/5] CatBoost   : score=4.598, val_score=6.621, elasped=75.4s
  [3/5] XGBoost    : score=1.437, val_score=7.591, elasped=15.9s
  [4/5] LightGBM   : score=5.183, val_score=6.941, elasped=38.0s
  [5/5] ExtraTrees : score=4.879, val_score=7.409, elasped=47.1s
  Total(balanced): score=4.295, val_score=6.657, elasped=194.3s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=6.341, val_score=6.787, elasped=18.8s
  [2/5] CatBoost   : score=4.724, val_score=6.350, elasped=69.6s
  [3/5] XGBoost    : score=1.625, val_score=7.303, elasped=14.7s
  [4/5] LightGBM   : score=4.792, val_score=6.528, elasped=48.0s
  [5/5] ExtraTrees : score=4.988, val_score=6.989, elasped=48.2s
  Total(balanced): score=4.289, val_score=6.403, elasped=199.3s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=6.430, val_score=6.395, elasped=17.7s
  [2/5] CatBoost   : score=4.378, val_score=6.160, elasped=99.2s
  [3/5] XGBoost    : score=0.953, val_score=7.4

Segment: toyota, Length: 3259:   5%|▌         | 1/20 [16:38<5:15:48, 997.26s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.569, val_score=6.276, elasped=22.3s
  [2/5] CatBoost   : score=3.118, val_score=5.681, elasped=135.8s
  [3/5] XGBoost    : score=0.822, val_score=6.549, elasped=14.0s
  [4/5] LightGBM   : score=3.561, val_score=6.139, elasped=55.7s
  [5/5] ExtraTrees : score=3.953, val_score=6.299, elasped=56.8s
  Total(balanced): score=3.085, val_score=5.744, elasped=284.6s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.375, val_score=6.181, elasped=21.8s
  [2/5] CatBoost   : score=3.341, val_score=5.365, elasped=97.5s
  [3/5] XGBoost    : score=0.841, val_score=6.000, elasped=14.2s
  [4/5] LightGBM   : score=3.840, val_score=5.661, elasped=46.4s
  [5/5] ExtraTrees : score=3.891, val_score=6.113, elasped=58.3s
  Total(balanced): score=3.129, val_score=5.439, elasped=238.2s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.557, val_score=5.729, elasped=22.3s
  [2/5] CatBoost   : score=3.896, val_score=5.154, elasped=57.4s
  [3/5] XGBoost    : score=0.747, val_score=5.

Segment: mercedes-benz, Length: 2899:  10%|█         | 2/20 [36:40<5:35:20, 1117.78s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=8.563, val_score=8.845, elasped=29.9s
  [2/5] CatBoost   : score=5.360, val_score=7.898, elasped=72.6s
  [3/5] XGBoost    : score=1.113, val_score=8.770, elasped=19.9s
  [4/5] LightGBM   : score=6.378, val_score=8.226, elasped=32.9s
  [5/5] ExtraTrees : score=5.487, val_score=8.802, elasped=52.7s
  Total(balanced): score=5.014, val_score=7.852, elasped=207.9s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=8.620, val_score=9.361, elasped=29.7s
  [2/5] CatBoost   : score=5.152, val_score=8.366, elasped=79.7s
  [3/5] XGBoost    : score=0.807, val_score=9.728, elasped=16.7s
  [4/5] LightGBM   : score=5.764, val_score=8.477, elasped=39.5s
  [5/5] ExtraTrees : score=5.293, val_score=9.452, elasped=54.2s
  Total(balanced): score=4.802, val_score=8.448, elasped=219.8s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=8.533, val_score=9.941, elasped=29.6s
  [2/5] CatBoost   : score=4.647, val_score=8.513, elasped=112.1s
  [3/5] XGBoost    : score=1.186, val_score=9.

Segment: nissan, Length: 2129:  15%|█▌        | 3/20 [54:29<5:10:21, 1095.39s/it]       

> [K-Fold] 1/5
  [1/5] ElasticNet : score=4.749, val_score=5.471, elasped=15.8s
  [2/5] CatBoost   : score=3.444, val_score=5.237, elasped=62.6s
  [3/5] XGBoost    : score=0.403, val_score=6.278, elasped=11.5s
  [4/5] LightGBM   : score=4.032, val_score=5.827, elasped=29.4s
  [5/5] ExtraTrees : score=3.280, val_score=5.871, elasped=25.4s
  Total(balanced): score=2.986, val_score=5.345, elasped=144.7s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.159, val_score=5.647, elasped=14.8s
  [2/5] CatBoost   : score=4.215, val_score=5.273, elasped=29.2s
  [3/5] XGBoost    : score=0.546, val_score=6.115, elasped=10.1s
  [4/5] LightGBM   : score=3.886, val_score=5.350, elasped=32.5s
  [5/5] ExtraTrees : score=3.217, val_score=5.685, elasped=26.3s
  Total(balanced): score=3.219, val_score=5.171, elasped=113.0s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.917, val_score=5.378, elasped=16.1s
  [2/5] CatBoost   : score=3.532, val_score=5.118, elasped=61.2s
  [3/5] XGBoost    : score=0.418, val_score=6.1

Segment: fiat, Length: 1164:  20%|██        | 4/20 [1:06:14<4:11:00, 941.26s/it]  

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.092, val_score=6.061, elasped=7.0s
  [2/5] CatBoost   : score=3.199, val_score=5.460, elasped=62.1s
  [3/5] XGBoost    : score=1.311, val_score=5.908, elasped=5.3s
  [4/5] LightGBM   : score=3.044, val_score=5.311, elasped=37.9s
  [5/5] ExtraTrees : score=3.137, val_score=5.285, elasped=8.0s
  Total(balanced): score=2.923, val_score=5.249, elasped=120.3s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.238, val_score=5.602, elasped=6.6s
  [2/5] CatBoost   : score=3.789, val_score=5.124, elasped=29.8s
  [3/5] XGBoost    : score=1.193, val_score=6.036, elasped=6.0s
  [4/5] LightGBM   : score=4.109, val_score=5.304, elasped=22.0s
  [5/5] ExtraTrees : score=3.009, val_score=5.610, elasped=7.2s
  Total(balanced): score=3.265, val_score=5.214, elasped=71.6s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.557, val_score=6.944, elasped=7.0s
  [2/5] CatBoost   : score=3.308, val_score=5.959, elasped=45.1s
  [3/5] XGBoost    : score=0.811, val_score=6.388, elas

Segment: audi, Length: 5597:  25%|██▌       | 5/20 [1:14:26<3:14:48, 779.23s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=6.532, val_score=6.671, elasped=63.6s
  [2/5] CatBoost   : score=4.955, val_score=6.281, elasped=92.8s
  [3/5] XGBoost    : score=1.929, val_score=6.830, elasped=43.2s
  [4/5] LightGBM   : score=5.253, val_score=6.562, elasped=52.7s
  [5/5] ExtraTrees : score=5.246, val_score=7.054, elasped=223.3s
  Total(balanced): score=4.553, val_score=6.334, elasped=475.7s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=6.487, val_score=6.892, elasped=62.6s
  [2/5] CatBoost   : score=5.070, val_score=6.389, elasped=75.9s
  [3/5] XGBoost    : score=1.987, val_score=7.265, elasped=51.6s
  [4/5] LightGBM   : score=5.485, val_score=6.688, elasped=39.0s
  [5/5] ExtraTrees : score=5.282, val_score=7.309, elasped=223.3s
  Total(balanced): score=4.661, val_score=6.538, elasped=452.4s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=6.518, val_score=6.728, elasped=62.5s
  [2/5] CatBoost   : score=4.751, val_score=6.423, elasped=111.4s
  [3/5] XGBoost    : score=2.000, val_score=

Segment: renault, Length: 3853:  30%|███       | 6/20 [1:53:34<5:06:14, 1312.48s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.074, val_score=5.281, elasped=29.2s
  [2/5] CatBoost   : score=3.228, val_score=4.962, elasped=113.5s
  [3/5] XGBoost    : score=1.101, val_score=5.510, elasped=20.5s
  [4/5] LightGBM   : score=3.870, val_score=5.160, elasped=40.0s
  [5/5] ExtraTrees : score=3.807, val_score=5.691, elasped=86.8s
  Total(balanced): score=3.195, val_score=4.978, elasped=290.1s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.029, val_score=5.553, elasped=28.8s
  [2/5] CatBoost   : score=2.913, val_score=4.771, elasped=185.9s
  [3/5] XGBoost    : score=1.012, val_score=5.366, elasped=23.0s
  [4/5] LightGBM   : score=3.242, val_score=4.984, elasped=66.4s
  [5/5] ExtraTrees : score=3.952, val_score=5.654, elasped=87.0s
  Total(balanced): score=2.957, val_score=4.857, elasped=391.1s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.094, val_score=5.299, elasped=29.0s
  [2/5] CatBoost   : score=3.488, val_score=4.535, elasped=78.9s
  [3/5] XGBoost    : score=1.207, val_score=5

Segment: volkswagen, Length: 5693:  35%|███▌      | 7/20 [2:19:21<5:01:01, 1389.31s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=6.471, val_score=6.844, elasped=57.6s
  [2/5] CatBoost   : score=4.772, val_score=6.166, elasped=94.0s
  [3/5] XGBoost    : score=1.954, val_score=7.065, elasped=37.0s
  [4/5] LightGBM   : score=4.838, val_score=6.186, elasped=58.0s
  [5/5] ExtraTrees : score=5.068, val_score=6.974, elasped=211.9s
  Total(balanced): score=4.402, val_score=6.236, elasped=458.4s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=6.482, val_score=6.734, elasped=57.6s
  [2/5] CatBoost   : score=4.821, val_score=6.338, elasped=87.0s
  [3/5] XGBoost    : score=1.794, val_score=6.982, elasped=42.4s
  [4/5] LightGBM   : score=4.997, val_score=6.364, elasped=48.2s
  [5/5] ExtraTrees : score=5.082, val_score=6.977, elasped=208.2s
  Total(balanced): score=4.401, val_score=6.336, elasped=443.4s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=6.517, val_score=6.652, elasped=57.2s
  [2/5] CatBoost   : score=4.595, val_score=5.915, elasped=122.5s
  [3/5] XGBoost    : score=1.898, val_score=

Segment: citroen, Length: 1129:  40%|████      | 8/20 [2:59:03<5:41:02, 1705.19s/it]   

> [K-Fold] 1/5
  [1/5] ElasticNet : score=3.587, val_score=3.974, elasped=7.7s
  [2/5] CatBoost   : score=2.497, val_score=3.825, elasped=43.1s
  [3/5] XGBoost    : score=0.481, val_score=4.669, elasped=5.8s
  [4/5] LightGBM   : score=2.470, val_score=3.987, elasped=30.5s
  [5/5] ExtraTrees : score=2.441, val_score=4.872, elasped=8.9s
  Total(balanced): score=2.181, val_score=3.886, elasped=96.0s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=3.799, val_score=3.901, elasped=7.4s
  [2/5] CatBoost   : score=2.656, val_score=4.173, elasped=32.3s
  [3/5] XGBoost    : score=1.058, val_score=4.500, elasped=4.8s
  [4/5] LightGBM   : score=3.122, val_score=4.641, elasped=20.7s
  [5/5] ExtraTrees : score=2.700, val_score=4.691, elasped=9.1s
  Total(balanced): score=2.509, val_score=4.128, elasped=74.3s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=3.421, val_score=4.474, elasped=7.4s
  [2/5] CatBoost   : score=2.543, val_score=4.291, elasped=41.1s
  [3/5] XGBoost    : score=0.513, val_score=5.246, elasp

Segment: bmw, Length: 5262:  45%|████▌     | 9/20 [3:07:08<4:02:41, 1323.73s/it]    

> [K-Fold] 1/5
  [1/5] ElasticNet : score=8.023, val_score=8.708, elasped=57.8s
  [2/5] CatBoost   : score=6.217, val_score=7.934, elasped=65.5s
  [3/5] XGBoost    : score=2.721, val_score=8.755, elasped=26.9s
  [4/5] LightGBM   : score=6.095, val_score=7.884, elasped=51.8s
  [5/5] ExtraTrees : score=6.142, val_score=8.523, elasped=204.2s
  Total(balanced): score=5.559, val_score=7.953, elasped=406.3s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=7.949, val_score=8.578, elasped=56.0s
  [2/5] CatBoost   : score=6.031, val_score=7.801, elasped=72.7s
  [3/5] XGBoost    : score=2.696, val_score=8.200, elasped=29.9s
  [4/5] LightGBM   : score=6.600, val_score=8.014, elasped=35.3s
  [5/5] ExtraTrees : score=6.183, val_score=8.404, elasped=201.6s
  Total(balanced): score=5.588, val_score=7.831, elasped=395.5s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=8.061, val_score=8.359, elasped=56.5s
  [2/5] CatBoost   : score=5.543, val_score=7.506, elasped=132.2s
  [3/5] XGBoost    : score=2.453, val_score=

Segment: opel, Length: 6651:  50%|█████     | 10/20 [3:42:32<4:21:47, 1570.74s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=4.202, val_score=4.088, elasped=58.5s
  [2/5] CatBoost   : score=3.097, val_score=3.831, elasped=117.6s
  [3/5] XGBoost    : score=1.427, val_score=4.248, elasped=57.0s
  [4/5] LightGBM   : score=3.347, val_score=3.966, elasped=55.8s
  [5/5] ExtraTrees : score=3.533, val_score=4.490, elasped=310.7s
  Total(balanced): score=2.960, val_score=3.901, elasped=599.7s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=4.184, val_score=4.201, elasped=58.4s
  [2/5] CatBoost   : score=2.847, val_score=3.828, elasped=189.9s
  [3/5] XGBoost    : score=1.320, val_score=4.356, elasped=56.0s
  [4/5] LightGBM   : score=3.094, val_score=3.898, elasped=69.0s
  [5/5] ExtraTrees : score=3.485, val_score=4.528, elasped=312.5s
  Total(balanced): score=2.827, val_score=3.921, elasped=685.9s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.155, val_score=4.364, elasped=57.9s
  [2/5] CatBoost   : score=2.902, val_score=3.988, elasped=155.9s
  [3/5] XGBoost    : score=1.207, val_scor

Segment: ford, Length: 5819:  55%|█████▌    | 11/20 [4:35:53<5:10:27, 2069.69s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.425, val_score=5.953, elasped=46.2s
  [2/5] CatBoost   : score=3.763, val_score=5.354, elasped=108.3s
  [3/5] XGBoost    : score=1.560, val_score=5.794, elasped=38.1s
  [4/5] LightGBM   : score=3.944, val_score=5.406, elasped=55.8s
  [5/5] ExtraTrees : score=4.584, val_score=6.176, elasped=224.1s
  Total(balanced): score=3.608, val_score=5.419, elasped=472.6s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.520, val_score=5.392, elasped=45.4s
  [2/5] CatBoost   : score=3.520, val_score=4.999, elasped=177.5s
  [3/5] XGBoost    : score=1.404, val_score=5.331, elasped=41.6s
  [4/5] LightGBM   : score=3.801, val_score=5.110, elasped=60.2s
  [5/5] ExtraTrees : score=4.608, val_score=5.920, elasped=222.5s
  Total(balanced): score=3.506, val_score=4.972, elasped=547.3s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.334, val_score=5.990, elasped=47.2s
  [2/5] CatBoost   : score=3.603, val_score=5.398, elasped=146.4s
  [3/5] XGBoost    : score=1.290, val_scor

Segment: mazda, Length: 1572:  60%|██████    | 12/20 [5:17:16<4:52:45, 2195.73s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.652, val_score=6.414, elasped=10.9s
  [2/5] CatBoost   : score=3.646, val_score=6.326, elasped=50.8s
  [3/5] XGBoost    : score=1.269, val_score=7.197, elasped=6.8s
  [4/5] LightGBM   : score=4.820, val_score=6.650, elasped=25.3s
  [5/5] ExtraTrees : score=3.353, val_score=7.286, elasped=13.0s
  Total(balanced): score=3.609, val_score=6.419, elasped=106.9s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.873, val_score=6.410, elasped=10.2s
  [2/5] CatBoost   : score=3.922, val_score=5.550, elasped=39.0s
  [3/5] XGBoost    : score=0.861, val_score=6.172, elasped=7.2s
  [4/5] LightGBM   : score=4.027, val_score=6.102, elasped=31.2s
  [5/5] ExtraTrees : score=3.507, val_score=6.228, elasped=13.4s
  Total(balanced): score=3.447, val_score=5.701, elasped=101.1s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.482, val_score=6.243, elasped=10.0s
  [2/5] CatBoost   : score=4.142, val_score=6.144, elasped=32.3s
  [3/5] XGBoost    : score=0.734, val_score=7.031

Segment: honda, Length: 1545:  65%|██████▌   | 13/20 [5:26:38<3:18:24, 1700.59s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.350, val_score=5.567, elasped=10.2s
  [2/5] CatBoost   : score=3.115, val_score=5.295, elasped=61.6s
  [3/5] XGBoost    : score=0.645, val_score=6.750, elasped=7.7s
  [4/5] LightGBM   : score=3.434, val_score=5.370, elasped=38.8s
  [5/5] ExtraTrees : score=3.032, val_score=6.462, elasped=12.6s
  Total(balanced): score=2.949, val_score=5.350, elasped=131.0s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.343, val_score=5.901, elasped=9.8s
  [2/5] CatBoost   : score=3.391, val_score=5.101, elasped=47.9s
  [3/5] XGBoost    : score=0.595, val_score=6.463, elasped=6.9s
  [4/5] LightGBM   : score=3.647, val_score=5.212, elasped=30.6s
  [5/5] ExtraTrees : score=3.067, val_score=6.364, elasped=13.0s
  Total(balanced): score=3.011, val_score=5.253, elasped=108.2s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.189, val_score=6.419, elasped=10.1s
  [2/5] CatBoost   : score=3.236, val_score=5.766, elasped=50.7s
  [3/5] XGBoost    : score=0.857, val_score=6.875,

Segment: kia, Length: 2034:  70%|███████   | 14/20 [5:36:29<2:16:32, 1365.48s/it]  

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.875, val_score=5.585, elasped=13.4s
  [2/5] CatBoost   : score=4.341, val_score=5.329, elasped=45.1s
  [3/5] XGBoost    : score=0.730, val_score=6.252, elasped=9.3s
  [4/5] LightGBM   : score=4.655, val_score=5.675, elasped=31.7s
  [5/5] ExtraTrees : score=3.678, val_score=5.630, elasped=22.2s
  Total(balanced): score=3.678, val_score=5.255, elasped=121.6s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.615, val_score=5.635, elasped=12.8s
  [2/5] CatBoost   : score=4.335, val_score=5.417, elasped=41.0s
  [3/5] XGBoost    : score=0.749, val_score=6.047, elasped=9.5s
  [4/5] LightGBM   : score=4.597, val_score=5.808, elasped=29.1s
  [5/5] ExtraTrees : score=3.658, val_score=5.516, elasped=22.4s
  Total(balanced): score=3.585, val_score=5.272, elasped=114.8s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.787, val_score=6.156, elasped=12.5s
  [2/5] CatBoost   : score=4.392, val_score=5.660, elasped=38.9s
  [3/5] XGBoost    : score=0.796, val_score=6.481

Segment: seat, Length: 1628:  75%|███████▌  | 15/20 [5:47:13<1:35:40, 1148.15s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=4.595, val_score=5.634, elasped=13.0s
  [2/5] CatBoost   : score=2.922, val_score=5.319, elasped=65.8s
  [3/5] XGBoost    : score=0.444, val_score=6.274, elasped=8.9s
  [4/5] LightGBM   : score=3.363, val_score=5.699, elasped=30.8s
  [5/5] ExtraTrees : score=2.984, val_score=6.305, elasped=17.8s
  Total(balanced): score=2.661, val_score=5.439, elasped=136.3s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=4.594, val_score=5.551, elasped=12.3s
  [2/5] CatBoost   : score=2.969, val_score=4.867, elasped=57.4s
  [3/5] XGBoost    : score=0.521, val_score=5.632, elasped=9.0s
  [4/5] LightGBM   : score=3.580, val_score=5.303, elasped=27.8s
  [5/5] ExtraTrees : score=3.194, val_score=5.810, elasped=17.2s
  Total(balanced): score=2.700, val_score=4.924, elasped=123.7s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.087, val_score=5.155, elasped=12.1s
  [2/5] CatBoost   : score=2.587, val_score=4.835, elasped=85.9s
  [3/5] XGBoost    : score=0.745, val_score=6.095

Segment: volvo, Length: 1352:  80%|████████  | 16/20 [5:58:32<1:07:06, 1006.71s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=7.617, val_score=8.311, elasped=9.8s
  [2/5] CatBoost   : score=4.986, val_score=8.710, elasped=44.4s
  [3/5] XGBoost    : score=0.382, val_score=10.691, elasped=8.9s
  [4/5] LightGBM   : score=4.416, val_score=8.980, elasped=39.0s
  [5/5] ExtraTrees : score=5.457, val_score=9.546, elasped=12.8s
  Total(balanced): score=4.397, val_score=8.604, elasped=114.9s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=7.768, val_score=8.940, elasped=9.6s
  [2/5] CatBoost   : score=4.734, val_score=8.384, elasped=53.4s
  [3/5] XGBoost    : score=0.992, val_score=10.253, elasped=6.4s
  [4/5] LightGBM   : score=4.222, val_score=8.581, elasped=40.0s
  [5/5] ExtraTrees : score=5.452, val_score=10.306, elasped=13.4s
  Total(balanced): score=4.372, val_score=8.441, elasped=122.8s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=7.907, val_score=7.853, elasped=9.6s
  [2/5] CatBoost   : score=5.863, val_score=7.313, elasped=32.5s
  [3/5] XGBoost    : score=0.577, val_score=9.252

Segment: peugeot, Length: 793:  85%|████████▌ | 17/20 [6:08:58<44:36, 892.30s/it]  

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.030, val_score=5.826, elasped=5.0s
  [2/5] CatBoost   : score=3.793, val_score=5.616, elasped=25.9s
  [3/5] XGBoost    : score=1.161, val_score=6.404, elasped=4.5s
  [4/5] LightGBM   : score=4.042, val_score=6.063, elasped=20.3s
  [5/5] ExtraTrees : score=2.843, val_score=5.760, elasped=5.3s
  Total(balanced): score=3.159, val_score=5.597, elasped=61.0s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.578, val_score=5.293, elasped=4.9s
  [2/5] CatBoost   : score=3.922, val_score=5.546, elasped=26.8s
  [3/5] XGBoost    : score=0.288, val_score=6.107, elasped=4.9s
  [4/5] LightGBM   : score=4.298, val_score=5.753, elasped=18.3s
  [5/5] ExtraTrees : score=2.683, val_score=5.359, elasped=4.4s
  Total(balanced): score=3.187, val_score=5.130, elasped=59.3s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.715, val_score=6.452, elasped=4.8s
  [2/5] CatBoost   : score=2.380, val_score=6.314, elasped=92.4s
  [3/5] XGBoost    : score=0.351, val_score=7.515, elasp

Segment: hyundai, Length: 1855:  90%|█████████ | 18/20 [6:15:25<24:41, 740.51s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.774, val_score=6.176, elasped=9.7s
  [2/5] CatBoost   : score=3.812, val_score=5.388, elasped=69.0s
  [3/5] XGBoost    : score=1.047, val_score=6.062, elasped=7.0s
  [4/5] LightGBM   : score=4.333, val_score=5.642, elasped=35.1s
  [5/5] ExtraTrees : score=3.546, val_score=5.803, elasped=17.7s
  Total(balanced): score=3.472, val_score=5.398, elasped=138.5s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.721, val_score=6.015, elasped=10.4s
  [2/5] CatBoost   : score=3.926, val_score=5.393, elasped=51.6s
  [3/5] XGBoost    : score=0.836, val_score=6.522, elasped=7.2s
  [4/5] LightGBM   : score=4.447, val_score=5.727, elasped=28.0s
  [5/5] ExtraTrees : score=3.364, val_score=5.529, elasped=17.6s
  Total(balanced): score=3.454, val_score=5.431, elasped=114.7s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.804, val_score=6.053, elasped=9.8s
  [2/5] CatBoost   : score=3.567, val_score=5.598, elasped=68.4s
  [3/5] XGBoost    : score=1.035, val_score=5.942, 

Segment: mitsubishi, Length: 556:  95%|█████████▌| 19/20 [6:25:32<11:40, 700.38s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.127, val_score=6.739, elasped=69.5s
  [2/5] CatBoost   : score=4.170, val_score=6.531, elasped=28.4s
  [3/5] XGBoost    : score=1.245, val_score=7.767, elasped=4.2s
  [4/5] LightGBM   : score=5.194, val_score=6.828, elasped=11.5s
  [5/5] ExtraTrees : score=3.037, val_score=6.923, elasped=2.3s
  Total(balanced): score=3.413, val_score=6.386, elasped=116.0s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.187, val_score=6.077, elasped=72.0s
  [2/5] CatBoost   : score=3.790, val_score=6.199, elasped=32.6s
  [3/5] XGBoost    : score=0.801, val_score=8.256, elasped=4.7s
  [4/5] LightGBM   : score=4.810, val_score=6.777, elasped=13.1s
  [5/5] ExtraTrees : score=3.000, val_score=7.939, elasped=2.4s
  Total(balanced): score=3.324, val_score=6.254, elasped=124.9s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.362, val_score=6.408, elasped=71.3s
  [2/5] CatBoost   : score=4.038, val_score=6.423, elasped=29.1s
  [3/5] XGBoost    : score=2.846, val_score=7.288, 

Segment: mitsubishi, Length: 556: 100%|██████████| 20/20 [6:35:49<00:00, 1187.50s/it]

CPU times: user 1d 6h 21min 37s, sys: 3h 1min 3s, total: 1d 9h 22min 41s
Wall time: 6h 36min 5s





In [134]:
import pickle
with open('./model_checkpoints/segment_weiens_models_brand.pkl', 'wb') as f:
	pickle.dump(models, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_weiens_feature_info_brand.pkl', 'wb') as f:
	pickle.dump(feature_info, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_weiens_scores_brand.pkl', 'wb') as f:
	pickle.dump(scores, f, protocol=pickle.HIGHEST_PROTOCOL)

In [135]:
# import pickle
# with open('./model_checkpoints/segment_weiens_models_brand.pkl', 'rb') as f:
# 	models = pickle.load(f)
# with open('./model_checkpoints/segment_weiens_feature_info_brand.pkl', 'rb') as f:
# 	feature_info = pickle.load(f)
# with open('./model_checkpoints/segment_weiens_scores_brand.pkl', 'rb') as f:
# 	scores = pickle.load(f)

In [136]:
# pd.DataFrame(
#     np.array(scores).reshape(100,5),
#     columns=['segment','k','n_tr','n_val','score']
# ).sort_values(['segment','k'])

In [137]:
# inference
X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]

X_test = test_fn.copy()

ohe = OneHotEncoder()
ohe.fit(X,cat_features)
X_oh = ohe.transform(X)
X_test_oh = ohe.transform(X_test)

segment_list = X['segment'].unique()

tr_pred_list = []
te_pred_list = []
for segment in tqdm(segment_list):
    ## data load
    # (1) train
    train_data    = X   [X   .segment==segment][feature_info[segment]['features']]
    train_data_oh = X_oh[X_oh.segment==segment][feature_info[segment]['oh_features']]
    # (2) test
    test_data     = X_test   [X_test   .segment==segment][feature_info[segment]['features']]
    test_data_oh  = X_test_oh[X_test_oh.segment==segment][feature_info[segment]['oh_features']]
    
    ## model
    kfold_models = models[segment]
    
    ## prediction
    # (1) train
    tr_pred_df = pd.DataFrame({
        'true':y[X.segment==segment].values.flatten(),
        'pred':np.mean([model.predict(train_data,train_data_oh) for model in kfold_models],axis=0),
    })
    tr_pred_df.index = train_data.index
    # (2) test
    te_pred_df = pd.DataFrame({
        'pred':np.mean([model.predict(test_data,test_data_oh) for model in kfold_models],axis=0),
    })
    te_pred_df.index = test_data.index
    if CFG.TARGET_TRANSFORMATION:
        tr_pred_df['true'] = np.exp(tr_pred_df['true'])
    
    ## append
    tr_pred_list.append(tr_pred_df)
    te_pred_list.append(te_pred_df)

100%|██████████| 20/20 [00:52<00:00,  2.65s/it]


In [138]:
# train
tr_pred_df = pd.concat(tr_pred_list,axis=0).sort_index()
mean_absolute_error(y_true=tr_pred_df.true,y_pred=tr_pred_df.pred)

4.174799692173191

In [139]:
# def abline(intercept,slope,**kwargs):
#     axes = plt.gca()
#     x_vals = np.array(axes.get_xlim())
#     y_vals = intercept + slope * x_vals
#     plt.plot(x_vals, y_vals, '--',**kwargs)

# offset = 0.05
# min_value = min(tr_pred_df.true.min(),tr_pred_df.pred.min())*(1-offset)
# max_value = min(tr_pred_df.true.max(),tr_pred_df.pred.max())*(1+offset)

# plt.figure(figsize=(15,7))
# sns.scatterplot(x=tr_pred_df.true,y=tr_pred_df.pred)
# plt.xlim(min_value,max_value)
# plt.ylim(min_value,max_value)
# abline(0,1,color='red',linestyle='--')
# plt.show()

In [140]:
te_pred_df = pd.concat(te_pred_list,axis=0).sort_index()
te_pred_df.head()

Unnamed: 0,pred
0,84.277319
1,26.037368
2,89.219354
3,125.427724
4,50.114399


In [141]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['가격'] = te_pred_df.pred.values
submit.to_csv('./out/13_ensemble_segment_브랜드_kfold_logy_interaction.csv',index=False)

<br>

## 참조 pycaret

In [None]:
# from pycaret import regression

In [None]:
# %%time

# data = train_fn[train_fn.segment==segment_list[0]]
# # data['가격'] = np.exp(data['가격'])
# print(len(data))

# regression.setup(data=data,target='가격',remove_outliers=True,verbose=True)
# best = regression.compare_models(n_select=5,fold=5)