# Library Setting

In [1]:
from tqdm import tqdm
tqdm.pandas()

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

import gc
gc.collect()

0

<br></br>

# Configuration

In [2]:
class CFG:
    SEED = 0
    
    SUBSET_DEPTH = 1
    INTERACTION = False
    FS_ALPHA = 0.01
    
    N_SPLITS = 5
    TARGET_TRANSFORMATION = True
    
    LR = 0.003
    EPOCHS = 30000
    ES = 300
    XGB_LR = 0.01     # default=0.3
    XGB_EPOCHS = 1000 # default=100
    XGB_ES = 100
    XTRATREES_EPOCHS = 100 #default=100

<br></br>

# Data

## Data Load

In [3]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [4]:
train_df.shape, test_df.shape

((57920, 15), (14480, 14))

In [5]:
train_df.head()

Unnamed: 0,ID,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격
0,TRAIN_00000,2018,2014,skoda,fabia,KAT,SLA,85231,999,0,0,1,0,0,51.74
1,TRAIN_00001,2010,2006,toyota,auris,RKO,SWI,135000,1598,0,0,1,0,0,41.47
2,TRAIN_00002,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223,1796,0,0,1,0,0,17.81
3,TRAIN_00003,2006,2001,nissan,x-trail,EHX,WIE,238000,2184,0,1,0,0,0,18.2
4,TRAIN_00004,2007,2007,fiat,bravo,OSW,MAL,251000,1910,0,1,0,0,0,17.55


<br>

## Target Transformation

In [6]:
if CFG.TARGET_TRANSFORMATION:
    train_df['가격'] = np.log(train_df['가격'])

<br>

## Resetting Columns Type

In [7]:
class TypeResetting:
    def __init__(self):
        self.cat_features = ['브랜드','차량모델명','판매도시','판매구역','생산년도','모델출시년도']
        self.seg_features = []
        
    def add_categorical_features(self,cat_features):
        self.cat_features += cat_features
        
    def delete_categorical_features(self,cat_features):
        self.cat_features = [col for col in self.cat_features if col not in cat_features]
        
    def add_segment_features(self,segment_features):
        self.seg_features = ['segment']
        self.cat_features = [col for col in self.cat_features if col not in segment_features]
        
    def fit(self,data):
        if (len(self.seg_features)>0) & ('segment' not in data.columns):
            raise ValueError("segment column name must be 'segment'")
        self.target_feature = ['가격']
        self.unuse_features = ['ID']
        self.dummy_features = ['압축천연가스(CNG)','액화석유가스(LPG)','경유','가솔린','하이브리드']
        self.num_features   = [col for col in data.columns
                               if col not in self.target_feature+self.unuse_features+self.dummy_features+self.cat_features+self.seg_features]
        
    def transform(self,data):
        d = data.copy()
        for col in self.dummy_features:
            if d[col].dtypes!=int:
                d[col] = d[col].astype(int)
        for col in self.cat_features:
            if d[col].dtypes!=object:
                d[col] = d[col].astype(str)
        for col in self.num_features:
            if d[col].dtypes!=float:
                d[col] = d[col].astype(float)
        for col in self.seg_features:
            if d[col].dtypes!=object:
                d[col] = d[col].astype(str)
        for col in self.unuse_features:
            if col in d.columns:
                d.drop(col,axis=1,inplace=True)
        return d
    
    def fit_transform(self,data):
        self.fit(data)
        return self.transform(data)
    
    def get_feature_type(self):
        globals()['target_feature'] = self.target_feature
        globals()['unuse_features'] = self.unuse_features
        globals()['dummy_features'] = self.dummy_features
        globals()['cat_features']   = self.cat_features
        globals()['num_features']   = self.num_features

In [8]:
type_resetor = TypeResetting()
type_resetor.fit(train_df)
type_resetor.get_feature_type()

train_df2 = type_resetor.transform(train_df)
test_df2  = type_resetor.transform(test_df)

In [9]:
import warnings
warnings.simplefilter("always")

def check_only_oneside(train,test,cat_features):
    not_test_only_features = []
    for iter,col in enumerate(cat_features):
        print('[{}/{}] {}'.format(iter+1,len(cat_features),col))
        
        only_train = list(set(train[col].unique())-set(test[col].unique()))
        only_test  = list(set(test[col].unique())-set(train[col].unique()))
        print(' - Only Train:',len(only_train))
        print(' - Only Test :',len(only_test))
        if len(only_test)>0:
            print('******Warning******')
        else:
            not_test_only_features.append(col)
        print('')
    return not_test_only_features

In [10]:
# 브랜드, 차량모델명, 판매구역, 모델출시년도
not_test_only_features = check_only_oneside(train_df2,test_df2,cat_features+dummy_features)
not_test_only_features = list(set(not_test_only_features)-set(dummy_features))

[1/11] 브랜드
 - Only Train: 0
 - Only Test : 0

[2/11] 차량모델명
 - Only Train: 2
 - Only Test : 0

[3/11] 판매도시
 - Only Train: 1750
 - Only Test : 300

[4/11] 판매구역
 - Only Train: 0
 - Only Test : 0

[5/11] 생산년도
 - Only Train: 3
 - Only Test : 1

[6/11] 모델출시년도
 - Only Train: 0
 - Only Test : 0

[7/11] 압축천연가스(CNG)
 - Only Train: 0
 - Only Test : 0

[8/11] 액화석유가스(LPG)
 - Only Train: 0
 - Only Test : 0

[9/11] 경유
 - Only Train: 0
 - Only Test : 0

[10/11] 가솔린
 - Only Train: 0
 - Only Test : 0

[11/11] 하이브리드
 - Only Train: 0
 - Only Test : 0



In [11]:
not_test_only_features

['차량모델명', '브랜드', '모델출시년도', '판매구역']

<br></br>

# New Features

In [12]:
# pd.Series([str(round(int(year)/100,1)) for year in train_df6['생산년도']]).value_counts()

In [13]:
train_df2.head()

Unnamed: 0,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격
0,2018,2014,skoda,fabia,KAT,SLA,85231.0,999.0,0,0,1,0,0,3.946231
1,2010,2006,toyota,auris,RKO,SWI,135000.0,1598.0,0,0,1,0,0,3.72497
2,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223.0,1796.0,0,0,1,0,0,2.87976
3,2006,2001,nissan,x-trail,EHX,WIE,238000.0,2184.0,0,1,0,0,0,2.901422
4,2007,2007,fiat,bravo,OSW,MAL,251000.0,1910.0,0,1,0,0,0,2.865054


In [14]:
from tqdm import tqdm
from itertools import chain, combinations
def all_subsets(ss):
    return list(chain(*map(lambda x: combinations(ss, x), range(0, len(ss)+1))))

class FeatureEngineering:
    def __init__(self):
        pass
    
    def _get_quantile(self,x,col):
        x = np.array(x).flatten()
        x = x[pd.notnull(x)]

        agg_df = pd.DataFrame(index=[0])
        for q in [0,25,50,75,100]:
            agg_df[f'{col}_Q{q}'] = np.quantile(x,q/100)

        return agg_df
    
    def _derived_features(self,data):
        d = data.copy()

        # (1) 모델출시년도에 생산된 차량인지
        d['출시년도생산여부'] = np.where(d['생산년도'].astype(int)==d['모델출시년도'].astype(int),1,0)

        # (2) 모델출시 이후에 몇년 지나서 생산됬는지
        d['출시이후생산년수'] = d['생산년도'].astype(int)-d['모델출시년도'].astype(int)

        # (3) 출시 이전에 생산되었는지
        d['출시이전생산여부'] = np.where(d['출시이후생산년수']<0,1,0)

        # (4) 브랜드의 국적 (구글링)
        d['브랜드국적'] = ['체코' if brand=='skoda' else
                        '일본' if brand in ['toyota','nissan','mazda','honda','mitsubishi'] else
                        '독일' if brand in ['mercedes-benz','audi','volkswagen','bmw','opel'] else
                        '이탈리아' if brand=='fiat' else
                        '프랑스' if brand in ['renault','citroen','peugeot'] else
                        '미국' if brand=='ford' else
                        '한국' if brand in ['kia','hyundai'] else
                        '스페인' if brand=='seat' else
                        '스웨덴' if brand=='volvo' else
                        np.nan for brand in d['브랜드']]

        # (5) 브랜드 국적의 대륙명
        d['브랜드대륙명'] = ['유럽' if country in ['체코','독일','이탈리아','프랑스','스페인','스웨덴'] else
                          '아시아' if country in ['일본','한국'] else
                          '아메리카' if country in ['미국'] else
                          np.nan for country in d['브랜드국적']]
        return d
    
    def fit(self,data,cat_features,subset_depth=1):
        assert '가격' in data.columns, \
            'Input data must be training dataset'
        assert len(cat_features)>=subset_depth, \
            'len(cat_features) >= subset_depth'
        
        self.cat_features = cat_features
        self.new_cat_features = ['출시년도생산여부','출시이후생산년수','출시이전생산여부','브랜드국적','브랜드대륙명']
        
        # (6) 카테고리 변수에 따른 가격의 Quantile값
        all_subset_list = all_subsets(cat_features)
        all_subset_list = [subset for subset in all_subset_list if (len(subset)<=subset_depth) & (len(subset)>=1)]
        
        self.agg_dict = {}
        for subset in tqdm(all_subset_list,desc=f'Get quantiles of target by categorical features (depth={subset_depth})'):
            subset = list(subset)
            subset_name = '_'.join(subset)
            agg_fn = data.groupby(subset)['가격'].apply(lambda x: self._get_quantile(x,subset_name)).reset_index()
            drop_cols = [col for col in agg_fn if col.find('level_')>=0]
            agg_fn.drop(columns=drop_cols,inplace=True)
            self.agg_dict[subset_name] = agg_fn
            
    def transform(self,data):
        data = self._derived_features(data)
        for key,agg_fn in self.agg_dict.items():
            data = pd.merge(data,agg_fn,how='left',on=key.split('_'))
        return data
    
    def fit_transform(self,data,cat_features,subset_depth=1):
        self.fit(data,cat_features,subset_depth)
        return self.transform(data)

In [15]:
fe = FeatureEngineering()
fe.fit(
    data=train_df2,
    cat_features=not_test_only_features, 
    subset_depth=CFG.SUBSET_DEPTH,
)
train_df3 = fe.transform(train_df2)
test_df3  = fe.transform(test_df2)

Get quantiles of target by categorical features (depth=1): 100%|██████████| 4/4 [00:00<00:00, 12.86it/s]


In [16]:
fe.new_cat_features

['출시년도생산여부', '출시이후생산년수', '출시이전생산여부', '브랜드국적', '브랜드대륙명']

In [17]:
type_resetor = TypeResetting()
type_resetor.add_categorical_features(fe.new_cat_features)
type_resetor.fit(train_df3)
type_resetor.get_feature_type()

train_df3 = type_resetor.transform(train_df3)
test_df3  = type_resetor.transform(test_df3)

In [18]:
print(train_df3.shape)
train_df3.head()

(57920, 39)


Unnamed: 0,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격,출시년도생산여부,출시이후생산년수,출시이전생산여부,브랜드국적,브랜드대륙명,차량모델명_Q0,차량모델명_Q25,차량모델명_Q50,차량모델명_Q75,차량모델명_Q100,브랜드_Q0,브랜드_Q25,브랜드_Q50,브랜드_Q75,브랜드_Q100,모델출시년도_Q0,모델출시년도_Q25,모델출시년도_Q50,모델출시년도_Q75,모델출시년도_Q100,판매구역_Q0,판매구역_Q25,판매구역_Q50,판매구역_Q75,판매구역_Q100
0,2018,2014,skoda,fabia,KAT,SLA,85231.0,999.0,0,0,1,0,0,3.946231,0,4,0,체코,유럽,0.792993,2.933059,3.712352,4.065687,4.830312,0.727549,3.393501,4.057853,4.509072,5.049856,0.157004,4.019486,4.355041,4.64314,5.049856,0.262364,3.206803,3.77391,4.355041,5.049856
1,2010,2006,toyota,auris,RKO,SWI,135000.0,1598.0,0,0,1,0,0,3.72497,0,4,0,일본,아시아,2.687167,3.594569,4.112512,4.387075,4.761062,0.157004,3.353407,4.000034,4.435212,5.049856,1.358409,3.089678,3.348851,3.785779,5.049856,1.095273,3.113071,3.520461,4.066802,5.049856
2,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223.0,1796.0,0,0,1,0,0,2.87976,1,0,0,독일,유럽,2.250239,3.147165,3.440418,3.627069,4.866534,1.095273,3.152736,3.785779,4.354655,5.049856,0.955511,2.453588,2.826722,3.353407,5.049022,0.482426,3.089678,3.707577,4.296605,5.049856
3,2006,2001,nissan,x-trail,EHX,WIE,238000.0,2184.0,0,1,0,0,0,2.901422,0,5,0,일본,아시아,2.164472,4.368303,4.575844,4.761062,5.049022,1.050822,3.558201,3.923359,4.405499,5.049022,0.955511,2.341806,2.738903,3.095125,4.886356,0.482426,3.089678,3.707577,4.296605,5.049856
4,2007,2007,fiat,bravo,OSW,MAL,251000.0,1910.0,0,1,0,0,0,2.865054,1,0,0,이탈리아,유럽,2.134166,2.894253,3.065725,3.21072,3.660223,1.111858,2.865054,3.529985,4.174387,4.969049,1.697449,3.24921,3.626206,3.948741,5.049856,0.732368,3.201526,3.755837,4.305416,5.049856


<br></br>

# EDA

In [19]:
# check_num_features = [col for col in num_features if col.find('_Q')<0]

# i=0
# for col in check_num_features:
#     i+=1
#     print('\n({}/{}) {}'.format(i,len(check_num_features),col))
#     plt.figure(figsize=(15,7))
#     sns.scatterplot(x=train_df3['가격'],y=train_df3[col])
#     plt.show()

<br></br>

# Add the Interaction Term

In [20]:
import warnings
from tqdm import trange

def get_abs_corr(x,y):
    return np.abs(np.corrcoef(x,y))[0,1]

class InteractionTerm:
    def __init__(self):
        pass
    
    def fit(self,data,num_features,corr_cutoff=0.7):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        
        d = data.copy()
        self.interaction_list = []
        for i in range(len(num_features)):
            for j in range(len(num_features)):
                if i>j:
                    col_i = num_features[i]
                    col_j = num_features[j]
                    
                    # 상관계수가 cutoff보다 큰 경우에는 interaction을 생성하지 않음
                    if (get_abs_corr(d[col_i]*d[col_j],d[col_i])>=corr_cutoff) | (get_abs_corr(d[col_i]*d[col_j],d[col_j])>=corr_cutoff):
                        pass
                    else:
                        self.interaction_list.append(f'{col_i}*{col_j}')
    
    def transform(self,data):
        d = data.copy()
        for interaction in self.interaction_list:
            col_i,col_j = interaction.split('*')
            d[interaction] = d[col_i]*d[col_j]
        return d
    
    def fit_transform(self,data,num_features,corr_cutoff=0.7):
        self.fit(data,num_features,corr_cutoff)
        return self.transform(data)

In [21]:
train_df4 = train_df3.copy()
test_df4  = test_df3.copy()

if CFG.INTERACTION:
    interaction_maker = InteractionTerm()
    interaction_maker.fit(
        data=train_df3,
        num_features=num_features,
        corr_cutoff=0.7,
    )
    train_df4 = interaction_maker.transform(train_df4)
    test_df4  = interaction_maker.transform(test_df4)

    type_resetor = TypeResetting()
    type_resetor.add_categorical_features(fe.new_cat_features)
    type_resetor.fit(train_df4)
    type_resetor.get_feature_type()

    train_df4 = type_resetor.transform(train_df4)
    test_df4  = type_resetor.transform(test_df4)

<br></br>

# Feature Selection

In [22]:
# k=0
# for i in range(len(num_features)):
#     for j in range(len(num_features)):
#         if i>j:
#             col_i = num_features[i]
#             col_j = num_features[j]
#             corr = np.corrcoef(train_df4[col_i],train_df4[col_j])[0,1]
#             if corr>=0.7:
#                 k+=1
#                 print(k,col_i,col_j,corr)

In [23]:
def log_offset(x):
    if min(x)>0:
        offset = 0
    elif min(x)==0:
        offset = 1e-3
    else:
        offset = min(x)+1e-3
        print('minimum = {:.3f}'.format(min(x)))
    return np.log(x+offset)

<br></br>

## Categorical Features

In [24]:
# import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [25]:
check_cat_features = [col for col in cat_features if train_df4[col].nunique()<=100]

# (1) ANOVA를 해서 p-value가 0.05보다 높은 것들 확인
pvalue_list = []
for col in tqdm(check_cat_features):
    d = train_df4[[col,'가격']].rename(columns={col:'feature'})
    
    model = ols(f'가격 ~ C(feature)',data=d).fit()
    pvalue = anova_lm(model).values[0][-1]
    pvalue_list.append([col,pvalue])
    
pvalue_df = pd.DataFrame(pvalue_list,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df[pvalue_df.pvalue>=alpha].round(4)

100%|██████████| 9/9 [00:03<00:00,  2.49it/s]


In [26]:
# (2) (1)에서 유의하지않은 feature들은 log적용 후에도 유의하지 않으면 제외
pvalue_list2 = []
unsignificant_features = pvalue_df[pvalue_df.pvalue>CFG.FS_ALPHA].feature.tolist()
for col in tqdm(unsignificant_features):
    d = train_df4[[col,'target']].rename(columns={col:'feature'})
    d['feature'] = log_offset(d['feature'])
    
    model = ols(f'feature ~ C(target)',data=d).fit()
    pvalue = anova_lm(model).values[0][-1]
    pvalue_list2.append([col,pvalue])
    
pvalue_df2 = pd.DataFrame(pvalue_list2,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)

0it [00:00, ?it/s]


In [27]:
delete_features = pvalue_df2[pvalue_df2.pvalue> CFG.FS_ALPHA].feature.tolist()
log_features    = pvalue_df2[pvalue_df2.pvalue<=CFG.FS_ALPHA].feature.tolist()
print('> delete_features')
print('  - length : {}'.format(len(delete_features)))
print('  - feature_name : {}'.format(delete_features))
print('')
print('> log_features')
print('  - length : {}'.format(len(log_features)))
print('  - feature_name : {}'.format(log_features))

train_df5 = train_df4.copy()
train_df5.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    train_df5[col] = log_offset(train_df5[col])
    
test_df5 = test_df4.copy()
test_df5.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    test_df5[col] = log_offset(test_df5[col])

> delete_features
  - length : 0
  - feature_name : []

> log_features
  - length : 0
  - feature_name : []


<br>

## Numerical Features

In [28]:
import scipy

In [29]:
# (1) corr test를 해서 p-value가 0.05보다 높은 것들 확인
pvalue_list = []
for col in num_features:
    corr,pvalue = scipy.stats.pearsonr(train_df5['가격'],train_df5[col])
    pvalue_list.append([col,pvalue])
pvalue_df = pd.DataFrame(pvalue_list,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df.round(4).head()

In [30]:
# (2) (1)에서 유의하지않은 feature들은 log적용 후에도 유의하지 않으면 제외
pvalue_list2 = []
unsignificant_features = pvalue_df[pvalue_df.pvalue>CFG.FS_ALPHA].feature.tolist()
for col in tqdm(unsignificant_features):
    corr,pvalue = scipy.stats.pearsonr(train_df5['가격'],log_offset(train_df5[col]))
    pvalue_list2.append([col,pvalue])
pvalue_df2 = pd.DataFrame(pvalue_list2,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df2.round(4).head()

0it [00:00, ?it/s]


In [31]:
delete_features = pvalue_df2[pvalue_df2.pvalue> CFG.FS_ALPHA].feature.tolist()
log_features    = pvalue_df2[pvalue_df2.pvalue<=CFG.FS_ALPHA].feature.tolist()
print('> delete_features')
print('  - length : {}'.format(len(delete_features)))
print('  - feature_name : {}'.format(delete_features))
print('')
print('> log_features')
print('  - length : {}'.format(len(log_features)))
print('  - feature_name : {}'.format(log_features))

train_df6 = train_df5.copy()
train_df6.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    train_df6[col] = log_offset(train_df6[col])
    
test_df6 = test_df5.copy()
test_df6.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    test_df6[col] = log_offset(test_df6[col])

> delete_features
  - length : 0
  - feature_name : []

> log_features
  - length : 0
  - feature_name : []


<br></br>

# Make Segment

In [32]:
def make_segment(data,segment: list):
    d = data.copy()
    d['segment'] = d[segment].apply(lambda x: '___'.join(x),axis=1)
    d.drop(columns=segment,inplace=True)
    return d

In [33]:
segment = ['브랜드']
train_df7 = make_segment(train_df6,segment)
test_df7  = make_segment(test_df6 ,segment)

In [34]:
test_only = list(set(test_df7.segment.unique())-set(train_df7.segment.unique()))
assert len(test_only)==0, \
    "Segment exists only in the test set ({})".format(len(test_only))

In [35]:
train_only = list(set(train_df7['segment'].unique())-set(test_df7['segment'].unique()))

n_asis = len(train_df7)
n_tobe = len(train_df7[~train_df7.segment.isin(train_only)])
train_df7 = train_df7[~train_df7.segment.isin(train_only)]
print('> Train에만 존재하는 Segment 제거')
print(' - 데이터수 : {:,} -> {:,}'.format(n_asis,n_tobe))
print(' - 세그먼트수 : {:,}'.format(train_df7['segment'].nunique()))

> Train에만 존재하는 Segment 제거
 - 데이터수 : 57,920 -> 57,920
 - 세그먼트수 : 20


In [36]:
vc = train_df7['segment'].value_counts().sort_values()
display(vc.head())
print('...')
display(vc.tail())

segment
mitsubishi     556
peugeot        793
citroen       1129
fiat          1164
volvo         1352
Name: count, dtype: int64

...


segment
bmw           5262
audi          5597
volkswagen    5693
ford          5819
opel          6651
Name: count, dtype: int64

In [37]:
type_resetor = TypeResetting()
type_resetor.add_categorical_features(fe.new_cat_features)
type_resetor.add_segment_features(segment)
type_resetor.fit(train_df7)
type_resetor.get_feature_type()

train_df7 = type_resetor.transform(train_df7)
test_df7  = type_resetor.transform(test_df7)

In [38]:
cat_features

['차량모델명',
 '판매도시',
 '판매구역',
 '생산년도',
 '모델출시년도',
 '출시년도생산여부',
 '출시이후생산년수',
 '출시이전생산여부',
 '브랜드국적',
 '브랜드대륙명']

In [39]:
print(train_df7.shape)
train_df7.head()

(57920, 39)


Unnamed: 0,생산년도,모델출시년도,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격,출시년도생산여부,출시이후생산년수,출시이전생산여부,브랜드국적,브랜드대륙명,차량모델명_Q0,차량모델명_Q25,차량모델명_Q50,차량모델명_Q75,차량모델명_Q100,브랜드_Q0,브랜드_Q25,브랜드_Q50,브랜드_Q75,브랜드_Q100,모델출시년도_Q0,모델출시년도_Q25,모델출시년도_Q50,모델출시년도_Q75,모델출시년도_Q100,판매구역_Q0,판매구역_Q25,판매구역_Q50,판매구역_Q75,판매구역_Q100,segment
0,2018,2014,fabia,KAT,SLA,85231.0,999.0,0,0,1,0,0,3.946231,0,4,0,체코,유럽,0.792993,2.933059,3.712352,4.065687,4.830312,0.727549,3.393501,4.057853,4.509072,5.049856,0.157004,4.019486,4.355041,4.64314,5.049856,0.262364,3.206803,3.77391,4.355041,5.049856,skoda
1,2010,2006,auris,RKO,SWI,135000.0,1598.0,0,0,1,0,0,3.72497,0,4,0,일본,아시아,2.687167,3.594569,4.112512,4.387075,4.761062,0.157004,3.353407,4.000034,4.435212,5.049856,1.358409,3.089678,3.348851,3.785779,5.049856,1.095273,3.113071,3.520461,4.066802,5.049856,toyota
2,2002,2002,clk-klasa,GNI,WIE,255223.0,1796.0,0,0,1,0,0,2.87976,1,0,0,독일,유럽,2.250239,3.147165,3.440418,3.627069,4.866534,1.095273,3.152736,3.785779,4.354655,5.049856,0.955511,2.453588,2.826722,3.353407,5.049022,0.482426,3.089678,3.707577,4.296605,5.049856,mercedes-benz
3,2006,2001,x-trail,EHX,WIE,238000.0,2184.0,0,1,0,0,0,2.901422,0,5,0,일본,아시아,2.164472,4.368303,4.575844,4.761062,5.049022,1.050822,3.558201,3.923359,4.405499,5.049022,0.955511,2.341806,2.738903,3.095125,4.886356,0.482426,3.089678,3.707577,4.296605,5.049856,nissan
4,2007,2007,bravo,OSW,MAL,251000.0,1910.0,0,1,0,0,0,2.865054,1,0,0,이탈리아,유럽,2.134166,2.894253,3.065725,3.21072,3.660223,1.111858,2.865054,3.529985,4.174387,4.969049,1.697449,3.24921,3.626206,3.948741,5.049856,0.732368,3.201526,3.755837,4.305416,5.049856,fiat


<br></br>

# Modeling

In [40]:
import os
def mkdir(paths):
    if type(paths)==str:
        paths = [paths]
    for path in paths:
        if not os.path.isdir(path):
            print('> Create Folder: {}'.format(path))
            os.mkdir(path)

In [41]:
## dummy_features는 한가지만 속함
# X[dummy_features].apply(lambda x: np.sum(x),axis=1).value_counts()

def add_fuel_type(data,dummy_features):
    d = data.copy()
    d['fuel_type'] = d[dummy_features].apply(
        lambda x: dummy_features[np.where(x==1)[0][0]],axis=1)
    d.drop(columns=dummy_features,inplace=True)
    return d

In [42]:
mkdir('./model_checkpoints')
mkdir('./model_checkpoints/segment_catboost')
mkdir('./model_checkpoints/segment_weightedensemble')

In [43]:
def check_null_cnt(data):
    null_cnt = data.isnull().sum()
    null_cnt = len(null_cnt[null_cnt!=0])
    return null_cnt

check_null_cnt(train_df7),check_null_cnt(test_df7)

(0, 0)

<br>

## CatBoost
- public score : 

In [None]:
gc.collect()

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool

In [None]:
train_fn = train_df7.copy()
test_fn  = test_df7 .copy()

In [None]:
%%time
# 1시간

X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]

X = add_fuel_type(X,dummy_features)
new_cat_features = cat_features + ['fuel_type']

segment_list = X['segment'].unique()

models = {}
feature_info = {}
scores = []
pbar = tqdm(segment_list)

s_i = 0
for segment in pbar:
    s_i+=1
    
    # segment에 해당하는 데이터추출
    _X = X[X.segment==segment].drop('segment',axis=1)
    _y = y[X.segment==segment]
    
    # kfold
    kf = KFold(n_splits=CFG.N_SPLITS,random_state=1000*s_i+CFG.SEED,shuffle=True)
    
    # unique인 컬럼 제외
    unique_info = _X.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        _X = _X.drop(unique_cols,axis=1)
        
    # categorical feature에서 unique인 컬럼을 제외
    fixed_cat_features = [col for col in new_cat_features if col in _X.columns]
    
    _models = []
    _scores = []
    k=0
    for tr_idx,val_idx in kf.split(_X,_y):
        k+=1
        
        # kfold dataset
        X_tr, X_va = _X.iloc[tr_idx], _X.iloc[val_idx]
        y_tr, y_va = _y.iloc[tr_idx], _y.iloc[val_idx]

        # progress
        progress = 'Segment: [{}], Size: [{:,}], KFold: [{}/{}]'\
            .format(segment,len(_X),k,CFG.N_SPLITS)
        pbar.set_description(progress)

        # dataset
        train_dataset = Pool(X_tr,y_tr,cat_features=fixed_cat_features)
        valid_dataset = Pool(X_va,y_va,cat_features=fixed_cat_features)

        # define the model
        model = CatBoostRegressor(
            loss_function='MAE',
            random_state=CFG.SEED,
            iterations=CFG.EPOCHS,
            learning_rate=CFG.LR,
            allow_writing_files=False,
        )

        # fit the model
        model.fit(
            train_dataset,
            eval_set=valid_dataset,
            early_stopping_rounds=CFG.ES,
            verbose=0,
            #metric_period=CFG.EPOCHS//5,
        )

        # save the model
        model.save_model(f'./model_checkpoints/segment_catboost/{segment}_k{k}.cbm')

        # prediction
        y_pred = model.predict(valid_dataset).flatten()
        y_true = y_va.values
        
        # inverse transform
        if CFG.TARGET_TRANSFORMATION:
            y_pred = np.exp(y_pred)
            y_true = np.exp(y_true)
            
        # calculate score
        score = mean_absolute_error(y_true=y_true,y_pred=y_pred)
        
        # append inner loop
        _models.append(model)
        _scores.append([segment,k,len(X_tr),len(X_va),score])

    # append outer loop
    models[segment] = _models
    scores.append(_scores)
    feature_info[segment] = {'cat_features':fixed_cat_features,'features':_X.columns.tolist()}
    
    # score report
    mean_score_report = pd.Series(np.array(_scores)[:,-1]).astype(float).mean()
    print('Segment: {}'.format(segment))
    print("MAE's for {}-Fold: [{}]".format(CFG.N_SPLITS,np.array(pd.Series(np.array(_scores)[:,-1]).astype(float).values)))
    print("Mean of MAE's for {}-Fold: [{:.4f}]".format(CFG.N_SPLITS,mean_score_report))

In [None]:
import pickle
with open('./model_checkpoints/segment_cat_models_brand_kf.pkl', 'wb') as f:
	pickle.dump(models, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_cat_feature_info_brand_kf.pkl', 'wb') as f:
	pickle.dump(feature_info, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_cat_scores_brand_kf.pkl', 'wb') as f:
	pickle.dump(scores, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
score_df = pd.DataFrame(
    np.array(scores).reshape(len(scores)*5,5),
    columns=['segment','k','n_tr','n_val','score']
)

score_df.sort_values(['segment','k']).head(10)

In [None]:
# inference
X = train_fn.drop(target_feature,axis=1)
X = add_fuel_type(X,dummy_features)
y = train_fn[target_feature]

X_test = add_fuel_type(test_fn,dummy_features)

segment_list = X['segment'].unique()

tr_pred_list = []
te_pred_list = []
for segment in tqdm(segment_list):
    ## data load
    # (1) train
    train_data = X[X.segment==segment][feature_info[segment]['features']]
    train_dataset = Pool(train_data,cat_features=feature_info[segment]['cat_features'])
    # (2) test
    test_data = X_test[X_test.segment==segment][feature_info[segment]['features']]
    test_dataset = Pool(test_data,cat_features=feature_info[segment]['cat_features'])
    
    ## model
    kfold_models = models[segment]
    
    ## prediction
    # (1) train
    tr_pred_df = pd.DataFrame({
        'segment':segment,
        'true':y[X.segment==segment].values.flatten(),
        'pred':np.mean([model.predict(train_dataset) for model in kfold_models],axis=0),
        #'pred':np.sum([weight*model.predict(train_dataset) for weight,model in zip(kfold_weights,kfold_models)],axis=0),
    })
    tr_pred_df.index = train_data.index
    # (2) test
    te_pred_df = pd.DataFrame({
        'segment':segment,
        'pred':np.mean([model.predict(test_dataset) for model in kfold_models],axis=0),
        #'pred':np.sum([weight*model.predict(test_dataset) for weight,model in zip(kfold_weights,kfold_models)],axis=0),
    })
    te_pred_df.index = test_data.index
    
    ## Target Transformation
    if CFG.TARGET_TRANSFORMATION:
        tr_pred_df['true'] = np.exp(tr_pred_df['true'])
        tr_pred_df['pred'] = np.exp(tr_pred_df['pred'])
        te_pred_df['pred'] = np.exp(te_pred_df['pred'])
    
    ## append
    tr_pred_list.append(tr_pred_df)
    te_pred_list.append(te_pred_df)

In [None]:
# train
tr_pred_df = pd.concat(tr_pred_list,axis=0).sort_index()
mean_absolute_error(y_true=tr_pred_df.true,y_pred=tr_pred_df.pred)

In [None]:
te_pred_df = pd.concat(te_pred_list,axis=0).sort_index()
te_pred_df.head()

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['가격'] = te_pred_df.pred.values
submit.to_csv('./out/10_catboost_segment_브랜드_kfold_logy.csv',index=False)

<br>

## Weighted Ensemble
- public score : 

In [44]:
import pandas as pd
import warnings

class OneHotEncoder:
    def __init__(self):
        pass
    
    def fit(self,data,columns):
        self.transform_list = []
        for col in columns:
            for i,value in enumerate(sorted(data[col].unique())):
                if i>0:
                    self.transform_list.append([col,value])
        
    def transform(self,data):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        new_data = data.copy()
        for col,value in self.transform_list:
            new_data[f'{col}_{value}'] = np.where(new_data[col]==value,1,0)
        drop_columns = pd.unique(np.array(self.transform_list)[:,0])
        new_data.drop(columns=drop_columns,inplace=True)
        return new_data

In [45]:
from copy import deepcopy
from sklearn.base import BaseEstimator, RegressorMixin
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
import warnings
import time
import pickle

class WeightedEnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,weight=['equal','balanced'],target_transformation=False):
        super().__init__()
        
        assert weight in ['equal','balanced'], \
            "weight must be one of ['equal','balanced']"
        assert isinstance(target_transformation,bool), \
            "target_transformation must be bool type"
        
        self.weight = weight
        self.target_transformation = target_transformation
        self._get_regressors()
    
    def _get_regressors(self):
        max_depth = 10
        n_jobs = -1
        
        params_elasticnet = {
            'l1_ratio' : np.arange(0.1, 1, 0.1),
            'alphas' : [1e-5, 1e-3, 1e-1, 0.0, 1.0, 10.0, 100.0],
            'cv' : RepeatedKFold(n_splits=CFG.N_SPLITS, n_repeats=3, random_state=CFG.SEED),
            'n_jobs' : n_jobs,
            #'max_iter' : 50000,
            'tol' : 0.001,
        }
        
        params_catboost = {
            'random_state' : CFG.SEED,
            'iterations' : CFG.EPOCHS,
            'early_stopping_rounds' : CFG.ES,
            'learning_rate' : CFG.LR,
            'loss_function' : 'MAE',
            'grow_policy' : 'Lossguide', # 'SymmetricTree','Depthwise'
            'use_best_model' : True,
            'allow_writing_files' : False,
            'verbose' : 0,
            'max_depth' : max_depth,
            #'l2_leaf_reg' : 1,
        }
    
        params_xgboost = {
            'random_state' : CFG.SEED,
            'n_estimators' : CFG.XGB_EPOCHS,
            'early_stopping_rounds' : CFG.XGB_ES,
            'learning_rate' : CFG.XGB_LR,
            'objective' : 'reg:absoluteerror',
            'verbosity' : 0,
            'max_depth': max_depth,
            'n_jobs' : n_jobs,
        }
    
        params_lightgbm = {
            'random_state' : CFG.SEED,
            'n_estimators' : CFG.EPOCHS,
            'early_stopping_round' : CFG.ES,
            'learning_rate' : CFG.LR,
            'objective' : 'regression',
            'metric' : 'mean_absolute_error',
            'verbosity' : -1,
            'max_depth': max_depth,
            'n_jobs' : n_jobs,
        }
        
        params_extratrees = {
            'random_state' : CFG.SEED,
            'n_estimators' : CFG.XTRATREES_EPOCHS,
            'criterion' : 'absolute_error',
            'verbose' : 0,
            'max_depth' : max_depth,
            'n_jobs' : n_jobs,
        }
        
        self.regressors = [
            ElasticNetCV(**params_elasticnet),
            CatBoostRegressor(**params_catboost),
            XGBRegressor(**params_xgboost),
            LGBMRegressor(**params_lightgbm),
            ExtraTreesRegressor(**params_extratrees),
        ]
        self.regressors_name = ['ElasticNet','CatBoost','XGBoost','LightGBM','ExtraTrees']
        
    def _adjust_prediction(self,pred):
        pred = np.array(pred).flatten()
        if np.where(pred<0,1,0).sum()>0:
            pred = [x if x>0 else self.minimum_value for x in pred]
        pred = np.exp(np.array(pred).flatten())
        if np.where(pred==np.inf,1,0).sum()>0:
            pred = [x if x!=np.inf else self.maximum_value for x in pred]
        pred = np.array(pred).flatten()
        return pred
    
    def fit(self,X,y,eval_set,oh_set,cat_features,verbose=1):
        assert len(eval_set)==1, \
            "eval_set length must be 1. len(eval_set)={}".format(len(eval_set))
        assert len(oh_set)==1, \
            "oh_set length must be 1. len(oh_set)={}".format(len(oh_set))
        X_val, y_val = eval_set[0]
        X_oh, X_val_oh = oh_set[0]
        
        self.cat_features = cat_features
        self.weights = []
        self.fitting_elapsed = []
        if verbose:
            pbar = tqdm(zip(self.regressors_name,self.regressors),total=len(self.regressors))
        else:
            pbar = zip(self.regressors_name,self.regressors)
            
        fit_iter = 0
        for regressor_name,regressor in pbar:
            fit_iter+=1
            s = time.time()
            
            if verbose:
                pbar.set_description(name)
                
            if regressor_name=='ElasticNet':
                warnings.filterwarnings("ignore", category=UserWarning)
                regressor.fit(X_oh,y)
                tr_pred = regressor.predict(X_oh)
                va_pred = regressor.predict(X_val_oh)
            elif regressor_name=='CatBoost':
                train_dataset = Pool(X,y,cat_features=cat_features)
                val_dataset   = Pool(X_val,y_val,cat_features=cat_features)
                regressor.fit(
                    train_dataset,
                    eval_set=val_dataset,
                    #metric_period=CFG.EPOCHS//5,
                )
                tr_pred = regressor.predict(train_dataset)
                va_pred = regressor.predict(val_dataset)
            elif regressor_name=='XGBoost':
                regressor.fit(
                    X_oh,y,
                    eval_set=[(X_val_oh,y_val)],
                    verbose=0,
                )
                tr_pred = regressor.predict(X_oh)
                va_pred = regressor.predict(X_val_oh)
            elif regressor_name=='LightGBM':
                warnings.filterwarnings("ignore", category=UserWarning)
                X_tmp = X.copy()
                X_val_tmp = X_val.copy()
                for col in cat_features:
                    X_tmp[col]     = X_tmp[col]    .astype('category')
                    X_val_tmp[col] = X_val_tmp[col].astype('category')
                regressor.fit(
                    X_tmp,y,
                    eval_set=[(X_val_tmp,y_val)],
                    verbose=-1,
                )
                tr_pred = regressor.predict(X_tmp)
                va_pred = regressor.predict(X_val_tmp)
            elif regressor_name=='ExtraTrees':
                regressor.fit(X_oh,y)
                tr_pred = regressor.predict(X_oh)
                va_pred = regressor.predict(X_val_oh)
            else:
                raise ValueError('Unknown Regressor: {}'.format(regressor_name))
            
            if self.target_transformation:
                tr_true = np.exp(np.array(y)    .flatten())
                va_true = np.exp(np.array(y_val).flatten())
                self.minimum_value = min(np.nanmin(tr_true),np.nanmin(va_true))
                self.maximum_value = max(np.nanmax(tr_true),np.nanmax(va_true))
                
                tr_pred = self._adjust_prediction(tr_pred)
                va_pred = self._adjust_prediction(va_pred)
            else:
                tr_true = np.array(y).flatten()
                va_true = np.array(y_val).flatten()
                tr_pred = np.array(tr_pred).flatten()
                va_pred = np.array(va_pred).flatten()
            tr_score = mean_absolute_error(y_pred=tr_pred,y_true=tr_true)
            va_score = mean_absolute_error(y_pred=va_pred,y_true=va_true)
            e = time.time()
            self.weights.append(1/va_score)
            self.fitting_elapsed.append(e-s)
            
            blank = ' '*(11-len(regressor_name))
            fit_progress = '  [{}/{}] {}{}: score={:.3f}, val_score={:.3f}, elasped={:.1f}s'\
                .format(fit_iter,len(self.regressors),regressor_name,blank,tr_score,va_score,e-s)
            print(fit_progress)
        
        if self.weight=='equal':
            self.weights = np.array([1.0 for _ in self.regressors])
        self.weights /= sum(self.weights)
        
        tr_pred = self.predict(X,X_oh)
        va_pred = self.predict(X_val,X_val_oh)
        ens_tr_score = mean_absolute_error(y_true=np.exp(np.array(y)    .flatten()),y_pred=tr_pred)
        ens_va_score = mean_absolute_error(y_true=np.exp(np.array(y_val).flatten()),y_pred=va_pred)
        
        total_fit_progress = '  Total({}): score={:.3f}, val_score={:.3f}, elasped={:.1f}s'\
            .format(self.weight,ens_tr_score,ens_va_score,sum(self.fitting_elapsed))
        print(total_fit_progress)
        
    def predict(self,X,X_oh):
        assert len(X)==len(X_oh), \
            "X and X_oh must be same length"
        
        pred_list = []
        for regressor_name,regressor in zip(self.regressors_name,self.regressors):
            if regressor_name in ['ElasticNet','XGBoost','ExtraTrees']:
                dataset = X_oh.copy()
            elif regressor_name=='CatBoost':
                dataset = Pool(X,cat_features=self.cat_features)
            elif regressor_name=='LightGBM':
                dataset = X.copy()
                for col in self.cat_features:
                    dataset[col] = dataset[col].astype('category')
            else:
                raise ValueError('Unknown Regressor: {}'.format(regressor_name))
            
            y_pred = regressor.predict(dataset)
            y_pred = self._adjust_prediction(y_pred)
            
            pred_list.append(y_pred)
            
        final_pred = np.zeros(len(X))
        for pred,weight in zip(pred_list,self.weights):
            final_pred += np.array(pred)*weight
            
        return final_pred
    
    def save_model(self,path):
        save_dict = {
            'cat_features' : self.cat_features,
            'weights' : self.weights,
            'target_transformation' : self.target_transformation,
            'fitting_elapsed' : self.fitting_elapsed,
            'regressors' : self.regressors,
        }
        with open(path, 'wb') as f:
            pickle.dump(save_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
            
    def load_model(self,path):
        with open(path, 'rb') as f:
            save_dict = pickle.load(f)
            self.cat_features = save_dict['cat_features']
            self.weights = save_dict['weights']
            self.target_transformation = save_dict['target_transformation'],
            self.fitting_elapsed = save_dict['fitting_elapsed']
            self.regressors = save_dict['regressors']



In [46]:
from sklearn.model_selection import KFold

In [47]:
gc.collect()

0

In [48]:
train_fn = train_df7.copy()
test_fn  = test_df7 .copy()

In [49]:
%%time
# 6시간

X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]
y = pd.Series(y.values.flatten())

ohe = OneHotEncoder()
ohe.fit(X,cat_features)
X_oh = ohe.transform(X)

segment_list = X['segment'].unique()

models = {}
feature_info = {}
scores = []
pbar = tqdm(segment_list)

s_i = 0
for segment in pbar:
    s_i+=1
    
    # segment에 해당하는 데이터추출
    _X    = X   [X   .segment==segment].drop('segment',axis=1)
    _X_oh = X_oh[X_oh.segment==segment].drop('segment',axis=1)
    _y    = y   [X   .segment==segment]
    
    # kfold
    kf = KFold(n_splits=CFG.N_SPLITS,random_state=1000*s_i+CFG.SEED,shuffle=True)
    
    # unique인 컬럼 제외
    # (1) X
    unique_info = _X.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        _X = _X.drop(unique_cols,axis=1)
    # (2) X_oh
    unique_info = _X_oh.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        _X_oh = _X_oh.drop(unique_cols,axis=1)
        
    # categorical feature에서 unique인 컬럼을 제외
    fixed_cat_features = [col for col in cat_features if col in _X.columns]
    
    # progress
    progress = 'Segment: {}, Length: {}'\
        .format(segment,len(_X))
    pbar.set_description(progress)
    
    _models = []
    _scores = []
    k=0
    for tr_idx,val_idx in kf.split(_X,_y):
        k+=1
        print('> [K-Fold] {}/{}'.format(k,CFG.N_SPLITS))
        
        # kfold dataset
        X_tr   , X_va    = _X   .iloc[tr_idx], _X   .iloc[val_idx]
        X_tr_oh, X_va_oh = _X_oh.iloc[tr_idx], _X_oh.iloc[val_idx]
        y_tr   , y_va    = _y   .iloc[tr_idx], _y   .iloc[val_idx]

        # define the model
        ensemble_model = WeightedEnsembleRegressor(
            weight='balanced',
            target_transformation=CFG.TARGET_TRANSFORMATION,
        )

        # fit the model
        ensemble_model.fit(
            X_tr,y_tr,
            eval_set=[(X_va,y_va)],
            oh_set=[(X_tr_oh,X_va_oh)],
            cat_features=fixed_cat_features,
            verbose=0,
        )

        # save the model
        ensemble_model.save_model(f'./model_checkpoints/segment_weightedensemble/{segment}_k{k}.pickle')

        # prediction
        y_pred = ensemble_model.predict(X_va,X_va_oh).flatten()
        y_true = y_va.values
        
        # caculate score
        score = mean_absolute_error(y_true=y_true,y_pred=y_pred)
        
        # append inner loop
        _models.append(ensemble_model)
        _scores.append([segment,k,len(X_tr),len(X_va),score])

    # append outer loop
    models[segment] = _models
    scores.append(_scores)
    feature_info[segment] = {
        'cat_features':fixed_cat_features,
        'features':_X.columns.tolist(),
        'oh_features':_X_oh.columns.tolist(),
    }

Segment: skoda, Length: 3130:   0%|          | 0/20 [00:01<?, ?it/s]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=6.525, val_score=7.005, elasped=14.9s
  [2/5] CatBoost   : score=4.599, val_score=6.463, elasped=90.9s
  [3/5] XGBoost    : score=1.947, val_score=7.500, elasped=16.4s
  [4/5] LightGBM   : score=5.303, val_score=6.917, elasped=23.7s
  [5/5] ExtraTrees : score=5.506, val_score=7.616, elasped=46.4s
  Total(balanced): score=4.505, val_score=6.604, elasped=192.3s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=6.528, val_score=6.876, elasped=15.0s
  [2/5] CatBoost   : score=4.774, val_score=6.194, elasped=79.7s
  [3/5] XGBoost    : score=1.838, val_score=6.789, elasped=14.4s
  [4/5] LightGBM   : score=5.420, val_score=6.503, elasped=27.7s
  [5/5] ExtraTrees : score=5.568, val_score=7.158, elasped=44.4s
  Total(balanced): score=4.523, val_score=6.213, elasped=181.1s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=6.586, val_score=6.675, elasped=16.0s
  [2/5] CatBoost   : score=4.739, val_score=5.987, elasped=75.8s
  [3/5] XGBoost    : score=1.372, val_score=7.2

Segment: toyota, Length: 3259:   5%|▌         | 1/20 [15:12<4:48:24, 910.75s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.646, val_score=6.306, elasped=21.0s
  [2/5] CatBoost   : score=3.600, val_score=5.534, elasped=88.6s
  [3/5] XGBoost    : score=0.931, val_score=6.337, elasped=17.1s
  [4/5] LightGBM   : score=4.390, val_score=5.987, elasped=30.9s
  [5/5] ExtraTrees : score=4.196, val_score=6.238, elasped=53.5s
  Total(balanced): score=3.402, val_score=5.646, elasped=211.1s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.453, val_score=6.253, elasped=19.6s
  [2/5] CatBoost   : score=3.694, val_score=5.346, elasped=74.5s
  [3/5] XGBoost    : score=0.837, val_score=5.896, elasped=16.8s
  [4/5] LightGBM   : score=3.599, val_score=5.546, elasped=50.8s
  [5/5] ExtraTrees : score=4.167, val_score=6.236, elasped=53.8s
  Total(balanced): score=3.178, val_score=5.365, elasped=215.4s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.611, val_score=5.796, elasped=19.4s
  [2/5] CatBoost   : score=3.836, val_score=5.062, elasped=62.1s
  [3/5] XGBoost    : score=1.299, val_score=5.6

Segment: mercedes-benz, Length: 2899:  10%|█         | 2/20 [35:41<5:29:23, 1097.96s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=9.001, val_score=8.997, elasped=31.1s
  [2/5] CatBoost   : score=5.855, val_score=7.779, elasped=64.6s
  [3/5] XGBoost    : score=1.409, val_score=8.587, elasped=40.4s
  [4/5] LightGBM   : score=6.467, val_score=8.040, elasped=144.4s
  [5/5] ExtraTrees : score=5.825, val_score=9.190, elasped=48.8s
  Total(balanced): score=5.257, val_score=7.746, elasped=329.2s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=8.911, val_score=9.948, elasped=29.5s
  [2/5] CatBoost   : score=5.751, val_score=8.184, elasped=64.3s
  [3/5] XGBoost    : score=1.084, val_score=9.527, elasped=21.9s
  [4/5] LightGBM   : score=6.214, val_score=8.423, elasped=55.4s
  [5/5] ExtraTrees : score=5.739, val_score=9.449, elasped=49.0s
  Total(balanced): score=5.104, val_score=8.266, elasped=220.2s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=8.591, val_score=10.241, elasped=24.9s
  [2/5] CatBoost   : score=5.501, val_score=8.339, elasped=72.3s
  [3/5] XGBoost    : score=1.470, val_score=9

Segment: nissan, Length: 2129:  15%|█▌        | 3/20 [54:11<5:12:45, 1103.85s/it]       

> [K-Fold] 1/5
  [1/5] ElasticNet : score=4.903, val_score=5.564, elasped=12.5s
  [2/5] CatBoost   : score=3.768, val_score=5.122, elasped=42.6s
  [3/5] XGBoost    : score=0.466, val_score=6.067, elasped=9.9s
  [4/5] LightGBM   : score=3.858, val_score=5.775, elasped=28.2s
  [5/5] ExtraTrees : score=3.758, val_score=6.065, elasped=24.2s
  Total(balanced): score=3.128, val_score=5.325, elasped=117.4s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.152, val_score=5.286, elasped=12.5s
  [2/5] CatBoost   : score=4.344, val_score=5.227, elasped=22.6s
  [3/5] XGBoost    : score=0.820, val_score=5.869, elasped=8.7s
  [4/5] LightGBM   : score=4.359, val_score=5.422, elasped=22.8s
  [5/5] ExtraTrees : score=3.756, val_score=5.645, elasped=24.1s
  Total(balanced): score=3.485, val_score=5.076, elasped=90.7s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.759, val_score=5.324, elasped=12.8s
  [2/5] CatBoost   : score=3.817, val_score=5.061, elasped=55.6s
  [3/5] XGBoost    : score=0.608, val_score=5.751,

Segment: fiat, Length: 1164:  20%|██        | 4/20 [1:04:17<4:01:57, 907.32s/it]  

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.243, val_score=5.969, elasped=5.9s
  [2/5] CatBoost   : score=3.581, val_score=5.378, elasped=48.1s
  [3/5] XGBoost    : score=1.077, val_score=5.557, elasped=7.1s
  [4/5] LightGBM   : score=3.525, val_score=5.073, elasped=34.3s
  [5/5] ExtraTrees : score=3.417, val_score=5.434, elasped=7.8s
  Total(balanced): score=3.096, val_score=5.103, elasped=103.1s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.357, val_score=5.603, elasped=5.9s
  [2/5] CatBoost   : score=4.044, val_score=5.114, elasped=23.3s
  [3/5] XGBoost    : score=1.198, val_score=5.841, elasped=6.7s
  [4/5] LightGBM   : score=3.840, val_score=5.364, elasped=27.2s
  [5/5] ExtraTrees : score=3.302, val_score=5.629, elasped=6.6s
  Total(balanced): score=3.309, val_score=5.088, elasped=69.7s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.669, val_score=6.972, elasped=5.9s
  [2/5] CatBoost   : score=3.797, val_score=6.030, elasped=26.8s
  [3/5] XGBoost    : score=0.974, val_score=6.232, elas

Segment: audi, Length: 5597:  25%|██▌       | 5/20 [1:11:28<3:03:50, 735.34s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=6.597, val_score=6.748, elasped=59.5s
  [2/5] CatBoost   : score=5.104, val_score=6.258, elasped=86.2s
  [3/5] XGBoost    : score=2.487, val_score=6.833, elasped=28.1s
  [4/5] LightGBM   : score=5.372, val_score=6.530, elasped=23.4s
  [5/5] ExtraTrees : score=5.831, val_score=7.227, elasped=228.0s
  Total(balanced): score=4.806, val_score=6.314, elasped=425.1s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=6.575, val_score=6.878, elasped=55.2s
  [2/5] CatBoost   : score=5.173, val_score=6.350, elasped=71.0s
  [3/5] XGBoost    : score=2.393, val_score=7.006, elasped=31.9s
  [4/5] LightGBM   : score=5.555, val_score=6.618, elasped=20.0s
  [5/5] ExtraTrees : score=5.876, val_score=7.471, elasped=228.9s
  Total(balanced): score=4.844, val_score=6.508, elasped=407.0s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=6.618, val_score=6.806, elasped=107.0s
  [2/5] CatBoost   : score=4.862, val_score=6.298, elasped=132.8s
  [3/5] XGBoost    : score=2.143, val_score

Segment: renault, Length: 3853:  30%|███       | 6/20 [1:50:21<4:58:18, 1278.44s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.168, val_score=5.371, elasped=25.5s
  [2/5] CatBoost   : score=3.638, val_score=4.968, elasped=68.6s
  [3/5] XGBoost    : score=1.486, val_score=5.472, elasped=14.7s
  [4/5] LightGBM   : score=3.905, val_score=5.080, elasped=27.3s
  [5/5] ExtraTrees : score=4.224, val_score=5.627, elasped=79.6s
  Total(balanced): score=3.421, val_score=4.947, elasped=215.7s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.120, val_score=5.664, elasped=25.3s
  [2/5] CatBoost   : score=3.619, val_score=4.838, elasped=74.0s
  [3/5] XGBoost    : score=1.415, val_score=5.324, elasped=16.4s
  [4/5] LightGBM   : score=3.694, val_score=4.918, elasped=34.0s
  [5/5] ExtraTrees : score=4.355, val_score=6.017, elasped=80.3s
  Total(balanced): score=3.338, val_score=4.858, elasped=230.0s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.210, val_score=5.340, elasped=25.0s
  [2/5] CatBoost   : score=3.608, val_score=4.478, elasped=84.2s
  [3/5] XGBoost    : score=1.350, val_score=5.1

Segment: volkswagen, Length: 5693:  35%|███▌      | 7/20 [2:10:26<4:31:48, 1254.46s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=6.622, val_score=6.926, elasped=54.7s
  [2/5] CatBoost   : score=5.091, val_score=6.172, elasped=67.8s
  [3/5] XGBoost    : score=2.466, val_score=6.936, elasped=30.1s
  [4/5] LightGBM   : score=5.199, val_score=6.194, elasped=36.1s
  [5/5] ExtraTrees : score=5.632, val_score=7.243, elasped=215.8s
  Total(balanced): score=4.741, val_score=6.279, elasped=404.5s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=6.609, val_score=6.916, elasped=53.1s
  [2/5] CatBoost   : score=5.107, val_score=6.375, elasped=63.3s
  [3/5] XGBoost    : score=2.333, val_score=6.906, elasped=36.3s
  [4/5] LightGBM   : score=5.067, val_score=6.431, elasped=38.1s
  [5/5] ExtraTrees : score=5.569, val_score=7.202, elasped=207.0s
  Total(balanced): score=4.654, val_score=6.348, elasped=397.8s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=6.677, val_score=6.702, elasped=53.2s
  [2/5] CatBoost   : score=5.148, val_score=5.907, elasped=66.9s
  [3/5] XGBoost    : score=2.237, val_score=6

Segment: citroen, Length: 1129:  40%|████      | 8/20 [2:45:01<5:03:09, 1515.83s/it]   

> [K-Fold] 1/5
  [1/5] ElasticNet : score=3.603, val_score=3.925, elasped=5.7s
  [2/5] CatBoost   : score=2.781, val_score=3.836, elasped=36.4s
  [3/5] XGBoost    : score=0.603, val_score=4.650, elasped=4.6s
  [4/5] LightGBM   : score=3.369, val_score=3.933, elasped=14.9s
  [5/5] ExtraTrees : score=2.748, val_score=4.854, elasped=9.3s
  Total(balanced): score=2.510, val_score=3.852, elasped=70.9s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=3.728, val_score=3.860, elasped=6.3s
  [2/5] CatBoost   : score=2.777, val_score=4.171, elasped=30.1s
  [3/5] XGBoost    : score=0.650, val_score=4.427, elasped=4.0s
  [4/5] LightGBM   : score=3.006, val_score=4.594, elasped=15.3s
  [5/5] ExtraTrees : score=2.947, val_score=4.917, elasped=8.6s
  Total(balanced): score=2.467, val_score=4.118, elasped=64.2s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=3.411, val_score=4.439, elasped=5.9s
  [2/5] CatBoost   : score=2.626, val_score=4.313, elasped=41.1s
  [3/5] XGBoost    : score=0.574, val_score=5.034, elasp

Segment: bmw, Length: 5262:  45%|████▌     | 9/20 [2:51:31<3:33:22, 1163.88s/it]    

> [K-Fold] 1/5
  [1/5] ElasticNet : score=8.464, val_score=8.989, elasped=55.1s
  [2/5] CatBoost   : score=6.394, val_score=7.886, elasped=53.6s
  [3/5] XGBoost    : score=3.004, val_score=8.347, elasped=24.0s
  [4/5] LightGBM   : score=6.621, val_score=7.925, elasped=23.4s
  [5/5] ExtraTrees : score=6.767, val_score=8.692, elasped=211.7s
  Total(balanced): score=5.849, val_score=7.846, elasped=367.7s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=8.386, val_score=9.046, elasped=51.1s
  [2/5] CatBoost   : score=6.233, val_score=7.815, elasped=61.0s
  [3/5] XGBoost    : score=2.716, val_score=8.092, elasped=30.8s
  [4/5] LightGBM   : score=6.665, val_score=8.138, elasped=21.6s
  [5/5] ExtraTrees : score=6.664, val_score=8.608, elasped=216.7s
  Total(balanced): score=5.705, val_score=7.895, elasped=381.3s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=8.527, val_score=8.731, elasped=51.0s
  [2/5] CatBoost   : score=5.898, val_score=7.441, elasped=121.0s
  [3/5] XGBoost    : score=2.971, val_score=

Segment: opel, Length: 6651:  50%|█████     | 10/20 [3:25:59<4:00:29, 1442.95s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=4.210, val_score=4.100, elasped=54.7s
  [2/5] CatBoost   : score=3.111, val_score=3.788, elasped=135.2s
  [3/5] XGBoost    : score=1.722, val_score=4.207, elasped=33.9s
  [4/5] LightGBM   : score=3.071, val_score=3.943, elasped=50.4s
  [5/5] ExtraTrees : score=3.862, val_score=4.622, elasped=308.2s
  Total(balanced): score=2.998, val_score=3.868, elasped=582.4s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=4.202, val_score=4.162, elasped=54.7s
  [2/5] CatBoost   : score=3.252, val_score=3.860, elasped=82.9s
  [3/5] XGBoost    : score=1.506, val_score=4.430, elasped=37.4s
  [4/5] LightGBM   : score=3.190, val_score=3.948, elasped=42.3s
  [5/5] ExtraTrees : score=3.841, val_score=4.738, elasped=305.8s
  Total(balanced): score=3.016, val_score=3.950, elasped=523.1s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.162, val_score=4.395, elasped=53.9s
  [2/5] CatBoost   : score=3.208, val_score=3.996, elasped=94.6s
  [3/5] XGBoost    : score=1.579, val_score=

Segment: ford, Length: 5819:  55%|█████▌    | 11/20 [4:13:33<4:41:11, 1874.66s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.418, val_score=6.029, elasped=45.4s
  [2/5] CatBoost   : score=3.976, val_score=5.408, elasped=98.7s
  [3/5] XGBoost    : score=2.111, val_score=5.745, elasped=29.2s
  [4/5] LightGBM   : score=4.124, val_score=5.476, elasped=39.1s
  [5/5] ExtraTrees : score=4.951, val_score=6.451, elasped=232.6s
  Total(balanced): score=3.833, val_score=5.460, elasped=445.0s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.533, val_score=5.372, elasped=45.0s
  [2/5] CatBoost   : score=4.157, val_score=5.104, elasped=82.0s
  [3/5] XGBoost    : score=2.102, val_score=5.290, elasped=28.1s
  [4/5] LightGBM   : score=4.231, val_score=5.243, elasped=26.5s
  [5/5] ExtraTrees : score=5.010, val_score=6.222, elasped=224.5s
  Total(balanced): score=3.919, val_score=5.030, elasped=406.0s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.383, val_score=6.029, elasped=43.0s
  [2/5] CatBoost   : score=4.029, val_score=5.448, elasped=105.2s
  [3/5] XGBoost    : score=1.934, val_score=

Segment: mazda, Length: 1572:  60%|██████    | 12/20 [4:48:30<4:18:58, 1942.28s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.752, val_score=6.395, elasped=8.5s
  [2/5] CatBoost   : score=3.924, val_score=6.135, elasped=44.7s
  [3/5] XGBoost    : score=1.279, val_score=7.103, elasped=5.1s
  [4/5] LightGBM   : score=4.680, val_score=6.480, elasped=17.5s
  [5/5] ExtraTrees : score=3.819, val_score=7.651, elasped=10.9s
  Total(balanced): score=3.736, val_score=6.390, elasped=86.7s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.593, val_score=6.419, elasped=8.0s
  [2/5] CatBoost   : score=4.551, val_score=5.628, elasped=22.8s
  [3/5] XGBoost    : score=1.320, val_score=6.146, elasped=5.1s
  [4/5] LightGBM   : score=4.396, val_score=5.975, elasped=18.2s
  [5/5] ExtraTrees : score=4.116, val_score=6.578, elasped=11.8s
  Total(balanced): score=3.779, val_score=5.747, elasped=65.8s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.399, val_score=6.376, elasped=8.1s
  [2/5] CatBoost   : score=4.278, val_score=6.176, elasped=31.0s
  [3/5] XGBoost    : score=1.233, val_score=6.555, ela

Segment: honda, Length: 1545:  65%|██████▌   | 13/20 [4:55:19<2:52:25, 1477.89s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.645, val_score=6.008, elasped=8.8s
  [2/5] CatBoost   : score=3.549, val_score=5.295, elasped=49.9s
  [3/5] XGBoost    : score=1.061, val_score=6.366, elasped=5.1s
  [4/5] LightGBM   : score=3.444, val_score=5.513, elasped=24.6s
  [5/5] ExtraTrees : score=3.642, val_score=6.338, elasped=12.3s
  Total(balanced): score=3.156, val_score=5.365, elasped=100.6s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.620, val_score=5.941, elasped=8.0s
  [2/5] CatBoost   : score=3.542, val_score=5.108, elasped=49.1s
  [3/5] XGBoost    : score=0.981, val_score=5.972, elasped=5.5s
  [4/5] LightGBM   : score=3.653, val_score=5.173, elasped=25.6s
  [5/5] ExtraTrees : score=3.598, val_score=6.672, elasped=12.2s
  Total(balanced): score=3.160, val_score=5.199, elasped=100.3s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.529, val_score=7.172, elasped=9.5s
  [2/5] CatBoost   : score=3.509, val_score=5.776, elasped=53.8s
  [3/5] XGBoost    : score=1.129, val_score=6.890, e

Segment: kia, Length: 2034:  70%|███████   | 14/20 [5:03:32<1:58:03, 1180.53s/it]  

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.828, val_score=5.519, elasped=11.4s
  [2/5] CatBoost   : score=4.658, val_score=5.334, elasped=33.6s
  [3/5] XGBoost    : score=1.058, val_score=5.904, elasped=7.8s
  [4/5] LightGBM   : score=4.802, val_score=5.590, elasped=19.9s
  [5/5] ExtraTrees : score=4.176, val_score=5.989, elasped=22.0s
  Total(balanced): score=3.888, val_score=5.247, elasped=94.7s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.812, val_score=5.628, elasped=10.8s
  [2/5] CatBoost   : score=4.569, val_score=5.460, elasped=32.6s
  [3/5] XGBoost    : score=0.952, val_score=5.839, elasped=8.7s
  [4/5] LightGBM   : score=4.946, val_score=5.777, elasped=17.6s
  [5/5] ExtraTrees : score=4.149, val_score=5.670, elasped=21.6s
  Total(balanced): score=3.841, val_score=5.249, elasped=91.3s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.759, val_score=6.095, elasped=10.5s
  [2/5] CatBoost   : score=4.328, val_score=5.712, elasped=45.5s
  [3/5] XGBoost    : score=0.749, val_score=6.193, 

Segment: seat, Length: 1628:  75%|███████▌  | 15/20 [5:12:50<1:22:44, 992.81s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=4.645, val_score=5.728, elasped=10.5s
  [2/5] CatBoost   : score=3.314, val_score=5.284, elasped=47.3s
  [3/5] XGBoost    : score=0.356, val_score=6.093, elasped=9.0s
  [4/5] LightGBM   : score=3.309, val_score=5.584, elasped=23.3s
  [5/5] ExtraTrees : score=3.449, val_score=6.615, elasped=16.8s
  Total(balanced): score=2.771, val_score=5.410, elasped=107.0s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=4.635, val_score=5.542, elasped=10.0s
  [2/5] CatBoost   : score=3.547, val_score=4.983, elasped=29.3s
  [3/5] XGBoost    : score=0.483, val_score=5.533, elasped=8.5s
  [4/5] LightGBM   : score=3.576, val_score=5.307, elasped=19.2s
  [5/5] ExtraTrees : score=3.696, val_score=5.940, elasped=16.1s
  Total(balanced): score=2.903, val_score=5.026, elasped=83.2s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.902, val_score=5.083, elasped=9.9s
  [2/5] CatBoost   : score=3.232, val_score=4.818, elasped=49.2s
  [3/5] XGBoost    : score=0.612, val_score=5.858, 

Segment: volvo, Length: 1352:  80%|████████  | 16/20 [5:22:03<57:21, 860.33s/it] 

> [K-Fold] 1/5
  [1/5] ElasticNet : score=7.945, val_score=8.559, elasped=8.2s
  [2/5] CatBoost   : score=6.004, val_score=8.560, elasped=25.8s
  [3/5] XGBoost    : score=0.538, val_score=9.668, elasped=8.0s
  [4/5] LightGBM   : score=5.323, val_score=8.967, elasped=20.9s
  [5/5] ExtraTrees : score=6.069, val_score=9.066, elasped=12.3s
  Total(balanced): score=4.894, val_score=8.392, elasped=75.2s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=8.072, val_score=9.080, elasped=8.1s
  [2/5] CatBoost   : score=5.609, val_score=8.292, elasped=38.4s
  [3/5] XGBoost    : score=0.436, val_score=10.233, elasped=9.0s
  [4/5] LightGBM   : score=5.144, val_score=8.570, elasped=23.3s
  [5/5] ExtraTrees : score=5.893, val_score=10.481, elasped=12.0s
  Total(balanced): score=4.766, val_score=8.568, elasped=90.8s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=8.207, val_score=8.238, elasped=8.3s
  [2/5] CatBoost   : score=5.779, val_score=7.313, elasped=35.0s
  [3/5] XGBoost    : score=0.381, val_score=8.690, e

Segment: peugeot, Length: 793:  85%|████████▌ | 17/20 [5:30:14<37:28, 749.38s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.120, val_score=5.589, elasped=3.8s
  [2/5] CatBoost   : score=3.845, val_score=5.434, elasped=25.2s
  [3/5] XGBoost    : score=1.923, val_score=6.975, elasped=3.4s
  [4/5] LightGBM   : score=4.670, val_score=5.951, elasped=10.4s
  [5/5] ExtraTrees : score=3.564, val_score=6.323, elasped=4.9s
  Total(balanced): score=3.612, val_score=5.607, elasped=47.6s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.242, val_score=5.187, elasped=3.7s
  [2/5] CatBoost   : score=4.102, val_score=5.485, elasped=25.7s
  [3/5] XGBoost    : score=0.322, val_score=6.030, elasped=4.0s
  [4/5] LightGBM   : score=4.138, val_score=5.268, elasped=15.0s
  [5/5] ExtraTrees : score=3.266, val_score=6.151, elasped=4.0s
  Total(balanced): score=3.244, val_score=5.095, elasped=52.4s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.756, val_score=6.300, elasped=3.7s
  [2/5] CatBoost   : score=3.358, val_score=6.366, elasped=43.7s
  [3/5] XGBoost    : score=0.685, val_score=6.998, elasp

Segment: hyundai, Length: 1855:  90%|█████████ | 18/20 [5:34:41<20:08, 604.42s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=6.148, val_score=6.143, elasped=8.1s
  [2/5] CatBoost   : score=4.097, val_score=5.505, elasped=55.4s
  [3/5] XGBoost    : score=0.648, val_score=6.029, elasped=9.1s
  [4/5] LightGBM   : score=4.158, val_score=5.857, elasped=23.2s
  [5/5] ExtraTrees : score=4.034, val_score=6.257, elasped=15.6s
  Total(balanced): score=3.515, val_score=5.474, elasped=111.3s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=6.130, val_score=6.146, elasped=8.4s
  [2/5] CatBoost   : score=4.077, val_score=5.349, elasped=49.4s
  [3/5] XGBoost    : score=1.239, val_score=6.128, elasped=5.6s
  [4/5] LightGBM   : score=4.778, val_score=5.561, elasped=18.0s
  [5/5] ExtraTrees : score=3.909, val_score=5.970, elasped=15.5s
  Total(balanced): score=3.741, val_score=5.408, elasped=96.9s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=6.130, val_score=6.325, elasped=8.0s
  [2/5] CatBoost   : score=4.118, val_score=5.593, elasped=45.4s
  [3/5] XGBoost    : score=1.273, val_score=6.079, el

Segment: mitsubishi, Length: 556:  95%|█████████▌| 19/20 [5:42:34<09:25, 565.07s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.151, val_score=6.399, elasped=3.0s
  [2/5] CatBoost   : score=4.265, val_score=6.467, elasped=27.0s
  [3/5] XGBoost    : score=0.675, val_score=7.590, elasped=4.3s
  [4/5] LightGBM   : score=5.407, val_score=6.827, elasped=7.6s
  [5/5] ExtraTrees : score=3.563, val_score=7.184, elasped=1.9s
  Total(balanced): score=3.449, val_score=6.113, elasped=43.8s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.194, val_score=6.164, elasped=3.1s
  [2/5] CatBoost   : score=3.896, val_score=6.015, elasped=30.5s
  [3/5] XGBoost    : score=2.120, val_score=7.642, elasped=3.1s
  [4/5] LightGBM   : score=4.963, val_score=6.923, elasped=9.2s
  [5/5] ExtraTrees : score=2.755, val_score=7.849, elasped=1.9s
  Total(balanced): score=3.510, val_score=6.157, elasped=47.8s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.350, val_score=6.297, elasped=2.9s
  [2/5] CatBoost   : score=4.809, val_score=6.288, elasped=16.8s
  [3/5] XGBoost    : score=2.079, val_score=6.856, elasped

Segment: mitsubishi, Length: 556: 100%|██████████| 20/20 [5:46:03<00:00, 1038.19s/it]

CPU times: user 1d 2h 28min 39s, sys: 2h 44min 14s, total: 1d 5h 12min 53s
Wall time: 5h 46min 17s





<br>

(2) ensemble regressor : 5.6351285819

In [50]:
# inference
X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]

X_test = test_fn.copy()

ohe = OneHotEncoder()
ohe.fit(X,cat_features)
X_oh = ohe.transform(X)
X_test_oh = ohe.transform(X_test)

segment_list = X['segment'].unique()

tr_pred_list = []
te_pred_list = []
for segment in tqdm(segment_list):
    ## data load
    # (1) train
    train_data    = X   [X   .segment==segment][feature_info[segment]['features']]
    train_data_oh = X_oh[X_oh.segment==segment][feature_info[segment]['oh_features']]
    # (2) test
    test_data     = X_test   [X_test   .segment==segment][feature_info[segment]['features']]
    test_data_oh  = X_test_oh[X_test_oh.segment==segment][feature_info[segment]['oh_features']]
    
    ## model
    kfold_models = models[segment]
    
    ## prediction
    # (1) train
    tr_pred_df = pd.DataFrame({
        'true':y[X.segment==segment].values.flatten(),
    })
    for i in range(len(kfold_models)):
        tr_pred_df[f'pred_{i+1}'] = kfold_models[i].predict(train_data,train_data_oh)
    tr_pred_df.index = train_data.index
    # (2) test
    te_pred_df = pd.DataFrame()
    for i in range(len(kfold_models)):
        te_pred_df[f'pred_{i+1}'] = kfold_models[i].predict(test_data,test_data_oh)
    te_pred_df.index = test_data.index
        
    ## target transformation
    if CFG.TARGET_TRANSFORMATION:
        tr_pred_df['true'] = np.exp(tr_pred_df['true'])
    
    ## append
    tr_pred_list.append(tr_pred_df)
    te_pred_list.append(te_pred_df)

100%|██████████| 20/20 [00:59<00:00,  2.97s/it]


In [51]:
# train
tr_pred_df = pd.concat(tr_pred_list,axis=0).sort_index()
te_pred_df = pd.concat(te_pred_list,axis=0).sort_index()

In [52]:
tr_pred_df.to_parquet('./out/stacking/tr_pred_df_segment브랜드_ensemble_logy.parquet')
te_pred_df.to_parquet('./out/stacking/te_pred_df_segment브랜드_ensemble_logy.parquet')

<br>

Fitting

In [55]:
new_cat_features = cat_features + ['fuel_type']

In [56]:
# inference
X = train_fn.drop(target_feature,axis=1)
X = add_fuel_type(X,dummy_features)
y = train_fn[target_feature]

X_test = add_fuel_type(test_fn,dummy_features)

segment_list = X['segment'].unique()

# add predictions
X      = pd.concat([X     ,tr_pred_df.drop('true',axis=1)],axis=1)
X_test = pd.concat([X_test,te_pred_df],axis=1)

In [57]:
stacking_models = {}
stacking_feature_info = {}
stacking_scores = []
pbar = tqdm(segment_list)

s_i = 0
for segment in pbar:
    s_i+=1
    
    # segment에 해당하는 데이터추출
    _X = X[X.segment==segment].drop('segment',axis=1)
    _y = y[X.segment==segment]
    
    # kfold
    kf = KFold(n_splits=CFG.N_SPLITS,random_state=1000*s_i+CFG.SEED,shuffle=True)
    
    # unique인 컬럼 제외
    unique_info = _X.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        _X = _X.drop(unique_cols,axis=1)
        
    # categorical feature에서 unique인 컬럼을 제외
    fixed_cat_features = [col for col in new_cat_features if col in _X.columns]
    
    _models = []
    _scores = []
    k=0
    for tr_idx,val_idx in kf.split(_X,_y):
        k+=1
        
        # kfold dataset
        X_tr, X_va = _X.iloc[tr_idx], _X.iloc[val_idx]
        y_tr, y_va = _y.iloc[tr_idx], _y.iloc[val_idx]

        # progress
        progress = 'Segment: [{}], Size: [{:,}], KFold: [{}/{}]'\
            .format(segment,len(_X),k,CFG.N_SPLITS)
        pbar.set_description(progress)

        # dataset
        train_dataset = Pool(X_tr,y_tr,cat_features=fixed_cat_features)
        valid_dataset = Pool(X_va,y_va,cat_features=fixed_cat_features)

        # define the model
        model = CatBoostRegressor(
            loss_function='MAE',
            random_state=CFG.SEED,
            iterations=CFG.EPOCHS,
            learning_rate=CFG.LR,
            allow_writing_files=False,
        )

        # fit the model
        model.fit(
            train_dataset,
            eval_set=valid_dataset,
            early_stopping_rounds=CFG.ES,
            verbose=0,
            #metric_period=CFG.EPOCHS//5,
        )

        # save the model
        model.save_model(f'./model_checkpoints/segment_catboost/{segment}_k{k}.cbm')

        # prediction
        y_pred = model.predict(valid_dataset).flatten()
        y_true = y_va.values
        
        # inverse transform
        if CFG.TARGET_TRANSFORMATION:
            y_pred = np.exp(y_pred)
            y_true = np.exp(y_true)
            
        # calculate score
        score = mean_absolute_error(y_true=y_true,y_pred=y_pred)
        
        # append inner loop
        _models.append(model)
        _scores.append([segment,k,len(X_tr),len(X_va),score])

    # append outer loop
    stacking_models[segment] = _models
    stacking_scores.append(_scores)
    stacking_feature_info[segment] = {'cat_features':fixed_cat_features,'features':_X.columns.tolist()}
    
    # score report
    mean_score_report = pd.Series(np.array(_scores)[:,-1]).astype(float).mean()
    print('Segment: {}'.format(segment))
    print("MAE's for {}-Fold: [{}]".format(CFG.N_SPLITS,np.array(pd.Series(np.array(_scores)[:,-1]).astype(float).values)))
    print("Mean of MAE's for {}-Fold: [{:.4f}]".format(CFG.N_SPLITS,mean_score_report))

Segment: [toyota], Size: [3,259], KFold: [1/5]:   5%|▌         | 1/20 [01:31<28:53, 91.26s/it]

Segment: skoda
MAE's for 5-Fold: [[5.27930571 4.90646996 4.95128465 5.0047389  5.17197243]]
Mean of MAE's for 5-Fold: [5.0628]


Segment: [mercedes-benz], Size: [2,899], KFold: [1/5]:  10%|█         | 2/20 [02:54<25:55, 86.43s/it]

Segment: toyota
MAE's for 5-Fold: [[4.1059641  4.24671789 3.75450311 3.96244045 3.66441929]]
Mean of MAE's for 5-Fold: [3.9468]


Segment: [nissan], Size: [2,129], KFold: [1/5]:  15%|█▌        | 3/20 [03:49<20:23, 71.97s/it]       

Segment: mercedes-benz
MAE's for 5-Fold: [[5.73692497 6.14591915 6.48865678 6.39429582 6.10391945]]
Mean of MAE's for 5-Fold: [6.1739]


Segment: [fiat], Size: [1,164], KFold: [1/5]:  20%|██        | 4/20 [05:06<19:42, 73.93s/it]  

Segment: nissan
MAE's for 5-Fold: [[4.12319566 3.73352697 3.90533664 3.67839168 3.7707488 ]]
Mean of MAE's for 5-Fold: [3.8422]


Segment: [audi], Size: [5,597], KFold: [1/5]:  25%|██▌       | 5/20 [06:00<16:42, 66.83s/it]

Segment: fiat
MAE's for 5-Fold: [[3.93272142 3.91055964 4.55429733 3.56624039 3.23648   ]]
Mean of MAE's for 5-Fold: [3.8401]


Segment: [renault], Size: [3,853], KFold: [1/5]:  30%|███       | 6/20 [07:42<18:25, 78.96s/it]

Segment: audi
MAE's for 5-Fold: [[5.11681304 5.13208235 5.3393382  5.23660145 5.42451701]]
Mean of MAE's for 5-Fold: [5.2499]


Segment: [volkswagen], Size: [5,693], KFold: [1/5]:  35%|███▌      | 7/20 [09:06<17:27, 80.56s/it]

Segment: renault
MAE's for 5-Fold: [[3.77193083 3.7974503  3.63512774 4.05999892 3.65510841]]
Mean of MAE's for 5-Fold: [3.7839]


Segment: [citroen], Size: [1,129], KFold: [1/5]:  40%|████      | 8/20 [10:54<17:52, 89.34s/it]   

Segment: volkswagen
MAE's for 5-Fold: [[4.9663644  5.13881689 4.78717406 5.28666474 5.11191357]]
Mean of MAE's for 5-Fold: [5.0582]


Segment: [bmw], Size: [5,262], KFold: [1/5]:  45%|████▌     | 9/20 [12:09<15:32, 84.75s/it]    

Segment: citroen
MAE's for 5-Fold: [[2.71719183 2.84118522 3.30059667 2.91212396 2.93065417]]
Mean of MAE's for 5-Fold: [2.9404]


Segment: [opel], Size: [6,651], KFold: [1/5]:  50%|█████     | 10/20 [13:33<14:06, 84.64s/it]

Segment: bmw
MAE's for 5-Fold: [[6.21987335 6.48583752 6.10878835 6.21201522 6.12111121]]
Mean of MAE's for 5-Fold: [6.2295]


Segment: [ford], Size: [5,819], KFold: [1/5]:  55%|█████▌    | 11/20 [15:25<13:56, 92.90s/it]

Segment: opel
MAE's for 5-Fold: [[3.17475194 3.18970058 3.22454318 3.4379252  3.29834055]]
Mean of MAE's for 5-Fold: [3.2651]


Segment: [mazda], Size: [1,572], KFold: [1/5]:  60%|██████    | 12/20 [17:07<12:45, 95.73s/it]

Segment: ford
MAE's for 5-Fold: [[4.38357754 4.02076849 4.40466381 4.11414551 4.03365568]]
Mean of MAE's for 5-Fold: [4.1914]


Segment: [honda], Size: [1,545], KFold: [1/5]:  65%|██████▌   | 13/20 [18:05<09:50, 84.34s/it]

Segment: mazda
MAE's for 5-Fold: [[4.5910505  4.53268008 4.70581671 4.39916804 4.11595546]]
Mean of MAE's for 5-Fold: [4.4689]


Segment: [kia], Size: [2,034], KFold: [1/5]:  70%|███████   | 14/20 [19:15<07:58, 79.83s/it]  

Segment: honda
MAE's for 5-Fold: [[3.88189596 3.818      4.25205451 4.03519047 3.74412169]]
Mean of MAE's for 5-Fold: [3.9463]


Segment: [seat], Size: [1,628], KFold: [1/5]:  75%|███████▌  | 15/20 [20:21<06:18, 75.77s/it]

Segment: kia
MAE's for 5-Fold: [[3.87497878 4.04101242 4.46024407 4.66019195 4.59666032]]
Mean of MAE's for 5-Fold: [4.3266]


Segment: [volvo], Size: [1,352], KFold: [1/5]:  80%|████████  | 16/20 [21:17<04:39, 69.91s/it]

Segment: seat
MAE's for 5-Fold: [[3.89360661 3.54385652 3.43622616 3.01267999 3.51267578]]
Mean of MAE's for 5-Fold: [3.4798]


Segment: [peugeot], Size: [793], KFold: [1/5]:  85%|████████▌ | 17/20 [22:10<03:14, 64.82s/it]

Segment: volvo
MAE's for 5-Fold: [[6.02914565 5.78762835 5.60118503 5.87817962 6.77987783]]
Mean of MAE's for 5-Fold: [6.0152]


Segment: [hyundai], Size: [1,855], KFold: [1/5]:  90%|█████████ | 18/20 [22:53<01:56, 58.15s/it]

Segment: peugeot
MAE's for 5-Fold: [[3.54094552 3.70324212 4.39508426 3.94308584 4.20264149]]
Mean of MAE's for 5-Fold: [3.9570]


Segment: [mitsubishi], Size: [556], KFold: [1/5]:  95%|█████████▌| 19/20 [24:05<01:02, 62.43s/it]

Segment: hyundai
MAE's for 5-Fold: [[4.18520388 4.05813739 4.11325083 4.11431106 4.2249236 ]]
Mean of MAE's for 5-Fold: [4.1392]


Segment: [mitsubishi], Size: [556], KFold: [5/5]: 100%|██████████| 20/20 [25:14<00:00, 75.72s/it]

Segment: mitsubishi
MAE's for 5-Fold: [[4.61996615 4.34172456 4.27423513 3.79216799 5.54718853]]
Mean of MAE's for 5-Fold: [4.5151]





In [58]:
import pickle
with open('./model_checkpoints/segment_stacking_models_brand_kf.pkl', 'wb') as f:
	pickle.dump(stacking_models, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_stacking_feature_info_brand_kf.pkl', 'wb') as f:
	pickle.dump(stacking_feature_info, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_stacking_scores_brand_kf.pkl', 'wb') as f:
	pickle.dump(stacking_scores, f, protocol=pickle.HIGHEST_PROTOCOL)

In [59]:
score_df = pd.DataFrame(
    np.array(stacking_scores).reshape(len(stacking_scores)*5,5),
    columns=['segment','k','n_tr','n_val','score']
)

score_df.sort_values(['segment','k']).head(10)

Unnamed: 0,segment,k,n_tr,n_val,score
25,audi,1,4477,1120,5.116813041523893
26,audi,2,4477,1120,5.132082346353567
27,audi,3,4478,1119,5.33933819749306
28,audi,4,4478,1119,5.236601447964262
29,audi,5,4478,1119,5.424517007060739
45,bmw,1,4209,1053,6.219873353909377
46,bmw,2,4209,1053,6.485837518865727
47,bmw,3,4210,1052,6.108788354488365
48,bmw,4,4210,1052,6.212015217673424
49,bmw,5,4210,1052,6.121111208745558


In [60]:
# inference
X = train_fn.drop(target_feature,axis=1)
X = add_fuel_type(X,dummy_features)
y = train_fn[target_feature]

X_test = add_fuel_type(test_fn,dummy_features)

segment_list = X['segment'].unique()

# add predictions
X      = pd.concat([X     ,tr_pred_df.drop('true',axis=1)],axis=1)
X_test = pd.concat([X_test,te_pred_df],axis=1)

In [62]:
tr_pred_list = []
te_pred_list = []
for segment in tqdm(segment_list):
    ## data load
    # (1) train
    train_data = X[X.segment==segment][stacking_feature_info[segment]['features']]
    train_dataset = Pool(train_data,cat_features=stacking_feature_info[segment]['cat_features'])
    # (2) test
    test_data = X_test[X_test.segment==segment][stacking_feature_info[segment]['features']]
    test_dataset = Pool(test_data,cat_features=stacking_feature_info[segment]['cat_features'])
    
    ## model
    kfold_models = stacking_models[segment]
    
    ## prediction
    # (1) train
    tr_pred_df = pd.DataFrame({
        'segment':segment,
        'true':y[X.segment==segment].values.flatten(),
        'pred':np.mean([model.predict(train_dataset) for model in kfold_models],axis=0),
        #'pred':np.sum([weight*model.predict(train_dataset) for weight,model in zip(kfold_weights,kfold_models)],axis=0),
    })
    tr_pred_df.index = train_data.index
    # (2) test
    te_pred_df = pd.DataFrame({
        'segment':segment,
        'pred':np.mean([model.predict(test_dataset) for model in kfold_models],axis=0),
        #'pred':np.sum([weight*model.predict(test_dataset) for weight,model in zip(kfold_weights,kfold_models)],axis=0),
    })
    te_pred_df.index = test_data.index
    
    ## Target Transformation
    if CFG.TARGET_TRANSFORMATION:
        tr_pred_df['true'] = np.exp(tr_pred_df['true'])
        tr_pred_df['pred'] = np.exp(tr_pred_df['pred'])
        te_pred_df['pred'] = np.exp(te_pred_df['pred'])
    
    ## append
    tr_pred_list.append(tr_pred_df)
    te_pred_list.append(te_pred_df)

100%|██████████| 20/20 [00:02<00:00,  8.31it/s]


In [63]:
# train
tr_pred_df = pd.concat(tr_pred_list,axis=0).sort_index()
mean_absolute_error(y_true=tr_pred_df.true,y_pred=tr_pred_df.pred)

3.9968174582656557

In [64]:
te_pred_df = pd.concat(te_pred_list,axis=0).sort_index()
te_pred_df.head()

Unnamed: 0,segment,pred
0,mazda,81.641591
1,ford,25.985464
2,volkswagen,88.979396
3,renault,122.918797
4,volvo,49.500418


In [65]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['가격'] = te_pred_df.pred.values
submit.to_csv('./out/16_ensemble_stacking_segment_브랜드_kfold_logy.csv',index=False)