# Library Setting

In [1]:
from tqdm import tqdm
tqdm.pandas()

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

import gc
gc.collect()

0

<br></br>

# Configuration

In [40]:
class CFG:
    SEED = 0
    
    SUBSET_DEPTH = 1
    INTERACTION = False
    FS_ALPHA = 0.01
    
    N_SPLITS = 5
    TARGET_TRANSFORMATION = True
    
    LR = 0.003
    EPOCHS = 30000
    ES = 300
    XGB_LR = 0.01     # default=0.3
    XGB_EPOCHS = 1000 # default=100
    XGB_ES = 100
    XTRATREES_EPOCHS = 100 #default=100

<br></br>

# Data

## Data Load

In [41]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [42]:
train_df.shape, test_df.shape

((57920, 15), (14480, 14))

In [43]:
train_df.head()

Unnamed: 0,ID,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격
0,TRAIN_00000,2018,2014,skoda,fabia,KAT,SLA,85231,999,0,0,1,0,0,51.74
1,TRAIN_00001,2010,2006,toyota,auris,RKO,SWI,135000,1598,0,0,1,0,0,41.47
2,TRAIN_00002,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223,1796,0,0,1,0,0,17.81
3,TRAIN_00003,2006,2001,nissan,x-trail,EHX,WIE,238000,2184,0,1,0,0,0,18.2
4,TRAIN_00004,2007,2007,fiat,bravo,OSW,MAL,251000,1910,0,1,0,0,0,17.55


<br>

## Target Transformation

In [44]:
if CFG.TARGET_TRANSFORMATION:
    train_df['가격'] = np.log(train_df['가격'])

<br>

## Resetting Columns Type

In [45]:
class TypeResetting:
    def __init__(self):
        self.cat_features = ['브랜드','차량모델명','판매도시','판매구역','생산년도','모델출시년도']
        self.seg_features = []
        
    def add_categorical_features(self,cat_features):
        self.cat_features += cat_features
        
    def delete_categorical_features(self,cat_features):
        self.cat_features = [col for col in self.cat_features if col not in cat_features]
        
    def add_segment_features(self,segment_features):
        self.seg_features = ['segment']
        self.cat_features = [col for col in self.cat_features if col not in segment_features]
        
    def fit(self,data):
        if (len(self.seg_features)>0) & ('segment' not in data.columns):
            raise ValueError("segment column name must be 'segment'")
        self.target_feature = ['가격']
        self.unuse_features = ['ID']
        self.dummy_features = ['압축천연가스(CNG)','액화석유가스(LPG)','경유','가솔린','하이브리드']
        self.num_features   = [col for col in data.columns
                               if col not in self.target_feature+self.unuse_features+self.dummy_features+self.cat_features+self.seg_features]
        
    def transform(self,data):
        d = data.copy()
        for col in self.dummy_features:
            if d[col].dtypes!=int:
                d[col] = d[col].astype(int)
        for col in self.cat_features:
            if d[col].dtypes!=object:
                d[col] = d[col].astype(str)
        for col in self.num_features:
            if d[col].dtypes!=float:
                d[col] = d[col].astype(float)
        for col in self.seg_features:
            if d[col].dtypes!=object:
                d[col] = d[col].astype(str)
        for col in self.unuse_features:
            if col in d.columns:
                d.drop(col,axis=1,inplace=True)
        return d
    
    def fit_transform(self,data):
        self.fit(data)
        return self.transform(data)
    
    def get_feature_type(self):
        globals()['target_feature'] = self.target_feature
        globals()['unuse_features'] = self.unuse_features
        globals()['dummy_features'] = self.dummy_features
        globals()['cat_features']   = self.cat_features
        globals()['num_features']   = self.num_features

In [46]:
type_resetor = TypeResetting()
type_resetor.fit(train_df)
type_resetor.get_feature_type()

train_df2 = type_resetor.transform(train_df)
test_df2  = type_resetor.transform(test_df)

In [47]:
import warnings
warnings.simplefilter("always")

def check_only_oneside(train,test,cat_features):
    not_test_only_features = []
    for iter,col in enumerate(cat_features):
        print('[{}/{}] {}'.format(iter+1,len(cat_features),col))
        
        only_train = list(set(train[col].unique())-set(test[col].unique()))
        only_test  = list(set(test[col].unique())-set(train[col].unique()))
        print(' - Only Train:',len(only_train))
        print(' - Only Test :',len(only_test))
        if len(only_test)>0:
            print('******Warning******')
        else:
            not_test_only_features.append(col)
        print('')
    return not_test_only_features

In [48]:
# 브랜드, 차량모델명, 판매구역, 모델출시년도
not_test_only_features = check_only_oneside(train_df2,test_df2,cat_features+dummy_features)
not_test_only_features = list(set(not_test_only_features)-set(dummy_features))

[1/11] 브랜드
 - Only Train: 0
 - Only Test : 0

[2/11] 차량모델명
 - Only Train: 2
 - Only Test : 0

[3/11] 판매도시
 - Only Train: 1750
 - Only Test : 300

[4/11] 판매구역
 - Only Train: 0
 - Only Test : 0

[5/11] 생산년도
 - Only Train: 3
 - Only Test : 1

[6/11] 모델출시년도
 - Only Train: 0
 - Only Test : 0

[7/11] 압축천연가스(CNG)
 - Only Train: 0
 - Only Test : 0

[8/11] 액화석유가스(LPG)
 - Only Train: 0
 - Only Test : 0

[9/11] 경유
 - Only Train: 0
 - Only Test : 0

[10/11] 가솔린
 - Only Train: 0
 - Only Test : 0

[11/11] 하이브리드
 - Only Train: 0
 - Only Test : 0



In [49]:
not_test_only_features

['차량모델명', '판매구역', '브랜드', '모델출시년도']

<br></br>

# New Features

In [50]:
# pd.Series([str(round(int(year)/100,1)) for year in train_df6['생산년도']]).value_counts()

In [51]:
train_df2.head()

Unnamed: 0,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격
0,2018,2014,skoda,fabia,KAT,SLA,85231.0,999.0,0,0,1,0,0,3.946231
1,2010,2006,toyota,auris,RKO,SWI,135000.0,1598.0,0,0,1,0,0,3.72497
2,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223.0,1796.0,0,0,1,0,0,2.87976
3,2006,2001,nissan,x-trail,EHX,WIE,238000.0,2184.0,0,1,0,0,0,2.901422
4,2007,2007,fiat,bravo,OSW,MAL,251000.0,1910.0,0,1,0,0,0,2.865054


In [52]:
from tqdm import tqdm
from itertools import chain, combinations
def all_subsets(ss):
    return list(chain(*map(lambda x: combinations(ss, x), range(0, len(ss)+1))))

class FeatureEngineering:
    def __init__(self):
        pass
    
    def _get_quantile(self,x,col):
        x = np.array(x).flatten()
        x = x[pd.notnull(x)]

        agg_df = pd.DataFrame(index=[0])
        for q in [0,25,50,75,100]:
            agg_df[f'{col}_Q{q}'] = np.quantile(x,q/100)

        return agg_df
    
    def _derived_features(self,data):
        d = data.copy()

        # (1) 모델출시년도에 생산된 차량인지
        d['출시년도생산여부'] = np.where(d['생산년도'].astype(int)==d['모델출시년도'].astype(int),1,0)

        # (2) 모델출시 이후에 몇년 지나서 생산됬는지
        d['출시이후생산년수'] = d['생산년도'].astype(int)-d['모델출시년도'].astype(int)

        # (3) 출시 이전에 생산되었는지
        d['출시이전생산여부'] = np.where(d['출시이후생산년수']<0,1,0)

        # (4) 브랜드의 국적 (구글링)
        d['브랜드국적'] = ['체코' if brand=='skoda' else
                        '일본' if brand in ['toyota','nissan','mazda','honda','mitsubishi'] else
                        '독일' if brand in ['mercedes-benz','audi','volkswagen','bmw','opel'] else
                        '이탈리아' if brand=='fiat' else
                        '프랑스' if brand in ['renault','citroen','peugeot'] else
                        '미국' if brand=='ford' else
                        '한국' if brand in ['kia','hyundai'] else
                        '스페인' if brand=='seat' else
                        '스웨덴' if brand=='volvo' else
                        np.nan for brand in d['브랜드']]

        # (5) 브랜드 국적의 대륙명
        d['브랜드대륙명'] = ['유럽' if country in ['체코','독일','이탈리아','프랑스','스페인','스웨덴'] else
                          '아시아' if country in ['일본','한국'] else
                          '아메리카' if country in ['미국'] else
                          np.nan for country in d['브랜드국적']]
        return d
    
    def fit(self,data,cat_features,subset_depth=1):
        assert '가격' in data.columns, \
            'Input data must be training dataset'
        assert len(cat_features)>=subset_depth, \
            'len(cat_features) >= subset_depth'
        
        self.cat_features = cat_features
        self.new_cat_features = ['출시년도생산여부','출시이후생산년수','출시이전생산여부','브랜드국적','브랜드대륙명']
        
        # (6) 카테고리 변수에 따른 가격의 Quantile값
        all_subset_list = all_subsets(cat_features)
        all_subset_list = [subset for subset in all_subset_list if (len(subset)<=subset_depth) & (len(subset)>=1)]
        
        self.agg_dict = {}
        for subset in tqdm(all_subset_list,desc=f'Get quantiles of target by categorical features (depth={subset_depth})'):
            subset = list(subset)
            subset_name = '_'.join(subset)
            agg_fn = data.groupby(subset)['가격'].apply(lambda x: self._get_quantile(x,subset_name)).reset_index()
            drop_cols = [col for col in agg_fn if col.find('level_')>=0]
            agg_fn.drop(columns=drop_cols,inplace=True)
            self.agg_dict[subset_name] = agg_fn
            
    def transform(self,data):
        data = self._derived_features(data)
        for key,agg_fn in self.agg_dict.items():
            data = pd.merge(data,agg_fn,how='left',on=key.split('_'))
        return data
    
    def fit_transform(self,data,cat_features,subset_depth=1):
        self.fit(data,cat_features,subset_depth)
        return self.transform(data)

In [53]:
fe = FeatureEngineering()
fe.fit(
    data=train_df2,
    cat_features=not_test_only_features, 
    subset_depth=CFG.SUBSET_DEPTH,
)
train_df3 = fe.transform(train_df2)
test_df3  = fe.transform(test_df2)

Get quantiles of target by categorical features (depth=1): 100%|██████████| 4/4 [00:00<00:00, 12.15it/s]


In [54]:
fe.new_cat_features

['출시년도생산여부', '출시이후생산년수', '출시이전생산여부', '브랜드국적', '브랜드대륙명']

In [55]:
type_resetor = TypeResetting()
type_resetor.add_categorical_features(fe.new_cat_features)
type_resetor.fit(train_df3)
type_resetor.get_feature_type()

train_df3 = type_resetor.transform(train_df3)
test_df3  = type_resetor.transform(test_df3)

In [56]:
print(train_df3.shape)
train_df3.head()

(57920, 39)


Unnamed: 0,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격,출시년도생산여부,출시이후생산년수,출시이전생산여부,브랜드국적,브랜드대륙명,차량모델명_Q0,차량모델명_Q25,차량모델명_Q50,차량모델명_Q75,차량모델명_Q100,판매구역_Q0,판매구역_Q25,판매구역_Q50,판매구역_Q75,판매구역_Q100,브랜드_Q0,브랜드_Q25,브랜드_Q50,브랜드_Q75,브랜드_Q100,모델출시년도_Q0,모델출시년도_Q25,모델출시년도_Q50,모델출시년도_Q75,모델출시년도_Q100
0,2018,2014,skoda,fabia,KAT,SLA,85231.0,999.0,0,0,1,0,0,3.946231,0,4,0,체코,유럽,0.792993,2.933059,3.712352,4.065687,4.830312,0.262364,3.206803,3.77391,4.355041,5.049856,0.727549,3.393501,4.057853,4.509072,5.049856,0.157004,4.019486,4.355041,4.64314,5.049856
1,2010,2006,toyota,auris,RKO,SWI,135000.0,1598.0,0,0,1,0,0,3.72497,0,4,0,일본,아시아,2.687167,3.594569,4.112512,4.387075,4.761062,1.095273,3.113071,3.520461,4.066802,5.049856,0.157004,3.353407,4.000034,4.435212,5.049856,1.358409,3.089678,3.348851,3.785779,5.049856
2,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223.0,1796.0,0,0,1,0,0,2.87976,1,0,0,독일,유럽,2.250239,3.147165,3.440418,3.627069,4.866534,0.482426,3.089678,3.707577,4.296605,5.049856,1.095273,3.152736,3.785779,4.354655,5.049856,0.955511,2.453588,2.826722,3.353407,5.049022
3,2006,2001,nissan,x-trail,EHX,WIE,238000.0,2184.0,0,1,0,0,0,2.901422,0,5,0,일본,아시아,2.164472,4.368303,4.575844,4.761062,5.049022,0.482426,3.089678,3.707577,4.296605,5.049856,1.050822,3.558201,3.923359,4.405499,5.049022,0.955511,2.341806,2.738903,3.095125,4.886356
4,2007,2007,fiat,bravo,OSW,MAL,251000.0,1910.0,0,1,0,0,0,2.865054,1,0,0,이탈리아,유럽,2.134166,2.894253,3.065725,3.21072,3.660223,0.732368,3.201526,3.755837,4.305416,5.049856,1.111858,2.865054,3.529985,4.174387,4.969049,1.697449,3.24921,3.626206,3.948741,5.049856


<br></br>

# EDA

In [58]:
# check_num_features = [col for col in num_features if col.find('_Q')<0]

# i=0
# for col in check_num_features:
#     i+=1
#     print('\n({}/{}) {}'.format(i,len(check_num_features),col))
#     plt.figure(figsize=(15,7))
#     sns.scatterplot(x=train_df3['가격'],y=train_df3[col])
#     plt.show()

<br></br>

# Add the Interaction Term

In [59]:
import warnings
from tqdm import trange

def get_abs_corr(x,y):
    return np.abs(np.corrcoef(x,y))[0,1]

class InteractionTerm:
    def __init__(self):
        pass
    
    def fit(self,data,num_features,corr_cutoff=0.7):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        
        d = data.copy()
        self.interaction_list = []
        for i in range(len(num_features)):
            for j in range(len(num_features)):
                if i>j:
                    col_i = num_features[i]
                    col_j = num_features[j]
                    
                    # 상관계수가 cutoff보다 큰 경우에는 interaction을 생성하지 않음
                    if (get_abs_corr(d[col_i]*d[col_j],d[col_i])>=corr_cutoff) | (get_abs_corr(d[col_i]*d[col_j],d[col_j])>=corr_cutoff):
                        pass
                    else:
                        self.interaction_list.append(f'{col_i}*{col_j}')
    
    def transform(self,data):
        d = data.copy()
        for interaction in self.interaction_list:
            col_i,col_j = interaction.split('*')
            d[interaction] = d[col_i]*d[col_j]
        return d
    
    def fit_transform(self,data,num_features,corr_cutoff=0.7):
        self.fit(data,num_features,corr_cutoff)
        return self.transform(data)

In [60]:
train_df4 = train_df3.copy()
test_df4  = test_df3.copy()

if CFG.INTERACTION:
    interaction_maker = InteractionTerm()
    interaction_maker.fit(
        data=train_df3,
        num_features=num_features,
        corr_cutoff=0.7,
    )
    train_df4 = interaction_maker.transform(train_df4)
    test_df4  = interaction_maker.transform(test_df4)

    type_resetor = TypeResetting()
    type_resetor.add_categorical_features(fe.new_cat_features)
    type_resetor.fit(train_df4)
    type_resetor.get_feature_type()

    train_df4 = type_resetor.transform(train_df4)
    test_df4  = type_resetor.transform(test_df4)

<br></br>

# Feature Selection

In [61]:
# k=0
# for i in range(len(num_features)):
#     for j in range(len(num_features)):
#         if i>j:
#             col_i = num_features[i]
#             col_j = num_features[j]
#             corr = np.corrcoef(train_df4[col_i],train_df4[col_j])[0,1]
#             if corr>=0.7:
#                 k+=1
#                 print(k,col_i,col_j,corr)

In [62]:
def log_offset(x):
    if min(x)>0:
        offset = 0
    elif min(x)==0:
        offset = 1e-3
    else:
        offset = min(x)+1e-3
        print('minimum = {:.3f}'.format(min(x)))
    return np.log(x+offset)

<br></br>

## Categorical Features

In [63]:
# import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [64]:
check_cat_features = [col for col in cat_features if train_df4[col].nunique()<=100]

# (1) ANOVA를 해서 p-value가 0.05보다 높은 것들 확인
pvalue_list = []
for col in tqdm(check_cat_features):
    d = train_df4[[col,'가격']].rename(columns={col:'feature'})
    
    model = ols(f'가격 ~ C(feature)',data=d).fit()
    pvalue = anova_lm(model).values[0][-1]
    pvalue_list.append([col,pvalue])
    
pvalue_df = pd.DataFrame(pvalue_list,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df[pvalue_df.pvalue>=alpha].round(4)

100%|██████████| 9/9 [00:03<00:00,  2.48it/s]


In [65]:
# (2) (1)에서 유의하지않은 feature들은 log적용 후에도 유의하지 않으면 제외
pvalue_list2 = []
unsignificant_features = pvalue_df[pvalue_df.pvalue>CFG.FS_ALPHA].feature.tolist()
for col in tqdm(unsignificant_features):
    d = train_df4[[col,'target']].rename(columns={col:'feature'})
    d['feature'] = log_offset(d['feature'])
    
    model = ols(f'feature ~ C(target)',data=d).fit()
    pvalue = anova_lm(model).values[0][-1]
    pvalue_list2.append([col,pvalue])
    
pvalue_df2 = pd.DataFrame(pvalue_list2,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)

0it [00:00, ?it/s]


In [66]:
delete_features = pvalue_df2[pvalue_df2.pvalue> CFG.FS_ALPHA].feature.tolist()
log_features    = pvalue_df2[pvalue_df2.pvalue<=CFG.FS_ALPHA].feature.tolist()
print('> delete_features')
print('  - length : {}'.format(len(delete_features)))
print('  - feature_name : {}'.format(delete_features))
print('')
print('> log_features')
print('  - length : {}'.format(len(log_features)))
print('  - feature_name : {}'.format(log_features))

train_df5 = train_df4.copy()
train_df5.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    train_df5[col] = log_offset(train_df5[col])
    
test_df5 = test_df4.copy()
test_df5.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    test_df5[col] = log_offset(test_df5[col])

> delete_features
  - length : 0
  - feature_name : []

> log_features
  - length : 0
  - feature_name : []


<br>

## Numerical Features

In [67]:
import scipy

In [68]:
# (1) corr test를 해서 p-value가 0.05보다 높은 것들 확인
pvalue_list = []
for col in num_features:
    corr,pvalue = scipy.stats.pearsonr(train_df5['가격'],train_df5[col])
    pvalue_list.append([col,pvalue])
pvalue_df = pd.DataFrame(pvalue_list,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df.round(4).head()

In [69]:
# (2) (1)에서 유의하지않은 feature들은 log적용 후에도 유의하지 않으면 제외
pvalue_list2 = []
unsignificant_features = pvalue_df[pvalue_df.pvalue>CFG.FS_ALPHA].feature.tolist()
for col in tqdm(unsignificant_features):
    corr,pvalue = scipy.stats.pearsonr(train_df5['가격'],log_offset(train_df5[col]))
    pvalue_list2.append([col,pvalue])
pvalue_df2 = pd.DataFrame(pvalue_list2,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df2.round(4).head()

0it [00:00, ?it/s]


In [70]:
delete_features = pvalue_df2[pvalue_df2.pvalue> CFG.FS_ALPHA].feature.tolist()
log_features    = pvalue_df2[pvalue_df2.pvalue<=CFG.FS_ALPHA].feature.tolist()
print('> delete_features')
print('  - length : {}'.format(len(delete_features)))
print('  - feature_name : {}'.format(delete_features))
print('')
print('> log_features')
print('  - length : {}'.format(len(log_features)))
print('  - feature_name : {}'.format(log_features))

train_df6 = train_df5.copy()
train_df6.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    train_df6[col] = log_offset(train_df6[col])
    
test_df6 = test_df5.copy()
test_df6.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    test_df6[col] = log_offset(test_df6[col])

> delete_features
  - length : 0
  - feature_name : []

> log_features
  - length : 0
  - feature_name : []


<br></br>

# Make Segment

In [71]:
def make_segment(data,segment: list):
    d = data.copy()
    d['segment'] = d[segment].apply(lambda x: '___'.join(x),axis=1)
    d.drop(columns=segment,inplace=True)
    return d

In [72]:
segment = ['브랜드']
train_df7 = make_segment(train_df6,segment)
test_df7  = make_segment(test_df6 ,segment)

In [73]:
test_only = list(set(test_df7.segment.unique())-set(train_df7.segment.unique()))
assert len(test_only)==0, \
    "Segment exists only in the test set ({})".format(len(test_only))

In [74]:
train_only = list(set(train_df7['segment'].unique())-set(test_df7['segment'].unique()))

n_asis = len(train_df7)
n_tobe = len(train_df7[~train_df7.segment.isin(train_only)])
train_df7 = train_df7[~train_df7.segment.isin(train_only)]
print('> Train에만 존재하는 Segment 제거')
print(' - 데이터수 : {:,} -> {:,}'.format(n_asis,n_tobe))
print(' - 세그먼트수 : {:,}'.format(train_df7['segment'].nunique()))

> Train에만 존재하는 Segment 제거
 - 데이터수 : 57,920 -> 57,920
 - 세그먼트수 : 20


In [75]:
vc = train_df7['segment'].value_counts().sort_values()
display(vc.head())
print('...')
display(vc.tail())

segment
mitsubishi     556
peugeot        793
citroen       1129
fiat          1164
volvo         1352
Name: count, dtype: int64

...


segment
bmw           5262
audi          5597
volkswagen    5693
ford          5819
opel          6651
Name: count, dtype: int64

In [76]:
type_resetor = TypeResetting()
type_resetor.add_categorical_features(fe.new_cat_features)
type_resetor.add_segment_features(segment)
type_resetor.fit(train_df7)
type_resetor.get_feature_type()

train_df7 = type_resetor.transform(train_df7)
test_df7  = type_resetor.transform(test_df7)

In [77]:
cat_features

['차량모델명',
 '판매도시',
 '판매구역',
 '생산년도',
 '모델출시년도',
 '출시년도생산여부',
 '출시이후생산년수',
 '출시이전생산여부',
 '브랜드국적',
 '브랜드대륙명']

In [78]:
print(train_df7.shape)
train_df7.head()

(57920, 39)


Unnamed: 0,생산년도,모델출시년도,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격,출시년도생산여부,출시이후생산년수,출시이전생산여부,브랜드국적,브랜드대륙명,차량모델명_Q0,차량모델명_Q25,차량모델명_Q50,차량모델명_Q75,차량모델명_Q100,판매구역_Q0,판매구역_Q25,판매구역_Q50,판매구역_Q75,판매구역_Q100,브랜드_Q0,브랜드_Q25,브랜드_Q50,브랜드_Q75,브랜드_Q100,모델출시년도_Q0,모델출시년도_Q25,모델출시년도_Q50,모델출시년도_Q75,모델출시년도_Q100,segment
0,2018,2014,fabia,KAT,SLA,85231.0,999.0,0,0,1,0,0,3.946231,0,4,0,체코,유럽,0.792993,2.933059,3.712352,4.065687,4.830312,0.262364,3.206803,3.77391,4.355041,5.049856,0.727549,3.393501,4.057853,4.509072,5.049856,0.157004,4.019486,4.355041,4.64314,5.049856,skoda
1,2010,2006,auris,RKO,SWI,135000.0,1598.0,0,0,1,0,0,3.72497,0,4,0,일본,아시아,2.687167,3.594569,4.112512,4.387075,4.761062,1.095273,3.113071,3.520461,4.066802,5.049856,0.157004,3.353407,4.000034,4.435212,5.049856,1.358409,3.089678,3.348851,3.785779,5.049856,toyota
2,2002,2002,clk-klasa,GNI,WIE,255223.0,1796.0,0,0,1,0,0,2.87976,1,0,0,독일,유럽,2.250239,3.147165,3.440418,3.627069,4.866534,0.482426,3.089678,3.707577,4.296605,5.049856,1.095273,3.152736,3.785779,4.354655,5.049856,0.955511,2.453588,2.826722,3.353407,5.049022,mercedes-benz
3,2006,2001,x-trail,EHX,WIE,238000.0,2184.0,0,1,0,0,0,2.901422,0,5,0,일본,아시아,2.164472,4.368303,4.575844,4.761062,5.049022,0.482426,3.089678,3.707577,4.296605,5.049856,1.050822,3.558201,3.923359,4.405499,5.049022,0.955511,2.341806,2.738903,3.095125,4.886356,nissan
4,2007,2007,bravo,OSW,MAL,251000.0,1910.0,0,1,0,0,0,2.865054,1,0,0,이탈리아,유럽,2.134166,2.894253,3.065725,3.21072,3.660223,0.732368,3.201526,3.755837,4.305416,5.049856,1.111858,2.865054,3.529985,4.174387,4.969049,1.697449,3.24921,3.626206,3.948741,5.049856,fiat


<br></br>

# Modeling

In [79]:
import os
def mkdir(paths):
    if type(paths)==str:
        paths = [paths]
    for path in paths:
        if not os.path.isdir(path):
            print('> Create Folder: {}'.format(path))
            os.mkdir(path)

In [80]:
## dummy_features는 한가지만 속함
# X[dummy_features].apply(lambda x: np.sum(x),axis=1).value_counts()

def add_fuel_type(data,dummy_features):
    d = data.copy()
    d['fuel_type'] = d[dummy_features].apply(
        lambda x: dummy_features[np.where(x==1)[0][0]],axis=1)
    d.drop(columns=dummy_features,inplace=True)
    return d

In [81]:
mkdir('./model_checkpoints')
mkdir('./model_checkpoints/segment_catboost')
mkdir('./model_checkpoints/segment_weightedensemble')

In [86]:
def check_null_cnt(data):
    null_cnt = data.isnull().sum()
    null_cnt = len(null_cnt[null_cnt!=0])
    return null_cnt

check_null_cnt(train_df7),check_null_cnt(test_df7)

(0, 0)

<br>

## CatBoost
- public score : 5.7424655998

In [82]:
gc.collect()

0

In [83]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool



In [87]:
train_fn = train_df7.copy()
test_fn  = test_df7 .copy()

In [88]:
%%time
# 1시간

X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]

X = add_fuel_type(X,dummy_features)
new_cat_features = cat_features + ['fuel_type']

segment_list = X['segment'].unique()

models = {}
feature_info = {}
scores = []
pbar = tqdm(segment_list)

s_i = 0
for segment in pbar:
    s_i+=1
    
    # segment에 해당하는 데이터추출
    _X = X[X.segment==segment].drop('segment',axis=1)
    _y = y[X.segment==segment]
    
    # kfold
    kf = KFold(n_splits=CFG.N_SPLITS,random_state=1000*s_i+CFG.SEED,shuffle=True)
    
    # unique인 컬럼 제외
    unique_info = _X.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        _X = _X.drop(unique_cols,axis=1)
        
    # categorical feature에서 unique인 컬럼을 제외
    fixed_cat_features = [col for col in new_cat_features if col in _X.columns]
    
    _models = []
    _scores = []
    k=0
    for tr_idx,val_idx in kf.split(_X,_y):
        k+=1
        
        # kfold dataset
        X_tr, X_va = _X.iloc[tr_idx], _X.iloc[val_idx]
        y_tr, y_va = _y.iloc[tr_idx], _y.iloc[val_idx]

        # progress
        progress = 'Segment: [{}], Size: [{:,}], KFold: [{}/{}]'\
            .format(segment,len(_X),k,CFG.N_SPLITS)
        pbar.set_description(progress)

        # dataset
        train_dataset = Pool(X_tr,y_tr,cat_features=fixed_cat_features)
        valid_dataset = Pool(X_va,y_va,cat_features=fixed_cat_features)

        # define the model
        model = CatBoostRegressor(
            loss_function='MAE',
            random_state=CFG.SEED,
            iterations=CFG.EPOCHS,
            learning_rate=CFG.LR,
            allow_writing_files=False,
        )

        # fit the model
        model.fit(
            train_dataset,
            eval_set=valid_dataset,
            early_stopping_rounds=CFG.ES,
            verbose=0,
            #metric_period=CFG.EPOCHS//5,
        )

        # save the model
        model.save_model(f'./model_checkpoints/segment_catboost/{segment}_k{k}.cbm')

        # prediction
        y_pred = model.predict(valid_dataset).flatten()
        y_true = y_va.values
        
        # inverse transform
        if CFG.TARGET_TRANSFORMATION:
            y_pred = np.exp(y_pred)
            y_true = np.exp(y_true)
            
        # calculate score
        score = mean_absolute_error(y_true=y_true,y_pred=y_pred)
        
        # append inner loop
        _models.append(model)
        _scores.append([segment,k,len(X_tr),len(X_va),score])

    # append outer loop
    models[segment] = _models
    scores.append(_scores)
    feature_info[segment] = {'cat_features':fixed_cat_features,'features':_X.columns.tolist()}
    
    # score report
    mean_score_report = pd.Series(np.array(_scores)[:,-1]).astype(float).mean()
    print('Segment: {}'.format(segment))
    print("MAE's for {}-Fold: [{}]".format(CFG.N_SPLITS,np.array(pd.Series(np.array(_scores)[:,-1]).astype(float).values)))
    print("Mean of MAE's for {}-Fold: [{:.4f}]".format(CFG.N_SPLITS,mean_score_report))

Segment: [toyota], Size: [3,259], KFold: [1/5]:   5%|▌         | 1/20 [03:45<1:11:20, 225.29s/it]

MAE's for 5-Fold: [[6.4846382  6.29849737 6.09677369 6.37160989 6.5596939 ]]
Mean of MAE's for 5-Fold: [6.3622]


Segment: [mercedes-benz], Size: [2,899], KFold: [1/5]:  10%|█         | 2/20 [07:02<1:02:33, 208.51s/it]

MAE's for 5-Fold: [[5.50277705 5.4283477  4.97532247 5.41614929 4.79000968]]
Mean of MAE's for 5-Fold: [5.2225]


Segment: [nissan], Size: [2,129], KFold: [1/5]:  15%|█▌        | 3/20 [09:37<52:13, 184.34s/it]         

MAE's for 5-Fold: [[7.90876873 8.31644407 8.72588954 8.60516257 8.48442461]]
Mean of MAE's for 5-Fold: [8.4081]


Segment: [fiat], Size: [1,164], KFold: [1/5]:  20%|██        | 4/20 [11:14<39:59, 149.98s/it]  

MAE's for 5-Fold: [[5.13766126 5.41384356 5.13735885 4.73588891 5.10349861]]
Mean of MAE's for 5-Fold: [5.1057]


Segment: [audi], Size: [5,597], KFold: [1/5]:  25%|██▌       | 5/20 [12:22<30:01, 120.10s/it]

MAE's for 5-Fold: [[5.54728136 5.18640719 6.10060993 4.78896115 4.5949196 ]]
Mean of MAE's for 5-Fold: [5.2436]


Segment: [renault], Size: [3,853], KFold: [1/5]:  30%|███       | 6/20 [16:41<39:05, 167.53s/it]

MAE's for 5-Fold: [[6.23900216 6.32118158 6.26027663 6.31529324 6.56140697]]
Mean of MAE's for 5-Fold: [6.3394]


Segment: [volkswagen], Size: [5,693], KFold: [1/5]:  35%|███▌      | 7/20 [20:12<39:21, 181.67s/it]

MAE's for 5-Fold: [[5.04308546 4.96334157 4.6168621  5.20506756 4.84420244]]
Mean of MAE's for 5-Fold: [4.9345]


Segment: [citroen], Size: [1,129], KFold: [1/5]:  40%|████      | 8/20 [24:59<43:02, 215.17s/it]   

MAE's for 5-Fold: [[6.21581662 6.28777199 5.81087536 6.3734847  6.29000547]]
Mean of MAE's for 5-Fold: [6.1956]


Segment: [bmw], Size: [5,262], KFold: [1/5]:  45%|████▌     | 9/20 [26:37<32:44, 178.62s/it]    

MAE's for 5-Fold: [[3.89753784 4.04415935 4.2654294  3.73446017 3.95632655]]
Mean of MAE's for 5-Fold: [3.9796]


Segment: [opel], Size: [6,651], KFold: [1/5]:  50%|█████     | 10/20 [30:33<32:44, 196.45s/it]

MAE's for 5-Fold: [[7.83360871 7.85673303 7.43370159 7.66156593 7.58397222]]
Mean of MAE's for 5-Fold: [7.6739]


Segment: [ford], Size: [5,819], KFold: [1/5]:  55%|█████▌    | 11/20 [37:59<40:53, 272.61s/it]

MAE's for 5-Fold: [[3.82272987 3.87317548 4.01455074 4.17666697 4.20037761]]
Mean of MAE's for 5-Fold: [4.0175]


Segment: [mazda], Size: [1,572], KFold: [1/5]:  60%|██████    | 12/20 [43:50<39:32, 296.56s/it]

MAE's for 5-Fold: [[5.42188801 5.07013624 5.39536204 4.97861164 5.04049399]]
Mean of MAE's for 5-Fold: [5.1813]


Segment: [honda], Size: [1,545], KFold: [1/5]:  65%|██████▌   | 13/20 [45:33<27:44, 237.82s/it]

MAE's for 5-Fold: [[6.09866766 5.78864227 6.21624264 5.43542285 5.33064598]]
Mean of MAE's for 5-Fold: [5.7739]


Segment: [kia], Size: [2,034], KFold: [1/5]:  70%|███████   | 14/20 [48:11<21:22, 213.75s/it]  

MAE's for 5-Fold: [[5.35104056 5.16282199 5.97330917 5.37972528 5.2727702 ]]
Mean of MAE's for 5-Fold: [5.4279]


Segment: [seat], Size: [1,628], KFold: [1/5]:  75%|███████▌  | 15/20 [50:19<15:39, 187.97s/it]

MAE's for 5-Fold: [[5.34367253 5.48476565 5.81173857 6.09031243 6.16285997]]
Mean of MAE's for 5-Fold: [5.7787]


Segment: [volvo], Size: [1,352], KFold: [1/5]:  80%|████████  | 16/20 [52:28<11:20, 170.20s/it]

MAE's for 5-Fold: [[5.28285608 4.99382496 5.03537545 4.17782385 5.02775829]]
Mean of MAE's for 5-Fold: [4.9035]


Segment: [peugeot], Size: [793], KFold: [1/5]:  85%|████████▌ | 17/20 [54:05<07:24, 148.02s/it]

MAE's for 5-Fold: [[8.28700688 8.66633904 7.56193344 7.98586088 9.50440423]]
Mean of MAE's for 5-Fold: [8.4011]


Segment: [hyundai], Size: [1,855], KFold: [1/5]:  90%|█████████ | 18/20 [54:58<03:58, 119.48s/it]

MAE's for 5-Fold: [[5.388665   5.49042047 6.7541546  5.51941456 5.54787835]]
Mean of MAE's for 5-Fold: [5.7401]


Segment: [mitsubishi], Size: [556], KFold: [1/5]:  95%|█████████▌| 19/20 [57:14<02:04, 124.52s/it]

MAE's for 5-Fold: [[5.64782553 5.52425378 5.66301542 5.97046398 5.89874045]]
Mean of MAE's for 5-Fold: [5.7409]


Segment: [mitsubishi], Size: [556], KFold: [5/5]: 100%|██████████| 20/20 [58:24<00:00, 175.22s/it]

MAE's for 5-Fold: [[6.30852768 5.82252881 6.21202783 5.0792416  7.65777788]]
Mean of MAE's for 5-Fold: [6.2160]
CPU times: user 2h 32min 57s, sys: 39min 47s, total: 3h 12min 44s
Wall time: 58min 27s





In [89]:
import pickle
with open('./model_checkpoints/segment_cat_models_brand_kf.pkl', 'wb') as f:
	pickle.dump(models, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_cat_feature_info_brand_kf.pkl', 'wb') as f:
	pickle.dump(feature_info, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_cat_scores_brand_kf.pkl', 'wb') as f:
	pickle.dump(scores, f, protocol=pickle.HIGHEST_PROTOCOL)

In [90]:
score_df = pd.DataFrame(
    np.array(scores).reshape(len(scores)*5,5),
    columns=['segment','k','n_tr','n_val','score']
)

score_df.sort_values(['segment','k']).head(10)

Unnamed: 0,segment,k,n_tr,n_val,score
25,audi,1,4477,1120,6.239002156100294
26,audi,2,4477,1120,6.321181576622522
27,audi,3,4478,1119,6.260276629787235
28,audi,4,4478,1119,6.315293236485838
29,audi,5,4478,1119,6.561406965430303
45,bmw,1,4209,1053,7.833608708294394
46,bmw,2,4209,1053,7.856733027009104
47,bmw,3,4210,1052,7.433701589265777
48,bmw,4,4210,1052,7.6615659345350675
49,bmw,5,4210,1052,7.58397221642056


In [91]:
# inference
X = train_fn.drop(target_feature,axis=1)
X = add_fuel_type(X,dummy_features)
y = train_fn[target_feature]

X_test = add_fuel_type(test_fn,dummy_features)

segment_list = X['segment'].unique()

tr_pred_list = []
te_pred_list = []
for segment in tqdm(segment_list):
    ## data load
    # (1) train
    train_data = X[X.segment==segment][feature_info[segment]['features']]
    train_dataset = Pool(train_data,cat_features=feature_info[segment]['cat_features'])
    # (2) test
    test_data = X_test[X_test.segment==segment][feature_info[segment]['features']]
    test_dataset = Pool(test_data,cat_features=feature_info[segment]['cat_features'])
    
    ## model
    kfold_models = models[segment]
    
    ## prediction
    # (1) train
    tr_pred_df = pd.DataFrame({
        'segment':segment,
        'true':y[X.segment==segment].values.flatten(),
        'pred':np.mean([model.predict(train_dataset) for model in kfold_models],axis=0),
        #'pred':np.sum([weight*model.predict(train_dataset) for weight,model in zip(kfold_weights,kfold_models)],axis=0),
    })
    tr_pred_df.index = train_data.index
    # (2) test
    te_pred_df = pd.DataFrame({
        'segment':segment,
        'pred':np.mean([model.predict(test_dataset) for model in kfold_models],axis=0),
        #'pred':np.sum([weight*model.predict(test_dataset) for weight,model in zip(kfold_weights,kfold_models)],axis=0),
    })
    te_pred_df.index = test_data.index
    
    ## Target Transformation
    if CFG.TARGET_TRANSFORMATION:
        tr_pred_df['true'] = np.exp(tr_pred_df['true'])
        tr_pred_df['pred'] = np.exp(tr_pred_df['pred'])
        te_pred_df['pred'] = np.exp(te_pred_df['pred'])
    
    ## append
    tr_pred_list.append(tr_pred_df)
    te_pred_list.append(te_pred_df)

100%|██████████| 20/20 [00:04<00:00,  4.70it/s]


In [92]:
# train
tr_pred_df = pd.concat(tr_pred_list,axis=0).sort_index()
mean_absolute_error(y_true=tr_pred_df.true,y_pred=tr_pred_df.pred)

4.958517151776045

In [93]:
te_pred_df = pd.concat(te_pred_list,axis=0).sort_index()
te_pred_df.head()

Unnamed: 0,segment,pred
0,mazda,85.436333
1,ford,26.748791
2,volkswagen,94.951675
3,renault,122.831476
4,volvo,52.454936


In [94]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['가격'] = te_pred_df.pred.values
submit.to_csv('./out/10_catboost_segment_브랜드_kfold_logy.csv',index=False)

- weighted sum으로 바꾸기
- cat, xgb, lgb 비교해서 필요없는거 하나 빼기?

<br>

## Weighted Ensemble
- public score : 5.7185798322

In [95]:
import pandas as pd
import warnings

class OneHotEncoder:
    def __init__(self):
        pass
    
    def fit(self,data,columns):
        self.transform_list = []
        for col in columns:
            for i,value in enumerate(sorted(data[col].unique())):
                if i>0:
                    self.transform_list.append([col,value])
        
    def transform(self,data):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        new_data = data.copy()
        for col,value in self.transform_list:
            new_data[f'{col}_{value}'] = np.where(new_data[col]==value,1,0)
        drop_columns = pd.unique(np.array(self.transform_list)[:,0])
        new_data.drop(columns=drop_columns,inplace=True)
        return new_data

In [96]:
from copy import deepcopy
from sklearn.base import BaseEstimator, RegressorMixin
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_absolute_error
import warnings
import time
import pickle

class WeightedEnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,weight=['equal','balanced'],target_transformation=False):
        super().__init__()
        
        assert weight in ['equal','balanced'], \
            "weight must be one of ['equal','balanced']"
        assert isinstance(target_transformation,bool), \
            "target_transformation must be bool type"
        
        self.weight = weight
        self.target_transformation = target_transformation
        self._get_regressors()
    
    def _get_regressors(self):
        max_depth = 10
        n_jobs = -1
        
        params_elasticnet = {
            'l1_ratio' : np.arange(0.1, 1, 0.1),
            'alphas' : [1e-5, 1e-3, 1e-1, 0.0, 1.0, 10.0, 100.0],
            'cv' : RepeatedKFold(n_splits=CFG.N_SPLITS, n_repeats=3, random_state=CFG.SEED),
            'n_jobs' : n_jobs,
            #'max_iter' : 50000,
            'tol' : 0.001,
        }
        
        params_catboost = {
            'random_state' : CFG.SEED,
            'iterations' : CFG.EPOCHS,
            'early_stopping_rounds' : CFG.ES,
            'learning_rate' : CFG.LR,
            'loss_function' : 'MAE',
            'grow_policy' : 'Lossguide', # 'SymmetricTree','Depthwise'
            'use_best_model' : True,
            'allow_writing_files' : False,
            'verbose' : 0,
            'max_depth' : max_depth,
            #'l2_leaf_reg' : 1,
        }
    
        params_xgboost = {
            'random_state' : CFG.SEED,
            'n_estimators' : CFG.XGB_EPOCHS,
            'early_stopping_rounds' : CFG.XGB_ES,
            'learning_rate' : CFG.XGB_LR,
            'objective' : 'reg:absoluteerror',
            'verbosity' : 0,
            'max_depth': max_depth,
            'n_jobs' : n_jobs,
        }
    
        params_lightgbm = {
            'random_state' : CFG.SEED,
            'n_estimators' : CFG.EPOCHS,
            'early_stopping_round' : CFG.ES,
            'learning_rate' : CFG.LR,
            'objective' : 'regression',
            'metric' : 'mean_absolute_error',
            'verbosity' : -1,
            'max_depth': max_depth,
            'n_jobs' : n_jobs,
        }
        
        params_extratrees = {
            'random_state' : CFG.SEED,
            'n_estimators' : CFG.XTRATREES_EPOCHS,
            'criterion' : 'absolute_error',
            'verbose' : 0,
            'max_depth' : max_depth,
            'n_jobs' : n_jobs,
        }
        
        self.regressors = [
            ElasticNetCV(**params_elasticnet),
            CatBoostRegressor(**params_catboost),
            XGBRegressor(**params_xgboost),
            LGBMRegressor(**params_lightgbm),
            ExtraTreesRegressor(**params_extratrees),
        ]
        self.regressors_name = ['ElasticNet','CatBoost','XGBoost','LightGBM','ExtraTrees']
        
    def _adjust_prediction(self,pred):
        pred = np.array(pred).flatten()
        if np.where(pred<0,1,0).sum()>0:
            pred = [x if x>0 else self.minimum_value for x in pred]
        pred = np.exp(np.array(pred).flatten())
        if np.where(pred==np.inf,1,0).sum()>0:
            pred = [x if x!=np.inf else self.maximum_value for x in pred]
        pred = np.array(pred).flatten()
        return pred
    
    def fit(self,X,y,eval_set,oh_set,cat_features,verbose=1):
        assert len(eval_set)==1, \
            "eval_set length must be 1. len(eval_set)={}".format(len(eval_set))
        assert len(oh_set)==1, \
            "oh_set length must be 1. len(oh_set)={}".format(len(oh_set))
        X_val, y_val = eval_set[0]
        X_oh, X_val_oh = oh_set[0]
        
        self.cat_features = cat_features
        self.weights = []
        self.fitting_elapsed = []
        if verbose:
            pbar = tqdm(zip(self.regressors_name,self.regressors),total=len(self.regressors))
        else:
            pbar = zip(self.regressors_name,self.regressors)
            
        fit_iter = 0
        for regressor_name,regressor in pbar:
            fit_iter+=1
            s = time.time()
            
            if verbose:
                pbar.set_description(name)
                
            if regressor_name=='ElasticNet':
                warnings.filterwarnings("ignore", category=UserWarning)
                regressor.fit(X_oh,y)
                tr_pred = regressor.predict(X_oh)
                va_pred = regressor.predict(X_val_oh)
            elif regressor_name=='CatBoost':
                train_dataset = Pool(X,y,cat_features=cat_features)
                val_dataset   = Pool(X_val,y_val,cat_features=cat_features)
                regressor.fit(
                    train_dataset,
                    eval_set=val_dataset,
                    #metric_period=CFG.EPOCHS//5,
                )
                tr_pred = regressor.predict(train_dataset)
                va_pred = regressor.predict(val_dataset)
            elif regressor_name=='XGBoost':
                regressor.fit(
                    X_oh,y,
                    eval_set=[(X_val_oh,y_val)],
                    verbose=0,
                )
                tr_pred = regressor.predict(X_oh)
                va_pred = regressor.predict(X_val_oh)
            elif regressor_name=='LightGBM':
                warnings.filterwarnings("ignore", category=UserWarning)
                X_tmp = X.copy()
                X_val_tmp = X_val.copy()
                for col in cat_features:
                    X_tmp[col]     = X_tmp[col]    .astype('category')
                    X_val_tmp[col] = X_val_tmp[col].astype('category')
                regressor.fit(
                    X_tmp,y,
                    eval_set=[(X_val_tmp,y_val)],
                    verbose=-1,
                )
                tr_pred = regressor.predict(X_tmp)
                va_pred = regressor.predict(X_val_tmp)
            elif regressor_name=='ExtraTrees':
                regressor.fit(X_oh,y)
                tr_pred = regressor.predict(X_oh)
                va_pred = regressor.predict(X_val_oh)
            else:
                raise ValueError('Unknown Regressor: {}'.format(regressor_name))
            
            if self.target_transformation:
                tr_true = np.exp(np.array(y)    .flatten())
                va_true = np.exp(np.array(y_val).flatten())
                self.minimum_value = min(np.nanmin(tr_true),np.nanmin(va_true))
                self.maximum_value = max(np.nanmax(tr_true),np.nanmax(va_true))
                
                tr_pred = self._adjust_prediction(tr_pred)
                va_pred = self._adjust_prediction(va_pred)
            else:
                tr_true = np.array(y).flatten()
                va_true = np.array(y_val).flatten()
                tr_pred = np.array(tr_pred).flatten()
                va_pred = np.array(va_pred).flatten()
            tr_score = mean_absolute_error(y_pred=tr_pred,y_true=tr_true)
            va_score = mean_absolute_error(y_pred=va_pred,y_true=va_true)
            e = time.time()
            self.weights.append(1/va_score)
            self.fitting_elapsed.append(e-s)
            
            blank = ' '*(11-len(regressor_name))
            fit_progress = '  [{}/{}] {}{}: score={:.3f}, val_score={:.3f}, elasped={:.1f}s'\
                .format(fit_iter,len(self.regressors),regressor_name,blank,tr_score,va_score,e-s)
            print(fit_progress)
        
        if self.weight=='equal':
            self.weights = np.array([1.0 for _ in self.regressors])
        self.weights /= sum(self.weights)
        
        tr_pred = self.predict(X,X_oh)
        va_pred = self.predict(X_val,X_val_oh)
        ens_tr_score = mean_absolute_error(y_true=np.exp(np.array(y)    .flatten()),y_pred=tr_pred)
        ens_va_score = mean_absolute_error(y_true=np.exp(np.array(y_val).flatten()),y_pred=va_pred)
        
        total_fit_progress = '  Total({}): score={:.3f}, val_score={:.3f}, elasped={:.1f}s'\
            .format(self.weight,ens_tr_score,ens_va_score,sum(self.fitting_elapsed))
        print(total_fit_progress)
        
    def predict(self,X,X_oh):
        assert len(X)==len(X_oh), \
            "X and X_oh must be same length"
        
        pred_list = []
        for regressor_name,regressor in zip(self.regressors_name,self.regressors):
            if regressor_name in ['ElasticNet','XGBoost','ExtraTrees']:
                dataset = X_oh.copy()
            elif regressor_name=='CatBoost':
                dataset = Pool(X,cat_features=self.cat_features)
            elif regressor_name=='LightGBM':
                dataset = X.copy()
                for col in self.cat_features:
                    dataset[col] = dataset[col].astype('category')
            else:
                raise ValueError('Unknown Regressor: {}'.format(regressor_name))
            
            y_pred = regressor.predict(dataset)
            y_pred = self._adjust_prediction(y_pred)
            
            pred_list.append(y_pred)
            
        final_pred = np.zeros(len(X))
        for pred,weight in zip(pred_list,self.weights):
            final_pred += np.array(pred)*weight
            
        return final_pred
    
    def save_model(self,path):
        save_dict = {
            'cat_features' : self.cat_features,
            'weights' : self.weights,
            'target_transformation' : self.target_transformation,
            'fitting_elapsed' : self.fitting_elapsed,
            'regressors' : self.regressors,
        }
        with open(path, 'wb') as f:
            pickle.dump(save_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
            
    def load_model(self,path):
        with open(path, 'rb') as f:
            save_dict = pickle.load(f)
            self.cat_features = save_dict['cat_features']
            self.weights = save_dict['weights']
            self.target_transformation = save_dict['target_transformation'],
            self.fitting_elapsed = save_dict['fitting_elapsed']
            self.regressors = save_dict['regressors']

In [97]:
from sklearn.model_selection import KFold

In [98]:
gc.collect()

0

In [99]:
train_fn = train_df7.copy()
test_fn  = test_df7 .copy()

In [100]:
%%time
# 6시간

X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]
y = pd.Series(y.values.flatten())

ohe = OneHotEncoder()
ohe.fit(X,cat_features)
X_oh = ohe.transform(X)

segment_list = X['segment'].unique()

models = {}
feature_info = {}
scores = []
pbar = tqdm(segment_list)

s_i = 0
for segment in pbar:
    s_i+=1
    
    # segment에 해당하는 데이터추출
    _X    = X   [X   .segment==segment].drop('segment',axis=1)
    _X_oh = X_oh[X_oh.segment==segment].drop('segment',axis=1)
    _y    = y   [X   .segment==segment]
    
    # kfold
    kf = KFold(n_splits=CFG.N_SPLITS,random_state=1000*s_i+CFG.SEED,shuffle=True)
    
    # unique인 컬럼 제외
    # (1) X
    unique_info = _X.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        _X = _X.drop(unique_cols,axis=1)
    # (2) X_oh
    unique_info = _X_oh.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        _X_oh = _X_oh.drop(unique_cols,axis=1)
        
    # categorical feature에서 unique인 컬럼을 제외
    fixed_cat_features = [col for col in cat_features if col in _X.columns]
    
    # progress
    progress = 'Segment: {}, Length: {}'\
        .format(segment,len(_X))
    pbar.set_description(progress)
    
    _models = []
    _scores = []
    k=0
    for tr_idx,val_idx in kf.split(_X,_y):
        k+=1
        print('> [K-Fold] {}/{}'.format(k,CFG.N_SPLITS))
        
        # kfold dataset
        X_tr   , X_va    = _X   .iloc[tr_idx], _X   .iloc[val_idx]
        X_tr_oh, X_va_oh = _X_oh.iloc[tr_idx], _X_oh.iloc[val_idx]
        y_tr   , y_va    = _y   .iloc[tr_idx], _y   .iloc[val_idx]

        # define the model
        ensemble_model = WeightedEnsembleRegressor(
            weight='balanced',
            target_transformation=CFG.TARGET_TRANSFORMATION,
        )

        # fit the model
        ensemble_model.fit(
            X_tr,y_tr,
            eval_set=[(X_va,y_va)],
            oh_set=[(X_tr_oh,X_va_oh)],
            cat_features=fixed_cat_features,
            verbose=0,
        )

        # save the model
        ensemble_model.save_model(f'./model_checkpoints/segment_weightedensemble/{segment}_k{k}.pickle')

        # prediction
        y_pred = ensemble_model.predict(X_va,X_va_oh).flatten()
        y_true = y_va.values
        
        # caculate score
        score = mean_absolute_error(y_true=y_true,y_pred=y_pred)
        
        # append inner loop
        _models.append(ensemble_model)
        _scores.append([segment,k,len(X_tr),len(X_va),score])

    # append outer loop
    models[segment] = _models
    scores.append(_scores)
    feature_info[segment] = {
        'cat_features':fixed_cat_features,
        'features':_X.columns.tolist(),
        'oh_features':_X_oh.columns.tolist(),
    }

Segment: skoda, Length: 3130:   0%|          | 0/20 [00:01<?, ?it/s]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=6.525, val_score=7.005, elasped=15.0s
  [2/5] CatBoost   : score=4.708, val_score=6.512, elasped=78.3s
  [3/5] XGBoost    : score=1.789, val_score=7.460, elasped=18.9s
  [4/5] LightGBM   : score=5.303, val_score=6.917, elasped=22.4s
  [5/5] ExtraTrees : score=5.468, val_score=7.655, elasped=45.8s
  Total(balanced): score=4.492, val_score=6.612, elasped=180.3s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=6.528, val_score=6.876, elasped=15.9s
  [2/5] CatBoost   : score=4.869, val_score=6.207, elasped=74.1s
  [3/5] XGBoost    : score=1.590, val_score=6.827, elasped=19.2s
  [4/5] LightGBM   : score=5.420, val_score=6.503, elasped=22.6s
  [5/5] ExtraTrees : score=5.568, val_score=7.195, elasped=44.7s
  Total(balanced): score=4.499, val_score=6.230, elasped=176.5s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=6.586, val_score=6.675, elasped=15.3s
  [2/5] CatBoost   : score=4.698, val_score=6.004, elasped=90.1s
  [3/5] XGBoost    : score=1.560, val_score=7.4

Segment: toyota, Length: 3259:   5%|▌         | 1/20 [15:12<4:48:28, 910.98s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.646, val_score=6.306, elasped=19.5s
  [2/5] CatBoost   : score=3.519, val_score=5.545, elasped=99.0s
  [3/5] XGBoost    : score=0.926, val_score=6.257, elasped=12.0s
  [4/5] LightGBM   : score=4.390, val_score=5.987, elasped=20.8s
  [5/5] ExtraTrees : score=4.201, val_score=6.301, elasped=55.1s
  Total(balanced): score=3.377, val_score=5.658, elasped=206.4s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.453, val_score=6.253, elasped=19.4s
  [2/5] CatBoost   : score=3.866, val_score=5.413, elasped=57.1s
  [3/5] XGBoost    : score=0.664, val_score=5.804, elasped=16.8s
  [4/5] LightGBM   : score=3.599, val_score=5.546, elasped=33.7s
  [5/5] ExtraTrees : score=4.130, val_score=6.169, elasped=53.2s
  Total(balanced): score=3.171, val_score=5.340, elasped=180.2s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.611, val_score=5.796, elasped=19.6s
  [2/5] CatBoost   : score=3.882, val_score=5.073, elasped=58.2s
  [3/5] XGBoost    : score=0.885, val_score=5.6

Segment: mercedes-benz, Length: 2899:  10%|█         | 2/20 [31:24<4:44:08, 947.12s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=9.001, val_score=8.997, elasped=26.0s
  [2/5] CatBoost   : score=6.157, val_score=7.849, elasped=43.3s
  [3/5] XGBoost    : score=0.922, val_score=8.739, elasped=21.6s
  [4/5] LightGBM   : score=6.467, val_score=8.040, elasped=21.4s
  [5/5] ExtraTrees : score=5.903, val_score=9.269, elasped=51.6s
  Total(balanced): score=5.255, val_score=7.781, elasped=164.0s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=8.911, val_score=9.948, elasped=26.2s
  [2/5] CatBoost   : score=5.813, val_score=8.273, elasped=59.8s
  [3/5] XGBoost    : score=1.082, val_score=9.362, elasped=20.4s
  [4/5] LightGBM   : score=6.214, val_score=8.423, elasped=21.9s
  [5/5] ExtraTrees : score=5.827, val_score=9.462, elasped=51.9s
  Total(balanced): score=5.115, val_score=8.219, elasped=180.3s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=8.591, val_score=10.241, elasped=26.0s
  [2/5] CatBoost   : score=5.632, val_score=8.371, elasped=69.1s
  [3/5] XGBoost    : score=1.418, val_score=9.

Segment: nissan, Length: 2129:  15%|█▌        | 3/20 [45:50<4:17:53, 910.21s/it]       

> [K-Fold] 1/5
  [1/5] ElasticNet : score=4.903, val_score=5.564, elasped=13.9s
  [2/5] CatBoost   : score=3.771, val_score=5.154, elasped=44.1s
  [3/5] XGBoost    : score=0.448, val_score=6.014, elasped=8.9s
  [4/5] LightGBM   : score=3.858, val_score=5.775, elasped=25.3s
  [5/5] ExtraTrees : score=3.755, val_score=6.051, elasped=24.2s
  Total(balanced): score=3.123, val_score=5.324, elasped=116.4s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.152, val_score=5.286, elasped=12.6s
  [2/5] CatBoost   : score=4.355, val_score=5.279, elasped=23.2s
  [3/5] XGBoost    : score=0.693, val_score=5.888, elasped=9.3s
  [4/5] LightGBM   : score=4.359, val_score=5.422, elasped=17.5s
  [5/5] ExtraTrees : score=3.715, val_score=5.592, elasped=24.3s
  Total(balanced): score=3.456, val_score=5.085, elasped=87.0s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.759, val_score=5.324, elasped=12.2s
  [2/5] CatBoost   : score=3.735, val_score=5.014, elasped=55.0s
  [3/5] XGBoost    : score=0.466, val_score=5.822,

Segment: fiat, Length: 1164:  20%|██        | 4/20 [55:00<3:24:46, 767.94s/it]  

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.243, val_score=5.969, elasped=5.4s
  [2/5] CatBoost   : score=3.531, val_score=5.314, elasped=45.4s
  [3/5] XGBoost    : score=1.091, val_score=5.524, elasped=4.7s
  [4/5] LightGBM   : score=3.525, val_score=5.073, elasped=18.5s
  [5/5] ExtraTrees : score=3.439, val_score=5.426, elasped=6.4s
  Total(balanced): score=3.083, val_score=5.090, elasped=80.4s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.357, val_score=5.603, elasped=5.1s
  [2/5] CatBoost   : score=4.055, val_score=5.038, elasped=19.6s
  [3/5] XGBoost    : score=1.198, val_score=5.841, elasped=5.3s
  [4/5] LightGBM   : score=3.840, val_score=5.364, elasped=13.6s
  [5/5] ExtraTrees : score=3.292, val_score=5.621, elasped=6.7s
  Total(balanced): score=3.315, val_score=5.077, elasped=50.4s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.669, val_score=6.972, elasped=5.4s
  [2/5] CatBoost   : score=3.654, val_score=5.991, elasped=29.1s
  [3/5] XGBoost    : score=1.173, val_score=6.411, elasp

Segment: audi, Length: 5597:  25%|██▌       | 5/20 [1:00:37<2:33:03, 612.20s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=6.597, val_score=6.748, elasped=55.7s
  [2/5] CatBoost   : score=5.107, val_score=6.251, elasped=78.8s
  [3/5] XGBoost    : score=2.371, val_score=6.829, elasped=28.9s
  [4/5] LightGBM   : score=5.372, val_score=6.530, elasped=23.4s
  [5/5] ExtraTrees : score=5.775, val_score=7.157, elasped=221.8s
  Total(balanced): score=4.775, val_score=6.285, elasped=408.6s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=6.575, val_score=6.878, elasped=55.6s
  [2/5] CatBoost   : score=5.006, val_score=6.338, elasped=91.6s
  [3/5] XGBoost    : score=2.371, val_score=6.947, elasped=33.7s
  [4/5] LightGBM   : score=5.555, val_score=6.618, elasped=22.5s
  [5/5] ExtraTrees : score=5.873, val_score=7.429, elasped=228.4s
  Total(balanced): score=4.800, val_score=6.484, elasped=431.8s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=6.618, val_score=6.806, elasped=57.5s
  [2/5] CatBoost   : score=4.836, val_score=6.308, elasped=119.9s
  [3/5] XGBoost    : score=2.216, val_score=

Segment: renault, Length: 3853:  30%|███       | 6/20 [1:35:29<4:20:17, 1115.54s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.168, val_score=5.371, elasped=25.3s
  [2/5] CatBoost   : score=3.738, val_score=5.001, elasped=55.4s
  [3/5] XGBoost    : score=1.137, val_score=5.417, elasped=15.9s
  [4/5] LightGBM   : score=3.905, val_score=5.080, elasped=19.5s
  [5/5] ExtraTrees : score=4.259, val_score=5.615, elasped=80.8s
  Total(balanced): score=3.372, val_score=4.933, elasped=196.9s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.120, val_score=5.664, elasped=24.8s
  [2/5] CatBoost   : score=3.410, val_score=4.807, elasped=113.4s
  [3/5] XGBoost    : score=1.459, val_score=5.307, elasped=14.0s
  [4/5] LightGBM   : score=3.694, val_score=4.918, elasped=24.4s
  [5/5] ExtraTrees : score=4.318, val_score=5.951, elasped=81.1s
  Total(balanced): score=3.289, val_score=4.831, elasped=257.8s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.210, val_score=5.340, elasped=24.9s
  [2/5] CatBoost   : score=3.733, val_score=4.488, elasped=62.8s
  [3/5] XGBoost    : score=1.454, val_score=5.

Segment: volkswagen, Length: 5693:  35%|███▌      | 7/20 [1:53:32<3:59:24, 1104.97s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=6.622, val_score=6.926, elasped=51.7s
  [2/5] CatBoost   : score=4.982, val_score=6.131, elasped=77.3s
  [3/5] XGBoost    : score=2.364, val_score=6.860, elasped=27.8s
  [4/5] LightGBM   : score=5.199, val_score=6.194, elasped=22.5s
  [5/5] ExtraTrees : score=5.638, val_score=7.259, elasped=205.6s
  Total(balanced): score=4.689, val_score=6.256, elasped=384.9s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=6.609, val_score=6.916, elasped=51.2s
  [2/5] CatBoost   : score=5.065, val_score=6.383, elasped=62.6s
  [3/5] XGBoost    : score=2.230, val_score=6.991, elasped=31.0s
  [4/5] LightGBM   : score=5.067, val_score=6.431, elasped=23.5s
  [5/5] ExtraTrees : score=5.574, val_score=7.184, elasped=203.8s
  Total(balanced): score=4.633, val_score=6.364, elasped=372.2s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=6.677, val_score=6.702, elasped=51.2s
  [2/5] CatBoost   : score=5.060, val_score=5.910, elasped=69.2s
  [3/5] XGBoost    : score=2.127, val_score=6

Segment: citroen, Length: 1129:  40%|████      | 8/20 [2:26:27<4:36:24, 1382.04s/it]   

> [K-Fold] 1/5
  [1/5] ElasticNet : score=3.603, val_score=3.925, elasped=5.7s
  [2/5] CatBoost   : score=2.765, val_score=3.830, elasped=33.0s
  [3/5] XGBoost    : score=0.603, val_score=4.649, elasped=4.2s
  [4/5] LightGBM   : score=3.369, val_score=3.933, elasped=11.0s
  [5/5] ExtraTrees : score=2.748, val_score=4.859, elasped=8.5s
  Total(balanced): score=2.505, val_score=3.856, elasped=62.4s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=3.728, val_score=3.860, elasped=5.4s
  [2/5] CatBoost   : score=2.775, val_score=4.169, elasped=26.4s
  [3/5] XGBoost    : score=0.650, val_score=4.430, elasped=4.0s
  [4/5] LightGBM   : score=3.006, val_score=4.594, elasped=12.6s
  [5/5] ExtraTrees : score=2.963, val_score=4.918, elasped=9.2s
  Total(balanced): score=2.470, val_score=4.116, elasped=57.5s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=3.411, val_score=4.439, elasped=5.6s
  [2/5] CatBoost   : score=2.561, val_score=4.266, elasped=42.8s
  [3/5] XGBoost    : score=0.574, val_score=5.033, elasp

Segment: bmw, Length: 5262:  45%|████▌     | 9/20 [2:32:15<3:14:03, 1058.46s/it]    

> [K-Fold] 1/5
  [1/5] ElasticNet : score=8.464, val_score=8.989, elasped=50.7s
  [2/5] CatBoost   : score=6.373, val_score=7.892, elasped=55.6s
  [3/5] XGBoost    : score=3.097, val_score=8.409, elasped=23.2s
  [4/5] LightGBM   : score=6.621, val_score=7.925, elasped=19.1s
  [5/5] ExtraTrees : score=6.744, val_score=8.695, elasped=209.6s
  Total(balanced): score=5.860, val_score=7.865, elasped=358.2s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=8.386, val_score=9.046, elasped=50.2s
  [2/5] CatBoost   : score=6.335, val_score=7.812, elasped=51.5s
  [3/5] XGBoost    : score=2.702, val_score=8.205, elasped=37.0s
  [4/5] LightGBM   : score=6.665, val_score=8.138, elasped=17.8s
  [5/5] ExtraTrees : score=6.728, val_score=8.666, elasped=203.4s
  Total(balanced): score=5.739, val_score=7.916, elasped=359.8s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=8.527, val_score=8.731, elasped=50.1s
  [2/5] CatBoost   : score=5.978, val_score=7.424, elasped=98.2s
  [3/5] XGBoost    : score=2.866, val_score=8

Segment: opel, Length: 6651:  50%|█████     | 10/20 [3:03:49<3:39:23, 1316.33s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=4.210, val_score=4.100, elasped=51.6s
  [2/5] CatBoost   : score=3.174, val_score=3.791, elasped=109.2s
  [3/5] XGBoost    : score=1.486, val_score=4.207, elasped=32.5s
  [4/5] LightGBM   : score=3.071, val_score=3.943, elasped=38.7s
  [5/5] ExtraTrees : score=3.857, val_score=4.624, elasped=300.1s
  Total(balanced): score=2.964, val_score=3.872, elasped=532.3s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=4.202, val_score=4.162, elasped=56.9s
  [2/5] CatBoost   : score=3.282, val_score=3.886, elasped=73.7s
  [3/5] XGBoost    : score=1.515, val_score=4.416, elasped=34.7s
  [4/5] LightGBM   : score=3.190, val_score=3.948, elasped=33.3s
  [5/5] ExtraTrees : score=3.803, val_score=4.726, elasped=302.7s
  Total(balanced): score=3.021, val_score=3.957, elasped=501.4s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.162, val_score=4.395, elasped=51.9s
  [2/5] CatBoost   : score=3.230, val_score=3.992, elasped=82.3s
  [3/5] XGBoost    : score=1.602, val_score=

Segment: ford, Length: 5819:  55%|█████▌    | 11/20 [3:47:41<4:17:52, 1719.18s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.418, val_score=6.029, elasped=42.0s
  [2/5] CatBoost   : score=4.007, val_score=5.397, elasped=90.6s
  [3/5] XGBoost    : score=2.069, val_score=5.753, elasped=31.0s
  [4/5] LightGBM   : score=4.124, val_score=5.476, elasped=39.1s
  [5/5] ExtraTrees : score=4.950, val_score=6.484, elasped=236.3s
  Total(balanced): score=3.831, val_score=5.466, elasped=438.9s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.533, val_score=5.372, elasped=41.3s
  [2/5] CatBoost   : score=4.181, val_score=5.099, elasped=86.3s
  [3/5] XGBoost    : score=2.148, val_score=5.318, elasped=36.2s
  [4/5] LightGBM   : score=4.231, val_score=5.243, elasped=39.2s
  [5/5] ExtraTrees : score=5.009, val_score=6.230, elasped=233.5s
  Total(balanced): score=3.937, val_score=5.035, elasped=436.5s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.383, val_score=6.029, elasped=42.4s
  [2/5] CatBoost   : score=4.057, val_score=5.448, elasped=97.7s
  [3/5] XGBoost    : score=2.150, val_score=6

Segment: mazda, Length: 1572:  60%|██████    | 12/20 [4:25:06<4:10:32, 1879.10s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.752, val_score=6.395, elasped=8.5s
  [2/5] CatBoost   : score=3.967, val_score=6.190, elasped=46.1s
  [3/5] XGBoost    : score=0.732, val_score=7.069, elasped=7.0s
  [4/5] LightGBM   : score=4.680, val_score=6.480, elasped=22.9s
  [5/5] ExtraTrees : score=3.809, val_score=7.626, elasped=11.5s
  Total(balanced): score=3.633, val_score=6.382, elasped=96.0s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.593, val_score=6.419, elasped=9.5s
  [2/5] CatBoost   : score=4.660, val_score=5.633, elasped=22.2s
  [3/5] XGBoost    : score=1.122, val_score=6.116, elasped=6.0s
  [4/5] LightGBM   : score=4.396, val_score=5.975, elasped=31.0s
  [5/5] ExtraTrees : score=4.108, val_score=6.535, elasped=12.5s
  Total(balanced): score=3.762, val_score=5.788, elasped=81.2s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.399, val_score=6.376, elasped=7.9s
  [2/5] CatBoost   : score=4.265, val_score=6.138, elasped=32.4s
  [3/5] XGBoost    : score=1.172, val_score=6.470, ela

Segment: honda, Length: 1545:  65%|██████▌   | 13/20 [4:33:12<2:49:59, 1457.11s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.645, val_score=6.008, elasped=8.6s
  [2/5] CatBoost   : score=3.373, val_score=5.242, elasped=72.3s
  [3/5] XGBoost    : score=0.720, val_score=6.386, elasped=6.7s
  [4/5] LightGBM   : score=3.444, val_score=5.513, elasped=39.7s
  [5/5] ExtraTrees : score=3.628, val_score=6.283, elasped=13.1s
  Total(balanced): score=3.056, val_score=5.340, elasped=140.3s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.620, val_score=5.941, elasped=8.1s
  [2/5] CatBoost   : score=3.646, val_score=5.107, elasped=46.4s
  [3/5] XGBoost    : score=1.238, val_score=5.869, elasped=6.5s
  [4/5] LightGBM   : score=3.653, val_score=5.173, elasped=33.0s
  [5/5] ExtraTrees : score=3.590, val_score=6.622, elasped=12.1s
  Total(balanced): score=3.217, val_score=5.196, elasped=106.1s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.529, val_score=7.172, elasped=8.2s
  [2/5] CatBoost   : score=3.661, val_score=5.806, elasped=44.8s
  [3/5] XGBoost    : score=0.493, val_score=6.818, e

Segment: kia, Length: 2034:  70%|███████   | 14/20 [4:42:00<1:57:39, 1176.55s/it]  

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.828, val_score=5.519, elasped=11.1s
  [2/5] CatBoost   : score=4.692, val_score=5.320, elasped=35.0s
  [3/5] XGBoost    : score=0.781, val_score=5.697, elasped=9.1s
  [4/5] LightGBM   : score=4.802, val_score=5.590, elasped=25.7s
  [5/5] ExtraTrees : score=4.133, val_score=5.994, elasped=21.6s
  Total(balanced): score=3.809, val_score=5.166, elasped=102.5s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.812, val_score=5.628, elasped=11.0s
  [2/5] CatBoost   : score=4.510, val_score=5.438, elasped=36.4s
  [3/5] XGBoost    : score=1.004, val_score=5.881, elasped=8.1s
  [4/5] LightGBM   : score=4.946, val_score=5.777, elasped=20.6s
  [5/5] ExtraTrees : score=4.117, val_score=5.683, elasped=22.3s
  Total(balanced): score=3.841, val_score=5.276, elasped=98.3s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.759, val_score=6.095, elasped=10.7s
  [2/5] CatBoost   : score=4.464, val_score=5.746, elasped=38.7s
  [3/5] XGBoost    : score=0.740, val_score=6.144,

Segment: seat, Length: 1628:  75%|███████▌  | 15/20 [4:51:54<1:23:24, 1000.91s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=4.645, val_score=5.728, elasped=10.3s
  [2/5] CatBoost   : score=3.284, val_score=5.270, elasped=47.5s
  [3/5] XGBoost    : score=0.699, val_score=6.084, elasped=7.0s
  [4/5] LightGBM   : score=3.309, val_score=5.584, elasped=23.0s
  [5/5] ExtraTrees : score=3.458, val_score=6.519, elasped=17.1s
  Total(balanced): score=2.836, val_score=5.420, elasped=104.9s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=4.635, val_score=5.542, elasped=10.5s
  [2/5] CatBoost   : score=3.436, val_score=4.997, elasped=33.0s
  [3/5] XGBoost    : score=1.230, val_score=5.667, elasped=6.6s
  [4/5] LightGBM   : score=3.576, val_score=5.307, elasped=20.0s
  [5/5] ExtraTrees : score=3.688, val_score=5.957, elasped=17.3s
  Total(balanced): score=3.039, val_score=5.069, elasped=87.4s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.902, val_score=5.083, elasped=9.8s
  [2/5] CatBoost   : score=3.092, val_score=4.780, elasped=54.9s
  [3/5] XGBoost    : score=1.117, val_score=5.926, 

Segment: volvo, Length: 1352:  80%|████████  | 16/20 [5:01:01<57:37, 864.34s/it]  

> [K-Fold] 1/5
  [1/5] ElasticNet : score=7.945, val_score=8.559, elasped=8.3s
  [2/5] CatBoost   : score=5.615, val_score=8.538, elasped=34.6s
  [3/5] XGBoost    : score=0.733, val_score=9.404, elasped=6.7s
  [4/5] LightGBM   : score=5.323, val_score=8.967, elasped=22.7s
  [5/5] ExtraTrees : score=6.061, val_score=9.126, elasped=12.2s
  Total(balanced): score=4.818, val_score=8.397, elasped=84.5s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=8.072, val_score=9.080, elasped=8.0s
  [2/5] CatBoost   : score=5.519, val_score=8.253, elasped=40.1s
  [3/5] XGBoost    : score=1.154, val_score=10.179, elasped=5.4s
  [4/5] LightGBM   : score=5.144, val_score=8.570, elasped=24.9s
  [5/5] ExtraTrees : score=6.012, val_score=10.671, elasped=12.7s
  Total(balanced): score=4.896, val_score=8.659, elasped=91.1s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=8.207, val_score=8.238, elasped=8.5s
  [2/5] CatBoost   : score=5.726, val_score=7.264, elasped=37.1s
  [3/5] XGBoost    : score=1.120, val_score=8.878, e

Segment: peugeot, Length: 793:  85%|████████▌ | 17/20 [5:08:56<37:22, 747.35s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.120, val_score=5.589, elasped=3.5s
  [2/5] CatBoost   : score=4.108, val_score=5.542, elasped=21.2s
  [3/5] XGBoost    : score=0.667, val_score=6.787, elasped=3.4s
  [4/5] LightGBM   : score=4.670, val_score=5.951, elasped=9.1s
  [5/5] ExtraTrees : score=3.579, val_score=6.368, elasped=5.0s
  Total(balanced): score=3.445, val_score=5.587, elasped=42.3s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.242, val_score=5.187, elasped=3.6s
  [2/5] CatBoost   : score=4.112, val_score=5.596, elasped=25.8s
  [3/5] XGBoost    : score=0.397, val_score=6.046, elasped=3.6s
  [4/5] LightGBM   : score=4.138, val_score=5.268, elasped=15.2s
  [5/5] ExtraTrees : score=3.273, val_score=6.202, elasped=3.7s
  Total(balanced): score=3.263, val_score=5.115, elasped=51.9s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=4.756, val_score=6.300, elasped=3.6s
  [2/5] CatBoost   : score=3.390, val_score=6.336, elasped=42.2s
  [3/5] XGBoost    : score=0.553, val_score=6.964, elaspe

Segment: hyundai, Length: 1855:  90%|█████████ | 18/20 [5:13:17<20:01, 600.99s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=6.148, val_score=6.143, elasped=8.9s
  [2/5] CatBoost   : score=4.166, val_score=5.525, elasped=51.9s
  [3/5] XGBoost    : score=0.846, val_score=6.006, elasped=7.1s
  [4/5] LightGBM   : score=4.158, val_score=5.857, elasped=22.3s
  [5/5] ExtraTrees : score=4.057, val_score=6.267, elasped=15.5s
  Total(balanced): score=3.571, val_score=5.531, elasped=105.8s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=6.130, val_score=6.146, elasped=7.9s
  [2/5] CatBoost   : score=4.326, val_score=5.372, elasped=36.0s
  [3/5] XGBoost    : score=0.694, val_score=6.012, elasped=7.6s
  [4/5] LightGBM   : score=4.778, val_score=5.561, elasped=17.0s
  [5/5] ExtraTrees : score=3.875, val_score=5.978, elasped=15.0s
  Total(balanced): score=3.670, val_score=5.386, elasped=83.4s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=6.130, val_score=6.325, elasped=7.7s
  [2/5] CatBoost   : score=4.182, val_score=5.605, elasped=42.4s
  [3/5] XGBoost    : score=0.825, val_score=5.906, el

Segment: mitsubishi, Length: 556:  95%|█████████▌| 19/20 [5:21:06<09:21, 561.55s/it]

> [K-Fold] 1/5
  [1/5] ElasticNet : score=5.151, val_score=6.399, elasped=3.1s
  [2/5] CatBoost   : score=4.423, val_score=6.453, elasped=23.4s
  [3/5] XGBoost    : score=0.974, val_score=7.735, elasped=3.6s
  [4/5] LightGBM   : score=5.407, val_score=6.827, elasped=9.0s
  [5/5] ExtraTrees : score=3.516, val_score=7.225, elasped=1.9s
  Total(balanced): score=3.553, val_score=6.162, elasped=41.0s
> [K-Fold] 2/5
  [1/5] ElasticNet : score=5.194, val_score=6.164, elasped=3.0s
  [2/5] CatBoost   : score=4.162, val_score=5.959, elasped=25.7s
  [3/5] XGBoost    : score=1.324, val_score=7.611, elasped=3.5s
  [4/5] LightGBM   : score=4.963, val_score=6.923, elasped=11.3s
  [5/5] ExtraTrees : score=2.753, val_score=7.932, elasped=2.0s
  Total(balanced): score=3.424, val_score=6.148, elasped=45.5s
> [K-Fold] 3/5
  [1/5] ElasticNet : score=5.350, val_score=6.297, elasped=3.0s
  [2/5] CatBoost   : score=4.744, val_score=6.226, elasped=17.0s
  [3/5] XGBoost    : score=2.499, val_score=6.829, elaspe

Segment: mitsubishi, Length: 556: 100%|██████████| 20/20 [5:24:36<00:00, 973.85s/it]

CPU times: user 1d 2h 48min 47s, sys: 2h 54min 34s, total: 1d 5h 43min 22s
Wall time: 5h 24min 55s





In [101]:
import pickle
with open('./model_checkpoints/segment_weiens_models_brand.pkl', 'wb') as f:
	pickle.dump(models, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_weiens_feature_info_brand.pkl', 'wb') as f:
	pickle.dump(feature_info, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_weiens_scores_brand.pkl', 'wb') as f:
	pickle.dump(scores, f, protocol=pickle.HIGHEST_PROTOCOL)

In [102]:
# import pickle
# with open('./model_checkpoints/segment_weiens_models_brand.pkl', 'rb') as f:
# 	models = pickle.load(f)
# with open('./model_checkpoints/segment_weiens_feature_info_brand.pkl', 'rb') as f:
# 	feature_info = pickle.load(f)
# with open('./model_checkpoints/segment_weiens_scores_brand.pkl', 'rb') as f:
# 	scores = pickle.load(f)

In [103]:
# pd.DataFrame(
#     np.array(scores).reshape(100,5),
#     columns=['segment','k','n_tr','n_val','score']
# ).sort_values(['segment','k'])

In [116]:
# inference
X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]

X_test = test_fn.copy()

ohe = OneHotEncoder()
ohe.fit(X,cat_features)
X_oh = ohe.transform(X)
X_test_oh = ohe.transform(X_test)

segment_list = X['segment'].unique()

tr_pred_list = []
te_pred_list = []
for segment in tqdm(segment_list):
    ## data load
    # (1) train
    train_data    = X   [X   .segment==segment][feature_info[segment]['features']]
    train_data_oh = X_oh[X_oh.segment==segment][feature_info[segment]['oh_features']]
    # (2) test
    test_data     = X_test   [X_test   .segment==segment][feature_info[segment]['features']]
    test_data_oh  = X_test_oh[X_test_oh.segment==segment][feature_info[segment]['oh_features']]
    
    ## model
    kfold_models = models[segment]
    
    ## prediction
    # (1) train
    tr_pred_df = pd.DataFrame({
        'true':y[X.segment==segment].values.flatten(),
        'pred':np.mean([model.predict(train_data,train_data_oh) for model in kfold_models],axis=0),
    })
    tr_pred_df.index = train_data.index
    # (2) test
    te_pred_df = pd.DataFrame({
        'pred':np.mean([model.predict(test_data,test_data_oh) for model in kfold_models],axis=0),
    })
    te_pred_df.index = test_data.index
    if CFG.TARGET_TRANSFORMATION:
        tr_pred_df['true'] = np.exp(tr_pred_df['true'])
    
    ## append
    tr_pred_list.append(tr_pred_df)
    te_pred_list.append(te_pred_df)

100%|██████████| 20/20 [01:16<00:00,  3.84s/it]


In [117]:
# train
tr_pred_df = pd.concat(tr_pred_list,axis=0).sort_index()
mean_absolute_error(y_true=tr_pred_df.true,y_pred=tr_pred_df.pred)

4.328599687923594

In [118]:
# def abline(intercept,slope,**kwargs):
#     axes = plt.gca()
#     x_vals = np.array(axes.get_xlim())
#     y_vals = intercept + slope * x_vals
#     plt.plot(x_vals, y_vals, '--',**kwargs)

# offset = 0.05
# min_value = min(tr_pred_df.true.min(),tr_pred_df.pred.min())*(1-offset)
# max_value = min(tr_pred_df.true.max(),tr_pred_df.pred.max())*(1+offset)

# plt.figure(figsize=(15,7))
# sns.scatterplot(x=tr_pred_df.true,y=tr_pred_df.pred)
# plt.xlim(min_value,max_value)
# plt.ylim(min_value,max_value)
# abline(0,1,color='red',linestyle='--')
# plt.show()

In [119]:
te_pred_df = pd.concat(te_pred_list,axis=0).sort_index()
te_pred_df.head()

Unnamed: 0,pred
0,83.212244
1,25.85396
2,88.990835
3,126.141661
4,51.476805


In [120]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['가격'] = te_pred_df.pred.values
submit.to_csv('./out/11_ensemble_segment_브랜드_kfold_logy.csv',index=False)

<br>

## 참조 pycaret

In [None]:
# from pycaret import regression

In [None]:
# %%time

# data = train_fn[train_fn.segment==segment_list[0]]
# # data['가격'] = np.exp(data['가격'])
# print(len(data))

# regression.setup(data=data,target='가격',remove_outliers=True,verbose=True)
# best = regression.compare_models(n_select=5,fold=5)