# Library Setting

In [1]:
from tqdm import tqdm
tqdm.pandas()

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

import gc
gc.collect()

0

<br></br>

# Configuration

In [2]:
class CFG:
    SEED = 0
    
    SUBSET_DEPTH = 3
    INTERACTION = False
    FS_ALPHA = 0.01
    
    N_SPLITS = 5
    
    LR = 0.003
    EPOCHS = 30000
    ES = 300
    XGB_LR = 0.3     # default
    XGB_EPOCHS = 1000 # default
    XGB_ES = 10

<br></br>

# Data

## Data Load

In [3]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [4]:
train_df.shape, test_df.shape

((57920, 15), (14480, 14))

In [5]:
train_df.head()

Unnamed: 0,ID,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격
0,TRAIN_00000,2018,2014,skoda,fabia,KAT,SLA,85231,999,0,0,1,0,0,51.74
1,TRAIN_00001,2010,2006,toyota,auris,RKO,SWI,135000,1598,0,0,1,0,0,41.47
2,TRAIN_00002,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223,1796,0,0,1,0,0,17.81
3,TRAIN_00003,2006,2001,nissan,x-trail,EHX,WIE,238000,2184,0,1,0,0,0,18.2
4,TRAIN_00004,2007,2007,fiat,bravo,OSW,MAL,251000,1910,0,1,0,0,0,17.55


<br>

## Resetting Columns Type

In [6]:
class TypeResetting:
    def __init__(self):
        self.cat_features = ['브랜드','차량모델명','판매도시','판매구역','생산년도','모델출시년도']
        self.seg_features = []
        
    def add_categorical_features(self,cat_features):
        self.cat_features += cat_features
        
    def delete_categorical_features(self,cat_features):
        self.cat_features = [col for col in self.cat_features if col not in cat_features]
        
    def add_segment_features(self,segment_features):
        self.seg_features = ['segment']
        self.cat_features = [col for col in self.cat_features if col not in segment_features]
        
    def fit(self,data):
        if (len(self.seg_features)>0) & ('segment' not in data.columns):
            raise ValueError("segment column name must be 'segment'")
        self.target_feature = ['가격']
        self.unuse_features = ['ID']
        self.dummy_features = ['압축천연가스(CNG)','액화석유가스(LPG)','경유','가솔린','하이브리드']
        self.num_features   = [col for col in data.columns
                               if col not in self.target_feature+self.unuse_features+self.dummy_features+self.cat_features+self.seg_features]
        
    def transform(self,data):
        d = data.copy()
        for col in self.dummy_features:
            if d[col].dtypes!=int:
                d[col] = d[col].astype(int)
        for col in self.cat_features:
            if d[col].dtypes!=object:
                d[col] = d[col].astype(str)
        for col in self.num_features:
            if d[col].dtypes!=float:
                d[col] = d[col].astype(float)
        for col in self.seg_features:
            if d[col].dtypes!=object:
                d[col] = d[col].astype(str)
        for col in self.unuse_features:
            if col in d.columns:
                d.drop(col,axis=1,inplace=True)
        return d
    
    def fit_transform(self,data):
        self.fit(data)
        return self.transform(data)
    
    def get_feature_type(self):
        globals()['target_feature'] = self.target_feature
        globals()['unuse_features'] = self.unuse_features
        globals()['dummy_features'] = self.dummy_features
        globals()['cat_features']   = self.cat_features
        globals()['num_features']   = self.num_features

In [7]:
type_resetor = TypeResetting()
type_resetor.fit(train_df)
type_resetor.get_feature_type()

train_df2 = type_resetor.transform(train_df)
test_df2  = type_resetor.transform(test_df)

In [8]:
import warnings
warnings.simplefilter("always")

def check_only_oneside(train,test,cat_features):
    not_test_only_features = []
    for iter,col in enumerate(cat_features):
        print('[{}/{}] {}'.format(iter+1,len(cat_features),col))
        
        only_train = list(set(train[col].unique())-set(test[col].unique()))
        only_test  = list(set(test[col].unique())-set(train[col].unique()))
        print(' - Only Train:',len(only_train))
        print(' - Only Test :',len(only_test))
        if len(only_test)>0:
            print('******Warning******')
        else:
            not_test_only_features.append(col)
        print('')
    return not_test_only_features

In [9]:
# 브랜드, 차량모델명, 판매구역, 모델출시년도
not_test_only_features = check_only_oneside(train_df2,test_df2,cat_features+dummy_features)
not_test_only_features = list(set(not_test_only_features)-set(dummy_features))

[1/11] 브랜드
 - Only Train: 0
 - Only Test : 0

[2/11] 차량모델명
 - Only Train: 2
 - Only Test : 0

[3/11] 판매도시
 - Only Train: 1750
 - Only Test : 300

[4/11] 판매구역
 - Only Train: 0
 - Only Test : 0

[5/11] 생산년도
 - Only Train: 3
 - Only Test : 1

[6/11] 모델출시년도
 - Only Train: 0
 - Only Test : 0

[7/11] 압축천연가스(CNG)
 - Only Train: 0
 - Only Test : 0

[8/11] 액화석유가스(LPG)
 - Only Train: 0
 - Only Test : 0

[9/11] 경유
 - Only Train: 0
 - Only Test : 0

[10/11] 가솔린
 - Only Train: 0
 - Only Test : 0

[11/11] 하이브리드
 - Only Train: 0
 - Only Test : 0



In [10]:
not_test_only_features

['브랜드', '모델출시년도', '판매구역', '차량모델명']

In [11]:
seg_df = train_df.groupby(['브랜드','차량모델명']+dummy_features).size().reset_index().rename(columns={0:'cnt'}).sort_values('cnt')
print(seg_df.shape)
seg_df.head()

(430, 8)


Unnamed: 0,브랜드,차량모델명,압축천연가스(CNG),액화석유가스(LPG),경유,가솔린,하이브리드,cnt
214,mercedes-benz,gle-klasa,0,0,0,1,0,1
42,bmw,seria-5,0,0,0,0,1,1
63,bmw,x6,0,0,0,0,1,1
250,nissan,patrol,0,0,0,1,0,1
81,fiat,doblo,1,0,0,0,0,1


<br></br>

# New Features

In [12]:
# pd.Series([str(round(int(year)/100,1)) for year in train_df6['생산년도']]).value_counts()

In [13]:
train_df.head()

Unnamed: 0,ID,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격
0,TRAIN_00000,2018,2014,skoda,fabia,KAT,SLA,85231,999,0,0,1,0,0,51.74
1,TRAIN_00001,2010,2006,toyota,auris,RKO,SWI,135000,1598,0,0,1,0,0,41.47
2,TRAIN_00002,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223,1796,0,0,1,0,0,17.81
3,TRAIN_00003,2006,2001,nissan,x-trail,EHX,WIE,238000,2184,0,1,0,0,0,18.2
4,TRAIN_00004,2007,2007,fiat,bravo,OSW,MAL,251000,1910,0,1,0,0,0,17.55


In [14]:
from tqdm import tqdm
from itertools import chain, combinations
def all_subsets(ss):
    return list(chain(*map(lambda x: combinations(ss, x), range(0, len(ss)+1))))

class FeatureEngineering:
    def __init__(self):
        pass
    
    def _get_quantile(self,x,col):
        x = np.array(x).flatten()
        x = x[pd.notnull(x)]

        agg_df = pd.DataFrame(index=[0])
        for q in [0,25,50,75,100]:
            agg_df[f'{col}_Q{q}'] = np.quantile(x,q/100)

        return agg_df
    
    def _derived_features(self,data):
        d = data.copy()

        # (1) 모델출시년도에 생산된 차량인지
        d['출시년도생산여부'] = np.where(d['생산년도'].astype(float)==d['모델출시년도'].astype(float),1,0)

        # (2) 모델출시 이후에 몇년 지나서 생산됬는지
        d['출시이후생산년수'] = d['생산년도'].astype(float)-d['모델출시년도'].astype(float)

        # (3) 출시 이전에 생산되었는지
        d['출시이전생산여부'] = np.where(d['출시이후생산년수']<0,1,0)

        # (4) 브랜드의 국적 (구글링)
        d['브랜드국적'] = ['체코' if brand=='skoda' else
                        '일본' if brand in ['toyota','nissan','mazda','honda','mitsubishi'] else
                        '독일' if brand in ['mercedes-benz','audi','volkswagen','bmw','opel'] else
                        '이탈리아' if brand=='fiat' else
                        '프랑스' if brand in ['renault','citroen','peugeot'] else
                        '미국' if brand=='ford' else
                        '한국' if brand in ['kia','hyundai'] else
                        '스페인' if brand=='seat' else
                        '스웨덴' if brand=='volvo' else
                        np.nan for brand in d['브랜드']]

        # (5) 브랜드 국적의 대륙명
        d['브랜드대륙명'] = ['유럽' if country in ['체코','독일','이탈리아','프랑스','스페인','스웨덴'] else
                          '아시아' if country in ['일본','한국'] else
                          '아메리카' if country in ['미국'] else
                          np.nan for country in d['브랜드국적']]
        return d
    
    def fit(self,data,cat_features,subset_depth=1):
        assert '가격' in data.columns, \
            'Input data must be training dataset'
        assert len(cat_features)>=subset_depth, \
            'len(cat_features) >= subset_depth'
        
        self.cat_features = cat_features
        self.new_cat_features = ['출시년도생산여부','출시이후생산년수','출시이전생산여부','브랜드국적','브랜드대륙명']
        
        # (6) 카테고리 변수에 따른 가격의 Quantile값
        all_subset_list = all_subsets(cat_features)
        all_subset_list = [subset for subset in all_subset_list if (len(subset)<=subset_depth) & (len(subset)>=1)]
        
        self.agg_dict = {}
        for subset in tqdm(all_subset_list,desc=f'Get quantiles of target by categorical features (depth={subset_depth})'):
            subset = list(subset)
            subset_name = '_'.join(subset)
            agg_fn = data.groupby(subset)['가격'].apply(lambda x: self._get_quantile(x,subset_name)).reset_index()
            drop_cols = [col for col in agg_fn if col.find('level_')>=0]
            agg_fn.drop(columns=drop_cols,inplace=True)
            self.agg_dict[subset_name] = agg_fn
            
    def transform(self,data):
        data = self._derived_features(data)
        for key,agg_fn in self.agg_dict.items():
            data = pd.merge(data,agg_fn,how='left',on=key.split('_'))
        return data
    
    def fit_transform(self,data,cat_features,subset_depth=1):
        self.fit(data,cat_features,subset_depth)
        return self.transform(data)

In [15]:
fe = FeatureEngineering()
fe.fit(
    data=train_df2,
    cat_features=not_test_only_features, 
    subset_depth=CFG.SUBSET_DEPTH,
)
train_df3 = fe.transform(train_df2)
test_df3  = fe.transform(test_df2)

Get quantiles of target by categorical features (depth=3): 100%|██████████| 14/14 [00:19<00:00,  1.36s/it]


In [16]:
fe.new_cat_features

['출시년도생산여부', '출시이후생산년수', '출시이전생산여부', '브랜드국적', '브랜드대륙명']

In [17]:
type_resetor = TypeResetting()
type_resetor.add_categorical_features(fe.new_cat_features)
type_resetor.fit(train_df3)
type_resetor.get_feature_type()

train_df3 = type_resetor.transform(train_df3)
test_df3  = type_resetor.transform(test_df3)

In [18]:
train_df3.shape

(57920, 89)

<br></br>

# EDA

In [19]:
# check_num_features = [col for col in num_features if col.find('_Q')<0]

# i=0
# for col in check_num_features:
#     i+=1
#     print('\n({}/{}) {}'.format(i,len(check_num_features),col))
#     plt.figure(figsize=(15,7))
#     sns.scatterplot(x=train_df3['가격'],y=train_df3[col])
#     plt.show()

<br></br>

# Add the Interaction Term

In [20]:
import warnings
from tqdm import trange

def get_abs_corr(x,y):
    return np.abs(np.corrcoef(x,y))[0,1]

class InteractionTerm:
    def __init__(self):
        pass
    
    def fit(self,data,num_features,corr_cutoff=0.7):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        
        d = data.copy()
        self.interaction_list = []
        for i in range(len(num_features)):
            for j in range(len(num_features)):
                if i>j:
                    col_i = num_features[i]
                    col_j = num_features[j]
                    
                    # 상관계수가 cutoff보다 큰 경우에는 interaction을 생성하지 않음
                    if (get_abs_corr(d[col_i]*d[col_j],d[col_i])>=corr_cutoff) | (get_abs_corr(d[col_i]*d[col_j],d[col_j])>=corr_cutoff):
                        pass
                    else:
                        self.interaction_list.append(f'{col_i}*{col_j}')
    
    def transform(self,data):
        d = data.copy()
        for interaction in self.interaction_list:
            col_i,col_j = interaction.split('*')
            d[interaction] = d[col_i]*d[col_j]
        return d
    
    def fit_transform(self,data,num_features,corr_cutoff=0.7):
        self.fit(data,num_features,corr_cutoff)
        return self.transform(data)

In [21]:
train_df4 = train_df3.copy()
test_df4  = test_df3.copy()

if CFG.INTERACTION:
    interaction_maker = InteractionTerm()
    interaction_maker.fit(
        data=train_df3,
        num_features=num_features,
        corr_cutoff=0.7,
    )
    train_df4 = interaction_maker.transform(train_df4)
    test_df4  = interaction_maker.transform(test_df4)

    type_resetor = TypeResetting()
    type_resetor.add_categorical_features(fe.new_cat_features)
    type_resetor.fit(train_df4)
    type_resetor.get_feature_type()

    train_df4 = type_resetor.transform(train_df4)
    test_df4  = type_resetor.transform(test_df4)

<br></br>

# Feature Selection

In [22]:
# k=0
# for i in range(len(num_features)):
#     for j in range(len(num_features)):
#         if i>j:
#             col_i = num_features[i]
#             col_j = num_features[j]
#             corr = np.corrcoef(train_df4[col_i],train_df4[col_j])[0,1]
#             if corr>=0.7:
#                 k+=1
#                 print(k,col_i,col_j,corr)

In [23]:
def log_offset(x):
    if min(x)>0:
        offset = 0
    elif min(x)==0:
        offset = 1e-3
    else:
        offset = min(x)+1e-3
        print('minimum = {:.3f}'.format(min(x)))
    return np.log(x+offset)

<br></br>

## Categorical Features

In [24]:
# import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [25]:
check_cat_features = [col for col in cat_features if train_df4[col].nunique()<=100]

# (1) ANOVA를 해서 p-value가 0.05보다 높은 것들 확인
pvalue_list = []
for col in tqdm(check_cat_features):
    d = train_df4[[col,'가격']].rename(columns={col:'feature'})
    
    model = ols(f'가격 ~ C(feature)',data=d).fit()
    pvalue = anova_lm(model).values[0][-1]
    pvalue_list.append([col,pvalue])
    
pvalue_df = pd.DataFrame(pvalue_list,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df[pvalue_df.pvalue>=alpha].round(4)

100%|██████████| 9/9 [00:03<00:00,  2.31it/s]


In [26]:
# (2) (1)에서 유의하지않은 feature들은 log적용 후에도 유의하지 않으면 제외
pvalue_list2 = []
unsignificant_features = pvalue_df[pvalue_df.pvalue>CFG.FS_ALPHA].feature.tolist()
for col in tqdm(unsignificant_features):
    d = train_df4[[col,'target']].rename(columns={col:'feature'})
    d['feature'] = log_offset(d['feature'])
    
    model = ols(f'feature ~ C(target)',data=d).fit()
    pvalue = anova_lm(model).values[0][-1]
    pvalue_list2.append([col,pvalue])
    
pvalue_df2 = pd.DataFrame(pvalue_list2,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)

0it [00:00, ?it/s]


In [27]:
delete_features = pvalue_df2[pvalue_df2.pvalue> CFG.FS_ALPHA].feature.tolist()
log_features    = pvalue_df2[pvalue_df2.pvalue<=CFG.FS_ALPHA].feature.tolist()
print('> delete_features')
print('  - length : {}'.format(len(delete_features)))
print('  - feature_name : {}'.format(delete_features))
print('')
print('> log_features')
print('  - length : {}'.format(len(log_features)))
print('  - feature_name : {}'.format(log_features))

train_df5 = train_df4.copy()
train_df5.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    train_df5[col] = log_offset(train_df5[col])
    
test_df5 = test_df4.copy()
test_df5.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    test_df5[col] = log_offset(test_df5[col])

> delete_features
  - length : 0
  - feature_name : []

> log_features
  - length : 0
  - feature_name : []


<br>

## Numerical Features

In [28]:
import scipy

In [29]:
# (1) corr test를 해서 p-value가 0.05보다 높은 것들 확인
pvalue_list = []
for col in num_features:
    corr,pvalue = scipy.stats.pearsonr(train_df5['가격'],train_df5[col])
    pvalue_list.append([col,pvalue])
pvalue_df = pd.DataFrame(pvalue_list,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df.round(4).head()

In [30]:
# (2) (1)에서 유의하지않은 feature들은 log적용 후에도 유의하지 않으면 제외
pvalue_list2 = []
unsignificant_features = pvalue_df[pvalue_df.pvalue>CFG.FS_ALPHA].feature.tolist()
for col in tqdm(unsignificant_features):
    corr,pvalue = scipy.stats.pearsonr(train_df5['가격'],log_offset(train_df5[col]))
    pvalue_list2.append([col,pvalue])
pvalue_df2 = pd.DataFrame(pvalue_list2,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df2.round(4).head()

100%|██████████| 1/1 [00:00<00:00, 61.03it/s]


In [31]:
delete_features = pvalue_df2[pvalue_df2.pvalue> CFG.FS_ALPHA].feature.tolist()
log_features    = pvalue_df2[pvalue_df2.pvalue<=CFG.FS_ALPHA].feature.tolist()
print('> delete_features')
print('  - length : {}'.format(len(delete_features)))
print('  - feature_name : {}'.format(delete_features))
print('')
print('> log_features')
print('  - length : {}'.format(len(log_features)))
print('  - feature_name : {}'.format(log_features))

train_df6 = train_df5.copy()
train_df6.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    train_df6[col] = log_offset(train_df6[col])
    
test_df6 = test_df5.copy()
test_df6.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    test_df6[col] = log_offset(test_df6[col])

> delete_features
  - length : 0
  - feature_name : []

> log_features
  - length : 1
  - feature_name : ['판매구역_Q0']


<br></br>

# Make Segment

In [32]:
def make_segment(data,segment: list):
    d = data.copy()
    d['segment'] = d[segment].apply(lambda x: '___'.join(x),axis=1)
    d.drop(columns=segment,inplace=True)
    return d

In [33]:
segment = ['브랜드','차량모델명']
train_df7 = make_segment(train_df6,segment)
test_df7  = make_segment(test_df6 ,segment)

In [34]:
test_only = list(set(test_df7.segment.unique())-set(train_df7.segment.unique()))
assert len(test_only)==0, \
    "Segment exists only in the test set ({})".format(len(test_only))

In [35]:
train_only = list(set(train_df7['segment'].unique())-set(test_df7['segment'].unique()))

n_asis = len(train_df7)
n_tobe = len(train_df7[~train_df7.segment.isin(train_only)])
train_df7 = train_df7[~train_df7.segment.isin(train_only)]
print('> Train에만 존재하는 Segment 제거')
print(' - 데이터수 : {:,} -> {:,}'.format(n_asis,n_tobe))
print(' - 세그먼트수 : {:,}'.format(train_df7['segment'].nunique()))

> Train에만 존재하는 Segment 제거
 - 데이터수 : 57,920 -> 57,917
 - 세그먼트수 : 141


In [36]:
vc = train_df7['segment'].value_counts().sort_values()
display(vc.head())
print('...')
display(vc.tail())

segment
audi___s3                  7
opel___frontera           14
opel___omega              17
nissan___patrol           20
ford___transit-connect    20
Name: count, dtype: int64

...


segment
ford___focus         1647
volkswagen___golf    1910
bmw___seria-3        1998
audi___a4            2082
opel___astra         2537
Name: count, dtype: int64

In [37]:
type_resetor = TypeResetting()
type_resetor.add_categorical_features(fe.new_cat_features)
type_resetor.add_segment_features(segment)
type_resetor.fit(train_df7)
type_resetor.get_feature_type()

train_df7 = type_resetor.transform(train_df7)
test_df7  = type_resetor.transform(test_df7)

In [38]:
cat_features

['판매도시',
 '판매구역',
 '생산년도',
 '모델출시년도',
 '출시년도생산여부',
 '출시이후생산년수',
 '출시이전생산여부',
 '브랜드국적',
 '브랜드대륙명']

In [39]:
train_df7.head()

Unnamed: 0,생산년도,모델출시년도,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격,출시년도생산여부,출시이후생산년수,출시이전생산여부,브랜드국적,브랜드대륙명,브랜드_Q0,브랜드_Q25,브랜드_Q50,브랜드_Q75,브랜드_Q100,모델출시년도_Q0,모델출시년도_Q25,모델출시년도_Q50,모델출시년도_Q75,모델출시년도_Q100,판매구역_Q0,판매구역_Q25,판매구역_Q50,판매구역_Q75,판매구역_Q100,차량모델명_Q0,차량모델명_Q25,차량모델명_Q50,차량모델명_Q75,차량모델명_Q100,브랜드_모델출시년도_Q0,브랜드_모델출시년도_Q25,브랜드_모델출시년도_Q50,브랜드_모델출시년도_Q75,브랜드_모델출시년도_Q100,브랜드_판매구역_Q0,브랜드_판매구역_Q25,브랜드_판매구역_Q50,브랜드_판매구역_Q75,브랜드_판매구역_Q100,브랜드_차량모델명_Q0,브랜드_차량모델명_Q25,브랜드_차량모델명_Q50,브랜드_차량모델명_Q75,브랜드_차량모델명_Q100,모델출시년도_판매구역_Q0,모델출시년도_판매구역_Q25,모델출시년도_판매구역_Q50,모델출시년도_판매구역_Q75,모델출시년도_판매구역_Q100,모델출시년도_차량모델명_Q0,모델출시년도_차량모델명_Q25,모델출시년도_차량모델명_Q50,모델출시년도_차량모델명_Q75,모델출시년도_차량모델명_Q100,판매구역_차량모델명_Q0,판매구역_차량모델명_Q25,판매구역_차량모델명_Q50,판매구역_차량모델명_Q75,판매구역_차량모델명_Q100,브랜드_모델출시년도_판매구역_Q0,브랜드_모델출시년도_판매구역_Q25,브랜드_모델출시년도_판매구역_Q50,브랜드_모델출시년도_판매구역_Q75,브랜드_모델출시년도_판매구역_Q100,브랜드_모델출시년도_차량모델명_Q0,브랜드_모델출시년도_차량모델명_Q25,브랜드_모델출시년도_차량모델명_Q50,브랜드_모델출시년도_차량모델명_Q75,브랜드_모델출시년도_차량모델명_Q100,브랜드_판매구역_차량모델명_Q0,브랜드_판매구역_차량모델명_Q25,브랜드_판매구역_차량모델명_Q50,브랜드_판매구역_차량모델명_Q75,브랜드_판매구역_차량모델명_Q100,모델출시년도_판매구역_차량모델명_Q0,모델출시년도_판매구역_차량모델명_Q25,모델출시년도_판매구역_차량모델명_Q50,모델출시년도_판매구역_차량모델명_Q75,모델출시년도_판매구역_차량모델명_Q100,segment
0,2018,2014,KAT,SLA,85231.0,999.0,0,0,1,0,0,51.74,0,4.0,0,체코,유럽,2.07,29.77,57.85,90.8375,156.0,1.17,55.6725,77.87,103.87,156.0,0.262364,24.7,43.55,77.87,156.0,2.21,18.785,40.95,58.305,125.25,12.87,46.67,55.25,64.87,118.17,3.12,32.37,55.77,92.3,156.0,2.21,18.785,40.95,58.305,125.25,26.78,53.76,75.4,101.27,156.0,12.87,46.67,55.25,64.87,118.17,3.12,20.005,45.37,57.2,114.59,33.8,47.6775,52.585,61.5875,103.48,12.87,46.67,55.25,64.87,118.17,3.12,20.005,45.37,57.2,114.59,33.8,47.6775,52.585,61.5875,103.48,skoda___fabia
1,2010,2006,RKO,SWI,135000.0,1598.0,0,0,1,0,0,41.47,0,4.0,0,일본,아시아,1.17,28.6,54.6,84.37,156.0,3.89,21.97,28.47,44.07,156.0,1.095273,22.49,33.8,58.37,156.0,14.69,36.4,61.1,80.405,116.87,14.69,25.87,33.15,44.07,85.67,8.45,28.275,46.67,72.71,155.87,14.69,36.4,61.1,80.405,116.87,9.75,22.815,28.6,43.94,124.8,14.69,24.57,28.6,36.4,64.92,17.55,28.6975,53.885,72.02,109.07,17.55,27.625,36.855,51.025,85.67,14.69,24.57,28.6,36.4,64.92,17.55,28.6975,53.885,72.02,109.07,17.55,23.205,28.275,37.8625,55.77,toyota___auris
2,2002,2002,GNI,WIE,255223.0,1796.0,0,0,1,0,0,17.81,1,0.0,0,독일,유럽,2.99,23.4,44.07,77.84,156.0,2.6,11.63,16.89,28.6,155.87,0.482426,21.97,40.755,73.45,156.0,9.49,23.27,31.2,37.6025,129.87,7.15,23.4,29.77,36.27,155.87,3.06,22.88,42.38,76.27,155.87,9.49,23.27,31.2,37.6025,129.87,3.24,10.53,15.21,25.35,102.7,12.87,25.87,32.49,38.87,129.87,11.05,18.165,28.21,33.73,37.7,12.35,23.92,29.89,33.67,63.7,12.87,25.87,32.49,38.87,129.87,11.05,18.165,28.21,33.73,37.7,12.87,23.355,29.77,35.49,37.7,mercedes-benz___clk-klasa
3,2006,2001,EHX,WIE,238000.0,2184.0,0,1,0,0,0,18.2,0,5.0,0,일본,아시아,2.86,35.1,50.57,81.9,155.87,2.6,10.4,15.47,22.09,132.47,0.482426,21.97,40.755,73.45,156.0,8.71,78.91,97.11,116.87,155.87,3.64,9.62,13.65,19.37,29.77,3.64,32.37,51.74,75.27,140.27,8.71,78.91,97.11,116.87,155.87,3.64,8.97,12.155,16.9,64.87,8.71,17.81,19.37,22.88,29.77,18.2,88.335,106.59,116.935,134.55,3.64,10.53,12.22,18.85,24.57,8.71,17.81,19.37,22.88,29.77,18.2,88.335,106.59,116.935,134.55,18.2,19.5,20.8,22.685,24.57,nissan___x-trail
4,2007,2007,OSW,MAL,251000.0,1910.0,0,1,0,0,0,17.55,1,0.0,0,이탈리아,유럽,3.04,17.55,34.125,65.0,143.89,5.46,25.77,37.57,51.87,156.0,0.732368,24.57,42.77,74.1,156.0,8.45,18.07,21.45,24.7975,38.87,8.45,18.07,21.45,24.7975,38.87,4.81,18.13,35.1,78.26,128.63,8.45,18.07,21.45,24.7975,38.87,9.1,25.09,38.87,55.9,156.0,8.45,18.07,21.45,24.7975,38.87,9.1,17.68,21.97,24.96,32.37,9.1,17.68,21.97,24.96,32.37,8.45,18.07,21.45,24.7975,38.87,9.1,17.68,21.97,24.96,32.37,9.1,17.68,21.97,24.96,32.37,fiat___bravo


<br></br>

# Modeling

In [40]:
import os
def mkdir(paths):
    if type(paths)==str:
        paths = [paths]
    for path in paths:
        if not os.path.isdir(path):
            print('> Create Folder: {}'.format(path))
            os.mkdir(path)

In [41]:
## dummy_features는 한가지만 속함
# X[dummy_features].apply(lambda x: np.sum(x),axis=1).value_counts()

def add_fuel_type(data,dummy_features):
    d = data.copy()
    d['fuel_type'] = d[dummy_features].apply(
        lambda x: dummy_features[np.where(x==1)[0][0]],axis=1)
    d.drop(columns=dummy_features,inplace=True)
    return d

In [42]:
mkdir('./model_checkpoints')
mkdir('./model_checkpoints/segment_catboost')
mkdir('./model_checkpoints/segment_weightedensemble')

<br>

## CatBoost
- public score : 6.3578576144

In [43]:
gc.collect()

0

In [44]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool



In [45]:
train_fn = train_df7.copy()
test_fn  = test_df7 .copy()

In [46]:
%%time
# 30분

X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]

X = add_fuel_type(X,dummy_features)
new_cat_features = cat_features + ['fuel_type']

X_train, X_valid, y_train, y_valid = train_test_split(
    X,y,test_size=0.2,random_state=CFG.SEED,stratify=X['segment'])
assert X_train.segment.nunique()==X_valid.segment.nunique(), \
    "The number of segments in the training and the validation is different."

segment_list = X['segment'].unique()

models = {}
feature_info = {}
scores = []
pbar = tqdm(segment_list)
for segment in pbar:
    # segment에 해당하는 데이터추출
    X_tr = X_train[X_train.segment==segment].drop('segment',axis=1)
    y_tr = y_train[X_train.segment==segment]
    X_va = X_valid[X_valid.segment==segment].drop('segment',axis=1)
    y_va = y_valid[X_valid.segment==segment]
    pbar.set_description('Segment: {}, Length: Train({}), Validation({})'.format(segment,len(X_tr),len(X_va)))
    
    # unique인 컬럼 제외
    unique_info = X_tr.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        X_tr = X_tr.drop(unique_cols,axis=1)
        X_va = X_va.drop(unique_cols,axis=1)
        
    # categorical feature에서 unique인 컬럼을 제외
    fixed_cat_features = [col for col in new_cat_features if col in X_tr.columns]
    train_dataset = Pool(X_tr,y_tr,cat_features=fixed_cat_features)
    valid_dataset = Pool(X_va,y_va,cat_features=fixed_cat_features)
    
    # define the model
    model = CatBoostRegressor(
        loss_function='MAE',
        random_state=CFG.SEED,
        iterations=CFG.EPOCHS,
        learning_rate=CFG.LR,
        allow_writing_files=False,
    )
    
    # fit the model
    model.fit(
        train_dataset,
        eval_set=valid_dataset,
        early_stopping_rounds=CFG.ES,
        verbose=0,
        #metric_period=CFG.EPOCHS//5,
    )
    
    # save the model
    model.save_model(f'./model_checkpoints/segment_catboost/{segment}.cbm')

    # calculate the score
    y_pred = model.predict(valid_dataset).flatten()
    y_true = y_va.values
    score = mean_absolute_error(y_true=y_true,y_pred=y_pred)
    
    # append
    models[segment] = model
    feature_info[segment] = {'cat_features':fixed_cat_features,'features':X_tr.columns.tolist()}
    scores.append([segment,len(X_tr),len(X_va),score])

Segment: audi___s3, Length: Train(6), Validation(1): 100%|██████████| 141/141 [29:55<00:00, 12.74s/it]                 

CPU times: user 1h 4min, sys: 16min 26s, total: 1h 20min 26s
Wall time: 29min 58s





In [47]:
import pickle
with open('./model_checkpoints/segment_cat_models.pkl', 'wb') as f:
	pickle.dump(models, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_cat_feature_info.pkl', 'wb') as f:
	pickle.dump(feature_info, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_cat_scores.pkl', 'wb') as f:
	pickle.dump(scores, f, protocol=pickle.HIGHEST_PROTOCOL)

In [48]:
pd.DataFrame(scores,columns=['segment','n_tr','n_val','score']).sort_values('n_tr',ascending=False).head()

Unnamed: 0,segment,n_tr,n_val,score
20,opel___astra,2029,508,4.097556
5,audi___a4,1665,417,5.411264
43,bmw___seria-3,1598,400,7.392699
45,volkswagen___golf,1528,382,5.372326
29,ford___focus,1317,330,4.408158


In [49]:
# inference
X = train_fn.drop(target_feature,axis=1)
X = add_fuel_type(X,dummy_features)
y = train_fn[target_feature]

X_test = add_fuel_type(test_fn,dummy_features)

segment_list = X['segment'].unique()

tr_pred_list = []
te_pred_list = []
for segment in tqdm(segment_list):
    ## data load
    # (1) train
    train_data = X[X.segment==segment][feature_info[segment]['features']]
    train_dataset = Pool(train_data,cat_features=feature_info[segment]['cat_features'])
    # (2) test
    test_data = X_test[X_test.segment==segment][feature_info[segment]['features']]
    test_dataset = Pool(test_data,cat_features=feature_info[segment]['cat_features'])
    
    ## model
    model = models[segment]
    
    ## prediction
    # (1) train
    tr_pred_df = pd.DataFrame({
        'true':y[X.segment==segment].values.flatten(),
        'pred':model.predict(train_dataset),
    })
    tr_pred_df.index = train_data.index
    # (2) test
    te_pred_df = pd.DataFrame({
        'pred':model.predict(test_dataset),
    })
    te_pred_df.index = test_data.index
    
    ## append
    tr_pred_list.append(tr_pred_df)
    te_pred_list.append(te_pred_df)

100%|██████████| 141/141 [00:02<00:00, 48.25it/s]


In [50]:
# train
tr_pred_df = pd.concat(tr_pred_list,axis=0).sort_index()
mean_absolute_error(y_true=tr_pred_df.true,y_pred=tr_pred_df.pred)

4.696386819505149

In [51]:
te_pred_df = pd.concat(te_pred_list,axis=0).sort_index()
te_pred_df.head()

Unnamed: 0,pred
0,79.440798
1,25.731654
2,99.688314
3,126.403475
4,54.655225


In [52]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['가격'] = te_pred_df.pred.values
submit.to_csv('./out/3_catboost_segment.csv',index=False)

<br>

## Weighted Ensemble
- public score : 6.5472181851

In [53]:
import pandas as pd
import warnings

class OneHotEncoder:
    def __init__(self):
        pass
    
    def fit(self,data,columns):
        self.transform_list = []
        for col in columns:
            for i,value in enumerate(sorted(data[col].unique())):
                if i>0:
                    self.transform_list.append([col,value])
        
    def transform(self,data):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        new_data = data.copy()
        for col,value in self.transform_list:
            new_data[f'{col}_{value}'] = np.where(new_data[col]==value,1,0)
        drop_columns = pd.unique(np.array(self.transform_list)[:,0])
        new_data.drop(columns=drop_columns,inplace=True)
        return new_data

In [54]:
from copy import deepcopy
from sklearn.base import BaseEstimator, RegressorMixin
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
import warnings
import time
import pickle

class WeightedEnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self):
        super().__init__()
        self._get_regressors()
    
    def _get_regressors(self):
        max_depth = 10
        n_jobs = -1
        
        params_catboost = {
            'random_state':CFG.SEED,
            'early_stopping_rounds' : CFG.ES,
            'learning_rate' : CFG.LR,
            'iterations' : CFG.EPOCHS,
            'loss_function': 'MAE',
            'grow_policy' : 'Lossguide', # 'SymmetricTree','Depthwise'
            'use_best_model' : True,
            'allow_writing_files' : False,
            'verbose' : 0,
            'max_depth': max_depth,
            'l2_leaf_reg' : 1,
        }
    
        params_xgboost = {
            'random_state':CFG.SEED,
            'early_stopping_rounds' : CFG.XGB_ES,
            'learning_rate' : CFG.XGB_LR,
            'n_estimators' : CFG.XGB_EPOCHS,
            'objective': 'reg:absoluteerror',
            'verbosity' : 0,
            'max_depth': max_depth,
            'n_jobs' : n_jobs,
        }
    
        params_lgb = {
            'objective': 'regression',
            'random_state':CFG.SEED,
            'early_stopping_round' : CFG.ES,
            'learning_rate' : CFG.LR,
            'n_estimators' : CFG.EPOCHS,
            'metric': 'mean_absolute_error',
            'verbosity' : -1,
            'max_depth': max_depth,
            'n_jobs' : n_jobs,
        }
        
        self.regressors = [
            CatBoostRegressor(**params_catboost),
            XGBRegressor(**params_xgboost),
            LGBMRegressor(**params_lgb),
        ]
        self.regressors_name = ['CatBoost','XGBoost','LightGBM']
    
    def fit(self,X,y,eval_set,oh_set,cat_features,verbose=1):
        assert len(eval_set)==1, \
            "eval_set length must be 1. len(eval_set)={}".format(len(eval_set))
        assert len(oh_set)==1, \
            "oh_set length must be 1. len(oh_set)={}".format(len(oh_set))
        X_val, y_val = eval_set[0]
        X_oh, X_val_oh = oh_set[0]
        
        self.cat_features = cat_features
        self.weights = []
        self.fitting_elapsed = []
        if verbose:
            pbar = tqdm(zip(self.regressors_name,self.regressors),total=len(self.regressors))
        else:
            pbar = zip(self.regressors_name,self.regressors)
        for name,regressor in pbar:
            s = time.time()
            if verbose:
                pbar.set_description(name)
            if name=='CatBoost':
                train_dataset = Pool(X,y,cat_features=cat_features)
                val_dataset   = Pool(X_val,y_val,cat_features=self.cat_features)
                regressor.fit(
                    train_dataset,
                    eval_set=val_dataset,
                    #metric_period=CFG.EPOCHS//5,
                )
                val_pred = regressor.predict(val_dataset)
            elif name=='XGBoost':
                regressor.fit(
                    X_oh,y,
                    eval_set=[(X_val_oh,y_val)],
                    verbose=0,
                )
                val_pred = regressor.predict(X_val_oh)
            elif name=='LightGBM':
                warnings.filterwarnings("ignore", category=UserWarning)
                for col in cat_features:
                    X[col]     = X[col].astype('category')
                    X_val[col] = X_val[col].astype('category')
                regressor.fit(
                    X,y,
                    eval_set=[(X_val,y_val)],
                    verbose=-1,
                )
                val_pred = regressor.predict(X_val)
            else:
                raise ValueError('Unknown Regressor: {}'.format(name))
                
            score = mean_absolute_error(y_pred=val_pred,y_true=y_val)
            e = time.time()
            
            self.weights.append(1/score)
            self.fitting_elapsed.append(e-s)
        
        self.weights /= sum(self.weights)
                
    def predict(self,X,X_oh):
        assert len(X)==len(X_oh), \
            "X and X_oh must be same length"
        
        pred_list = []
        for name,regressor in zip(self.regressors_name,self.regressors):
            if name=='CatBoost':
                dataset = Pool(X,cat_features=self.cat_features)
            elif name=='XGBoost':
                dataset = X_oh
            elif name=='LightGBM':
                dataset = X
                for col in self.cat_features:
                    dataset[col] = dataset[col].astype('category')
            
            y_pred = regressor.predict(dataset)
            y_pred = np.array(y_pred).flatten()
            pred_list.append(y_pred)
            
        final_pred = np.zeros(len(X))
        for pred,weight in zip(pred_list,self.weights):
            final_pred += np.array(pred)*weight
            
        return final_pred
    
    def save_model(self,path):
        save_dict = {
            'cat_features' : self.cat_features,
            'weights' : self.weights,
            'fitting_elapsed' : self.fitting_elapsed,
            'regressors' : self.regressors,
        }
        with open(path, 'wb') as f:
            pickle.dump(save_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
            
    def load_model(self,path):
        with open(path, 'rb') as f:
            save_dict = pickle.load(f)
            self.cat_features = save_dict['cat_features']
            self.weights = save_dict['weights']
            self.fitting_elapsed = save_dict['fitting_elapsed']
            self.regressors = save_dict['regressors']

In [None]:
gc.collect()

In [55]:
train_fn = train_df7.copy()
test_fn  = test_df7 .copy()

In [74]:
%%time
# 120분

X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]

ohe = OneHotEncoder()
ohe.fit(X,cat_features)
X_oh = ohe.transform(X)

X_train, X_valid, y_train, y_valid = train_test_split(
    X,y,test_size=0.2,random_state=CFG.SEED,stratify=X['segment'])
assert X_train.segment.nunique()==X_valid.segment.nunique(), \
    "The number of segments in the training and the validation is different."

X_train_oh = X_oh.loc[X_train.index]
X_valid_oh = X_oh.loc[X_valid.index]

segment_list = X['segment'].unique()

models = {}
feature_info = {}
scores = []
pbar = tqdm(segment_list)
for segment in pbar:
    # segment에 해당하는 데이터추출
    X_tr = X_train[X_train.segment==segment].drop('segment',axis=1)
    y_tr = y_train[X_train.segment==segment]
    X_va = X_valid[X_valid.segment==segment].drop('segment',axis=1)
    y_va = y_valid[X_valid.segment==segment]
    X_tr_oh = X_train_oh[X_train_oh.segment==segment].drop('segment',axis=1)
    X_va_oh = X_valid_oh[X_valid_oh.segment==segment].drop('segment',axis=1)
    pbar.set_description('Segment: {}, Length: Train({}), Validation({})'.format(segment,len(X_tr),len(X_va)))
    
    # unique인 컬럼 제외
    # (1) X
    unique_info = X_tr.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        X_tr = X_tr.drop(unique_cols,axis=1)
        X_va = X_va.drop(unique_cols,axis=1)
    # (2) X_oh
    unique_info = X_tr_oh.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        X_tr_oh = X_tr_oh.drop(unique_cols,axis=1)
        X_va_oh = X_va_oh.drop(unique_cols,axis=1)
        
    # categorical feature에서 unique인 컬럼을 제외
    fixed_cat_features = [col for col in cat_features if col in X_tr.columns]
    train_dataset = Pool(X_tr,y_tr,cat_features=fixed_cat_features)
    valid_dataset = Pool(X_va,y_va,cat_features=fixed_cat_features)
    
    # define the model
    ensemble_model = WeightedEnsembleRegressor()
    
    # fit the model
    ensemble_model.fit(
        X_tr,y_tr,
        eval_set=[(X_va,y_va)],
        oh_set=[(X_tr_oh,X_va_oh)],
        cat_features=fixed_cat_features,
        verbose=0,
    )
    
    # save the model
    ensemble_model.save_model(f'./model_checkpoints/segment_weightedensemble/{segment}.pickle')

    # calculate the score
    y_pred = ensemble_model.predict(X_va,X_va_oh).flatten()
    y_true = y_va.values
    score = mean_absolute_error(y_true=y_true,y_pred=y_pred)
    
    # append
    models[segment] = ensemble_model
    feature_info[segment] = {
        'cat_features':fixed_cat_features,
        'features':X_tr.columns.tolist(),
        'oh_features':X_tr_oh.columns.tolist(),
    }
    scores.append([segment,len(X_tr),len(X_va),score])

Segment: audi___s3, Length: Train(6), Validation(1): 100%|██████████| 141/141 [1:57:43<00:00, 50.10s/it]                   

CPU times: user 4h 8min 5s, sys: 1h 26min 24s, total: 5h 34min 29s
Wall time: 1h 57min 59s





In [81]:
import pickle
with open('./model_checkpoints/segment_weiens_models.pkl', 'wb') as f:
	pickle.dump(models, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_weiens_feature_info.pkl', 'wb') as f:
	pickle.dump(feature_info, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_weiens_scores.pkl', 'wb') as f:
	pickle.dump(scores, f, protocol=pickle.HIGHEST_PROTOCOL)

In [82]:
pd.DataFrame(scores,columns=['segment','n_tr','n_val','score']).sort_values('n_tr',ascending=False).head()

Unnamed: 0,segment,n_tr,n_val,score
20,opel___astra,2029,508,4.047941
5,audi___a4,1665,417,5.548476
43,bmw___seria-3,1598,400,7.448294
45,volkswagen___golf,1528,382,5.538396
29,ford___focus,1317,330,4.61203


In [83]:
# inference
X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]

X_test = test_fn.copy()

ohe = OneHotEncoder()
ohe.fit(X,cat_features)
X_oh = ohe.transform(X)
X_test_oh = ohe.transform(X_test)

segment_list = X['segment'].unique()

tr_pred_list = []
te_pred_list = []
for segment in tqdm(segment_list):
    ## data load
    # (1) train
    train_data    = X   [X   .segment==segment][feature_info[segment]['features']]
    train_data_oh = X_oh[X_oh.segment==segment][feature_info[segment]['oh_features']]
    # (2) test
    test_data     = X_test   [X_test   .segment==segment][feature_info[segment]['features']]
    test_data_oh  = X_test_oh[X_test_oh.segment==segment][feature_info[segment]['oh_features']]
    
    ## model
    model = models[segment]
    
    ## prediction
    # (1) train
    tr_pred_df = pd.DataFrame({
        'true':y[X.segment==segment].values.flatten(),
        'pred':model.predict(train_data,train_data_oh),
    })
    tr_pred_df.index = train_data.index
    # (2) test
    te_pred_df = pd.DataFrame({
        'pred':model.predict(test_data,test_data_oh),
    })
    te_pred_df.index = test_data.index
    
    ## append
    tr_pred_list.append(tr_pred_df)
    te_pred_list.append(te_pred_df)

100%|██████████| 141/141 [00:39<00:00,  3.59it/s]


In [84]:
# train
tr_pred_df = pd.concat(tr_pred_list,axis=0).sort_index()
mean_absolute_error(y_true=tr_pred_df.true,y_pred=tr_pred_df.pred)

3.8522682289581254

In [93]:
# def abline(intercept,slope,**kwargs):
#     axes = plt.gca()
#     x_vals = np.array(axes.get_xlim())
#     y_vals = intercept + slope * x_vals
#     plt.plot(x_vals, y_vals, '--',**kwargs)

# offset = 0.05
# min_value = min(tr_pred_df.true.min(),tr_pred_df.pred.min())*(1-offset)
# max_value = min(tr_pred_df.true.max(),tr_pred_df.pred.max())*(1+offset)

# plt.figure(figsize=(15,7))
# sns.scatterplot(x=tr_pred_df.true,y=tr_pred_df.pred)
# plt.xlim(min_value,max_value)
# plt.ylim(min_value,max_value)
# abline(0,1,color='red',linestyle='--')
# plt.show()

In [79]:
te_pred_df = pd.concat(te_pred_list,axis=0).sort_index()
te_pred_df.head()

Unnamed: 0,pred
0,86.245076
1,27.885631
2,94.027649
3,123.974522
4,52.85984


In [80]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['가격'] = te_pred_df.pred.values
submit.to_csv('./out/4_ensemble_segment.csv',index=False)

<br>

## 참조 pycaret

In [None]:
# from pycaret import regression

In [None]:
# %%time
# regression.setup(data=train_df5,target='가격',remove_outliers=True,verbose=True)
# best = regression.compare_models(n_select=5,fold=5)