# Library Setting

In [1]:
from tqdm import tqdm
tqdm.pandas()

import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

import gc
gc.collect()

0

<br></br>

# Configuration

In [3]:
class CFG:
    SEED = 0
    
    SUBSET_DEPTH = 3
    INTERACTION = True
    FS_ALPHA = 0.01
    
    N_SPLITS = 5
    
    LR = 0.003
    EPOCHS = 30000
    ES = 300
    XGB_LR = 0.3     # default
    XGB_EPOCHS = 1000 # default
    XGB_ES = 10

<br></br>

# Data

## Data Load

In [35]:
train_df = pd.read_csv('./data/train.csv')
test_df  = pd.read_csv('./data/test.csv')

In [36]:
train_df.shape, test_df.shape

((57920, 15), (14480, 14))

In [37]:
train_df.head()

Unnamed: 0,ID,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격
0,TRAIN_00000,2018,2014,skoda,fabia,KAT,SLA,85231,999,0,0,1,0,0,51.74
1,TRAIN_00001,2010,2006,toyota,auris,RKO,SWI,135000,1598,0,0,1,0,0,41.47
2,TRAIN_00002,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223,1796,0,0,1,0,0,17.81
3,TRAIN_00003,2006,2001,nissan,x-trail,EHX,WIE,238000,2184,0,1,0,0,0,18.2
4,TRAIN_00004,2007,2007,fiat,bravo,OSW,MAL,251000,1910,0,1,0,0,0,17.55


<br>

## Resetting Columns Type

In [38]:
class TypeResetting:
    def __init__(self):
        self.cat_features = ['브랜드','차량모델명','판매도시','판매구역','생산년도','모델출시년도']
        self.seg_features = []
        
    def add_categorical_features(self,cat_features):
        self.cat_features += cat_features
        
    def delete_categorical_features(self,cat_features):
        self.cat_features = [col for col in self.cat_features if col not in cat_features]
        
    def add_segment_features(self,segment_features):
        self.seg_features = ['segment']
        self.cat_features = [col for col in self.cat_features if col not in segment_features]
        
    def fit(self,data):
        if (len(self.seg_features)>0) & ('segment' not in data.columns):
            raise ValueError("segment column name must be 'segment'")
        self.target_feature = ['가격']
        self.unuse_features = ['ID']
        self.dummy_features = ['압축천연가스(CNG)','액화석유가스(LPG)','경유','가솔린','하이브리드']
        self.num_features   = [col for col in data.columns
                               if col not in self.target_feature+self.unuse_features+self.dummy_features+self.cat_features+self.seg_features]
        
    def transform(self,data):
        d = data.copy()
        for col in self.dummy_features:
            if d[col].dtypes!=int:
                d[col] = d[col].astype(int)
        for col in self.cat_features:
            if d[col].dtypes!=object:
                d[col] = d[col].astype(str)
        for col in self.num_features:
            if d[col].dtypes!=float:
                d[col] = d[col].astype(float)
        for col in self.seg_features:
            if d[col].dtypes!=object:
                d[col] = d[col].astype(str)
        for col in self.unuse_features:
            if col in d.columns:
                d.drop(col,axis=1,inplace=True)
        return d
    
    def fit_transform(self,data):
        self.fit(data)
        return self.transform(data)
    
    def get_feature_type(self):
        globals()['target_feature'] = self.target_feature
        globals()['unuse_features'] = self.unuse_features
        globals()['dummy_features'] = self.dummy_features
        globals()['cat_features']   = self.cat_features
        globals()['num_features']   = self.num_features

In [39]:
type_resetor = TypeResetting()
type_resetor.fit(train_df)
type_resetor.get_feature_type()

train_df2 = type_resetor.transform(train_df)
test_df2  = type_resetor.transform(test_df)

In [40]:
import warnings
warnings.simplefilter("always")

def check_only_oneside(train,test,cat_features):
    not_test_only_features = []
    for iter,col in enumerate(cat_features):
        print('[{}/{}] {}'.format(iter+1,len(cat_features),col))
        
        only_train = list(set(train[col].unique())-set(test[col].unique()))
        only_test  = list(set(test[col].unique())-set(train[col].unique()))
        print(' - Only Train:',len(only_train))
        print(' - Only Test :',len(only_test))
        if len(only_test)>0:
            print('******Warning******')
        else:
            not_test_only_features.append(col)
        print('')
    return not_test_only_features

In [41]:
# 브랜드, 차량모델명, 판매구역, 모델출시년도
not_test_only_features = check_only_oneside(train_df2,test_df2,cat_features+dummy_features)
not_test_only_features = list(set(not_test_only_features)-set(dummy_features))

[1/11] 브랜드
 - Only Train: 0
 - Only Test : 0

[2/11] 차량모델명
 - Only Train: 2
 - Only Test : 0

[3/11] 판매도시
 - Only Train: 1750
 - Only Test : 300

[4/11] 판매구역
 - Only Train: 0
 - Only Test : 0

[5/11] 생산년도
 - Only Train: 3
 - Only Test : 1

[6/11] 모델출시년도
 - Only Train: 0
 - Only Test : 0

[7/11] 압축천연가스(CNG)
 - Only Train: 0
 - Only Test : 0

[8/11] 액화석유가스(LPG)
 - Only Train: 0
 - Only Test : 0

[9/11] 경유
 - Only Train: 0
 - Only Test : 0

[10/11] 가솔린
 - Only Train: 0
 - Only Test : 0

[11/11] 하이브리드
 - Only Train: 0
 - Only Test : 0



In [42]:
not_test_only_features

['판매구역', '차량모델명', '모델출시년도', '브랜드']

<br></br>

# New Features

In [43]:
# pd.Series([str(round(int(year)/100,1)) for year in train_df6['생산년도']]).value_counts()

In [44]:
train_df.head()

Unnamed: 0,ID,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격
0,TRAIN_00000,2018,2014,skoda,fabia,KAT,SLA,85231,999,0,0,1,0,0,51.74
1,TRAIN_00001,2010,2006,toyota,auris,RKO,SWI,135000,1598,0,0,1,0,0,41.47
2,TRAIN_00002,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223,1796,0,0,1,0,0,17.81
3,TRAIN_00003,2006,2001,nissan,x-trail,EHX,WIE,238000,2184,0,1,0,0,0,18.2
4,TRAIN_00004,2007,2007,fiat,bravo,OSW,MAL,251000,1910,0,1,0,0,0,17.55


In [45]:
from tqdm import tqdm
from itertools import chain, combinations
def all_subsets(ss):
    return list(chain(*map(lambda x: combinations(ss, x), range(0, len(ss)+1))))

class FeatureEngineering:
    def __init__(self):
        pass
    
    def _get_quantile(self,x,col):
        x = np.array(x).flatten()
        x = x[pd.notnull(x)]

        agg_df = pd.DataFrame(index=[0])
        for q in [0,25,50,75,100]:
            agg_df[f'{col}_Q{q}'] = np.quantile(x,q/100)

        return agg_df
    
    def _derived_features(self,data):
        d = data.copy()

        # (1) 모델출시년도에 생산된 차량인지
        d['출시년도생산여부'] = np.where(d['생산년도'].astype(int)==d['모델출시년도'].astype(int),1,0)

        # (2) 모델출시 이후에 몇년 지나서 생산됬는지
        d['출시이후생산년수'] = d['생산년도'].astype(int)-d['모델출시년도'].astype(int)

        # (3) 출시 이전에 생산되었는지
        d['출시이전생산여부'] = np.where(d['출시이후생산년수']<0,1,0)

        # (4) 브랜드의 국적 (구글링)
        d['브랜드국적'] = ['체코' if brand=='skoda' else
                        '일본' if brand in ['toyota','nissan','mazda','honda','mitsubishi'] else
                        '독일' if brand in ['mercedes-benz','audi','volkswagen','bmw','opel'] else
                        '이탈리아' if brand=='fiat' else
                        '프랑스' if brand in ['renault','citroen','peugeot'] else
                        '미국' if brand=='ford' else
                        '한국' if brand in ['kia','hyundai'] else
                        '스페인' if brand=='seat' else
                        '스웨덴' if brand=='volvo' else
                        np.nan for brand in d['브랜드']]

        # (5) 브랜드 국적의 대륙명
        d['브랜드대륙명'] = ['유럽' if country in ['체코','독일','이탈리아','프랑스','스페인','스웨덴'] else
                          '아시아' if country in ['일본','한국'] else
                          '아메리카' if country in ['미국'] else
                          np.nan for country in d['브랜드국적']]
        return d
    
    def fit(self,data,cat_features,subset_depth=1):
        assert '가격' in data.columns, \
            'Input data must be training dataset'
        assert len(cat_features)>=subset_depth, \
            'len(cat_features) >= subset_depth'
        
        self.cat_features = cat_features
        self.new_cat_features = ['출시년도생산여부','출시이후생산년수','출시이전생산여부','브랜드국적','브랜드대륙명']
        
        # (6) 카테고리 변수에 따른 가격의 Quantile값
        all_subset_list = all_subsets(cat_features)
        all_subset_list = [subset for subset in all_subset_list if (len(subset)<=subset_depth) & (len(subset)>=1)]
        
        self.agg_dict = {}
        for subset in tqdm(all_subset_list,desc=f'Get quantiles of target by categorical features (depth={subset_depth})'):
            subset = list(subset)
            subset_name = '_'.join(subset)
            agg_fn = data.groupby(subset)['가격'].apply(lambda x: self._get_quantile(x,subset_name)).reset_index()
            drop_cols = [col for col in agg_fn if col.find('level_')>=0]
            agg_fn.drop(columns=drop_cols,inplace=True)
            self.agg_dict[subset_name] = agg_fn
            
    def transform(self,data):
        data = self._derived_features(data)
        for key,agg_fn in self.agg_dict.items():
            data = pd.merge(data,agg_fn,how='left',on=key.split('_'))
        return data
    
    def fit_transform(self,data,cat_features,subset_depth=1):
        self.fit(data,cat_features,subset_depth)
        return self.transform(data)

In [46]:
fe = FeatureEngineering()
fe.fit(
    data=train_df2,
    cat_features=not_test_only_features, 
    subset_depth=CFG.SUBSET_DEPTH,
)
train_df3 = fe.transform(train_df2)
test_df3  = fe.transform(test_df2)

Get quantiles of target by categorical features (depth=3): 100%|██████████| 14/14 [00:17<00:00,  1.26s/it]


In [47]:
fe.new_cat_features

['출시년도생산여부', '출시이후생산년수', '출시이전생산여부', '브랜드국적', '브랜드대륙명']

In [54]:
type_resetor = TypeResetting()
type_resetor.add_categorical_features(fe.new_cat_features)
type_resetor.fit(train_df3)
type_resetor.get_feature_type()

train_df3 = type_resetor.transform(train_df3)
test_df3  = type_resetor.transform(test_df3)

In [55]:
print(train_df3.shape)
train_df3.head()

(57920, 89)


Unnamed: 0,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격,출시년도생산여부,출시이후생산년수,출시이전생산여부,브랜드국적,브랜드대륙명,판매구역_Q0,판매구역_Q25,판매구역_Q50,판매구역_Q75,판매구역_Q100,차량모델명_Q0,차량모델명_Q25,차량모델명_Q50,차량모델명_Q75,차량모델명_Q100,모델출시년도_Q0,모델출시년도_Q25,모델출시년도_Q50,모델출시년도_Q75,모델출시년도_Q100,브랜드_Q0,브랜드_Q25,브랜드_Q50,브랜드_Q75,브랜드_Q100,판매구역_차량모델명_Q0,판매구역_차량모델명_Q25,판매구역_차량모델명_Q50,판매구역_차량모델명_Q75,판매구역_차량모델명_Q100,판매구역_모델출시년도_Q0,판매구역_모델출시년도_Q25,판매구역_모델출시년도_Q50,판매구역_모델출시년도_Q75,판매구역_모델출시년도_Q100,판매구역_브랜드_Q0,판매구역_브랜드_Q25,판매구역_브랜드_Q50,판매구역_브랜드_Q75,판매구역_브랜드_Q100,차량모델명_모델출시년도_Q0,차량모델명_모델출시년도_Q25,차량모델명_모델출시년도_Q50,차량모델명_모델출시년도_Q75,차량모델명_모델출시년도_Q100,차량모델명_브랜드_Q0,차량모델명_브랜드_Q25,차량모델명_브랜드_Q50,차량모델명_브랜드_Q75,차량모델명_브랜드_Q100,모델출시년도_브랜드_Q0,모델출시년도_브랜드_Q25,모델출시년도_브랜드_Q50,모델출시년도_브랜드_Q75,모델출시년도_브랜드_Q100,판매구역_차량모델명_모델출시년도_Q0,판매구역_차량모델명_모델출시년도_Q25,판매구역_차량모델명_모델출시년도_Q50,판매구역_차량모델명_모델출시년도_Q75,판매구역_차량모델명_모델출시년도_Q100,판매구역_차량모델명_브랜드_Q0,판매구역_차량모델명_브랜드_Q25,판매구역_차량모델명_브랜드_Q50,판매구역_차량모델명_브랜드_Q75,판매구역_차량모델명_브랜드_Q100,판매구역_모델출시년도_브랜드_Q0,판매구역_모델출시년도_브랜드_Q25,판매구역_모델출시년도_브랜드_Q50,판매구역_모델출시년도_브랜드_Q75,판매구역_모델출시년도_브랜드_Q100,차량모델명_모델출시년도_브랜드_Q0,차량모델명_모델출시년도_브랜드_Q25,차량모델명_모델출시년도_브랜드_Q50,차량모델명_모델출시년도_브랜드_Q75,차량모델명_모델출시년도_브랜드_Q100
0,2018,2014,skoda,fabia,KAT,SLA,85231.0,999.0,0,0,1,0,0,51.74,0,4,0,체코,유럽,1.3,24.7,43.55,77.87,156.0,2.21,18.785,40.95,58.305,125.25,1.17,55.6725,77.87,103.87,156.0,2.07,29.77,57.85,90.8375,156.0,3.12,20.005,45.37,57.2,114.59,26.78,53.76,75.4,101.27,156.0,3.12,32.37,55.77,92.3,156.0,12.87,46.67,55.25,64.87,118.17,2.21,18.785,40.95,58.305,125.25,12.87,46.67,55.25,64.87,118.17,33.8,47.6775,52.585,61.5875,103.48,3.12,20.005,45.37,57.2,114.59,33.8,47.6775,52.585,61.5875,103.48,12.87,46.67,55.25,64.87,118.17
1,2010,2006,toyota,auris,RKO,SWI,135000.0,1598.0,0,0,1,0,0,41.47,0,4,0,일본,아시아,2.99,22.49,33.8,58.37,156.0,14.69,36.4,61.1,80.405,116.87,3.89,21.97,28.47,44.07,156.0,1.17,28.6,54.6,84.37,156.0,17.55,28.6975,53.885,72.02,109.07,9.75,22.815,28.6,43.94,124.8,8.45,28.275,46.67,72.71,155.87,14.69,24.57,28.6,36.4,64.92,14.69,36.4,61.1,80.405,116.87,14.69,25.87,33.15,44.07,85.67,17.55,23.205,28.275,37.8625,55.77,17.55,28.6975,53.885,72.02,109.07,17.55,27.625,36.855,51.025,85.67,14.69,24.57,28.6,36.4,64.92
2,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223.0,1796.0,0,0,1,0,0,17.81,1,0,0,독일,유럽,1.62,21.97,40.755,73.45,156.0,9.49,23.27,31.2,37.6025,129.87,2.6,11.63,16.89,28.6,155.87,2.99,23.4,44.07,77.84,156.0,11.05,18.165,28.21,33.73,37.7,3.24,10.53,15.21,25.35,102.7,3.06,22.88,42.38,76.27,155.87,12.87,25.87,32.49,38.87,129.87,9.49,23.27,31.2,37.6025,129.87,7.15,23.4,29.77,36.27,155.87,12.87,23.355,29.77,35.49,37.7,11.05,18.165,28.21,33.73,37.7,12.35,23.92,29.89,33.67,63.7,12.87,25.87,32.49,38.87,129.87
3,2006,2001,nissan,x-trail,EHX,WIE,238000.0,2184.0,0,1,0,0,0,18.2,0,5,0,일본,아시아,1.62,21.97,40.755,73.45,156.0,8.71,78.91,97.11,116.87,155.87,2.6,10.4,15.47,22.09,132.47,2.86,35.1,50.57,81.9,155.87,18.2,88.335,106.59,116.935,134.55,3.64,8.97,12.155,16.9,64.87,3.64,32.37,51.74,75.27,140.27,8.71,17.81,19.37,22.88,29.77,8.71,78.91,97.11,116.87,155.87,3.64,9.62,13.65,19.37,29.77,18.2,19.5,20.8,22.685,24.57,18.2,88.335,106.59,116.935,134.55,3.64,10.53,12.22,18.85,24.57,8.71,17.81,19.37,22.88,29.77
4,2007,2007,fiat,bravo,OSW,MAL,251000.0,1910.0,0,1,0,0,0,17.55,1,0,0,이탈리아,유럽,2.08,24.57,42.77,74.1,156.0,8.45,18.07,21.45,24.7975,38.87,5.46,25.77,37.57,51.87,156.0,3.04,17.55,34.125,65.0,143.89,9.1,17.68,21.97,24.96,32.37,9.1,25.09,38.87,55.9,156.0,4.81,18.13,35.1,78.26,128.63,8.45,18.07,21.45,24.7975,38.87,8.45,18.07,21.45,24.7975,38.87,8.45,18.07,21.45,24.7975,38.87,9.1,17.68,21.97,24.96,32.37,9.1,17.68,21.97,24.96,32.37,9.1,17.68,21.97,24.96,32.37,8.45,18.07,21.45,24.7975,38.87


<br></br>

# EDA

In [56]:
# check_num_features = [col for col in num_features if col.find('_Q')<0]

# i=0
# for col in check_num_features:
#     i+=1
#     print('\n({}/{}) {}'.format(i,len(check_num_features),col))
#     plt.figure(figsize=(15,7))
#     sns.scatterplot(x=train_df3['가격'],y=train_df3[col])
#     plt.show()

<br></br>

# Add the Interaction Term

In [57]:
import warnings
from tqdm import trange

def get_abs_corr(x,y):
    return np.abs(np.corrcoef(x,y))[0,1]

class InteractionTerm:
    def __init__(self):
        pass
    
    def fit(self,data,num_features,corr_cutoff=0.7):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        
        d = data.copy()
        self.interaction_list = []
        total_cnt = 0
        add_cnt = 0
        for i in range(len(num_features)):
            for j in range(len(num_features)):
                if i>j:
                    total_cnt+=1
                    col_i = num_features[i]
                    col_j = num_features[j]
                    
                    # 상관계수가 cutoff보다 큰 경우에는 interaction을 생성하지 않음
                    if (get_abs_corr(d[col_i]*d[col_j],d[col_i])>=corr_cutoff) | (get_abs_corr(d[col_i]*d[col_j],d[col_j])>=corr_cutoff):
                        pass
                    else:
                        self.interaction_list.append(f'{col_i}*{col_j}')
                        add_cnt+=1
        print('> Total Interaction Term: {}'.format(total_cnt))
        print('> Added Interaction Term: {}'.format(add_cnt))
    
    def transform(self,data):
        d = data.copy()
        for interaction in self.interaction_list:
            col_i,col_j = interaction.split('*')
            d[interaction] = d[col_i]*d[col_j]
        return d
    
    def fit_transform(self,data,num_features,corr_cutoff=0.7):
        self.fit(data,num_features,corr_cutoff)
        return self.transform(data)

In [58]:
train_df4 = train_df3.copy()
test_df4  = test_df3.copy()

if CFG.INTERACTION:
    interaction_maker = InteractionTerm()
    interaction_maker.fit(
        data=train_df3,
        num_features=num_features,
        corr_cutoff=0.85,
    )
    train_df4 = interaction_maker.transform(train_df4)
    test_df4  = interaction_maker.transform(test_df4)

    type_resetor = TypeResetting()
    type_resetor.add_categorical_features(fe.new_cat_features)
    type_resetor.fit(train_df4)
    type_resetor.get_feature_type()

    train_df4 = type_resetor.transform(train_df4)
    test_df4  = type_resetor.transform(test_df4)

Total Interaction Term: 2556
Added Interaction Term: 391


In [59]:
train_df3.shape,train_df4.shape

((57920, 89), (57920, 480))

<br></br>

# Feature Selection

In [23]:
# k=0
# for i in range(len(num_features)):
#     for j in range(len(num_features)):
#         if i>j:
#             col_i = num_features[i]
#             col_j = num_features[j]
#             corr = np.corrcoef(train_df4[col_i],train_df4[col_j])[0,1]
#             if corr>=0.7:
#                 k+=1
#                 print(k,col_i,col_j,corr)

In [60]:
def log_offset(x):
    if min(x)>0:
        offset = 0
    elif min(x)==0:
        offset = 1e-3
    else:
        offset = min(x)+1e-3
        print('minimum = {:.3f}'.format(min(x)))
    return np.log(x+offset)

<br></br>

## Categorical Features

In [61]:
# import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm

In [62]:
check_cat_features = [col for col in cat_features if train_df4[col].nunique()<=100]

# (1) ANOVA를 해서 p-value가 0.05보다 높은 것들 확인
pvalue_list = []
for col in tqdm(check_cat_features):
    d = train_df4[[col,'가격']].rename(columns={col:'feature'})
    
    model = ols(f'가격 ~ C(feature)',data=d).fit()
    pvalue = anova_lm(model).values[0][-1]
    pvalue_list.append([col,pvalue])
    
pvalue_df = pd.DataFrame(pvalue_list,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df[pvalue_df.pvalue>=alpha].round(4)

100%|██████████| 9/9 [00:03<00:00,  2.39it/s]


In [63]:
# (2) (1)에서 유의하지않은 feature들은 log적용 후에도 유의하지 않으면 제외
pvalue_list2 = []
unsignificant_features = pvalue_df[pvalue_df.pvalue>CFG.FS_ALPHA].feature.tolist()
for col in tqdm(unsignificant_features):
    d = train_df4[[col,'target']].rename(columns={col:'feature'})
    d['feature'] = log_offset(d['feature'])
    
    model = ols(f'feature ~ C(target)',data=d).fit()
    pvalue = anova_lm(model).values[0][-1]
    pvalue_list2.append([col,pvalue])
    
pvalue_df2 = pd.DataFrame(pvalue_list2,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)

0it [00:00, ?it/s]


In [64]:
delete_features = pvalue_df2[pvalue_df2.pvalue> CFG.FS_ALPHA].feature.tolist()
log_features    = pvalue_df2[pvalue_df2.pvalue<=CFG.FS_ALPHA].feature.tolist()
print('> delete_features')
print('  - length : {}'.format(len(delete_features)))
print('  - feature_name : {}'.format(delete_features))
print('')
print('> log_features')
print('  - length : {}'.format(len(log_features)))
print('  - feature_name : {}'.format(log_features))

train_df5 = train_df4.copy()
train_df5.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    train_df5[col] = log_offset(train_df5[col])
    
test_df5 = test_df4.copy()
test_df5.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    test_df5[col] = log_offset(test_df5[col])

> delete_features
  - length : 0
  - feature_name : []

> log_features
  - length : 0
  - feature_name : []


<br>

## Numerical Features

In [65]:
import scipy

In [69]:
# (1) corr test를 해서 p-value가 0.05보다 높은 것들 확인
pvalue_list = []
for col in tqdm(num_features):
    corr,pvalue = scipy.stats.pearsonr(train_df5['가격'],train_df5[col])
    pvalue_list.append([col,pvalue])
pvalue_df = pd.DataFrame(pvalue_list,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df.round(4).head()

100%|██████████| 463/463 [00:05<00:00, 89.21it/s]


In [70]:
# (2) (1)에서 유의하지않은 feature들은 log적용 후에도 유의하지 않으면 제외
pvalue_list2 = []
unsignificant_features = pvalue_df[pvalue_df.pvalue>CFG.FS_ALPHA].feature.tolist()
for col in tqdm(unsignificant_features):
    corr,pvalue = scipy.stats.pearsonr(train_df5['가격'],log_offset(train_df5[col]))
    pvalue_list2.append([col,pvalue])
pvalue_df2 = pd.DataFrame(pvalue_list2,columns=['feature','pvalue'])\
    .sort_values('pvalue',ascending=False)
# pvalue_df2.round(4).head()

100%|██████████| 3/3 [00:00<00:00, 59.47it/s]


In [71]:
delete_features = pvalue_df2[pvalue_df2.pvalue> CFG.FS_ALPHA].feature.tolist()
log_features    = pvalue_df2[pvalue_df2.pvalue<=CFG.FS_ALPHA].feature.tolist()
print('> delete_features')
print('  - length : {}'.format(len(delete_features)))
print('  - feature_name : {}'.format(delete_features))
print('')
print('> log_features')
print('  - length : {}'.format(len(log_features)))
print('  - feature_name : {}'.format(log_features))

train_df6 = train_df5.copy()
train_df6.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    train_df6[col] = log_offset(train_df6[col])
    
test_df6 = test_df5.copy()
test_df6.drop(delete_features,axis=1,inplace=True)
for col in log_features:
    test_df6[col] = log_offset(test_df6[col])

> delete_features
  - length : 0
  - feature_name : []

> log_features
  - length : 3
  - feature_name : ['판매구역_Q0', '차량모델명_모델출시년도_Q100*주행거리', '차량모델명_모델출시년도_브랜드_Q100*주행거리']


<br></br>

# Make Segment

In [72]:
def make_segment(data,segment: list):
    d = data.copy()
    d['segment'] = d[segment].apply(lambda x: '___'.join(x),axis=1)
    d.drop(columns=segment,inplace=True)
    return d

In [73]:
segment = ['브랜드']
train_df7 = make_segment(train_df6,segment)
test_df7  = make_segment(test_df6 ,segment)

In [74]:
test_only = list(set(test_df7.segment.unique())-set(train_df7.segment.unique()))
assert len(test_only)==0, \
    "Segment exists only in the test set ({})".format(len(test_only))

In [75]:
train_only = list(set(train_df7['segment'].unique())-set(test_df7['segment'].unique()))

n_asis = len(train_df7)
n_tobe = len(train_df7[~train_df7.segment.isin(train_only)])
train_df7 = train_df7[~train_df7.segment.isin(train_only)]
print('> Train에만 존재하는 Segment 제거')
print(' - 데이터수 : {:,} -> {:,}'.format(n_asis,n_tobe))
print(' - 세그먼트수 : {:,}'.format(train_df7['segment'].nunique()))

> Train에만 존재하는 Segment 제거
 - 데이터수 : 57,920 -> 57,920
 - 세그먼트수 : 20


In [76]:
vc = train_df7['segment'].value_counts().sort_values()
display(vc.head())
print('...')
display(vc.tail())

segment
mitsubishi     556
peugeot        793
citroen       1129
fiat          1164
volvo         1352
Name: count, dtype: int64

...


segment
bmw           5262
audi          5597
volkswagen    5693
ford          5819
opel          6651
Name: count, dtype: int64

In [77]:
type_resetor = TypeResetting()
type_resetor.add_categorical_features(fe.new_cat_features)
type_resetor.add_segment_features(segment)
type_resetor.fit(train_df7)
type_resetor.get_feature_type()

train_df7 = type_resetor.transform(train_df7)
test_df7  = type_resetor.transform(test_df7)

In [78]:
cat_features

['차량모델명',
 '판매도시',
 '판매구역',
 '생산년도',
 '모델출시년도',
 '출시년도생산여부',
 '출시이후생산년수',
 '출시이전생산여부',
 '브랜드국적',
 '브랜드대륙명']

In [79]:
print(train_df7.shape)
train_df7.head()

(57920, 480)


Unnamed: 0,생산년도,모델출시년도,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격,출시년도생산여부,출시이후생산년수,출시이전생산여부,브랜드국적,브랜드대륙명,판매구역_Q0,판매구역_Q25,판매구역_Q50,판매구역_Q75,판매구역_Q100,차량모델명_Q0,차량모델명_Q25,차량모델명_Q50,차량모델명_Q75,차량모델명_Q100,모델출시년도_Q0,모델출시년도_Q25,모델출시년도_Q50,모델출시년도_Q75,모델출시년도_Q100,브랜드_Q0,브랜드_Q25,브랜드_Q50,브랜드_Q75,브랜드_Q100,판매구역_차량모델명_Q0,판매구역_차량모델명_Q25,판매구역_차량모델명_Q50,판매구역_차량모델명_Q75,판매구역_차량모델명_Q100,판매구역_모델출시년도_Q0,판매구역_모델출시년도_Q25,판매구역_모델출시년도_Q50,판매구역_모델출시년도_Q75,판매구역_모델출시년도_Q100,판매구역_브랜드_Q0,판매구역_브랜드_Q25,판매구역_브랜드_Q50,판매구역_브랜드_Q75,판매구역_브랜드_Q100,차량모델명_모델출시년도_Q0,차량모델명_모델출시년도_Q25,차량모델명_모델출시년도_Q50,차량모델명_모델출시년도_Q75,차량모델명_모델출시년도_Q100,차량모델명_브랜드_Q0,차량모델명_브랜드_Q25,차량모델명_브랜드_Q50,차량모델명_브랜드_Q75,차량모델명_브랜드_Q100,모델출시년도_브랜드_Q0,모델출시년도_브랜드_Q25,모델출시년도_브랜드_Q50,모델출시년도_브랜드_Q75,모델출시년도_브랜드_Q100,판매구역_차량모델명_모델출시년도_Q0,판매구역_차량모델명_모델출시년도_Q25,판매구역_차량모델명_모델출시년도_Q50,판매구역_차량모델명_모델출시년도_Q75,판매구역_차량모델명_모델출시년도_Q100,판매구역_차량모델명_브랜드_Q0,판매구역_차량모델명_브랜드_Q25,판매구역_차량모델명_브랜드_Q50,판매구역_차량모델명_브랜드_Q75,판매구역_차량모델명_브랜드_Q100,판매구역_모델출시년도_브랜드_Q0,판매구역_모델출시년도_브랜드_Q25,판매구역_모델출시년도_브랜드_Q50,판매구역_모델출시년도_브랜드_Q75,판매구역_모델출시년도_브랜드_Q100,차량모델명_모델출시년도_브랜드_Q0,차량모델명_모델출시년도_브랜드_Q25,차량모델명_모델출시년도_브랜드_Q50,차량모델명_모델출시년도_브랜드_Q75,차량모델명_모델출시년도_브랜드_Q100,판매구역_Q0*주행거리,차량모델명_Q0*주행거리,차량모델명_Q25*주행거리,차량모델명_Q25*배기량,차량모델명_Q25*판매구역_Q0,차량모델명_Q50*주행거리,차량모델명_Q50*배기량,차량모델명_Q50*판매구역_Q0,차량모델명_Q75*주행거리,차량모델명_Q75*배기량,모델출시년도_Q0*주행거리,모델출시년도_Q0*판매구역_Q0,모델출시년도_Q0*차량모델명_Q0,모델출시년도_Q0*차량모델명_Q25,모델출시년도_Q25*주행거리,모델출시년도_Q25*판매구역_Q0,모델출시년도_Q25*차량모델명_Q25,모델출시년도_Q25*차량모델명_Q50,모델출시년도_Q50*주행거리,모델출시년도_Q50*판매구역_Q0,모델출시년도_Q50*차량모델명_Q25,모델출시년도_Q50*차량모델명_Q50,모델출시년도_Q75*주행거리,모델출시년도_Q75*배기량,모델출시년도_Q75*판매구역_Q0,모델출시년도_Q75*차량모델명_Q25,모델출시년도_Q75*차량모델명_Q50,모델출시년도_Q100*판매구역_Q25,모델출시년도_Q100*판매구역_Q50,모델출시년도_Q100*판매구역_Q75,브랜드_Q0*주행거리,브랜드_Q0*차량모델명_Q0,브랜드_Q0*차량모델명_Q25,브랜드_Q0*차량모델명_Q50,브랜드_Q0*모델출시년도_Q0,브랜드_Q0*모델출시년도_Q25,브랜드_Q0*모델출시년도_Q50,브랜드_Q25*배기량,브랜드_Q25*차량모델명_Q100,브랜드_Q50*차량모델명_Q100,브랜드_Q75*차량모델명_Q100,판매구역_차량모델명_Q0*주행거리,판매구역_차량모델명_Q0*모델출시년도_Q0,판매구역_차량모델명_Q0*모델출시년도_Q25,판매구역_차량모델명_Q0*모델출시년도_Q50,판매구역_차량모델명_Q0*브랜드_Q0,판매구역_차량모델명_Q25*주행거리,판매구역_차량모델명_Q25*배기량,판매구역_차량모델명_Q25*모델출시년도_Q0,판매구역_차량모델명_Q25*모델출시년도_Q25,판매구역_차량모델명_Q25*모델출시년도_Q50,판매구역_차량모델명_Q25*모델출시년도_Q75,판매구역_차량모델명_Q25*브랜드_Q0,판매구역_차량모델명_Q50*주행거리,판매구역_차량모델명_Q50*배기량,판매구역_차량모델명_Q50*모델출시년도_Q25,판매구역_차량모델명_Q50*모델출시년도_Q50,판매구역_차량모델명_Q50*모델출시년도_Q75,판매구역_차량모델명_Q50*브랜드_Q0,판매구역_차량모델명_Q75*주행거리,판매구역_차량모델명_Q75*배기량,판매구역_차량모델명_Q100*배기량,판매구역_차량모델명_Q100*브랜드_Q25,판매구역_차량모델명_Q100*브랜드_Q50,판매구역_모델출시년도_Q0*주행거리,판매구역_모델출시년도_Q0*차량모델명_Q0,판매구역_모델출시년도_Q0*차량모델명_Q25,판매구역_모델출시년도_Q0*모델출시년도_Q0,판매구역_모델출시년도_Q0*브랜드_Q0,판매구역_모델출시년도_Q0*판매구역_차량모델명_Q0,판매구역_모델출시년도_Q0*판매구역_차량모델명_Q25,판매구역_모델출시년도_Q0*판매구역_차량모델명_Q50,판매구역_모델출시년도_Q25*주행거리,판매구역_모델출시년도_Q25*차량모델명_Q25,판매구역_모델출시년도_Q25*차량모델명_Q50,판매구역_모델출시년도_Q25*브랜드_Q0,판매구역_모델출시년도_Q25*판매구역_차량모델명_Q0,판매구역_모델출시년도_Q25*판매구역_차량모델명_Q25,판매구역_모델출시년도_Q25*판매구역_차량모델명_Q50,판매구역_모델출시년도_Q50*주행거리,판매구역_모델출시년도_Q50*판매구역_Q0,판매구역_모델출시년도_Q50*차량모델명_Q25,판매구역_모델출시년도_Q50*차량모델명_Q50,판매구역_모델출시년도_Q50*브랜드_Q0,판매구역_모델출시년도_Q50*판매구역_차량모델명_Q0,판매구역_모델출시년도_Q50*판매구역_차량모델명_Q25,판매구역_모델출시년도_Q50*판매구역_차량모델명_Q50,판매구역_모델출시년도_Q75*주행거리,판매구역_모델출시년도_Q75*배기량,판매구역_모델출시년도_Q75*판매구역_Q0,판매구역_모델출시년도_Q75*차량모델명_Q25,판매구역_모델출시년도_Q75*차량모델명_Q50,판매구역_모델출시년도_Q75*판매구역_차량모델명_Q25,판매구역_모델출시년도_Q75*판매구역_차량모델명_Q50,판매구역_모델출시년도_Q100*주행거리,판매구역_모델출시년도_Q100*배기량,판매구역_모델출시년도_Q100*차량모델명_Q75,판매구역_모델출시년도_Q100*차량모델명_Q100,판매구역_모델출시년도_Q100*브랜드_Q25,판매구역_모델출시년도_Q100*브랜드_Q50,판매구역_모델출시년도_Q100*브랜드_Q75,판매구역_모델출시년도_Q100*판매구역_차량모델명_Q100,판매구역_브랜드_Q0*주행거리,판매구역_브랜드_Q0*판매구역_Q25,판매구역_브랜드_Q0*차량모델명_Q0,판매구역_브랜드_Q0*차량모델명_Q25,판매구역_브랜드_Q0*차량모델명_Q50,판매구역_브랜드_Q0*모델출시년도_Q0,판매구역_브랜드_Q0*모델출시년도_Q25,판매구역_브랜드_Q0*모델출시년도_Q50,판매구역_브랜드_Q0*모델출시년도_Q75,판매구역_브랜드_Q0*판매구역_차량모델명_Q0,판매구역_브랜드_Q0*판매구역_차량모델명_Q25,판매구역_브랜드_Q0*판매구역_차량모델명_Q50,판매구역_브랜드_Q0*판매구역_모델출시년도_Q0,판매구역_브랜드_Q0*판매구역_모델출시년도_Q25,판매구역_브랜드_Q0*판매구역_모델출시년도_Q50,판매구역_브랜드_Q0*판매구역_모델출시년도_Q75,판매구역_브랜드_Q25*배기량,판매구역_브랜드_Q25*판매구역_Q25,판매구역_브랜드_Q25*차량모델명_Q75,판매구역_브랜드_Q25*판매구역_차량모델명_Q75,판매구역_브랜드_Q25*판매구역_차량모델명_Q100,판매구역_브랜드_Q25*판매구역_모델출시년도_Q100,판매구역_브랜드_Q25*판매구역_브랜드_Q0,판매구역_브랜드_Q50*배기량,판매구역_브랜드_Q50*판매구역_차량모델명_Q100,판매구역_브랜드_Q50*판매구역_모델출시년도_Q100,판매구역_브랜드_Q75*차량모델명_Q100,판매구역_브랜드_Q75*판매구역_모델출시년도_Q100,판매구역_브랜드_Q100*판매구역_Q25,판매구역_브랜드_Q100*판매구역_Q50,판매구역_브랜드_Q100*판매구역_Q75,판매구역_브랜드_Q100*모델출시년도_Q100,차량모델명_모델출시년도_Q0*주행거리,차량모델명_모델출시년도_Q0*판매구역_Q0,차량모델명_모델출시년도_Q0*차량모델명_Q25,차량모델명_모델출시년도_Q0*모델출시년도_Q0,차량모델명_모델출시년도_Q0*브랜드_Q0,차량모델명_모델출시년도_Q0*판매구역_차량모델명_Q0,차량모델명_모델출시년도_Q0*판매구역_차량모델명_Q25,차량모델명_모델출시년도_Q0*판매구역_모델출시년도_Q0,차량모델명_모델출시년도_Q0*판매구역_브랜드_Q0,차량모델명_모델출시년도_Q25*주행거리,차량모델명_모델출시년도_Q25*판매구역_Q0,차량모델명_모델출시년도_Q25*브랜드_Q0,차량모델명_모델출시년도_Q25*판매구역_브랜드_Q0,차량모델명_모델출시년도_Q50*주행거리,차량모델명_모델출시년도_Q50*판매구역_Q0,차량모델명_모델출시년도_Q50*브랜드_Q0,차량모델명_모델출시년도_Q50*판매구역_브랜드_Q0,차량모델명_모델출시년도_Q75*주행거리,차량모델명_모델출시년도_Q75*판매구역_Q0,차량모델명_모델출시년도_Q75*브랜드_Q0,차량모델명_모델출시년도_Q75*판매구역_차량모델명_Q50,차량모델명_모델출시년도_Q75*판매구역_브랜드_Q0,차량모델명_모델출시년도_Q100*주행거리,차량모델명_모델출시년도_Q100*배기량,차량모델명_모델출시년도_Q100*브랜드_Q0,차량모델명_모델출시년도_Q100*판매구역_차량모델명_Q75,차량모델명_모델출시년도_Q100*판매구역_브랜드_Q0,차량모델명_모델출시년도_Q100*판매구역_브랜드_Q25,차량모델명_브랜드_Q0*주행거리,차량모델명_브랜드_Q0*모델출시년도_Q0,차량모델명_브랜드_Q0*브랜드_Q0,차량모델명_브랜드_Q0*판매구역_모델출시년도_Q0,차량모델명_브랜드_Q0*판매구역_브랜드_Q0,차량모델명_브랜드_Q25*주행거리,차량모델명_브랜드_Q25*배기량,차량모델명_브랜드_Q25*판매구역_Q0,차량모델명_브랜드_Q25*모델출시년도_Q0,차량모델명_브랜드_Q25*모델출시년도_Q25,차량모델명_브랜드_Q25*모델출시년도_Q50,차량모델명_브랜드_Q25*모델출시년도_Q75,차량모델명_브랜드_Q25*브랜드_Q0,차량모델명_브랜드_Q25*판매구역_모델출시년도_Q0,차량모델명_브랜드_Q25*판매구역_모델출시년도_Q25,차량모델명_브랜드_Q25*판매구역_모델출시년도_Q50,차량모델명_브랜드_Q25*판매구역_모델출시년도_Q75,차량모델명_브랜드_Q25*판매구역_브랜드_Q0,차량모델명_브랜드_Q25*차량모델명_모델출시년도_Q0,차량모델명_브랜드_Q50*주행거리,차량모델명_브랜드_Q50*배기량,차량모델명_브랜드_Q50*판매구역_Q0,차량모델명_브랜드_Q50*모델출시년도_Q25,차량모델명_브랜드_Q50*모델출시년도_Q50,차량모델명_브랜드_Q50*모델출시년도_Q75,차량모델명_브랜드_Q50*브랜드_Q0,차량모델명_브랜드_Q50*판매구역_모델출시년도_Q25,차량모델명_브랜드_Q50*판매구역_모델출시년도_Q50,차량모델명_브랜드_Q50*판매구역_모델출시년도_Q75,차량모델명_브랜드_Q50*판매구역_브랜드_Q0,차량모델명_브랜드_Q75*주행거리,차량모델명_브랜드_Q75*배기량,차량모델명_브랜드_Q75*판매구역_모델출시년도_Q100,차량모델명_브랜드_Q75*판매구역_브랜드_Q25,차량모델명_브랜드_Q100*브랜드_Q25,차량모델명_브랜드_Q100*브랜드_Q50,차량모델명_브랜드_Q100*브랜드_Q75,차량모델명_브랜드_Q100*판매구역_모델출시년도_Q100,차량모델명_브랜드_Q100*판매구역_브랜드_Q75,모델출시년도_브랜드_Q0*주행거리,모델출시년도_브랜드_Q0*판매구역_Q0,모델출시년도_브랜드_Q0*차량모델명_Q25,모델출시년도_브랜드_Q0*모델출시년도_Q0,모델출시년도_브랜드_Q0*브랜드_Q0,모델출시년도_브랜드_Q0*판매구역_차량모델명_Q0,모델출시년도_브랜드_Q0*판매구역_차량모델명_Q25,모델출시년도_브랜드_Q0*판매구역_모델출시년도_Q0,모델출시년도_브랜드_Q0*판매구역_브랜드_Q0,모델출시년도_브랜드_Q0*차량모델명_브랜드_Q25,모델출시년도_브랜드_Q25*주행거리,모델출시년도_브랜드_Q25*판매구역_Q0,모델출시년도_브랜드_Q25*차량모델명_Q25,모델출시년도_브랜드_Q25*브랜드_Q0,모델출시년도_브랜드_Q25*판매구역_차량모델명_Q25,모델출시년도_브랜드_Q25*판매구역_브랜드_Q0,모델출시년도_브랜드_Q25*차량모델명_브랜드_Q25,모델출시년도_브랜드_Q50*주행거리,모델출시년도_브랜드_Q50*판매구역_Q0,모델출시년도_브랜드_Q50*차량모델명_Q25,모델출시년도_브랜드_Q50*브랜드_Q0,모델출시년도_브랜드_Q50*판매구역_차량모델명_Q50,모델출시년도_브랜드_Q50*판매구역_브랜드_Q0,모델출시년도_브랜드_Q50*차량모델명_브랜드_Q25,모델출시년도_브랜드_Q75*주행거리,모델출시년도_브랜드_Q75*판매구역_Q0,모델출시년도_브랜드_Q75*차량모델명_Q50,모델출시년도_브랜드_Q75*브랜드_Q0,모델출시년도_브랜드_Q75*판매구역_차량모델명_Q50,모델출시년도_브랜드_Q75*판매구역_브랜드_Q0,모델출시년도_브랜드_Q75*차량모델명_브랜드_Q50,모델출시년도_브랜드_Q100*주행거리,모델출시년도_브랜드_Q100*배기량,모델출시년도_브랜드_Q100*차량모델명_Q75,모델출시년도_브랜드_Q100*판매구역_차량모델명_Q75,모델출시년도_브랜드_Q100*판매구역_브랜드_Q25,모델출시년도_브랜드_Q100*판매구역_브랜드_Q50,모델출시년도_브랜드_Q100*차량모델명_브랜드_Q75,판매구역_차량모델명_모델출시년도_Q0*주행거리,판매구역_차량모델명_모델출시년도_Q0*판매구역_Q0,판매구역_차량모델명_모델출시년도_Q0*차량모델명_Q25,판매구역_차량모델명_모델출시년도_Q0*브랜드_Q0,판매구역_차량모델명_모델출시년도_Q0*판매구역_차량모델명_Q25,판매구역_차량모델명_모델출시년도_Q0*판매구역_브랜드_Q0,판매구역_차량모델명_모델출시년도_Q0*차량모델명_브랜드_Q25,판매구역_차량모델명_모델출시년도_Q25*주행거리,판매구역_차량모델명_모델출시년도_Q25*판매구역_Q0,판매구역_차량모델명_모델출시년도_Q25*브랜드_Q0,판매구역_차량모델명_모델출시년도_Q25*판매구역_브랜드_Q0,판매구역_차량모델명_모델출시년도_Q50*주행거리,판매구역_차량모델명_모델출시년도_Q50*판매구역_Q0,판매구역_차량모델명_모델출시년도_Q50*브랜드_Q0,판매구역_차량모델명_모델출시년도_Q50*판매구역_브랜드_Q0,판매구역_차량모델명_모델출시년도_Q75*주행거리,판매구역_차량모델명_모델출시년도_Q75*판매구역_Q0,판매구역_차량모델명_모델출시년도_Q75*브랜드_Q0,판매구역_차량모델명_모델출시년도_Q75*판매구역_브랜드_Q0,판매구역_차량모델명_모델출시년도_Q100*주행거리,판매구역_차량모델명_모델출시년도_Q100*배기량,판매구역_차량모델명_모델출시년도_Q100*판매구역_Q0,판매구역_차량모델명_모델출시년도_Q100*차량모델명_Q50,판매구역_차량모델명_모델출시년도_Q100*브랜드_Q0,판매구역_차량모델명_모델출시년도_Q100*판매구역_차량모델명_Q50,판매구역_차량모델명_모델출시년도_Q100*판매구역_브랜드_Q0,판매구역_차량모델명_모델출시년도_Q100*차량모델명_브랜드_Q50,판매구역_차량모델명_브랜드_Q0*주행거리,판매구역_차량모델명_브랜드_Q0*모델출시년도_Q0,판매구역_차량모델명_브랜드_Q0*모델출시년도_Q25,판매구역_차량모델명_브랜드_Q0*모델출시년도_Q50,판매구역_차량모델명_브랜드_Q0*브랜드_Q0,판매구역_차량모델명_브랜드_Q0*판매구역_모델출시년도_Q0,판매구역_차량모델명_브랜드_Q0*판매구역_모델출시년도_Q25,판매구역_차량모델명_브랜드_Q0*판매구역_모델출시년도_Q50,판매구역_차량모델명_브랜드_Q0*판매구역_브랜드_Q0,판매구역_차량모델명_브랜드_Q0*차량모델명_모델출시년도_Q0,판매구역_차량모델명_브랜드_Q0*모델출시년도_브랜드_Q0,판매구역_차량모델명_브랜드_Q25*주행거리,판매구역_차량모델명_브랜드_Q25*배기량,판매구역_차량모델명_브랜드_Q25*모델출시년도_Q0,판매구역_차량모델명_브랜드_Q25*모델출시년도_Q25,판매구역_차량모델명_브랜드_Q25*모델출시년도_Q50,판매구역_차량모델명_브랜드_Q25*모델출시년도_Q75,판매구역_차량모델명_브랜드_Q25*브랜드_Q0,판매구역_차량모델명_브랜드_Q25*판매구역_모델출시년도_Q0,판매구역_차량모델명_브랜드_Q25*판매구역_모델출시년도_Q25,판매구역_차량모델명_브랜드_Q25*판매구역_모델출시년도_Q50,판매구역_차량모델명_브랜드_Q25*판매구역_모델출시년도_Q75,판매구역_차량모델명_브랜드_Q25*판매구역_브랜드_Q0,판매구역_차량모델명_브랜드_Q25*차량모델명_모델출시년도_Q0,판매구역_차량모델명_브랜드_Q25*모델출시년도_브랜드_Q0,판매구역_차량모델명_브랜드_Q25*모델출시년도_브랜드_Q25,판매구역_차량모델명_브랜드_Q25*판매구역_차량모델명_모델출시년도_Q0,판매구역_차량모델명_브랜드_Q50*주행거리,판매구역_차량모델명_브랜드_Q50*배기량,판매구역_차량모델명_브랜드_Q50*모델출시년도_Q25,판매구역_차량모델명_브랜드_Q50*모델출시년도_Q50,판매구역_차량모델명_브랜드_Q50*모델출시년도_Q75,판매구역_차량모델명_브랜드_Q50*브랜드_Q0,판매구역_차량모델명_브랜드_Q50*판매구역_모델출시년도_Q0,판매구역_차량모델명_브랜드_Q50*판매구역_모델출시년도_Q25,판매구역_차량모델명_브랜드_Q50*판매구역_모델출시년도_Q50,판매구역_차량모델명_브랜드_Q50*판매구역_모델출시년도_Q75,판매구역_차량모델명_브랜드_Q50*판매구역_브랜드_Q0,판매구역_차량모델명_브랜드_Q50*차량모델명_모델출시년도_Q75,판매구역_차량모델명_브랜드_Q50*모델출시년도_브랜드_Q50,판매구역_차량모델명_브랜드_Q50*모델출시년도_브랜드_Q75,판매구역_차량모델명_브랜드_Q50*판매구역_차량모델명_모델출시년도_Q100,판매구역_차량모델명_브랜드_Q75*주행거리,판매구역_차량모델명_브랜드_Q75*배기량,판매구역_차량모델명_브랜드_Q75*판매구역_브랜드_Q25,판매구역_차량모델명_브랜드_Q75*차량모델명_모델출시년도_Q100,판매구역_차량모델명_브랜드_Q75*모델출시년도_브랜드_Q100,판매구역_차량모델명_브랜드_Q100*배기량,판매구역_차량모델명_브랜드_Q100*브랜드_Q25,판매구역_차량모델명_브랜드_Q100*브랜드_Q50,판매구역_차량모델명_브랜드_Q100*판매구역_모델출시년도_Q100,판매구역_차량모델명_브랜드_Q100*판매구역_브랜드_Q25,판매구역_차량모델명_브랜드_Q100*판매구역_브랜드_Q50,판매구역_모델출시년도_브랜드_Q0*주행거리,판매구역_모델출시년도_브랜드_Q0*판매구역_Q0,판매구역_모델출시년도_브랜드_Q0*차량모델명_Q25,판매구역_모델출시년도_브랜드_Q0*브랜드_Q0,판매구역_모델출시년도_브랜드_Q0*판매구역_차량모델명_Q0,판매구역_모델출시년도_브랜드_Q0*판매구역_차량모델명_Q25,판매구역_모델출시년도_브랜드_Q0*판매구역_브랜드_Q0,판매구역_모델출시년도_브랜드_Q0*차량모델명_브랜드_Q25,판매구역_모델출시년도_브랜드_Q0*판매구역_차량모델명_브랜드_Q0,판매구역_모델출시년도_브랜드_Q0*판매구역_차량모델명_브랜드_Q25,판매구역_모델출시년도_브랜드_Q25*주행거리,판매구역_모델출시년도_브랜드_Q25*판매구역_Q0,판매구역_모델출시년도_브랜드_Q25*차량모델명_Q25,판매구역_모델출시년도_브랜드_Q25*브랜드_Q0,판매구역_모델출시년도_브랜드_Q25*판매구역_브랜드_Q0,판매구역_모델출시년도_브랜드_Q25*차량모델명_브랜드_Q25,판매구역_모델출시년도_브랜드_Q50*주행거리,판매구역_모델출시년도_브랜드_Q50*판매구역_Q0,판매구역_모델출시년도_브랜드_Q50*차량모델명_Q25,판매구역_모델출시년도_브랜드_Q50*브랜드_Q0,판매구역_모델출시년도_브랜드_Q50*판매구역_브랜드_Q0,판매구역_모델출시년도_브랜드_Q50*차량모델명_브랜드_Q25,판매구역_모델출시년도_브랜드_Q75*주행거리,판매구역_모델출시년도_브랜드_Q75*판매구역_Q0,판매구역_모델출시년도_브랜드_Q75*차량모델명_Q50,판매구역_모델출시년도_브랜드_Q75*브랜드_Q0,판매구역_모델출시년도_브랜드_Q75*판매구역_차량모델명_Q50,판매구역_모델출시년도_브랜드_Q75*판매구역_브랜드_Q0,판매구역_모델출시년도_브랜드_Q75*차량모델명_브랜드_Q50,판매구역_모델출시년도_브랜드_Q75*판매구역_차량모델명_브랜드_Q50,판매구역_모델출시년도_브랜드_Q100*주행거리,판매구역_모델출시년도_브랜드_Q100*배기량,판매구역_모델출시년도_브랜드_Q100*판매구역_Q0,판매구역_모델출시년도_브랜드_Q100*차량모델명_Q50,판매구역_모델출시년도_브랜드_Q100*브랜드_Q0,판매구역_모델출시년도_브랜드_Q100*판매구역_차량모델명_Q50,판매구역_모델출시년도_브랜드_Q100*판매구역_브랜드_Q0,판매구역_모델출시년도_브랜드_Q100*차량모델명_브랜드_Q50,판매구역_모델출시년도_브랜드_Q100*판매구역_차량모델명_브랜드_Q50,차량모델명_모델출시년도_브랜드_Q0*주행거리,차량모델명_모델출시년도_브랜드_Q0*판매구역_Q0,차량모델명_모델출시년도_브랜드_Q0*차량모델명_Q25,차량모델명_모델출시년도_브랜드_Q0*모델출시년도_Q0,차량모델명_모델출시년도_브랜드_Q0*브랜드_Q0,차량모델명_모델출시년도_브랜드_Q0*판매구역_차량모델명_Q0,차량모델명_모델출시년도_브랜드_Q0*판매구역_차량모델명_Q25,차량모델명_모델출시년도_브랜드_Q0*판매구역_모델출시년도_Q0,차량모델명_모델출시년도_브랜드_Q0*판매구역_브랜드_Q0,차량모델명_모델출시년도_브랜드_Q0*차량모델명_브랜드_Q25,차량모델명_모델출시년도_브랜드_Q0*판매구역_차량모델명_브랜드_Q0,차량모델명_모델출시년도_브랜드_Q0*판매구역_차량모델명_브랜드_Q25,차량모델명_모델출시년도_브랜드_Q25*주행거리,차량모델명_모델출시년도_브랜드_Q25*판매구역_Q0,차량모델명_모델출시년도_브랜드_Q25*브랜드_Q0,차량모델명_모델출시년도_브랜드_Q25*판매구역_브랜드_Q0,차량모델명_모델출시년도_브랜드_Q50*주행거리,차량모델명_모델출시년도_브랜드_Q50*판매구역_Q0,차량모델명_모델출시년도_브랜드_Q50*브랜드_Q0,차량모델명_모델출시년도_브랜드_Q50*판매구역_브랜드_Q0,차량모델명_모델출시년도_브랜드_Q75*주행거리,차량모델명_모델출시년도_브랜드_Q75*판매구역_Q0,차량모델명_모델출시년도_브랜드_Q75*브랜드_Q0,차량모델명_모델출시년도_브랜드_Q75*판매구역_차량모델명_Q50,차량모델명_모델출시년도_브랜드_Q75*판매구역_브랜드_Q0,차량모델명_모델출시년도_브랜드_Q75*판매구역_차량모델명_브랜드_Q50,차량모델명_모델출시년도_브랜드_Q100*주행거리,차량모델명_모델출시년도_브랜드_Q100*배기량,차량모델명_모델출시년도_브랜드_Q100*브랜드_Q0,차량모델명_모델출시년도_브랜드_Q100*판매구역_차량모델명_Q75,차량모델명_모델출시년도_브랜드_Q100*판매구역_브랜드_Q0,차량모델명_모델출시년도_브랜드_Q100*판매구역_브랜드_Q25,차량모델명_모델출시년도_브랜드_Q100*판매구역_차량모델명_브랜드_Q75,segment
0,2018,2014,fabia,KAT,SLA,85231.0,999.0,0,0,1,0,0,51.74,0,4,0,체코,유럽,0.262364,24.7,43.55,77.87,156.0,2.21,18.785,40.95,58.305,125.25,1.17,55.6725,77.87,103.87,156.0,2.07,29.77,57.85,90.8375,156.0,3.12,20.005,45.37,57.2,114.59,26.78,53.76,75.4,101.27,156.0,3.12,32.37,55.77,92.3,156.0,12.87,46.67,55.25,64.87,118.17,2.21,18.785,40.95,58.305,125.25,12.87,46.67,55.25,64.87,118.17,33.8,47.6775,52.585,61.5875,103.48,3.12,20.005,45.37,57.2,114.59,33.8,47.6775,52.585,61.5875,103.48,12.87,46.67,55.25,64.87,118.17,110800.3,188360.51,1601064.0,18766.215,24.4205,3490209.45,40909.05,53.235,4969393.0,58246.695,99720.27,1.521,2.5857,21.97845,4745023.0,72.37425,1045.807912,2279.788875,6636937.97,101.231,1462.78795,3188.7765,8852943.97,103766.13,135.031,1951.19795,4253.4765,3853.2,6793.8,12147.72,176428.17,4.5747,38.88495,84.7665,2.4219,115.242075,161.1909,29740.23,3728.6925,7245.7125,11377.396875,265920.72,3.6504,173.6982,242.9544,6.4584,1705046.0,19984.995,23.40585,1113.728363,1557.78935,2077.91935,41.41035,3866930.47,45324.63,2525.861325,3532.9619,4712.5819,93.9159,4875213.2,57142.8,114475.41,3411.3443,6629.0315,2282486.18,59.1838,503.0623,31.3326,55.4346,83.5536,535.7339,1215.0086,4582018.56,1009.8816,2201.472,111.2832,167.7312,1075.4688,2439.0912,6426417.4,98.02,1416.389,3087.63,156.078,235.248,1508.377,3420.898,8631343.37,101168.73,131.651,1902.35695,4147.0065,2025.90635,4594.6199,13296036.0,155844.0,9095.58,19539.0,4644.12,9024.6,14170.65,17876.04,265920.72,77.064,6.8952,58.6092,127.764,3.6504,173.6982,242.9544,324.0744,9.7344,62.4156,141.5544,83.5536,167.7312,235.248,315.9624,32337.63,799.539,1887.33285,1851.564,3709.2783,5049.72,100.9944,55714.23,6390.6843,8700.12,11560.575,14398.8,3853.2,6793.8,12147.72,24336.0,1096922.97,16.731,241.76295,15.0579,26.6409,40.1544,257.46435,344.6586,40.1544,3977730.77,60.671,96.6069,145.6104,4709012.75,71.825,114.3675,172.38,5528934.97,84.331,134.2809,2943.1519,202.3944,16.125245,118051.83,244.6119,6759.324,368.6904,3825.1629,188360.51,2.5857,4.5747,59.1838,6.8952,1601064.0,18766.215,24.4205,21.97845,1045.807912,1462.78795,1951.19795,38.88495,503.0623,1009.8816,1416.389,1902.35695,58.6092,241.76295,3490209.45,40909.05,53.235,2279.788875,3188.7765,4253.4765,84.7665,2201.472,3087.63,4147.0065,127.764,4969393.0,58246.695,9095.58,1887.33285,3728.6925,7245.7125,11377.396875,19539.0,11560.575,1096922.97,16.731,241.76295,15.0579,26.6409,40.1544,257.46435,344.6586,40.1544,241.76295,3977730.77,60.671,876.69595,96.6069,933.63335,145.6104,876.69595,4709012.75,71.825,1037.87125,114.3675,2506.6925,172.38,1037.87125,5528934.97,84.331,2656.4265,134.2809,2943.1519,202.3944,2656.4265,10071747.27,118051.83,6889.90185,6759.324,3825.1629,6590.3409,6889.90185,2880807.8,43.94,634.933,69.966,676.169,105.456,634.933,4063601.0,61.98075,98.692425,148.7538,4481872.135,68.3605,108.85095,164.0652,5249164.0,80.06375,127.486125,192.153,8819703.88,103376.52,134.524,4237.506,214.2036,4694.8876,322.8576,4237.506,265920.72,3.6504,173.6982,242.9544,6.4584,83.5536,167.7312,235.248,9.7344,40.1544,40.1544,1705046.0,19984.995,23.40585,1113.728363,1557.78935,2077.91935,41.41035,535.7339,1075.4688,1508.377,2025.90635,62.4156,257.46435,257.46435,933.63335,676.169,3866930.47,45324.63,2525.861325,3532.9619,4712.5819,93.9159,1215.0086,2439.0912,3420.898,4594.6199,141.5544,2943.1519,2506.6925,2943.1519,4694.8876,4875213.2,57142.8,1851.564,6759.324,6759.324,114475.41,3411.3443,6629.0315,17876.04,3709.2783,6390.6843,2880807.8,43.94,634.933,69.966,105.456,676.169,105.456,634.933,105.456,676.169,4063601.0,61.98075,895.621837,98.692425,148.7538,895.621837,4481872.135,68.3605,987.809225,108.85095,164.0652,987.809225,5249164.0,80.06375,2522.008125,127.486125,2794.224875,192.153,2522.008125,2794.224875,8819703.88,103376.52,134.524,4237.506,214.2036,4694.8876,322.8576,4237.506,4694.8876,1096922.97,16.731,241.76295,15.0579,26.6409,40.1544,257.46435,344.6586,40.1544,241.76295,40.1544,257.46435,3977730.77,60.671,96.6069,145.6104,4709012.75,71.825,114.3675,172.38,5528934.97,84.331,134.2809,2943.1519,202.3944,2943.1519,16.125245,118051.83,244.6119,6759.324,368.6904,3825.1629,6759.324,skoda
1,2010,2006,auris,RKO,SWI,135000.0,1598.0,0,0,1,0,0,41.47,0,4,0,일본,아시아,1.095273,22.49,33.8,58.37,156.0,14.69,36.4,61.1,80.405,116.87,3.89,21.97,28.47,44.07,156.0,1.17,28.6,54.6,84.37,156.0,17.55,28.6975,53.885,72.02,109.07,9.75,22.815,28.6,43.94,124.8,8.45,28.275,46.67,72.71,155.87,14.69,24.57,28.6,36.4,64.92,14.69,36.4,61.1,80.405,116.87,14.69,25.87,33.15,44.07,85.67,17.55,23.205,28.275,37.8625,55.77,17.55,28.6975,53.885,72.02,109.07,17.55,27.625,36.855,51.025,85.67,14.69,24.57,28.6,36.4,64.92,403650.0,1983150.0,4914000.0,58167.2,108.836,8248500.0,97637.8,182.689,10854680.0,128487.19,525150.0,11.6311,57.1441,141.596,2965950.0,65.6903,799.708,1342.367,3843450.0,85.1253,1036.308,1739.517,5949450.0,70423.86,131.7693,1604.148,2692.677,3508.44,5272.8,9105.72,157950.0,17.1873,42.588,71.487,4.5513,25.7049,33.3099,45702.8,3342.482,6381.102,9860.3219,2369250.0,68.2695,385.5735,499.6485,20.5335,3874162.0,45858.605,111.633275,630.484075,817.017825,1264.698825,33.576075,7274475.0,86108.23,1183.85345,1534.10595,2374.71195,63.04545,9722700.0,115087.96,174293.86,3119.402,5955.222,1316250.0,143.2275,354.9,37.9275,11.4075,171.1125,279.800625,525.37875,3080025.0,830.466,1393.9965,26.69355,400.40325,654.733462,1229.386275,3861000.0,85.514,1041.04,1747.46,33.462,501.93,820.7485,1541.111,5931900.0,70216.12,131.3806,1599.416,2684.734,1260.96815,2367.7069,16848000.0,199430.4,10034.544,14585.376,3569.28,6814.08,10529.376,13611.936,1140750.0,190.0405,124.1305,307.58,516.295,32.8705,185.6465,240.5715,372.3915,148.2975,242.493875,455.32825,82.3875,192.78675,241.67,371.293,45183.45,635.90475,2273.451375,2036.3655,3083.95425,3528.72,238.92375,74578.66,5090.2969,5824.416,8497.6177,9074.208,3505.5163,5268.406,9098.1319,24315.72,1983150.0,43.9231,534.716,57.1441,17.1873,257.8095,421.566275,143.2275,124.1305,3316950.0,73.4643,28.7469,207.6165,3861000.0,85.514,33.462,241.67,4914000.0,108.836,42.588,1961.414,307.58,15.986186,103742.16,75.9564,4675.5384,548.574,1835.613,1983150.0,57.1441,17.1873,143.2275,124.1305,4914000.0,58167.2,108.836,141.596,799.708,1036.308,1604.148,42.588,354.9,830.466,1041.04,1599.416,307.58,534.716,8248500.0,97637.8,182.689,1342.367,1739.517,2692.677,71.487,1393.9965,1747.46,2684.734,516.295,10854680.0,128487.19,10034.544,2273.451375,3342.482,6381.102,9860.3219,14585.376,8497.6177,1983150.0,43.9231,534.716,57.1441,17.1873,257.8095,421.566275,143.2275,124.1305,534.716,3492450.0,77.3513,941.668,30.2679,742.404325,218.6015,941.668,4475250.0,99.1185,1206.66,38.7855,1786.28775,280.1175,1206.66,5949450.0,131.7693,2692.677,51.5619,2374.71195,372.3915,2692.677,11565450.0,136900.66,6888.29635,6169.9534,2422.31925,3998.2189,6888.29635,2369250.0,52.4745,638.82,20.5335,503.641125,148.2975,638.82,3132675.0,69.38295,27.14985,196.08225,3817125.0,84.54225,33.08175,238.92375,5111438.0,113.208875,44.299125,319.938125,7528950.0,89120.46,166.7523,3407.547,65.2509,3005.16645,471.2565,3407.547,2369250.0,68.2695,385.5735,499.6485,20.5335,171.1125,400.40325,501.93,148.2975,257.8095,257.8095,3874162.0,45858.605,111.633275,630.484075,817.017825,1264.698825,33.576075,279.800625,654.733462,820.7485,1260.96815,242.493875,421.566275,421.566275,742.404325,503.641125,7274475.0,86108.23,1183.85345,1534.10595,2374.71195,63.04545,525.37875,1229.386275,1541.111,2367.7069,455.32825,1961.414,1786.28775,2374.71195,3005.16645,9722700.0,115087.96,2036.3655,4675.5384,6169.9534,174293.86,3119.402,5955.222,13611.936,3083.95425,5090.2969,2369250.0,52.4745,638.82,20.5335,308.0025,503.641125,148.2975,638.82,308.0025,503.641125,3729375.0,82.59875,1005.55,32.32125,233.43125,1005.55,4975425.0,110.19645,1341.522,43.12035,311.42475,1341.522,6888375.0,152.56475,3117.6275,59.69925,2749.482125,431.16125,3117.6275,2749.482125,11565450.0,136900.66,256.1533,5234.437,100.2339,4616.32795,723.9115,5234.437,4616.32795,1983150.0,43.9231,534.716,57.1441,17.1873,257.8095,421.566275,143.2275,124.1305,534.716,257.8095,421.566275,3316950.0,73.4643,28.7469,207.6165,3861000.0,85.514,33.462,241.67,4914000.0,108.836,42.588,1961.414,307.58,1961.414,15.986186,103742.16,75.9564,4675.5384,548.574,1835.613,4675.5384,toyota
2,2002,2002,clk-klasa,GNI,WIE,255223.0,1796.0,0,0,1,0,0,17.81,1,0,0,독일,유럽,0.482426,21.97,40.755,73.45,156.0,9.49,23.27,31.2,37.6025,129.87,2.6,11.63,16.89,28.6,155.87,2.99,23.4,44.07,77.84,156.0,11.05,18.165,28.21,33.73,37.7,3.24,10.53,15.21,25.35,102.7,3.06,22.88,42.38,76.27,155.87,12.87,25.87,32.49,38.87,129.87,9.49,23.27,31.2,37.6025,129.87,7.15,23.4,29.77,36.27,155.87,12.87,23.355,29.77,35.49,37.7,11.05,18.165,28.21,33.73,37.7,12.35,23.92,29.89,33.67,63.7,12.87,25.87,32.49,38.87,129.87,413461.26,2422066.27,5939039.0,41792.92,37.6974,7962957.6,56035.2,50.544,9597023.0,67534.09,663579.8,4.212,24.674,60.502,2968243.0,18.8406,270.6301,362.856,4310716.47,27.3618,393.0303,526.968,7299377.8,51365.6,46.332,665.522,892.32,3424.4639,6352.48185,11448.6515,763116.77,28.3751,69.5773,93.288,7.774,34.7737,50.5011,42026.4,3038.958,5723.3709,10109.0808,2820214.15,28.73,128.5115,186.6345,33.0395,4636126.0,32624.34,47.229,211.25895,306.80685,519.519,54.31335,7199840.83,50665.16,328.0823,476.4669,806.806,84.3479,8608671.79,60579.08,67709.2,882.18,1661.439,826922.52,30.7476,75.3948,8.424,9.6876,35.802,58.8546,91.4004,2687498.19,245.0331,328.536,31.4847,116.3565,191.27745,297.0513,3881941.83,24.6402,353.9367,474.552,45.4779,168.0705,276.28965,429.0741,6469903.05,45528.6,41.067,589.8945,790.92,460.48275,715.1235,26211402.1,184449.2,3861.77675,13337.649,2403.18,4525.989,7994.168,3871.79,780982.38,67.2282,29.0394,71.2062,95.472,7.956,35.5878,51.6834,87.516,33.813,55.5849,86.3226,9.9144,32.2218,46.5426,77.571,41092.48,502.6736,860.3452,771.7424,862.576,2349.776,70.0128,76114.48,1597.726,4352.426,9905.1849,7832.929,3424.4639,6352.48185,11448.6515,24295.4569,3284720.01,20.8494,299.4849,33.462,38.4813,142.2135,233.78355,41.6988,39.3822,6602619.01,41.9094,77.3513,79.1622,8292195.27,52.6338,97.1451,99.4194,9920518.01,62.9694,116.2213,1096.5227,118.9422,17.316427,233246.52,388.3113,4380.5151,397.4022,2971.4256,2422066.27,24.674,28.3751,30.7476,29.0394,5939039.0,41792.92,37.6974,60.502,270.6301,393.0303,665.522,69.5773,75.3948,245.0331,353.9367,589.8945,71.2062,299.4849,7962957.6,56035.2,50.544,362.856,526.968,892.32,93.288,328.536,474.552,790.92,95.472,9597023.0,67534.09,3861.77675,860.3452,3038.958,5723.3709,10109.0808,13337.649,9905.1849,1824844.45,11.583,166.3805,18.59,21.3785,79.0075,129.87975,23.166,21.879,166.3805,5972218.2,37.908,544.518,69.966,425.061,71.604,544.518,7597988.71,48.2274,692.7479,89.0123,839.8117,91.0962,692.7479,9256938.21,58.7574,1131.624,108.4473,1023.1767,110.9862,1131.624,39781609.01,279942.52,5861.101675,5257.4951,3566.3056,6605.7706,5861.101675,3284720.01,20.8494,299.4849,38.4813,233.78355,39.3822,299.4849,5960733.0,37.8351,69.83145,71.4663,7597988.71,48.2274,89.0123,91.0962,9057864.0,57.4938,106.1151,108.5994,9621907.1,67709.2,61.074,1176.24,112.723,1063.517,115.362,1176.24,2820214.15,28.73,128.5115,186.6345,33.0395,35.802,116.3565,168.0705,33.813,142.2135,79.0075,4636126.0,32624.34,47.229,211.25895,306.80685,519.519,54.31335,58.8546,191.27745,276.28965,460.48275,55.5849,233.78355,129.87975,425.061,233.78355,7199840.83,50665.16,328.0823,476.4669,806.806,84.3479,91.4004,297.0513,429.0741,715.1235,86.3226,1096.5227,839.8117,1023.1767,1063.517,8608671.79,60579.08,771.7424,4380.5151,5257.4951,67709.2,882.18,1661.439,3871.79,862.576,1597.726,3152004.05,20.007,287.3845,36.9265,136.4675,224.33775,37.791,287.3845,136.4675,224.33775,6104934.0,38.7504,556.6184,71.5208,73.1952,556.6184,7628615.47,48.4218,695.5403,89.3711,91.4634,695.5403,8593358.0,54.5454,1050.504,100.6733,949.8307,103.0302,1050.504,949.8307,16257705.1,114405.2,103.194,1987.44,190.463,1796.977,194.922,1987.44,1796.977,3284720.01,20.8494,299.4849,33.462,38.4813,142.2135,233.78355,41.6988,39.3822,299.4849,142.2135,233.78355,6602619.01,41.9094,77.3513,79.1622,8292195.27,52.6338,97.1451,99.4194,9920518.01,62.9694,116.2213,1096.5227,118.9422,1096.5227,17.316427,233246.52,388.3113,4380.5151,397.4022,2971.4256,4380.5151,mercedes-benz
3,2006,2001,x-trail,EHX,WIE,238000.0,2184.0,0,1,0,0,0,18.2,0,5,0,일본,아시아,0.482426,21.97,40.755,73.45,156.0,8.71,78.91,97.11,116.87,155.87,2.6,10.4,15.47,22.09,132.47,2.86,35.1,50.57,81.9,155.87,18.2,88.335,106.59,116.935,134.55,3.64,8.97,12.155,16.9,64.87,3.64,32.37,51.74,75.27,140.27,8.71,17.81,19.37,22.88,29.77,8.71,78.91,97.11,116.87,155.87,3.64,9.62,13.65,19.37,29.77,18.2,19.5,20.8,22.685,24.57,18.2,88.335,106.59,116.935,134.55,3.64,10.53,12.22,18.85,24.57,8.71,17.81,19.37,22.88,29.77,385560.0,2072980.0,18780580.0,172339.44,127.8342,23112180.0,212088.24,157.3182,27815060.0,255244.08,618800.0,4.212,22.646,205.166,2475200.0,16.848,820.664,1009.944,3681860.0,25.0614,1220.7377,1502.2917,5257420.0,48244.56,35.7858,1743.1219,2145.1599,2910.3659,5398.81485,9729.9215,680680.0,24.9106,225.6826,277.7346,7.436,29.744,44.2442,76658.4,5471.037,7882.3459,12765.753,4331600.0,47.32,189.28,281.554,52.052,21023730.0,192923.64,229.671,918.684,1366.54245,1951.32015,252.6381,25368420.0,232792.56,1108.536,1648.9473,2354.5731,304.8474,27830530.0,255386.04,293857.2,4722.705,6804.1935,866320.0,31.7044,287.2324,9.464,10.4104,66.248,321.5394,387.9876,2134860.0,707.8227,871.0767,25.6542,163.254,792.36495,956.1123,2892890.0,19.6911,959.15105,1180.37205,34.7633,221.221,1073.711925,1295.60145,4022200.0,36909.6,27.378,1333.579,1641.159,1492.8615,1801.371,15439060.0,141676.08,7581.3569,10111.2869,2276.937,3280.4759,5312.853,8728.2585,866320.0,79.9708,31.7044,287.2324,353.4804,9.464,37.856,56.3108,80.4076,66.248,321.5394,387.9876,13.2496,32.6508,44.2442,61.516,70696.08,711.1689,3783.0819,3785.18595,4355.3835,2099.8419,117.8268,113000.16,6961.617,3356.3738,11732.3349,4882.7649,3081.7319,5716.70385,10302.8315,18581.5669,2072980.0,14.1102,687.3061,22.646,24.9106,158.522,769.39785,31.7044,31.7044,4238780.0,28.8522,50.9366,64.8284,4610060.0,31.3794,55.3982,70.5068,5445440.0,37.0656,65.4368,2438.7792,83.2832,15.773527,65017.68,85.1422,3481.15495,108.3628,963.6549,2072980.0,22.646,24.9106,31.7044,31.7044,18780580.0,172339.44,127.8342,205.166,820.664,1220.7377,1743.1219,225.6826,287.2324,707.8227,959.15105,1333.579,287.2324,687.3061,23112180.0,212088.24,157.3182,1009.944,1502.2917,2145.1599,277.7346,871.0767,1180.37205,1641.159,353.4804,27815060.0,255244.08,7581.3569,3783.0819,5471.037,7882.3459,12765.753,10111.2869,11732.3349,866320.0,5.8968,287.2324,9.464,10.4104,66.248,321.5394,13.2496,13.2496,287.2324,2289560.0,15.5844,759.1142,27.5132,849.7827,35.0168,759.1142,3248700.0,22.113,1077.1215,39.039,1454.9535,49.686,1077.1215,4610060.0,31.3794,1881.0207,55.3982,2064.6483,70.5068,1881.0207,7085260.0,65017.68,3479.2199,3481.15495,963.6549,1540.2998,3479.2199,4331600.0,29.484,1436.162,52.052,1607.697,66.248,1436.162,4641000.0,31.59,55.77,70.98,4950400.0,33.696,59.488,75.712,5399030.0,36.7497,64.8791,82.5734,5847660.0,53660.88,39.8034,2385.9927,70.2702,2618.9163,89.4348,2385.9927,4331600.0,47.32,189.28,281.554,52.052,66.248,163.254,221.221,66.248,158.522,66.248,21023730.0,192923.64,229.671,918.684,1366.54245,1951.32015,252.6381,321.5394,792.36495,1073.711925,1492.8615,321.5394,769.39785,321.5394,849.7827,1607.697,25368420.0,232792.56,1108.536,1648.9473,2354.5731,304.8474,387.9876,956.1123,1295.60145,1801.371,387.9876,2438.7792,1454.9535,2064.6483,2618.9163,27830530.0,255386.04,3785.18595,3481.15495,3481.15495,293857.2,4722.705,6804.1935,8728.2585,4355.3835,6961.617,866320.0,5.8968,287.2324,10.4104,66.248,321.5394,13.2496,287.2324,66.248,321.5394,2506140.0,17.0586,830.9223,30.1158,38.3292,830.9223,2908360.0,19.7964,964.2802,34.9492,44.4808,964.2802,4486300.0,30.537,1830.5235,53.911,2009.2215,68.614,1830.5235,2009.2215,5847660.0,53660.88,39.8034,2385.9927,70.2702,2618.9163,89.4348,2385.9927,2618.9163,2072980.0,14.1102,687.3061,22.646,24.9106,158.522,769.39785,31.7044,31.7044,687.3061,158.522,769.39785,4238780.0,28.8522,50.9366,64.8284,4610060.0,31.3794,55.3982,70.5068,5445440.0,37.0656,65.4368,2438.7792,83.2832,2438.7792,15.773527,65017.68,85.1422,3481.15495,108.3628,963.6549,3481.15495,nissan
4,2007,2007,bravo,OSW,MAL,251000.0,1910.0,0,1,0,0,0,17.55,1,0,0,이탈리아,유럽,0.732368,24.57,42.77,74.1,156.0,8.45,18.07,21.45,24.7975,38.87,5.46,25.77,37.57,51.87,156.0,3.04,17.55,34.125,65.0,143.89,9.1,17.68,21.97,24.96,32.37,9.1,25.09,38.87,55.9,156.0,4.81,18.13,35.1,78.26,128.63,8.45,18.07,21.45,24.7975,38.87,8.45,18.07,21.45,24.7975,38.87,8.45,18.07,21.45,24.7975,38.87,9.1,17.68,21.97,24.96,32.37,9.1,17.68,21.97,24.96,32.37,9.1,17.68,21.97,24.96,32.37,8.45,18.07,21.45,24.7975,38.87,522080.0,2120950.0,4535570.0,34513.7,37.5856,5383950.0,40969.5,44.616,6224172.0,47363.225,1370460.0,11.3568,46.137,98.6622,6468270.0,53.6016,465.6639,552.7665,9430070.0,78.1456,678.8899,805.8765,13019370.0,99071.7,107.8896,937.2909,1112.6115,3832.92,6672.12,11559.6,763040.0,25.688,54.9328,65.208,16.5984,78.3408,114.2128,33520.5,682.1685,1326.43875,2526.55,2284100.0,49.686,234.507,341.887,27.664,4437680.0,33768.8,96.5328,455.6136,664.2376,917.0616,53.7472,5514470.0,41962.7,566.1669,825.4129,1139.5839,66.7888,6264960.0,47673.6,61826.7,568.0935,1104.62625,2284100.0,76.895,164.437,49.686,27.664,82.81,160.888,199.927,6297590.0,453.3763,538.1805,76.2736,228.319,443.5912,551.2273,9756370.0,80.8496,702.3809,833.7615,118.1648,353.717,687.2216,853.9739,14030900.0,106769.0,116.272,1010.113,1199.055,988.312,1228.123,39156000.0,297960.0,3868.41,6063.72,2737.8,5323.5,10140.0,5049.72,1207310.0,118.1817,40.6445,86.9167,103.1745,26.2626,123.9537,180.7117,249.4947,43.771,85.0408,105.6757,43.771,120.6829,186.9647,268.879,34628.3,445.4541,449.578675,452.5248,586.8681,2828.28,87.2053,67041.0,1136.187,5475.6,3041.9662,12208.56,3160.4391,5501.5051,9531.483,20066.28,2120950.0,17.576,152.6915,46.137,25.688,76.895,149.396,76.895,40.6445,4535570.0,37.5856,54.9328,86.9167,5383950.0,44.616,65.208,103.1745,6224172.5,51.5788,75.3844,544.801075,119.275975,16.093431,74241.7,118.1648,970.1952,186.9647,704.7131,2120950.0,46.137,25.688,76.895,40.6445,4535570.0,34513.7,37.5856,98.6622,465.6639,678.8899,937.2909,54.9328,164.437,453.3763,702.3809,1010.113,86.9167,152.6915,5383950.0,40969.5,44.616,552.7665,805.8765,1112.6115,65.208,538.1805,833.7615,1199.055,103.1745,6224172.0,47363.225,3868.41,449.578675,682.1685,1326.43875,2526.55,6063.72,3041.9662,2120950.0,17.576,152.6915,46.137,25.688,76.895,149.396,76.895,40.6445,152.6915,4535570.0,37.5856,326.5249,54.9328,319.4776,86.9167,326.5249,5383950.0,44.616,387.6015,65.208,471.2565,103.1745,387.6015,6224172.5,51.5788,531.906375,75.3844,544.801075,119.275975,531.906375,9756370.0,74241.7,963.878825,970.1952,704.7131,1364.337,963.878825,2284100.0,18.928,164.437,27.664,160.888,43.771,164.437,4437680.0,36.7744,53.7472,85.0408,5514470.0,45.6976,66.7888,105.6757,6264960.0,51.9168,75.8784,120.0576,8124870.0,61826.7,67.3296,694.3365,98.4048,711.1689,155.6997,694.3365,2284100.0,49.686,234.507,341.887,27.664,82.81,228.319,353.717,43.771,76.895,76.895,4437680.0,33768.8,96.5328,455.6136,664.2376,917.0616,53.7472,160.888,443.5912,687.2216,988.312,85.0408,149.396,149.396,319.4776,160.888,5514470.0,41962.7,566.1669,825.4129,1139.5839,66.7888,199.927,551.2273,853.9739,1228.123,105.6757,544.801075,471.2565,544.801075,711.1689,6264960.0,47673.6,452.5248,970.1952,970.1952,61826.7,568.0935,1104.62625,5049.72,586.8681,1136.187,2284100.0,18.928,164.437,27.664,82.81,160.888,43.771,164.437,82.81,160.888,4437680.0,36.7744,319.4776,53.7472,85.0408,319.4776,5514470.0,45.6976,396.9979,66.7888,105.6757,396.9979,6264960.0,51.9168,535.392,75.8784,548.3712,120.0576,535.392,548.3712,8124870.0,61826.7,67.3296,694.3365,98.4048,711.1689,155.6997,694.3365,711.1689,2120950.0,17.576,152.6915,46.137,25.688,76.895,149.396,76.895,40.6445,152.6915,76.895,149.396,4535570.0,37.5856,54.9328,86.9167,5383950.0,44.616,65.208,103.1745,6224172.5,51.5788,75.3844,544.801075,119.275975,544.801075,16.093431,74241.7,118.1648,970.1952,186.9647,704.7131,970.1952,fiat


<br></br>

# Modeling

In [80]:
import os
def mkdir(paths):
    if type(paths)==str:
        paths = [paths]
    for path in paths:
        if not os.path.isdir(path):
            print('> Create Folder: {}'.format(path))
            os.mkdir(path)

In [81]:
## dummy_features는 한가지만 속함
# X[dummy_features].apply(lambda x: np.sum(x),axis=1).value_counts()

def add_fuel_type(data,dummy_features):
    d = data.copy()
    d['fuel_type'] = d[dummy_features].apply(
        lambda x: dummy_features[np.where(x==1)[0][0]],axis=1)
    d.drop(columns=dummy_features,inplace=True)
    return d

In [82]:
mkdir('./model_checkpoints')
mkdir('./model_checkpoints/segment_catboost')
mkdir('./model_checkpoints/segment_weightedensemble')

<br>

## CatBoost
- public score : 6.1375542174

In [83]:
gc.collect()

9

In [84]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool



In [85]:
train_fn = train_df7.copy()
test_fn  = test_df7 .copy()

In [86]:
%%time
# 30분

X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]

X = add_fuel_type(X,dummy_features)
new_cat_features = cat_features + ['fuel_type']

segment_list = X['segment'].unique()

models = {}
feature_info = {}
scores = []
pbar = tqdm(segment_list)

s_i = 0
for segment in pbar:
    s_i+=1
    
    # segment에 해당하는 데이터추출
    _X = X[X.segment==segment].drop('segment',axis=1)
    _y = y[X.segment==segment]
    
    # kfold
    kf = KFold(n_splits=CFG.N_SPLITS,random_state=1000*s_i+CFG.SEED,shuffle=True)
    
    # unique인 컬럼 제외
    unique_info = _X.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        _X = _X.drop(unique_cols,axis=1)
        
    # categorical feature에서 unique인 컬럼을 제외
    fixed_cat_features = [col for col in new_cat_features if col in _X.columns]
    
    _models = []
    _scores = []
    k=0
    for tr_idx,val_idx in kf.split(_X,_y):
        k+=1
        
        # kfold dataset
        X_tr, X_va = _X.iloc[tr_idx], _X.iloc[val_idx]
        y_tr, y_va = _y.iloc[tr_idx], _y.iloc[val_idx]

        # progress
        progress = 'Segment: {}, Length: Train({}), Validation({}), KFold: {}/{}'\
            .format(segment,len(X_tr),len(X_va),k,CFG.N_SPLITS)
        pbar.set_description(progress)

        # dataset
        train_dataset = Pool(X_tr,y_tr,cat_features=fixed_cat_features)
        valid_dataset = Pool(X_va,y_va,cat_features=fixed_cat_features)

        # define the model
        model = CatBoostRegressor(
            loss_function='MAE',
            random_state=CFG.SEED,
            iterations=CFG.EPOCHS,
            learning_rate=CFG.LR,
            allow_writing_files=False,
        )

        # fit the model
        model.fit(
            train_dataset,
            eval_set=valid_dataset,
            early_stopping_rounds=CFG.ES,
            verbose=0,
            #metric_period=CFG.EPOCHS//5,
        )

        # save the model
        model.save_model(f'./model_checkpoints/segment_catboost/{segment}_k{k}.cbm')

        # calculate the score
        y_pred = model.predict(valid_dataset).flatten()
        y_true = y_va.values
        score = mean_absolute_error(y_true=y_true,y_pred=y_pred)
        
        # append inner loop
        _models.append(model)
        _scores.append([segment,k,len(X_tr),len(X_va),score])

    # append outer loop
    models[segment] = _models
    scores.append(_scores)
    feature_info[segment] = {'cat_features':fixed_cat_features,'features':_X.columns.tolist()}

Segment: mitsubishi, Length: Train(445), Validation(111), KFold: 5/5: 100%|██████████| 20/20 [4:56:43<00:00, 890.18s/it]    

CPU times: user 18h 50min 13s, sys: 5h 2min 42s, total: 23h 52min 56s
Wall time: 4h 56min 47s





In [87]:
import pickle
with open('./model_checkpoints/segment_cat_models_brand_kf.pkl', 'wb') as f:
	pickle.dump(models, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_cat_feature_info_brand_kf.pkl', 'wb') as f:
	pickle.dump(feature_info, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_cat_scores_brand_kf.pkl', 'wb') as f:
	pickle.dump(scores, f, protocol=pickle.HIGHEST_PROTOCOL)

In [88]:
# pd.DataFrame(
#     np.array(scores).reshape(100,5),
#     columns=['segment','k','n_tr','n_val','score']
# ).sort_values(['segment','k'])

In [89]:
# inference
X = train_fn.drop(target_feature,axis=1)
X = add_fuel_type(X,dummy_features)
y = train_fn[target_feature]

X_test = add_fuel_type(test_fn,dummy_features)

segment_list = X['segment'].unique()

tr_pred_list = []
te_pred_list = []
for segment in tqdm(segment_list):
    ## data load
    # (1) train
    train_data = X[X.segment==segment][feature_info[segment]['features']]
    train_dataset = Pool(train_data,cat_features=feature_info[segment]['cat_features'])
    # (2) test
    test_data = X_test[X_test.segment==segment][feature_info[segment]['features']]
    test_dataset = Pool(test_data,cat_features=feature_info[segment]['cat_features'])
    
    ## model
    kfold_models = models[segment]
    
    ## prediction
    # (1) train
    tr_pred_df = pd.DataFrame({
        'segment':segment,
        'true':y[X.segment==segment].values.flatten(),
        'pred':np.mean([model.predict(train_dataset) for model in kfold_models],axis=0),
    })
    tr_pred_df.index = train_data.index
    # (2) test
    te_pred_df = pd.DataFrame({
        'segment':segment,
        'pred':np.mean([model.predict(test_dataset) for model in kfold_models],axis=0),
    })
    te_pred_df.index = test_data.index
    
    ## append
    tr_pred_list.append(tr_pred_df)
    te_pred_list.append(te_pred_df)

100%|██████████| 20/20 [00:07<00:00,  2.70it/s]


In [90]:
# train
tr_pred_df = pd.concat(tr_pred_list,axis=0).sort_index()
mean_absolute_error(y_true=tr_pred_df.true,y_pred=tr_pred_df.pred)

4.527716747927118

In [91]:
te_pred_df = pd.concat(te_pred_list,axis=0).sort_index()
te_pred_df.head()

Unnamed: 0,segment,pred
0,mazda,80.722174
1,ford,28.22706
2,volkswagen,96.743499
3,renault,123.139001
4,volvo,51.931781


In [92]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['가격'] = te_pred_df.pred.values
submit.to_csv('./out/7_catboost_segment_브랜드_kfold.csv',index=False)

<br>

## Weighted Ensemble
- public score : 

In [None]:
import pandas as pd
import warnings

class OneHotEncoder:
    def __init__(self):
        pass
    
    def fit(self,data,columns):
        self.transform_list = []
        for col in columns:
            for i,value in enumerate(sorted(data[col].unique())):
                if i>0:
                    self.transform_list.append([col,value])
        
    def transform(self,data):
        warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
        new_data = data.copy()
        for col,value in self.transform_list:
            new_data[f'{col}_{value}'] = np.where(new_data[col]==value,1,0)
        drop_columns = pd.unique(np.array(self.transform_list)[:,0])
        new_data.drop(columns=drop_columns,inplace=True)
        return new_data

In [None]:
from copy import deepcopy
from sklearn.base import BaseEstimator, RegressorMixin
from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
import warnings
import time
import pickle

class WeightedEnsembleRegressor(BaseEstimator, RegressorMixin):
    def __init__(self,weight=['equal','balanced']):
        super().__init__()
        
        assert weight in ['equal','balanced'], \
            "weight must be one of ['equal','balanced']"
        self.weight = weight
        self._get_regressors()
    
    def _get_regressors(self):
        max_depth = 10
        n_jobs = -1
        
        params_catboost = {
            'random_state':CFG.SEED,
            'early_stopping_rounds' : CFG.ES,
            'learning_rate' : CFG.LR,
            'iterations' : CFG.EPOCHS,
            'loss_function': 'MAE',
            'grow_policy' : 'Lossguide', # 'SymmetricTree','Depthwise'
            'use_best_model' : True,
            'allow_writing_files' : False,
            'verbose' : 0,
            'max_depth': max_depth,
            'l2_leaf_reg' : 1,
        }
    
        params_xgboost = {
            'random_state':CFG.SEED,
            'early_stopping_rounds' : CFG.XGB_ES,
            'learning_rate' : CFG.XGB_LR,
            'n_estimators' : CFG.XGB_EPOCHS,
            'objective': 'reg:absoluteerror',
            'verbosity' : 0,
            'max_depth': max_depth,
            'n_jobs' : n_jobs,
        }
    
        params_lgb = {
            'objective': 'regression',
            'random_state':CFG.SEED,
            'early_stopping_round' : CFG.ES,
            'learning_rate' : CFG.LR,
            'n_estimators' : CFG.EPOCHS,
            'metric': 'mean_absolute_error',
            'verbosity' : -1,
            'max_depth': max_depth,
            'n_jobs' : n_jobs,
        }
        
        self.regressors = [
            CatBoostRegressor(**params_catboost),
            XGBRegressor(**params_xgboost),
            LGBMRegressor(**params_lgb),
        ]
        self.regressors_name = ['CatBoost','XGBoost','LightGBM']
    
    def fit(self,X,y,eval_set,oh_set,cat_features,verbose=1):
        assert len(eval_set)==1, \
            "eval_set length must be 1. len(eval_set)={}".format(len(eval_set))
        assert len(oh_set)==1, \
            "oh_set length must be 1. len(oh_set)={}".format(len(oh_set))
        X_val, y_val = eval_set[0]
        X_oh, X_val_oh = oh_set[0]
        
        self.cat_features = cat_features
        self.weights = []
        self.fitting_elapsed = []
        if verbose:
            pbar = tqdm(zip(self.regressors_name,self.regressors),total=len(self.regressors))
        else:
            pbar = zip(self.regressors_name,self.regressors)
        for regressor_name,regressor in pbar:
            s = time.time()
            if verbose:
                pbar.set_description(name)
            if regressor_name=='CatBoost':
                train_dataset = Pool(X,y,cat_features=cat_features)
                val_dataset   = Pool(X_val,y_val,cat_features=cat_features)
                regressor.fit(
                    train_dataset,
                    eval_set=val_dataset,
                    #metric_period=CFG.EPOCHS//5,
                )
                val_pred = regressor.predict(val_dataset)
            elif regressor_name=='XGBoost':
                regressor.fit(
                    X_oh,y,
                    eval_set=[(X_val_oh,y_val)],
                    verbose=0,
                )
                val_pred = regressor.predict(X_val_oh)
            elif regressor_name=='LightGBM':
                warnings.filterwarnings("ignore", category=UserWarning)
                X_tmp = X.copy()
                X_val_tmp = X_val.copy()
                for col in cat_features:
                    X_tmp[col]     = X_tmp[col]    .astype('category')
                    X_val_tmp[col] = X_val_tmp[col].astype('category')
                regressor.fit(
                    X_tmp,y,
                    eval_set=[(X_val_tmp,y_val)],
                    verbose=-1,
                )
                val_pred = regressor.predict(X_val_tmp)
            else:
                raise ValueError('Unknown Regressor: {}'.format(regressor_name))
                
            score = mean_absolute_error(y_pred=val_pred,y_true=y_val)
            e = time.time()
            
            self.weights.append(1/score)
            self.fitting_elapsed.append(e-s)
        
        if self.weight=='equal':
            self.weights = np.array([1.0 for _ in self.regressors])
        self.weights /= sum(self.weights)
                
    def predict(self,X,X_oh):
        assert len(X)==len(X_oh), \
            "X and X_oh must be same length"
        
        pred_list = []
        for regressor_name,regressor in zip(self.regressors_name,self.regressors):
            if regressor_name=='CatBoost':
                dataset = Pool(X,cat_features=self.cat_features)
            elif regressor_name=='XGBoost':
                dataset = X_oh.copy()
            elif regressor_name=='LightGBM':
                dataset = X.copy()
                for col in self.cat_features:
                    dataset[col] = dataset[col].astype('category')
            else:
                raise ValueError('Unknown Regressor: {}'.format(regressor_name))
            
            y_pred = regressor.predict(dataset)
            y_pred = np.array(y_pred).flatten()
            pred_list.append(y_pred)
            
        final_pred = np.zeros(len(X))
        for pred,weight in zip(pred_list,self.weights):
            final_pred += np.array(pred)*weight
            
        return final_pred
    
    def save_model(self,path):
        save_dict = {
            'cat_features' : self.cat_features,
            'weights' : self.weights,
            'fitting_elapsed' : self.fitting_elapsed,
            'regressors' : self.regressors,
        }
        with open(path, 'wb') as f:
            pickle.dump(save_dict, f, protocol=pickle.HIGHEST_PROTOCOL)
            
    def load_model(self,path):
        with open(path, 'rb') as f:
            save_dict = pickle.load(f)
            self.cat_features = save_dict['cat_features']
            self.weights = save_dict['weights']
            self.fitting_elapsed = save_dict['fitting_elapsed']
            self.regressors = save_dict['regressors']

In [None]:
from sklearn.model_selection import KFold

In [None]:
gc.collect()

In [None]:
train_fn = train_df7.copy()
test_fn  = test_df7 .copy()

In [None]:
%%time
# 6시간

X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]

ohe = OneHotEncoder()
ohe.fit(X,cat_features)
X_oh = ohe.transform(X)

segment_list = X['segment'].unique()

models = {}
feature_info = {}
scores = []
pbar = tqdm(segment_list)

s_i = 0
for segment in pbar:
    s_i+=1
    
    # segment에 해당하는 데이터추출
    _X    = X   [X   .segment==segment].drop('segment',axis=1)
    _X_oh = X_oh[X_oh.segment==segment].drop('segment',axis=1)
    _y    = y   [X   .segment==segment]
    
    # kfold
    kf = KFold(n_splits=CFG.N_SPLITS,random_state=1000*s_i+CFG.SEED,shuffle=True)
    
    # unique인 컬럼 제외
    # (1) X
    unique_info = _X.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        _X = _X.drop(unique_cols,axis=1)
    # (2) X_oh
    unique_info = _X_oh.nunique()
    unique_cols = unique_info[unique_info==1].index.tolist()
    if len(unique_cols)>0:
        _X_oh = _X_oh.drop(unique_cols,axis=1)
        
    # categorical feature에서 unique인 컬럼을 제외
    fixed_cat_features = [col for col in cat_features if col in _X.columns]
    
    _models = []
    _scores = []
    k=0
    for tr_idx,val_idx in kf.split(_X,_y):
        k+=1
        
        # kfold dataset
        X_tr   , X_va    = _X   .iloc[tr_idx], _X   .iloc[val_idx]
        X_tr_oh, X_va_oh = _X_oh.iloc[tr_idx], _X_oh.iloc[val_idx]
        y_tr   , y_va    = _y   .iloc[tr_idx], _y   .iloc[val_idx]

        # progress
        progress = 'Segment: {}, Length: Train({}), Validation({}), KFold: {}/{}'\
            .format(segment,len(X_tr),len(X_va),k,CFG.N_SPLITS)
        pbar.set_description(progress)

        # define the model
        ensemble_model = WeightedEnsembleRegressor(weight='equal')

        # fit the model
        ensemble_model.fit(
            X_tr,y_tr,
            eval_set=[(X_va,y_va)],
            oh_set=[(X_tr_oh,X_va_oh)],
            cat_features=fixed_cat_features,
            verbose=0,
        )

        # save the model
        ensemble_model.save_model(f'./model_checkpoints/segment_weightedensemble/{segment}_k{k}.pickle')

        # calculate the score
        y_pred = ensemble_model.predict(X_va,X_va_oh).flatten()
        y_true = y_va.values
        score = mean_absolute_error(y_true=y_true,y_pred=y_pred)
        
        # append inner loop
        _models.append(ensemble_model)
        _scores.append([segment,k,len(X_tr),len(X_va),score])

    # append outer loop
    models[segment] = _models
    scores.append(_scores)
    feature_info[segment] = {
        'cat_features':fixed_cat_features,
        'features':_X.columns.tolist(),
        'oh_features':_X_oh.columns.tolist(),
    }

In [None]:
import pickle
with open('./model_checkpoints/segment_weiens_models_brand.pkl', 'wb') as f:
	pickle.dump(models, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_weiens_feature_info_brand.pkl', 'wb') as f:
	pickle.dump(feature_info, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('./model_checkpoints/segment_weiens_scores_brand.pkl', 'wb') as f:
	pickle.dump(scores, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# import pickle
# with open('./model_checkpoints/segment_weiens_models_brand.pkl', 'rb') as f:
# 	models = pickle.load(f)
# with open('./model_checkpoints/segment_weiens_feature_info_brand.pkl', 'rb') as f:
# 	feature_info = pickle.load(f)
# with open('./model_checkpoints/segment_weiens_scores_brand.pkl', 'rb') as f:
# 	scores = pickle.load(f)

In [None]:
# pd.DataFrame(
#     np.array(scores).reshape(100,5),
#     columns=['segment','k','n_tr','n_val','score']
# ).sort_values(['segment','k'])

In [None]:
# inference
X = train_fn.drop(target_feature,axis=1)
y = train_fn[target_feature]

X_test = test_fn.copy()

ohe = OneHotEncoder()
ohe.fit(X,cat_features)
X_oh = ohe.transform(X)
X_test_oh = ohe.transform(X_test)

segment_list = X['segment'].unique()

tr_pred_list = []
te_pred_list = []
for segment in tqdm(segment_list):
    ## data load
    # (1) train
    train_data    = X   [X   .segment==segment][feature_info[segment]['features']]
    train_data_oh = X_oh[X_oh.segment==segment][feature_info[segment]['oh_features']]
    # (2) test
    test_data     = X_test   [X_test   .segment==segment][feature_info[segment]['features']]
    test_data_oh  = X_test_oh[X_test_oh.segment==segment][feature_info[segment]['oh_features']]
    
    ## model
    kfold_models = models[segment]
    
    ## prediction
    # (1) train
    tr_pred_df = pd.DataFrame({
        'true':y[X.segment==segment].values.flatten(),
        'pred':np.mean([model.predict(train_data,train_data_oh) for model in kfold_models],axis=0),
    })
    tr_pred_df.index = train_data.index
    # (2) test
    te_pred_df = pd.DataFrame({
        'pred':np.mean([model.predict(test_data,test_data_oh) for model in kfold_models],axis=0),
    })
    te_pred_df.index = test_data.index
    
    ## append
    tr_pred_list.append(tr_pred_df)
    te_pred_list.append(te_pred_df)

In [None]:
# train
tr_pred_df = pd.concat(tr_pred_list,axis=0).sort_index()
mean_absolute_error(y_true=tr_pred_df.true,y_pred=tr_pred_df.pred)

In [None]:
# def abline(intercept,slope,**kwargs):
#     axes = plt.gca()
#     x_vals = np.array(axes.get_xlim())
#     y_vals = intercept + slope * x_vals
#     plt.plot(x_vals, y_vals, '--',**kwargs)

# offset = 0.05
# min_value = min(tr_pred_df.true.min(),tr_pred_df.pred.min())*(1-offset)
# max_value = min(tr_pred_df.true.max(),tr_pred_df.pred.max())*(1+offset)

# plt.figure(figsize=(15,7))
# sns.scatterplot(x=tr_pred_df.true,y=tr_pred_df.pred)
# plt.xlim(min_value,max_value)
# plt.ylim(min_value,max_value)
# abline(0,1,color='red',linestyle='--')
# plt.show()

In [None]:
te_pred_df = pd.concat(te_pred_list,axis=0).sort_index()
te_pred_df.head()

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
submit['가격'] = te_pred_df.pred.values
submit.to_csv('./out/8_ensemble_segment_브랜드_kfold.csv',index=False)

<br>

## 참조 pycaret

In [None]:
# from pycaret import regression

In [None]:
# %%time

# data = train_fn[train_fn.segment==segment_list[0]]
# print(len(data))

# regression.setup(data=data,target='가격',remove_outliers=True,verbose=True)
# best = regression.compare_models(n_select=5,fold=5)