In [24]:
import numpy as np
import pandas as pd
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from utils import manage_outlier

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA

from lightgbm import LGBMRegressor as lgbm
from xgboost import XGBRegressor as xgb
from catboost import CatBoostRegressor as cat
from sklego.linear_model import LADRegression as lad

plt.rc('font', family='Malgun Gothic')

import warnings
warnings.filterwarnings('ignore')

In [15]:
# seed 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [16]:
train = pd.read_csv('./open/train.csv').drop('ID',axis=1)
test = pd.read_csv('./open/test.csv').drop('ID',axis=1)
submission = pd.read_csv('./open/sample_submission.csv')

In [17]:
# 배기량을 기준으로 차량 구분 컬럼 추가
lst = []
for val in tqdm(train['배기량']):
    if val >= 2000:
        lst.append('대형')
    elif val >= 1600:
        lst.append('중형')
    elif val >= 1000:
        lst.append('소형')
    else:
        lst.append('경형')

train['배기량별 구분'] = lst

lst = []
for val in tqdm(test['배기량']):
    if val >= 2000:
        lst.append('대형')
    elif val >= 1600:
        lst.append('중형')
    elif val >= 1000:
        lst.append('소형')
    else:
        lst.append('경형')

test['배기량별 구분'] = lst

lst = []
for val in tqdm(train['배기량']):
    if val > 3800:
        lst.append('8기통')
    elif val > 2900:
        lst.append('6기통')
    elif val > 1400:
        lst.append('4기통')
    else:
        lst.append('3기통')

train['배기량별 구분2'] = lst

lst = []
for val in tqdm(test['배기량']):
    if val > 3800:
        lst.append('8기통')
    elif val > 2900:
        lst.append('6기통')
    elif val > 1400:
        lst.append('4기통')
    else:
        lst.append('3기통')

test['배기량별 구분2'] = lst

# 배기량, 구분을 기준으로 자동차세 컬럼 추가
train['연간 자동차세'] = 0
train.loc[train['배기량별 구분']=='경형','연간 자동차세'] = train['배기량'] * 90
train.loc[train['배기량별 구분']=='소형','연간 자동차세'] = train['배기량'] * 140
train.loc[np.logical_or(train['배기량별 구분']=='중형', train['배기량별 구분']=='대형'),'연간 자동차세'] = train['배기량'] * 220

test['연간 자동차세'] = 0
test.loc[test['배기량별 구분']=='경형','연간 자동차세'] = test['배기량'] * 90
test.loc[test['배기량별 구분']=='소형','연간 자동차세'] = test['배기량'] * 140
test.loc[np.logical_or(test['배기량별 구분']=='중형', test['배기량별 구분']=='대형'),'연간 자동차세'] = test['배기량'] * 220

train['지방교육세'] = train['연간 자동차세'] * .3
test['지방교육세'] = test['연간 자동차세'] * .3

train['총 자동차세'] = train['연간 자동차세'] + train['지방교육세']
test['총 자동차세'] = test['연간 자동차세'] + test['지방교육세']

# 생산년도, 출시년도를 기준으로 컬럼 추가 및 자동차세 할인 적용
train['생산이후'] = 2023 - train['생산년도']
train['모델출시이후'] = 2023 - train['모델출시년도']

test['생산이후'] = 2023 - test['생산년도']
test['모델출시이후'] = 2023 - test['모델출시년도']

train['자동차세 할인 여부'] = 0
train.loc[train['생산이후']>=3,'자동차세 할인 여부'] = 1

test['자동차세 할인 여부'] = 0
test.loc[test['생산이후']>=3,'자동차세 할인 여부'] = 1

train['할인 후 자동차세'] = 0
train.loc[train['생산이후']>=3,'할인 후 자동차세'] = (train['연간 자동차세'] + train['지방교육세'])*(1 - (train['생산이후']-2)*.05)
train.loc[train['생산이후']>=13,'할인 후 자동차세'] = (train['연간 자동차세'] + train['지방교육세'])*.5

test['할인 후 자동차세'] = 0
test.loc[test['생산이후']>=3,'할인 후 자동차세'] = (test['연간 자동차세'] + test['지방교육세'])*(1 - (test['생산이후']-2)*.05)
test.loc[test['생산이후']>=13,'할인 후 자동차세'] = (test['연간 자동차세'] + test['지방교육세'])*.5

# 생산 이후, 주행거리를 기준으로 일반, 엔진 보증 컬럼 추가
train['일반보증'] = 0
train.loc[np.logical_or(train['생산이후']<=3, train['주행거리']<=60000),'일반보증'] = 1

test['일반보증'] = 0
test.loc[np.logical_or(test['생산이후']<=3, test['주행거리']<=60000),'일반보증'] = 1

train['엔진보증'] = 0
train.loc[np.logical_or(train['생산이후']<=5, train['주행거리']<=100000),'엔진보증'] = 1

test['엔진보증'] = 0
test.loc[np.logical_or(test['생산이후']<=5, test['주행거리']<=100000),'엔진보증'] = 1

# 주행거리별 구분 추가
train['주행거리별 구분'] = 0
train.loc[np.logical_and(train['주행거리']>=0, train['주행거리']<=50000),'주행거리별 구분'] = '양호'
train.loc[np.logical_and(train['주행거리']>50000, train['주행거리']<=100000),'주행거리별 구분'] = '좀많음'
train.loc[np.logical_and(train['주행거리']>100000, train['주행거리']<=200000),'주행거리별 구분'] = '많음'
train.loc[train['주행거리']>200000,'주행거리별 구분'] = '아주많음'

test['주행거리별 구분'] = 0
test.loc[np.logical_and(test['주행거리']>=0, test['주행거리']<=50000),'주행거리별 구분'] = '양호'
test.loc[np.logical_and(test['주행거리']>50000, test['주행거리']<=100000),'주행거리별 구분'] = '좀많음'
test.loc[np.logical_and(test['주행거리']>100000, test['주행거리']<=200000),'주행거리별 구분'] = '많음'
test.loc[test['주행거리']>200000,'주행거리별 구분'] = '아주많음'

# 판매구역 중 특이한 분포를 보이는 MOR 구역 구분 추가
train['Is_MOR'] = 0
train.loc[train['판매구역']=='MOR', 'Is_MOR'] = 1
test['Is_MOR'] = 0
test.loc[test['판매구역']=='MOR', 'Is_MOR'] = 1

100%|██████████| 57920/57920 [00:00<00:00, 4448608.98it/s]
100%|██████████| 14480/14480 [00:00<00:00, 3445678.08it/s]
100%|██████████| 57920/57920 [00:00<00:00, 3612725.12it/s]
100%|██████████| 14480/14480 [00:00<00:00, 4117806.08it/s]


In [18]:
# 브랜드, 차량모델명, 판매도시, 판매구역 별 가격을 기준으로 랭크 인코딩
brand_idx = train[['브랜드','가격']].groupby(['브랜드']).mean().sort_values('가격').index
dict_brand = {}
for i in range(len(brand_idx)):
    key = brand_idx[i]
    value = i
    dict_brand[key] = value

train['브랜드'] = train['브랜드'].map(dict_brand)
test['브랜드'] = test['브랜드'].map(dict_brand)

model_idx = train[['차량모델명','가격']].groupby(['차량모델명']).mean().sort_values('가격').index
dict_model = {}
for i in range(len(model_idx)):
    key = model_idx[i]
    value = i
    dict_model[key] = value

train['차량모델명'] = train['차량모델명'].map(dict_model)
test['차량모델명'] = test['차량모델명'].map(dict_model)

city_idx = train[['판매도시','가격']].groupby(['판매도시']).mean().sort_values('가격').index
dict_city = {}
for i in range(len(city_idx)):
    key = city_idx[i]
    value = i
    dict_city[key] = value

train['판매도시'] = train['판매도시'].map(dict_city)
test['판매도시'] = test['판매도시'].map(dict_city)

sector_idx = train[['판매구역','가격']].groupby(['판매구역']).mean().sort_values('가격').index
dict_sector = {}
for i in range(len(sector_idx)):
    key = sector_idx[i]
    value = i
    dict_sector[key] = value

train['판매구역'] = train['판매구역'].map(dict_sector)
test['판매구역'] = test['판매구역'].map(dict_sector)

In [19]:
test.isna().sum()

생산년도             0
모델출시년도           0
브랜드              0
차량모델명            0
판매도시           324
판매구역             0
주행거리             0
배기량              0
압축천연가스(CNG)      0
경유               0
가솔린              0
하이브리드            0
액화석유가스(LPG)      0
배기량별 구분          0
배기량별 구분2         0
연간 자동차세          0
지방교육세            0
총 자동차세           0
생산이후             0
모델출시이후           0
자동차세 할인 여부       0
할인 후 자동차세        0
일반보증             0
엔진보증             0
주행거리별 구분         0
Is_MOR           0
dtype: int64

In [20]:
test.fillna(0, inplace=True)

In [22]:
# 연료 특징들을 하나의 컬럼으로 통합
train['연료'] = 0
for col in tqdm(['압축천연가스(CNG)','경유','가솔린','하이브리드','액화석유가스(LPG)']):
    train.loc[train[col]==1,'연료'] = col
# train = train.drop(['압축천연가스(CNG)','경유','가솔린','하이브리드','액화석유가스(LPG)'],axis=1)

test['연료'] = 0
for col in tqdm(['압축천연가스(CNG)','경유','가솔린','하이브리드','액화석유가스(LPG)']):
    test.loc[test[col]==1,'연료'] = col
# test = test.drop(['압축천연가스(CNG)','경유','가솔린','하이브리드','액화석유가스(LPG)'],axis=1)

fuel_idx = train[['연료','가격']].groupby(['연료']).mean().sort_values('가격').index
dict_fuel = {}
dict_fuel['압축천연가스(CNG)'] = 4
dict_fuel['경유'] = 1
dict_fuel['가솔린'] = 3
dict_fuel['하이브리드'] = 0
dict_fuel['압축천연가스(LPG)'] = 2

train['연료'] = train['연료'].map(dict_fuel)
test['연료'] = test['연료'].map(dict_fuel)

# 브랜드와 차량모델명을 감안한 브랜드파워 컬럼 추가
train['브랜드파워'] = train['브랜드'] * train['차량모델명']
test['브랜드파워'] = test['브랜드'] * test['차량모델명']

# 연료와 배기량을 감안한 유사 연비 컬럼 추가
train['유사 연비'] = train['연료'] * train['배기량']
test['유사 연비'] = test['연료'] * test['배기량']

train.drop('연료',axis=1, inplace=True)
test.drop('연료',axis=1, inplace=True)

100%|██████████| 5/5 [00:00<00:00, 993.77it/s]
100%|██████████| 5/5 [00:00<00:00, 3519.89it/s]


In [27]:
for col in train.columns:
    print(col,':' ,len(train[col].unique()))

생산년도 : 39
모델출시년도 : 34
브랜드 : 20
차량모델명 : 143
판매도시 : 3224
판매구역 : 17
주행거리 : 20994
배기량 : 347
압축천연가스(CNG) : 2
경유 : 2
가솔린 : 2
하이브리드 : 2
액화석유가스(LPG) : 2
가격 : 2466
배기량별 구분 : 4
배기량별 구분2 : 4
연간 자동차세 : 347
지방교육세 : 347
총 자동차세 : 347
생산이후 : 39
모델출시이후 : 34
자동차세 할인 여부 : 2
할인 후 자동차세 : 1029
일반보증 : 2
엔진보증 : 2
주행거리별 구분 : 4
Is_MOR : 2
브랜드파워 : 134
유사 연비 : 417


In [28]:
categ = ['배기량별 구분', '배기량별 구분2','주행거리별 구분']

In [29]:
# 범주형 컬럼 원-핫 인코딩
encoder = OneHotEncoder(sparse=False)

onehot = pd.DataFrame(encoder.fit_transform(train[categ]), columns=list(encoder.categories_[0])+ list(encoder.categories_[1])+ list(encoder.categories_[2]))
train = pd.concat([train.drop(encoder.feature_names_in_, axis=1), onehot], axis=1)

onehot2 = pd.DataFrame(encoder.transform(test[categ]), columns=list(encoder.categories_[0])+ list(encoder.categories_[1])+ list(encoder.categories_[2]))
test = pd.concat([test.drop(encoder.feature_names_in_, axis=1), onehot2], axis=1)

In [32]:
# 주행거리 컬럼의 이상치 제거
Q3 = np.quantile(train['주행거리'],.75)
Q1 = np.quantile(train['주행거리'],.25)
IQR = Q3 - Q1
maximum = Q3 + (3*IQR)
train['주행거리_이상치'] = 0
train.loc[train['주행거리']>maximum,'주행거리_이상치'] = 1

train = train.drop(index=train[train['주행거리_이상치']==1].index).reset_index().drop(['index','주행거리_이상치'],axis=1)

In [34]:
# 종속변수, 독립변수 분리
train_x = train.drop('가격',axis=1)
train_y = train['가격']
test_x = test

In [35]:
# MinMaxScaler
scaler = MinMaxScaler()

train_sc = pd.DataFrame(scaler.fit_transform(train_x), columns=train_x.columns)
test_sc = pd.DataFrame(scaler.transform(test_x), columns=test_x.columns)

In [38]:
# 주성분 분석을 통한 차원압축
# 생산, 출시년도 관련 컬럼 압축
pca = PCA(n_components=1)
pca.fit(train_sc[['생산년도','모델출시년도','생산이후','모델출시이후']])
print('pca: ', pca.explained_variance_ratio_)

pca:  [0.95048239]


In [39]:
train_sc['생산출시PCA'] = pca.transform(train_sc[['생산년도','모델출시년도','생산이후','모델출시이후']])
train_sc.drop(['생산년도','모델출시년도','생산이후','모델출시이후'],axis=1,inplace=True)

test_sc['생산출시PCA'] = pca.transform(test_sc[['생산년도','모델출시년도','생산이후','모델출시이후']])
test_sc.drop(['생산년도','모델출시년도','생산이후','모델출시이후'],axis=1,inplace=True)

In [40]:
# 배기량 관련 컬럼 압축

pca4 = PCA(n_components=1)
pca4.fit(train_sc[['배기량','연간 자동차세','지방교육세','총 자동차세']])
print('pca_1: ', pca4.explained_variance_ratio_)

pca_1:  [0.98920876]


In [41]:
train_sc['배기량PCA'] = pca4.transform(train_sc[['배기량','연간 자동차세','지방교육세','총 자동차세']])
train_sc.drop(['배기량','연간 자동차세','지방교육세','총 자동차세'],axis=1,inplace=True)

test_sc['배기량PCA'] = pca4.transform(test_sc[['배기량','연간 자동차세','지방교육세','총 자동차세']])
test_sc.drop(['배기량','연간 자동차세','지방교육세','총 자동차세'],axis=1,inplace=True)

In [42]:
display(train_sc.head(2)
,test_sc.head(2))

Unnamed: 0,브랜드,차량모델명,판매도시,판매구역,주행거리,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),...,3기통,4기통,6기통,8기통,많음,아주많음,양호,좀많음,생산출시PCA,배기량PCA
0,0.894737,0.309859,0.843624,0.75,0.156386,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.307572,-0.359188
1,0.789474,0.605634,0.525597,0.0,0.247705,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.085821,-0.152209


Unnamed: 0,브랜드,차량모델명,판매도시,판매구역,주행거리,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),...,3기통,4기통,6기통,8기통,많음,아주많음,양호,좀많음,생산출시PCA,배기량PCA
0,0.473684,0.584507,0.837419,0.5,0.104586,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.258398,0.138852
1,0.315789,0.161972,0.735029,0.25,0.291742,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.008509,-0.239348


In [43]:
tr_sc, val_sc, tr_sc_y, val_sc_y = train_test_split(train_sc, train_y, test_size=.15, random_state=42)

In [44]:
# lgbm 최종
LGBM = lgbm(objective='mae', metric='mae', n_estimators=20000, random_state=42,
             max_depth=11, num_leaves=255, learning_rate=0.01, reg_alpha=.5, reg_lambda=.05, subsample=.4)
LGBM.fit(tr_sc, tr_sc_y, eval_set=[(tr_sc, tr_sc_y),(val_sc, val_sc_y)], early_stopping_rounds=1000, verbose=1000)

[1000]	training's l1: 4.90455	valid_1's l1: 5.8412
[2000]	training's l1: 4.48535	valid_1's l1: 5.74156
[3000]	training's l1: 4.26321	valid_1's l1: 5.71128
[4000]	training's l1: 4.13118	valid_1's l1: 5.69372
[5000]	training's l1: 4.06335	valid_1's l1: 5.68862
[6000]	training's l1: 3.97922	valid_1's l1: 5.68359
[7000]	training's l1: 3.9193	valid_1's l1: 5.67984
[8000]	training's l1: 3.88172	valid_1's l1: 5.67802
[9000]	training's l1: 3.85364	valid_1's l1: 5.67609
[10000]	training's l1: 3.82314	valid_1's l1: 5.67602
[11000]	training's l1: 3.79524	valid_1's l1: 5.67633


In [45]:
print("LGBM Validation MAE : {}".format(mae(val_sc_y, LGBM.predict(val_sc))))

LGBM Validation MAE : 5.675520543107661


In [49]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(LGBM, scoring='neg_mean_absolute_error', random_state=42).fit(val_sc,val_sc_y)
eli5.show_weights(perm, top=len(val_sc.columns), feature_names=val_sc.columns.tolist())

Weight,Feature
19.2223  ± 0.2698,생산출시PCA
7.7480  ± 0.1332,할인 후 자동차세
4.3715  ± 0.0419,차량모델명
4.0541  ± 0.0752,주행거리
2.7774  ± 0.0652,브랜드
1.7860  ± 0.0772,브랜드파워
1.0086  ± 0.0303,배기량PCA
0.7425  ± 0.0305,유사 연비
0.4652  ± 0.0468,판매도시
0.1298  ± 0.0153,자동차세 할인 여부


In [46]:
pd.DataFrame(LGBM.feature_importances_, index=LGBM.feature_name_, columns=['변수 중요도']).sort_values(by='변수 중요도',ascending=False)

Unnamed: 0,변수 중요도
주행거리,283968
판매도시,277074
생산출시PCA,208662
유사_연비,138989
할인_후_자동차세,133191
판매구역,126151
차량모델명,122754
브랜드파워,83276
브랜드,58686
배기량PCA,58452


In [54]:
# lgbm 변수 선택 중...
LGBM = lgbm(objective='mae', metric='mae', n_estimators=20000, random_state=42,
             max_depth=11, num_leaves=255, learning_rate=0.01, reg_alpha=.5, reg_lambda=.05, subsample=.4)
LGBM.fit(tr_sc.drop(['Is_MOR','아주많음','많음','좀많음','양호'], axis=1), tr_sc_y, eval_set=[(tr_sc.drop(['Is_MOR','아주많음','많음','좀많음','양호'], axis=1), tr_sc_y),(val_sc.drop(['Is_MOR','아주많음','많음','좀많음','양호'], axis=1), val_sc_y)], early_stopping_rounds=1000, verbose=1000)

[1000]	valid_0's l1: 4.90869	valid_1's l1: 5.86838
[2000]	valid_0's l1: 4.50556	valid_1's l1: 5.77748
[3000]	valid_0's l1: 4.29281	valid_1's l1: 5.74336
[4000]	valid_0's l1: 4.14801	valid_1's l1: 5.71793
[5000]	valid_0's l1: 4.04876	valid_1's l1: 5.71071
[6000]	valid_0's l1: 3.97089	valid_1's l1: 5.70199
[7000]	valid_0's l1: 3.92004	valid_1's l1: 5.69838
[8000]	valid_0's l1: 3.87531	valid_1's l1: 5.69673
[9000]	valid_0's l1: 3.84045	valid_1's l1: 5.69565
[10000]	valid_0's l1: 3.82107	valid_1's l1: 5.6944
[11000]	valid_0's l1: 3.79909	valid_1's l1: 5.69336
[12000]	valid_0's l1: 3.77801	valid_1's l1: 5.69232
[13000]	valid_0's l1: 3.74592	valid_1's l1: 5.69153
[14000]	valid_0's l1: 3.72338	valid_1's l1: 5.69083
[15000]	valid_0's l1: 3.694	valid_1's l1: 5.69067
[16000]	valid_0's l1: 3.65662	valid_1's l1: 5.68562
[17000]	valid_0's l1: 3.64386	valid_1's l1: 5.68528


In [55]:
print("LGBM Validation MAE : {}".format(mae(val_sc_y, LGBM.predict(val_sc.drop(['Is_MOR','아주많음','많음','좀많음','양호'], axis=1)))))

LGBM Validation MAE : 5.685039637353458
