In [28]:
import numpy as np
import pandas as pd
import joblib
import random
import os
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from utils import manage_outlier, plot_boxplot, plot_violinplot

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.decomposition import PCA

from sklearn.ensemble import HistGradientBoostingRegressor as hgbr
from sklearn.neural_network import MLPRegressor as mlp
from lightgbm import LGBMRegressor as lgbm
from xgboost import XGBRegressor as xgb
from catboost import CatBoostRegressor as cat
from sklego.linear_model import LADRegression as lad

plt.rc('font', family='Malgun Gothic')

import warnings
warnings.filterwarnings('ignore')

In [29]:
# seed 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [31]:
train = pd.read_csv('./open/train.csv').drop('ID',axis=1)
test = pd.read_csv('./open/test.csv').drop('ID',axis=1)
submission = pd.read_csv('./open/sample_submission.csv')

In [32]:
# 배기량을 기준으로 컬럼 추가
lst = []
for val in tqdm(train['배기량']):
    if val > 2000:
        lst.append('대형')
    elif val > 1600:
        lst.append('중형')
    elif val > 1000:
        lst.append('소형')
    else:
        lst.append('경형')

train['배기량별 구분'] = lst

lst = []
for val in tqdm(test['배기량']):
    if val > 2000:
        lst.append('대형')
    elif val > 1600:
        lst.append('중형')
    elif val > 1000:
        lst.append('소형')
    else:
        lst.append('경형')

test['배기량별 구분'] = lst

train['연간 자동차세'] = 0
train.loc[train['배기량별 구분']=='경형','연간 자동차세'] = train['배기량'] * 90
train.loc[train['배기량별 구분']=='소형','연간 자동차세'] = train['배기량'] * 140
train.loc[np.logical_or(train['배기량별 구분']=='중형', train['배기량별 구분']=='대형'),'연간 자동차세'] = train['배기량'] * 220

test['연간 자동차세'] = 0
test.loc[test['배기량별 구분']=='경형','연간 자동차세'] = test['배기량'] * 90
test.loc[test['배기량별 구분']=='소형','연간 자동차세'] = test['배기량'] * 140
test.loc[np.logical_or(test['배기량별 구분']=='중형', test['배기량별 구분']=='대형'),'연간 자동차세'] = test['배기량'] * 220

train['지방교육세'] = train['연간 자동차세'] * .3
test['지방교육세'] = test['연간 자동차세'] * .3

train['총 자동차세'] = train['연간 자동차세'] + train['지방교육세']
test['총 자동차세'] = test['연간 자동차세'] + test['지방교육세']

lst = []
for val in tqdm(train['배기량']):
    if val > 3800:
        lst.append('8기통')
    elif val > 2900:
        lst.append('6기통')
    elif val > 1400:
        lst.append('4기통')
    else:
        lst.append('3기통')

train['배기량별 구분2'] = lst

lst = []
for val in tqdm(test['배기량']):
    if val > 3800:
        lst.append('8기통')
    elif val > 2900:
        lst.append('6기통')
    elif val > 1400:
        lst.append('4기통')
    else:
        lst.append('3기통')

test['배기량별 구분2'] = lst

# 생산년도, 출시년도를 기준으로 컬럼 추가 및 자동차세 할인 적용
train['생산이후'] = 2023 - train['생산년도']
train['모델출시이후'] = 2023 - train['모델출시년도']

test['생산이후'] = 2023 - test['생산년도']
test['모델출시이후'] = 2023 - test['모델출시년도']

train['자동차세 할인 여부'] = 0
train.loc[train['생산이후']>=3,'자동차세 할인 여부'] = 1

test['자동차세 할인 여부'] = 0
test.loc[test['생산이후']>=3,'자동차세 할인 여부'] = 1

train['할인 후 자동차세'] = 0
train.loc[train['생산이후']>=3,'할인 후 자동차세'] = (train['연간 자동차세'] + train['지방교육세'])*(1 - (train['생산이후']-2)*.05)
train.loc[train['생산이후']>=13,'할인 후 자동차세'] = (train['연간 자동차세'] + train['지방교육세'])*.5

test['할인 후 자동차세'] = 0
test.loc[test['생산이후']>=3,'할인 후 자동차세'] = (test['연간 자동차세'] + test['지방교육세'])*(1 - (test['생산이후']-2)*.05)
test.loc[test['생산이후']>=13,'할인 후 자동차세'] = (test['연간 자동차세'] + test['지방교육세'])*.5

train['일반보증'] = 0
train.loc[np.logical_or(train['생산이후']<=3, train['주행거리']<=60000),'일반보증'] = 1

test['일반보증'] = 0
test.loc[np.logical_or(test['생산이후']<=3, test['주행거리']<=60000),'일반보증'] = 1

train['엔진보증'] = 0
train.loc[np.logical_or(train['생산이후']<=5, train['주행거리']<=100000),'엔진보증'] = 1

test['엔진보증'] = 0
test.loc[np.logical_or(test['생산이후']<=5, test['주행거리']<=100000),'엔진보증'] = 1

100%|██████████| 57920/57920 [00:00<00:00, 1787213.09it/s]
100%|██████████| 14480/14480 [00:00<00:00, 1814728.60it/s]
100%|██████████| 57920/57920 [00:00<00:00, 2821960.20it/s]
100%|██████████| 14480/14480 [00:00<00:00, 1293799.20it/s]


In [33]:
# 브랜드, 차량모델명, 판매도시, 판매구역 가격을 기준으로 랭크 인코딩
brand_idx = train[['브랜드','가격']].groupby(['브랜드']).mean().sort_values('가격').index
dict_brand = {}
for i in range(len(brand_idx)):
    key = brand_idx[i]
    value = i
    dict_brand[key] = value

train['브랜드'] = train['브랜드'].map(dict_brand)
test['브랜드'] = test['브랜드'].map(dict_brand)

model_idx = train[['차량모델명','가격']].groupby(['차량모델명']).mean().sort_values('가격').index
dict_model = {}
for i in range(len(model_idx)):
    key = model_idx[i]
    value = i
    dict_model[key] = value

train['차량모델명'] = train['차량모델명'].map(dict_model)
test['차량모델명'] = test['차량모델명'].map(dict_model)

city_idx = train[['판매도시','가격']].groupby(['판매도시']).mean().sort_values('가격').index
dict_city = {}
for i in range(len(city_idx)):
    key = city_idx[i]
    value = i
    dict_city[key] = value

train['판매도시'] = train['판매도시'].map(dict_city)
test['판매도시'] = test['판매도시'].map(dict_city)

sector_idx = train[['판매구역','가격']].groupby(['판매구역']).mean().sort_values('가격').index
dict_sector = {}
for i in range(len(sector_idx)):
    key = sector_idx[i]
    value = i
    dict_sector[key] = value

train['판매구역'] = train['판매구역'].map(dict_sector)
test['판매구역'] = test['판매구역'].map(dict_sector)

In [35]:
# 범주형 컬럼 원-핫 인코딩
encoder = OneHotEncoder(sparse=False)

onehot = pd.DataFrame(encoder.fit_transform(train[['배기량별 구분','배기량별 구분2']]), columns=list(encoder.categories_[0])+ list(encoder.categories_[1]))
train = pd.concat([train.drop(encoder.feature_names_in_, axis=1), onehot], axis=1)

onehot2 = pd.DataFrame(encoder.transform(test[['배기량별 구분','배기량별 구분2']]), columns=list(encoder.categories_[0])+ list(encoder.categories_[1]))
test = pd.concat([test.drop(encoder.feature_names_in_, axis=1), onehot2], axis=1)

In [36]:
# 주행거리 컬럼의 이상치 제거
Q3 = np.quantile(train['주행거리'],.75)
Q1 = np.quantile(train['주행거리'],.25)
IQR = Q3 - Q1
maximum = Q3 + (1.5*IQR)
train['주행거리_이상치'] = 0
train.loc[train['주행거리']>maximum,'주행거리_이상치'] = 1

train = train.drop(index=train[train['주행거리_이상치']==1].index).reset_index().drop(['index','주행거리_이상치'],axis=1)

In [10]:
# 브랜드와 차량모델의 가치를 나타내는 컬럼 추가
train['브랜드 파워'] = train['브랜드'] * train['차량모델명']
test['브랜드 파워'] = test['브랜드'] * test['차량모델명']

# 판매도시와 판매구역의 가치를 나타내는 컬럼 추가
train['도시and구역'] = train['판매도시'] * train['판매구역']
test['도시and구역'] = test['판매도시'] * test['판매구역']

In [37]:
# 종속변수, 독립변수 분리
train_x = train.drop('가격',axis=1)
train_y = train['가격']
test_x = test

In [38]:
scaler2 = MinMaxScaler()
train_minmax = pd.DataFrame(scaler2.fit_transform(train_x), columns=train_x.columns)
test_minmax = pd.DataFrame(scaler2.transform(test_x), columns=test_x.columns)

In [39]:
# minmax data
pca2 = PCA(n_components=1)
pca2.fit(train_minmax[['생산년도','모델출시년도','생산이후','모델출시이후']])
print('minmax data pca_1: ', pca2.explained_variance_ratio_)

minmax data pca_1:  [0.95006817]


In [40]:
# minmax data
train_minmax['생산출시PCA'] = pca2.transform(train_minmax[['생산년도','모델출시년도','생산이후','모델출시이후']])
train_minmax.drop(['생산년도','모델출시년도','생산이후','모델출시이후'],axis=1,inplace=True)

test_minmax['생산출시PCA'] = pca2.transform(test_minmax[['생산년도','모델출시년도','생산이후','모델출시이후']])
test_minmax.drop(['생산년도','모델출시년도','생산이후','모델출시이후'],axis=1,inplace=True)

In [41]:
# minmax data
pca4 = PCA(n_components=1)
pca4.fit(train_minmax[['배기량','연간 자동차세','지방교육세','총 자동차세']])
print('minmax data pca_1: ', pca4.explained_variance_ratio_)

minmax data pca_1:  [0.98926083]


In [42]:
# minmax data
train_minmax['배기량PCA'] = pca4.transform(train_minmax[['배기량','연간 자동차세','지방교육세','총 자동차세']])
train_minmax.drop(['배기량','연간 자동차세','지방교육세','총 자동차세'],axis=1,inplace=True)

test_minmax['배기량PCA'] = pca4.transform(test_minmax[['배기량','연간 자동차세','지방교육세','총 자동차세']])
test_minmax.drop(['배기량','연간 자동차세','지방교육세','총 자동차세'],axis=1,inplace=True)

In [43]:
tr_minmax, val_minmax, tr_minmax_y, val_minmax_y = train_test_split(train_minmax, train_y, test_size=.15, random_state=42)

In [44]:
print(f'train data 개수 : {len(tr_minmax)}개')
print(f'val data 개수 : {len(val_minmax)}개')
print(f'test data 개수 : {len(test_minmax)}개')

train data 개수 : 48823개
val data 개수 : 8616개
test data 개수 : 14480개


In [45]:
# lgbm 최종
model1 = lgbm(objective='mae', metric='mae', n_estimators=20000, random_state=42,
             max_depth=11, num_leaves=255, learning_rate=0.01, reg_alpha=.5, reg_lambda=.05) # , subsample=.4
model1.fit(tr_minmax, tr_minmax_y, eval_set=[(tr_minmax, tr_minmax_y),(val_minmax, val_minmax_y)], early_stopping_rounds=1000, verbose=1000)

[1000]	training's l1: 4.95784	valid_1's l1: 6.09361
[2000]	training's l1: 4.53157	valid_1's l1: 6.0243
[3000]	training's l1: 4.31379	valid_1's l1: 6.00586
[4000]	training's l1: 4.19515	valid_1's l1: 5.99953
[5000]	training's l1: 4.03382	valid_1's l1: 5.984
[6000]	training's l1: 3.95769	valid_1's l1: 5.98322
[7000]	training's l1: 3.91586	valid_1's l1: 5.98183
[8000]	training's l1: 3.85407	valid_1's l1: 5.97995
[9000]	training's l1: 3.787	valid_1's l1: 5.97957


In [46]:
mae(val_minmax_y, model1.predict(val_minmax))

5.979142903400021

In [48]:
# XGBoost
model_xgb = xgb(
    objective='reg:absoluteerror', eval_metric='mae', random_state=42, n_estimators=90000,
    max_depth=10, learning_rate=0.01, gamma=0.6, subsample=0.8, reg_alpha=1, reg_lambda=0.05,
    colsample_bytree=0.5, min_child_weight=10, base_score=100
    )
model_xgb.fit(tr_minmax, tr_minmax_y, eval_set=[(tr_minmax, tr_minmax_y),(val_minmax, val_minmax_y)], early_stopping_rounds=1000, verbose=1000)

[0]	validation_0-mae:53.09252	validation_1-mae:52.96561
[1000]	validation_0-mae:45.09242	validation_1-mae:45.13468
[2000]	validation_0-mae:37.45220	validation_1-mae:37.67545
[3000]	validation_0-mae:30.35059	validation_1-mae:30.74095
[4000]	validation_0-mae:23.83506	validation_1-mae:24.36053
[5000]	validation_0-mae:18.00094	validation_1-mae:18.65116
[6000]	validation_0-mae:13.05123	validation_1-mae:13.84022
[7000]	validation_0-mae:9.21517	validation_1-mae:10.15293
[8000]	validation_0-mae:6.75912	validation_1-mae:7.85692
[9000]	validation_0-mae:5.48442	validation_1-mae:6.74112
[10000]	validation_0-mae:4.93332	validation_1-mae:6.31983
[11000]	validation_0-mae:4.67766	validation_1-mae:6.16496
[12000]	validation_0-mae:4.53393	validation_1-mae:6.10438
[13000]	validation_0-mae:4.42731	validation_1-mae:6.07086
[14000]	validation_0-mae:4.33991	validation_1-mae:6.04981
[15000]	validation_0-mae:4.26608	validation_1-mae:6.03302
[16000]	validation_0-mae:4.20226	validation_1-mae:6.02057
[17000]	vali

In [52]:
# CAT = cat(loss_function='MAE', eval_metric='MAE', iterations=30000, random_state=42, max_depth=6, task='GPU')
CAT = cat(loss_function='MAE', eval_metric='MAE', iterations=90000, random_state=42, max_depth=6, learning_rate=.03)
CAT.fit(tr_minmax, tr_minmax_y, eval_set=(val_minmax, val_minmax_y), use_best_model=True, early_stopping_rounds=1000, verbose=1000)

0:	learn: 27.3327744	test: 27.2894518	best: 27.2894518 (0)	total: 16.9ms	remaining: 25m 20s
1000:	learn: 6.3139803	test: 6.6564278	best: 6.6564278 (1000)	total: 11.7s	remaining: 17m 17s
2000:	learn: 5.7393906	test: 6.2444370	best: 6.2444370 (2000)	total: 22.8s	remaining: 16m 43s
3000:	learn: 5.4607088	test: 6.1135599	best: 6.1135599 (3000)	total: 34.1s	remaining: 16m 27s
4000:	learn: 5.2888783	test: 6.0538359	best: 6.0538359 (4000)	total: 45.6s	remaining: 16m 19s
5000:	learn: 5.1614214	test: 6.0216195	best: 6.0213936 (4977)	total: 56.7s	remaining: 16m 4s
6000:	learn: 5.0638578	test: 6.0024434	best: 6.0023131 (5998)	total: 1m 8s	remaining: 15m 52s
7000:	learn: 4.9926605	test: 5.9878083	best: 5.9877080 (6997)	total: 1m 19s	remaining: 15m 39s
8000:	learn: 4.9266702	test: 5.9793868	best: 5.9793257 (7987)	total: 1m 30s	remaining: 15m 31s
9000:	learn: 4.8746906	test: 5.9713096	best: 5.9712997 (8999)	total: 1m 42s	remaining: 15m 19s
10000:	learn: 4.8286828	test: 5.9639203	best: 5.9637955 (999

<catboost.core.CatBoostRegressor at 0x1cf2dfbca90>