In [100]:
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype
import random
import os
import matplotlib.pyplot as plt
# import seaborn as sns
from tqdm import tqdm

from sklearn.metrics import mean_absolute_error as mae
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import StandardScaler as ss, MinMaxScaler as mms, OneHotEncoder as ohe
from sklearn.decomposition import PCA

from sklearn.ensemble import HistGradientBoostingRegressor as hgb
from lightgbm import LGBMRegressor as lgbm
from xgboost import XGBRegressor as xgb
from catboost import CatBoostRegressor as cat
from sklego.linear_model import LADRegression as lad

plt.rc('font', family='Malgun Gothic')

import warnings
warnings.filterwarnings('ignore')

In [2]:
# seed 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42)

In [3]:
train = pd.read_csv('./open/train.csv').drop('ID',axis=1)
test = pd.read_csv('./open/test.csv').drop('ID',axis=1)
submission = pd.read_csv('./open/sample_submission.csv')

In [4]:
# 배기량을 기준으로 컬럼 추가
lst = []
for val in tqdm(train['배기량']):
    if val > 2000:
        lst.append('대형')
    elif val > 1600:
        lst.append('중형')
    elif val > 1000:
        lst.append('소형')
    else:
        lst.append('경형')

train['배기량별 구분'] = lst

lst = []
for val in tqdm(test['배기량']):
    if val > 2000:
        lst.append('대형')
    elif val > 1600:
        lst.append('중형')
    elif val > 1000:
        lst.append('소형')
    else:
        lst.append('경형')

test['배기량별 구분'] = lst

train['연간 자동차세'] = 0
train.loc[train['배기량별 구분']=='경형','연간 자동차세'] = train['배기량'] * 90
train.loc[train['배기량별 구분']=='소형','연간 자동차세'] = train['배기량'] * 140
train.loc[np.logical_or(train['배기량별 구분']=='중형', train['배기량별 구분']=='대형'),'연간 자동차세'] = train['배기량'] * 220

test['연간 자동차세'] = 0
test.loc[test['배기량별 구분']=='경형','연간 자동차세'] = test['배기량'] * 90
test.loc[test['배기량별 구분']=='소형','연간 자동차세'] = test['배기량'] * 140
test.loc[np.logical_or(test['배기량별 구분']=='중형', test['배기량별 구분']=='대형'),'연간 자동차세'] = test['배기량'] * 220

train['지방교육세'] = train['연간 자동차세'] * .3
test['지방교육세'] = test['연간 자동차세'] * .3

train['총 자동차세'] = train['연간 자동차세'] + train['지방교육세']
test['총 자동차세'] = test['연간 자동차세'] + test['지방교육세']

lst = []
for val in tqdm(train['배기량']):
    if val > 3800:
        lst.append('8기통')
    elif val > 2900:
        lst.append('6기통')
    elif val > 1400:
        lst.append('4기통')
    else:
        lst.append('3기통')

train['배기량별 구분2'] = lst

lst = []
for val in tqdm(test['배기량']):
    if val > 3800:
        lst.append('8기통')
    elif val > 2900:
        lst.append('6기통')
    elif val > 1400:
        lst.append('4기통')
    else:
        lst.append('3기통')

test['배기량별 구분2'] = lst

# 생산년도, 출시년도를 기준으로 컬럼 추가 및 자동차세 할인 적용
train['생산이후'] = 2023 - train['생산년도']
train['모델출시이후'] = 2023 - train['모델출시년도']

test['생산이후'] = 2023 - test['생산년도']
test['모델출시이후'] = 2023 - test['모델출시년도']

train['자동차세 할인 여부'] = 0
train.loc[train['생산이후']>=3,'자동차세 할인 여부'] = 1

test['자동차세 할인 여부'] = 0
test.loc[test['생산이후']>=3,'자동차세 할인 여부'] = 1

train['할인 후 자동차세'] = 0
train.loc[train['생산이후']>=3,'할인 후 자동차세'] = (train['연간 자동차세'] + train['지방교육세'])*(1 - (train['생산이후']-2)*.05)
train.loc[train['생산이후']>=13,'할인 후 자동차세'] = (train['연간 자동차세'] + train['지방교육세'])*.5

test['할인 후 자동차세'] = 0
test.loc[test['생산이후']>=3,'할인 후 자동차세'] = (test['연간 자동차세'] + test['지방교육세'])*(1 - (test['생산이후']-2)*.05)
test.loc[test['생산이후']>=13,'할인 후 자동차세'] = (test['연간 자동차세'] + test['지방교육세'])*.5

train['일반보증'] = 0
train.loc[np.logical_or(train['생산이후']<=3, train['주행거리']<=60000),'일반보증'] = 1

test['일반보증'] = 0
test.loc[np.logical_or(test['생산이후']<=3, test['주행거리']<=60000),'일반보증'] = 1

train['엔진보증'] = 0
train.loc[np.logical_or(train['생산이후']<=5, train['주행거리']<=100000),'엔진보증'] = 1

test['엔진보증'] = 0
test.loc[np.logical_or(test['생산이후']<=5, test['주행거리']<=100000),'엔진보증'] = 1

100%|██████████| 57920/57920 [00:00<00:00, 4160613.95it/s]
  0%|          | 0/14480 [00:00<?, ?it/s]

100%|██████████| 14480/14480 [00:00<00:00, 3619399.40it/s]
100%|██████████| 57920/57920 [00:00<00:00, 4449505.25it/s]
100%|██████████| 14480/14480 [00:00<00:00, 3603721.71it/s]


In [5]:
# 브랜드, 차량모델명, 판매도시, 판매구역 가격을 기준으로 랭크 인코딩
brand_idx = train[['브랜드','가격']].groupby(['브랜드']).mean().sort_values('가격').index
dict_brand = {}
for i in range(len(brand_idx)):
    key = brand_idx[i]
    value = i
    dict_brand[key] = value

train['브랜드'] = train['브랜드'].map(dict_brand)
test['브랜드'] = test['브랜드'].map(dict_brand)

model_idx = train[['차량모델명','가격']].groupby(['차량모델명']).mean().sort_values('가격').index
dict_model = {}
for i in range(len(model_idx)):
    key = model_idx[i]
    value = i
    dict_model[key] = value

train['차량모델명'] = train['차량모델명'].map(dict_model)
test['차량모델명'] = test['차량모델명'].map(dict_model)

city_idx = train[['판매도시','가격']].groupby(['판매도시']).mean().sort_values('가격').index
dict_city = {}
for i in range(len(city_idx)):
    key = city_idx[i]
    value = i
    dict_city[key] = value

train['판매도시'] = train['판매도시'].map(dict_city)
test['판매도시'] = test['판매도시'].map(dict_city)

sector_idx = train[['판매구역','가격']].groupby(['판매구역']).mean().sort_values('가격').index
dict_sector = {}
for i in range(len(sector_idx)):
    key = sector_idx[i]
    value = i
    dict_sector[key] = value

train['판매구역'] = train['판매구역'].map(dict_sector)
test['판매구역'] = test['판매구역'].map(dict_sector)

In [6]:
# 범주형 컬럼 원-핫 인코딩
encoder = ohe(sparse=False)

onehot = pd.DataFrame(encoder.fit_transform(train[['배기량별 구분','배기량별 구분2']]), columns=list(encoder.categories_[0])+ list(encoder.categories_[1]))
train = pd.concat([train.drop(encoder.feature_names_in_, axis=1), onehot], axis=1)

onehot2 = pd.DataFrame(encoder.transform(test[['배기량별 구분','배기량별 구분2']]), columns=list(encoder.categories_[0])+ list(encoder.categories_[1]))
test = pd.concat([test.drop(encoder.feature_names_in_, axis=1), onehot2], axis=1)

In [7]:
# 주행거리 컬럼의 이상치 제거
Q3 = np.quantile(train['주행거리'],.75)
Q1 = np.quantile(train['주행거리'],.25)
IQR = Q3 - Q1
maximum = Q3 + (1.5*IQR)
train['주행거리_이상치'] = 0
train.loc[train['주행거리']>maximum,'주행거리_이상치'] = 1

train = train.drop(index=train[train['주행거리_이상치']==1].index).reset_index().drop(['index','주행거리_이상치'],axis=1)

In [8]:
# 상관관계 파악을 위해 가격(target) 컬럼을 맨 후측으로 이동
train['target'] = train['가격']
train.drop('가격', axis=1, inplace=True)

In [9]:
# 종속변수, 독립변수 분리
train_x = train.drop('target',axis=1)
train_y = train['target']
test_x = test

In [10]:
# MinMaxScaler 비교
scaler2 = mms()

train_minmax = pd.DataFrame(scaler2.fit_transform(train_x), columns=train_x.columns)
test_minmax = pd.DataFrame(scaler2.transform(test_x), columns=test_x.columns)

# minmax data
pca2 = PCA(n_components=1)
pca2.fit(train_minmax[['생산년도','모델출시년도','생산이후','모델출시이후']])
print('minmax data pca_1: ', pca2.explained_variance_ratio_)

# minmax data
train_minmax['생산출시PCA'] = pca2.transform(train_minmax[['생산년도','모델출시년도','생산이후','모델출시이후']])
train_minmax.drop(['생산년도','모델출시년도','생산이후','모델출시이후'],axis=1,inplace=True)

test_minmax['생산출시PCA'] = pca2.transform(test_minmax[['생산년도','모델출시년도','생산이후','모델출시이후']])
test_minmax.drop(['생산년도','모델출시년도','생산이후','모델출시이후'],axis=1,inplace=True)

# 배기량 관련 컬럼 압축
# minmax data
pca4 = PCA(n_components=1)
pca4.fit(train_minmax[['배기량','연간 자동차세','지방교육세','총 자동차세']])
print('minmax data pca_1: ', pca4.explained_variance_ratio_)

# minmax data
train_minmax['배기량PCA'] = pca4.transform(train_minmax[['배기량','연간 자동차세','지방교육세','총 자동차세']])
train_minmax.drop(['배기량','연간 자동차세','지방교육세','총 자동차세'],axis=1,inplace=True)

test_minmax['배기량PCA'] = pca4.transform(test_minmax[['배기량','연간 자동차세','지방교육세','총 자동차세']])
test_minmax.drop(['배기량','연간 자동차세','지방교육세','총 자동차세'],axis=1,inplace=True)

minmax data pca_1:  [0.95006817]
minmax data pca_1:  [0.98926083]


In [11]:
display(train_y.head(2))

0    51.74
1    41.47
Name: target, dtype: float64

In [12]:
display(train_minmax.head(2)
,test_minmax.head(2))

Unnamed: 0,브랜드,차량모델명,판매도시,판매구역,주행거리,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),...,경형,대형,소형,중형,3기통,4기통,6기통,8기통,생산출시PCA,배기량PCA
0,0.894737,0.309859,0.843575,0.75,0.222833,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.305346,-0.356368
1,0.789474,0.605634,0.52545,0.0,0.352953,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.088047,-0.149427


Unnamed: 0,브랜드,차량모델명,판매도시,판매구역,주행거리,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),...,경형,대형,소형,중형,3기통,4기통,6기통,8기통,생산출시PCA,배기량PCA
0,0.473684,0.584507,0.837368,0.5,0.149023,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,-0.256172,0.141702
1,0.315789,0.161972,0.734947,0.25,0.415701,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.010738,-0.236522


In [13]:
tr_minmax, val_minmax, tr_minmax_y, val_minmax_y = train_test_split(train_minmax, train_y, test_size=.15, random_state=42)

In [84]:
# LightGBM
model_lgbm = lgbm(
    objective='mae', metric='mae', n_estimators=20000, random_state=42, max_depth=11,
    num_leaves=255, learning_rate=0.01, reg_alpha=.5, reg_lambda=.05, subsample=.4
    )
model_lgbm.fit(tr_minmax, tr_minmax_y, eval_set=(val_minmax, val_minmax_y), early_stopping_rounds=1000, verbose=1000)

[1000]	valid_0's l1: 6.09361
[2000]	valid_0's l1: 6.0243
[3000]	valid_0's l1: 6.00586
[4000]	valid_0's l1: 5.99953
[5000]	valid_0's l1: 5.984
[6000]	valid_0's l1: 5.98322
[7000]	valid_0's l1: 5.98183
[8000]	valid_0's l1: 5.97995
[9000]	valid_0's l1: 5.97957


In [86]:
print("LGBM Validation MAE : {}".format(mae(val_minmax_y, model_lgbm.predict(val_minmax))))

LGBM Validation MAE : 5.979142903400021


In [85]:
# XGBoost
model_xgb = xgb(
    objective='reg:absoluteerror', eval_metric='mae', random_state=42, n_estimators=90000,
    max_depth=10, learning_rate=0.01, gamma=0.6, subsample=0.8, reg_alpha=1, reg_lambda=0.05,
    colsample_bytree=0.5, min_child_weight=10, base_score=100
    )
model_xgb.fit(tr_minmax, tr_minmax_y, eval_set=[(val_minmax, val_minmax_y)], early_stopping_rounds=1000, verbose=1000)

[0]	validation_0-mae:52.96561


[1000]	validation_0-mae:45.13468
[2000]	validation_0-mae:37.67545
[3000]	validation_0-mae:30.74095
[4000]	validation_0-mae:24.36053
[5000]	validation_0-mae:18.65116
[6000]	validation_0-mae:13.84022
[7000]	validation_0-mae:10.15293
[8000]	validation_0-mae:7.85692
[9000]	validation_0-mae:6.74112
[10000]	validation_0-mae:6.31983
[11000]	validation_0-mae:6.16496
[12000]	validation_0-mae:6.10438
[13000]	validation_0-mae:6.07086
[14000]	validation_0-mae:6.04981
[15000]	validation_0-mae:6.03302
[16000]	validation_0-mae:6.02057
[17000]	validation_0-mae:6.01104
[18000]	validation_0-mae:6.00267
[19000]	validation_0-mae:5.99527
[20000]	validation_0-mae:5.98957
[21000]	validation_0-mae:5.98405
[22000]	validation_0-mae:5.97886
[23000]	validation_0-mae:5.97449
[24000]	validation_0-mae:5.96998
[25000]	validation_0-mae:5.96638
[26000]	validation_0-mae:5.96309
[27000]	validation_0-mae:5.95995
[28000]	validation_0-mae:5.95715
[29000]	validation_0-mae:5.95458
[30000]	validation_0-mae:5.95275
[31000]	vali

In [87]:
print("XGB Validation MAE : {}".format(mae(val_minmax_y, model_xgb.predict(val_minmax))))

XGB Validation MAE : 5.943318150312733


In [101]:
cat_dict

{6: 5.9291920078479405,
 7: 5.925082243251693,
 8: 5.933886283573369,
 9: 5.968829214081852,
 10: 5.984683652792931}

In [99]:
# CatBoost
param_lst = [6,7,8,9,10]

cat_dict = {}

for par in param_lst:
    model_cat = cat(
        loss_function='MAE', eval_metric='MAE', random_state=42, iterations=50000,
        max_depth=par
        )
    model_cat.fit(tr_minmax, tr_minmax_y, eval_set=(val_minmax, val_minmax_y), early_stopping_rounds=1000, use_best_model=True, verbose=1000)

    cat_dict[par] = mae(val_minmax_y, model_cat.predict(val_minmax))

"""
param = {
        "learning_rate":trial.suggest_categorical('learning_rate',[0.01, 0.03, 0.1]),
        "depth":trial.suggest_categorical('depth', [6, 10, None]),
        "random_strength":trial.suggest_float('random_strength', 0, 0.5),
        "bagging_temperature":trial.suggest_float('bagging_temperature', 0, 1),
        "l2_leaf_reg":trial.suggest_categorical('l2_leaf_reg', [None, 0.001, 0.01]),
        "border_count":trial.suggest_categorical('border_count', [None, 254]),
        "grow_policy":trial.suggest_categorical('grow_policy', ['Depthwise', 'Lossguide','SymmetricTree']),
        "max_leaves":trial.suggest_int('max_leaves', 31, 64),
        "auto_class_weights":trial.suggest_categorical('auto_class_weights', ['Balanced', 'SqrtBalanced', None]),
}
"""

0:	learn: 27.3327744	test: 27.2894518	best: 27.2894518 (0)	total: 14.8ms	remaining: 12m 19s
1000:	learn: 6.3139803	test: 6.6564278	best: 6.6564278 (1000)	total: 13.8s	remaining: 11m 16s
2000:	learn: 5.7393906	test: 6.2444370	best: 6.2444370 (2000)	total: 27.5s	remaining: 10m 58s
3000:	learn: 5.4607088	test: 6.1135599	best: 6.1135599 (3000)	total: 40.7s	remaining: 10m 38s
4000:	learn: 5.2888783	test: 6.0538359	best: 6.0538359 (4000)	total: 54.9s	remaining: 10m 30s
5000:	learn: 5.1614214	test: 6.0216195	best: 6.0213936 (4977)	total: 1m 9s	remaining: 10m 24s
6000:	learn: 5.0638578	test: 6.0024434	best: 6.0023131 (5998)	total: 1m 32s	remaining: 11m 14s
7000:	learn: 4.9926605	test: 5.9878083	best: 5.9877080 (6997)	total: 1m 52s	remaining: 11m 30s
8000:	learn: 4.9266702	test: 5.9793868	best: 5.9793257 (7987)	total: 2m 6s	remaining: 11m 2s
9000:	learn: 4.8746906	test: 5.9713096	best: 5.9712997 (8999)	total: 2m 20s	remaining: 10m 40s
10000:	learn: 4.8286828	test: 5.9639203	best: 5.9637955 (999

'\nparam = {\n        "learning_rate":trial.suggest_categorical(\'learning_rate\',[0.01, 0.03, 0.1]),\n        "depth":trial.suggest_categorical(\'depth\', [6, 10, None]),\n        "random_strength":trial.suggest_float(\'random_strength\', 0, 0.5),\n        "bagging_temperature":trial.suggest_float(\'bagging_temperature\', 0, 1),\n        "l2_leaf_reg":trial.suggest_categorical(\'l2_leaf_reg\', [None, 0.001, 0.01]),\n        "border_count":trial.suggest_categorical(\'border_count\', [None, 254]),\n        "grow_policy":trial.suggest_categorical(\'grow_policy\', [\'Depthwise\', \'Lossguide\',\'SymmetricTree\']),\n        "max_leaves":trial.suggest_int(\'max_leaves\', 31, 64),\n        "auto_class_weights":trial.suggest_categorical(\'auto_class_weights\', [\'Balanced\', \'SqrtBalanced\', None]),\n}\n'

In [96]:
# submission['가격'] = np.round(model_xgb.predict(test_minmax),1)
# submission.to_csv('./xgb_tune_round1.csv', index=False)