### import

In [1]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

### Fixed RandomSeed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

### Data Load

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head()

Unnamed: 0,ID,생산년도,모델출시년도,브랜드,차량모델명,판매도시,판매구역,주행거리,배기량,압축천연가스(CNG),경유,가솔린,하이브리드,액화석유가스(LPG),가격
0,TRAIN_00000,2018,2014,skoda,fabia,KAT,SLA,85231,999,0,0,1,0,0,51.74
1,TRAIN_00001,2010,2006,toyota,auris,RKO,SWI,135000,1598,0,0,1,0,0,41.47
2,TRAIN_00002,2002,2002,mercedes-benz,clk-klasa,GNI,WIE,255223,1796,0,0,1,0,0,17.81
3,TRAIN_00003,2006,2001,nissan,x-trail,EHX,WIE,238000,2184,0,1,0,0,0,18.2
4,TRAIN_00004,2007,2007,fiat,bravo,OSW,MAL,251000,1910,0,1,0,0,0,17.55


### Split

In [5]:
train_x = train.drop(['ID', '가격'], axis = 1)
train_y = train['가격']

test_x = test.drop('ID', axis = 1)

### Data Processing

In [6]:
ordinal_features = ['브랜드', '차량모델명', '판매도시', '판매구역']

for feature in ordinal_features:
    le = LabelEncoder()
    le = le.fit(train_x[feature])
    train_x[feature] = le.transform(train_x[feature])

    # train데이터에서 존재하지 않았던 값이 test 데이터에 존재할 수도 있습니다.
    # 따라서 test 데이터를 바로 변형시키지 않고 고유값을 확인후 test 데이터를 변환합니다.
    # Data Leakage를 발생시키지 않기 위함이니, 반드시 주의해주세요.
    for label in np.unique(test_x[feature]):
        if label not in le.classes_:
            le.classes_ = np.append(le.classes_, label)
    test_x[feature] = le.transform(test_x[feature])

### Train-Validation Split

In [7]:
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

### LightGBM 데이터셋으로 변환

In [8]:
train_data = lgb.Dataset(X_train, label=y_train)

### Set hyper-parameter grid

In [9]:
param_grid = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'reg_alpha': [0, 1, 2],
    'reg_lambda': [0, 1, 2]
}

### LightGBM 모델 생성

In [10]:
lgb_model = lgb.LGBMRegressor()

### 그리드 서치를 사용한 하이퍼파라미터 튜닝

In [11]:
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error')
grid_search.fit(X_train, y_train)

### 최적의 하이퍼파라미터 출력

In [None]:
print("Best Hyperparameters:", grid_search.best_params_)

### 최적의 모델로 재학습

In [10]:
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)



You can set `force_col_wise=true` to remove the overhead.
[1]	training's l2: 1062.4	valid_1's l2: 1064.3
Training until validation scores don't improve for 10 rounds
[2]	training's l2: 907.199	valid_1's l2: 909.491
[3]	training's l2: 784.253	valid_1's l2: 786.375
[4]	training's l2: 677.037	valid_1's l2: 679.805
[5]	training's l2: 588.981	valid_1's l2: 592.103
[6]	training's l2: 516.275	valid_1's l2: 519.51
[7]	training's l2: 456.502	valid_1's l2: 459.773
[8]	training's l2: 406.336	valid_1's l2: 410.4
[9]	training's l2: 365.9	valid_1's l2: 370.195
[10]	training's l2: 331.588	valid_1's l2: 336.469
[11]	training's l2: 302.915	valid_1's l2: 307.799
[12]	training's l2: 278.272	valid_1's l2: 282.772
[13]	training's l2: 257.305	valid_1's l2: 261.883
[14]	training's l2: 240.284	valid_1's l2: 244.998
[15]	training's l2: 225.593	valid_1's l2: 230.703
[16]	training's l2: 213.048	valid_1's l2: 218.346
[17]	training's l2: 202.451	valid_1's l2: 207.596
[18]	training's l2: 193.311	valid_1's l2: 198.4

### 검증 데이터에 대한 예측 수행 및 평가 지표 출력

In [None]:
y_pred = best_model.predict(X_val)

mae = mean_absolute_error(y_val, y_pred)
print("Mean Absolute Error:", mae)

### Inference & Submit

In [10]:
preds = best_model.predict(test_x)

In [12]:
submission = pd.read_csv('sample_submission.csv')
submission['가격'] = preds
submission.to_csv('./baseline_submit.csv', index = False)