In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import optuna

seed = 42
np.random.seed(seed)

### Data Preprocessing
* 중간에 새로운 데이터가 배포되었는데, 높은 자리의 숫자만 나타나있던 기존 데이터에서의 결과가 더 좋게 나왔습니다. 따라서 새로 추가된 낮은 자리 숫자를 모두 지워서 사용하였습니다.

* 조기 종료를 사용하기 위해 valid set을 나누어 사용하였습니다.

In [2]:
train = pd.read_csv('data/train_df.csv', encoding='cp949')
test = pd.read_csv('data/test_df.csv', encoding='cp949')
submission = pd.read_csv('data/sample_submission.csv')

train["SEND_SPG_INNB"] = (train["SEND_SPG_INNB"] // 10000000000) * 10000000000
train["REC_SPG_INNB"] = (train["REC_SPG_INNB"] // 10000000000) * 10000000000
test["SEND_SPG_INNB"] = (test["SEND_SPG_INNB"] // 10000000000) * 10000000000
test["REC_SPG_INNB"] = (test["REC_SPG_INNB"] // 10000000000) * 10000000000

train_X = train.drop('INVC_CONT',axis = 1)
train_X = train_X.drop(['index'], axis=1)
train_y = train['INVC_CONT']

test_X = test.drop(['index'], axis=1)

train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, test_size=0.2, random_state=seed)

### Parameter Search
* optuna를 사용하여 적당한 파라미터를 찾아서 사용하였습니다.
* Search Space를 넓혀서 보다 강력한 파라미터를 찾을 순 있겠지만, 훈련 데이터에 overfitting 될것을 우려하여 과도하게 사용하진 않았습니다. 실제로 valid loss가 더 낮은 파라미터가 더 높은 public score를 보장하는것은 아니었습니다.

In [3]:
EARLY_STOPPING_ROUND = 100
def objective(trial):
    param = {}
    param['learning_rate'] = trial.suggest_categorical("learning_rate", [0.05, 0.1, 0.15])
    param['depth'] = trial.suggest_int('depth', 14, 16)
    param['l2_leaf_reg'] = trial.suggest_discrete_uniform('l2_leaf_reg', 5.0, 6.0, 0.5)
    param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [3, 4, 5])
    param['grow_policy'] = 'Depthwise'
    param['iterations'] = 10000
    param['use_best_model'] = True
    param['eval_metric'] = 'RMSE'
    param['od_type'] = 'iter'
    param['od_wait'] = 20
    param['random_state'] = seed
    param['logging_level'] = 'Silent'

    regressor = CatBoostRegressor(**param)

    regressor.fit(train_X.copy(), train_y.copy(),
                  eval_set=[(valid_X.copy(), valid_y.copy())],
                  early_stopping_rounds=EARLY_STOPPING_ROUND,
                  cat_features=["DL_GD_LCLS_NM", "DL_GD_MCLS_NM"])
    loss = mean_squared_error(valid_y, regressor.predict(valid_X.copy()), squared=False)
    return loss

### Modelling
* optuna를 사용하여 찾은 파라미터로 모델을 훈련하였습니다.

In [4]:
# study = optuna.create_study(study_name=f'catboost-seed{seed}')
# study.optimize(objective, n_trials=10000, timeout=24000)
# print(study.best_value)
# print(study.best_params)


model1 = CatBoostRegressor(learning_rate=0.1, depth=15, l2_leaf_reg=5.5, min_child_samples=4,iterations=10000 , grow_policy="Depthwise", eval_metric="RMSE", random_state=seed)
model1.fit(train_X, train_y, cat_features=["DL_GD_LCLS_NM", "DL_GD_MCLS_NM"], eval_set=[(valid_X, valid_y)], early_stopping_rounds=100)

0:	learn: 5.6212725	test: 6.0025414	best: 6.0025414 (0)	total: 229ms	remaining: 38m 9s
1:	learn: 5.6029617	test: 5.9883453	best: 5.9883453 (1)	total: 294ms	remaining: 24m 31s
2:	learn: 5.5610058	test: 5.9363169	best: 5.9363169 (2)	total: 361ms	remaining: 20m 2s
3:	learn: 5.5245173	test: 5.8921374	best: 5.8921374 (3)	total: 431ms	remaining: 17m 57s
4:	learn: 5.4934126	test: 5.8538468	best: 5.8538468 (4)	total: 503ms	remaining: 16m 44s
5:	learn: 5.4841996	test: 5.8453618	best: 5.8453618 (5)	total: 557ms	remaining: 15m 28s
6:	learn: 5.4741635	test: 5.8410632	best: 5.8410632 (6)	total: 615ms	remaining: 14m 37s
7:	learn: 5.4442860	test: 5.8050592	best: 5.8050592 (7)	total: 690ms	remaining: 14m 21s
8:	learn: 5.4380350	test: 5.8017199	best: 5.8017199 (8)	total: 745ms	remaining: 13m 47s
9:	learn: 5.4113239	test: 5.7714624	best: 5.7714624 (9)	total: 800ms	remaining: 13m 19s
10:	learn: 5.4035837	test: 5.7671703	best: 5.7671703 (10)	total: 850ms	remaining: 12m 51s
11:	learn: 5.3752987	test: 5.737

94:	learn: 4.8781638	test: 5.6017052	best: 5.5408771 (34)	total: 5.97s	remaining: 10m 22s
95:	learn: 4.8779282	test: 5.6017133	best: 5.5408771 (34)	total: 6.01s	remaining: 10m 20s
96:	learn: 4.8753272	test: 5.6047249	best: 5.5408771 (34)	total: 6.06s	remaining: 10m 18s
97:	learn: 4.8739232	test: 5.6046698	best: 5.5408771 (34)	total: 6.12s	remaining: 10m 18s
98:	learn: 4.8723292	test: 5.6049314	best: 5.5408771 (34)	total: 6.17s	remaining: 10m 17s
99:	learn: 4.8710362	test: 5.6053538	best: 5.5408771 (34)	total: 6.46s	remaining: 10m 39s
100:	learn: 4.8695060	test: 5.6080960	best: 5.5408771 (34)	total: 6.8s	remaining: 11m 6s
101:	learn: 4.8681484	test: 5.6105800	best: 5.5408771 (34)	total: 6.9s	remaining: 11m 10s
102:	learn: 4.8649096	test: 5.6124618	best: 5.5408771 (34)	total: 6.97s	remaining: 11m 9s
103:	learn: 4.8645823	test: 5.6127061	best: 5.5408771 (34)	total: 7.15s	remaining: 11m 20s
104:	learn: 4.8627664	test: 5.6132843	best: 5.5408771 (34)	total: 7.34s	remaining: 11m 31s
105:	lear

<catboost.core.CatBoostRegressor at 0x2a7691b03c8>

### Predict Value Postprocessing
* 타겟 데이터는 정수값으로 구성되어있기 때문에 예측값을 모두 반올림하였습니다.
* 또한 0 이상의 값만 존재하기 때문에 0 이하의 값을 추론한 행은 훈련 데이터 중 두 카테고리가 모두 동일한 데이터의 타겟값 중 최빈값으로 대체하였습니다. 다만 CatBoost를 사용한 이후로는 0 이하의 값이 나타나지 않았습니다.

In [5]:
pred = model1.predict(test_X)

submission['INVC_CONT'] = np.around(pred)

# 음수 및 0이 나오는 경우를 최빈값으로 처리함
for _, row in submission.iterrows():
    if row["INVC_CONT"] <= 0:
        LCLS = test[test["index"] == row["index"]]["DL_GD_LCLS_NM"].item()
        MCLS = test[test["index"] == row["index"]]["DL_GD_MCLS_NM"].item()

        train_LCLS = train[train['DL_GD_LCLS_NM'] == LCLS]
        train_LCLS_MCLS = train_LCLS[train_LCLS['DL_GD_MCLS_NM'] == MCLS]
        mode_value = train_LCLS_MCLS["INVC_CONT"].mode()

        submission.loc[submission["index"] == row["index"], "INVC_CONT"] = mode_value.item()

submission.to_csv('baseline.csv',index = False)