# 1 코드 실행환경

Google Colab

런타임 유형 : GPU

Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic

Ubuntu 18.04.6 LTS

Python 3.7.13

GPU : NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2  Tesla T4

In [None]:
import platform
platform.platform()

'Linux-5.4.188+-x86_64-with-Ubuntu-18.04-bionic'

In [None]:
!cat /etc/issue.net

Ubuntu 18.04.6 LTS


In [None]:
!python --version

Python 3.7.13


In [None]:
!nvidia-smi

Tue Aug 30 13:25:51 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   69C    P8    10W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

## 1.1 데이터 입/출력 경로 지정

구글 코랩 사용시 구글 드라이브 연결 사용

로컬 환경 사용시 로컬 환경 경로로 설정

In [None]:
#구글 드라이브 연결
from google.colab import drive

drive.mount('/content/gdrive/')

#코랩 환경 경로 설정 
DATA_PATH = '/content/gdrive/MyDrive/dacon-235927-kops/data/'

Mounted at /content/gdrive/


In [None]:
# 로컬 환경 경로 설정
DATA_PATH = '/data/'

## 1.2 필요 라이브러리 설치

In [None]:
# Optuna 설치
!pip install --quiet --no-cache-dir git+https://github.com/optuna/optuna

# Catboost 설치
!pip install --quiet catboost

# XGB GPU 버전 설치
!pip uninstall --quiet -y xgboost
!pip install --quiet xgboost

# LGBM GPU 버전 설치
! git clone --recursive https://github.com/Microsoft/LightGBM
! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 209 kB 31.2 MB/s 
[K     |████████████████████████████████| 81 kB 72.2 MB/s 
[K     |████████████████████████████████| 78 kB 60.0 MB/s 
[K     |████████████████████████████████| 147 kB 79.3 MB/s 
[K     |████████████████████████████████| 49 kB 65.0 MB/s 
[K     |████████████████████████████████| 112 kB 70.8 MB/s 
[?25h  Building wheel for optuna (PEP 517) ... [?25l[?25hdone
  Building wheel for pyperclip (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 76.6 MB 1.2 MB/s 
[K     |████████████████████████████████| 255.9 MB 47 kB/s 
[?25hCloning into 'LightGBM'...
remote: Enumerating objects: 27400, done.[K
remote: Counting objects: 100% (518/518), done.[K
remote: Compressing objects: 100% (295/295), done.[K
remote: Total 27400 (delta 312), reused 34

## 1.3 라이브러리 로드

In [None]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

import optuna 
from optuna import Trial, visualization
from optuna.samplers import TPESampler

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

import matplotlib.pyplot as plt

from sklearn.decomposition import PCA

import joblib

import warnings

warnings.filterwarnings("ignore")

## 1.4 시드 고정

In [None]:
# Seed 고정
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) 
SEED = 42

# 2 데이터 전처리

In [None]:
# 데이터 로드
train = pd.read_csv(DATA_PATH + 'train.csv')

# X Y 데이터 분리
X_train = train.filter(regex='X') # Input : X Featrue
Y_train = train.filter(regex='Y') # Output : Y Feature

In [None]:
# 결과에 영향 낮은 인자 제거
X_train = X_train.drop(['X_04', 'X_23', 'X_47', 'X_48', 'X_10', 'X_11', 'X_02'], axis=1)

In [None]:
# X_33 이상치 제거
drop_idx = X_train.loc[X_train['X_33'] > 6 ].index

X_train = X_train.drop(drop_idx, axis = 0)
Y_train = Y_train.drop(drop_idx, axis = 0)

X_train = X_train.reset_index(drop = True)
Y_train = Y_train.reset_index(drop = True)

In [None]:
# PCA 클래스 설정
class PCA_transform:

  def __init__(self):
    self.cols_list = []
    self.pca_list = []
    self.n_pca_list = []
    self.size = 0
  
  # PCA 클래스의 학습 및 input 값 변환
  def fit_transform(self, X_input, col, n_pca):

    X_pca = X_input[col]

    # n차원으로 차원 축소, target 정보는 제외
    pca = PCA(n_components = n_pca)

    # PCA 학습
    pca.fit(X_pca)

    # PCA transform 후 데이터프레임으로 자료형 변경
    X_pca = pca.transform(X_pca)
    X_pca = pd.DataFrame(X_pca, columns = self.naming(n_pca))

    X_input = pd.concat([X_input, X_pca], axis = 1)
    X_input = X_input.drop(col, axis = 1)

    self.cols_list.append(col)
    self.pca_list.append(pca)
    self.n_pca_list.append(n_pca)
    self.size += 1

    return X_input

  # 학습된 PCA 값으로 transform
  def transform(self, X_input):
    for idx in range(self.size):
      X_input = self._idx_transform(X_input, idx)
    
    return X_input

  # n번째 PCA 변환
  def _idx_transform(self, X_input, idx):
    X_pca = X_input[self.cols_list[idx]]

    # pca transform 후 데이터프레임으로 자료형 변경
    X_pca = self.pca_list[idx].transform(X_pca)
    X_pca = pd.DataFrame(X_pca, columns = self.naming(self.n_pca_list[idx], idx))

    X_input = pd.concat([X_input, X_pca], axis = 1)
    X_input = X_input.drop(self.cols_list[idx], axis = 1)

    return X_input

  # PCA 된 컬럼 이름 규칙
  def naming(self, number, name = None):
    if (name is None):
      name = self.size
    names = []
    for idx in range(number):
      names.append(f'PCA_{str(name)}_{idx}')
    return names

In [None]:
# Optuna로 최적화된 PCA 파라미터 적용
pca_5 = PCA_transform()
X_train = pca_5.fit_transform(X_train, ['X_13', 'X_14', 'X_15', 'X_16', 'X_17', 'X_18'], 5)
X_train = pca_5.fit_transform(X_train, ['X_19', 'X_20', 'X_21', 'X_22'], 2)
X_train = pca_5.fit_transform(X_train, ['X_34', 'X_35', 'X_36', 'X_37'], 1)
X_train = pca_5.fit_transform(X_train, ['X_41', 'X_42', 'X_43', 'X_44', 'X_45'], 1)
X_train = pca_5.fit_transform(X_train, ['X_50', 'X_51', 'X_52', 'X_53', 'X_54', 'X_55', 'X_56'], 2)

# 3 평가산식 정의

In [None]:
def nrmse(y_val, y_pred):
  rmse = mean_squared_error(y_val, y_pred, squared=False)
  nrmse = rmse/np.mean(np.abs(y_val))
  return nrmse

def lg_nrmse(y_val, y_pred):
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여

    y_val = pd.DataFrame(y_val)
    y_pred = pd.DataFrame(y_pred)

    all_nrmse = []
    for idx in range(0,14):
        all_nrmse.append(nrmse(y_val.iloc[:,idx], y_pred.iloc[:,idx]))
        
    score = 1.2 * np.sum(all_nrmse[:7]) + 1.0 * np.sum(all_nrmse[7:14])
    return score

# 4 하이퍼파라미터 튜닝 준비

## 4.1 하이퍼파라미터 튜닝 함수

In [None]:
# Oputna로 탐색할 목적 함수 생성
def model_objective(trial: Trial, X, y, model, param):

  # 데이터 분할
  X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED)

  # 파라미터 적용
  tune = model(**param(trial))

  # 모델 학습
  tune.fit(X_tr, y_tr)

  # 결과 예측
  y_pr = tune.predict(X_val)

  ## nrmse으로 Loss 계산
  nrmse = nrmse(y_val, y_pr)

  return nrmse

In [None]:
# Y 하나에 대하여 학습
def start_study(idx, X, y, model, param, n_iter):
  name = model().__class__.__name__

  try:
    # 학습된 데이터가 있을 시 이어서 학습
    study = joblib.load(DATA_PATH + "tune_param/" + name + "_tune/tune_" + str(idx) + ".pkl")
    print('Study loaded')
  except:
    # 학습된 데이터가 없을 시 새로운 학습 데이터 생성
    # 학습방향 : 최소화
    study = optuna.create_study(direction='minimize')
  
  while (n_iter > len(study.trials)):
    # 학습 시작
    study.optimize(lambda trial : model_objective(trial, X, y, model, param), n_trials=1)
    
    # 저장할 폴더 생성
    try:
      os.mkdir(DATA_PATH + "tune_param")
    except:
      pass
    try:
      os.mkdir(DATA_PATH + "tune_param/" + name  + "_tune")
    except:
      pass

    # 결과 저장
    joblib.dump(study, DATA_PATH + "tune_param/" + name + "_tune/tune_" + str(idx) + ".pkl")

  return study

In [None]:
# 하이퍼파라미터 튜닝을 시작하는 함수
# X, Y : 학습 데이터
# model : 사용할 ML 모델
# param : 튜닝할 파라미터
# n_iter : 시도 횟수
def start_tuning(X, Y, model, param, n_iter):
  study_list = [None] * 14
  name = model().__class__.__name__
  score = 0

  # Y 1~14에 대해 튜닝
  for idx in range(0, 14):
    study_list[idx] = start_study(idx, X, Y.iloc[:,idx], model, param, n_iter)
    
    # 결과 출력
    print(name + ' ' + str(idx) + ' Best trial: score {},\nparams {}'.format(study_list[idx].best_trial.value, study_list[idx].best_trial.params))
    
    # 최선의 점수 계산
    if (idx <= 7):
      score += 1.2 * study_list[idx].best_trial.value
    else:
      score += study_list[idx].best_trial.value

  print('Score : ', score)

## 4.2 하이퍼파라미터 튜닝 파라미터 정의


In [None]:
def HistGradientBoostingRegressor_param(trial):
  return {
    'learning_rate' : trial.suggest_discrete_uniform('learning_rate',0.005, 0.05, 0.001),
    'max_iter' : trial.suggest_int('max_iter',100, 1000),
    'max_leaf_nodes' : trial.suggest_int('max_leaf_nodes',7, 256),
    'max_depth' : trial.suggest_int('max_depth',4, 40),
    'min_samples_leaf' : trial.suggest_int('min_samples_leaf',4, 400),
    'l2_regularization' : trial.suggest_discrete_uniform('l2_regularization', 0, 1, 0.05),
    'random_state' : trial.suggest_categorical('random_state', [SEED]),
    }

In [None]:
def XGBRegressor_param(trial):
  return {
    "objective": trial.suggest_categorical("objective", ['reg:squarederror']),
    'random_state': SEED,
    "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
    "learning_rate": trial.suggest_discrete_uniform('learning_rate',0.005, 0.05, 0.001),
    'nthread' : trial.suggest_categorical("nthread", [-1]),
    'max_depth':trial.suggest_int('max_depth', 8, 16),
    'min_child_weight':trial.suggest_int('min_child_weight', 2, 300),
    'gamma':trial.suggest_discrete_uniform('gamma', 0, 1, 0.05),
    'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.5, 1, 0.1),
    'lambda': trial.suggest_discrete_uniform('lambda', 0, 1, 0.05),
    'alpha': trial.suggest_discrete_uniform('alpha', 0, 1, 0.05),
    'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
    'tree_method': trial.suggest_categorical("tree_method", ['gpu_hist']),
    'predictor': trial.suggest_categorical("predictor", ['gpu_predictor']),
    }

In [None]:
def LGBMRegressor_param(trial):
  return {
        "objective": trial.suggest_categorical("objective", ['regression']),
        "device": trial.suggest_categorical("device", ['gpu']),
        "metric": trial.suggest_categorical("metric", ['rmse']),
        "verbose": trial.suggest_categorical("verbose", [-1]),
        "random_state": trial.suggest_categorical("random_state", [SEED]),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "learning_rate": trial.suggest_discrete_uniform('learning_rate',0.005, 0.05, 0.001),
        'max_depth': trial.suggest_int('max_depth', 5, 20),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_discrete_uniform('subsample', 0.5, 1, 0.1),
        'colsample_bytree': trial.suggest_discrete_uniform('colsample_bytree', 0.5, 1, 0.1),
        'num_leaves' : trial.suggest_int('num_leaves', 2, 1000),
    }

In [None]:
def CatBoostRegressor_param(trial): 
  return {
    "iterations":  trial.suggest_int("iterations", 100, 1000),
    "learning_rate": trial.suggest_discrete_uniform('learning_rate',0.005, 0.05, 0.001),
    'depth': trial.suggest_int('depth', 4, 10),
    'l2_leaf_reg' : trial.suggest_int('l2_leaf_reg', 2, 10),
    'random_strength' : trial.suggest_float('random_strength', 0, 10),
    'border_count' : trial.suggest_categorical("border_count", [254]),
    'eval_metric' : trial.suggest_categorical("eval_metric", ['RMSE']),
    'random_state' : trial.suggest_categorical("random_state", [SEED]),
    'logging_level' : trial.suggest_categorical("logging_level", ['Silent'])
  }

# 5 하이퍼파라미터 튜닝 시작

In [None]:
start_tuning(X_train, Y_train, HistGradientBoostingRegressor, HistGradientBoostingRegressor_param, 100)

Study loaded
HistGradientBoostingRegressor 0 Best trial: score 0.2534784650782734,
params {'learning_rate': 0.033, 'max_iter': 903, 'max_leaf_nodes': 63, 'max_depth': 25, 'min_samples_leaf': 67, 'l2_regularization': 0.05, 'random_state': 42}
Study loaded
HistGradientBoostingRegressor 1 Best trial: score 0.3557918478237482,
params {'learning_rate': 0.048999999999999995, 'max_iter': 834, 'max_leaf_nodes': 47, 'max_depth': 19, 'min_samples_leaf': 215, 'l2_regularization': 0.75, 'random_state': 42}
Study loaded
HistGradientBoostingRegressor 2 Best trial: score 0.3493231030591771,
params {'learning_rate': 0.043, 'max_iter': 677, 'max_leaf_nodes': 65, 'max_depth': 37, 'min_samples_leaf': 114, 'l2_regularization': 0.55, 'random_state': 42}
Study loaded
HistGradientBoostingRegressor 3 Best trial: score 0.18675513092972185,
params {'learning_rate': 0.019, 'max_iter': 787, 'max_leaf_nodes': 256, 'max_depth': 39, 'min_samples_leaf': 61, 'l2_regularization': 0.2, 'random_state': 42}
Study loaded
H

In [None]:
start_tuning(X_train, Y_train, XGBRegressor, XGBRegressor_param, 100)

Study loaded
XGBRegressor 0 Best trial: score 0.2532410409384196,
params {'objective': 'reg:squarederror', 'n_estimators': 891, 'learning_rate': 0.009000000000000001, 'nthread': -1, 'max_depth': 9, 'min_child_weight': 9, 'gamma': 0.6000000000000001, 'colsample_bytree': 0.8, 'lambda': 0.35000000000000003, 'alpha': 0.2, 'subsample': 0.7, 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor'}
Study loaded
XGBRegressor 1 Best trial: score 0.3555436581498101,
params {'objective': 'reg:squarederror', 'n_estimators': 390, 'learning_rate': 0.038, 'nthread': -1, 'max_depth': 14, 'min_child_weight': 85, 'gamma': 0.75, 'colsample_bytree': 1.0, 'lambda': 0.1, 'alpha': 0.7000000000000001, 'subsample': 0.9, 'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor'}
Study loaded
XGBRegressor 2 Best trial: score 0.3486160218469771,
params {'objective': 'reg:squarederror', 'n_estimators': 615, 'learning_rate': 0.012, 'nthread': -1, 'max_depth': 12, 'min_child_weight': 30, 'gamma': 0.65, 'colsample_bytr

In [None]:
start_tuning(X_train, Y_train, LGBMRegressor, LGBMRegressor_param, 100)

Study loaded
LGBMRegressor 0 Best trial: score 0.2533720818225458,
params {'objective': 'regression', 'device': 'gpu', 'metric': 'rmse', 'verbose': -1, 'random_state': 42, 'n_estimators': 322, 'learning_rate': 0.012, 'max_depth': 11, 'min_child_samples': 81, 'subsample': 0.5, 'colsample_bytree': 0.6, 'num_leaves': 368}
Study loaded
LGBMRegressor 1 Best trial: score 0.35580219094647103,
params {'objective': 'regression', 'device': 'gpu', 'metric': 'rmse', 'verbose': -1, 'random_state': 42, 'n_estimators': 431, 'learning_rate': 0.009000000000000001, 'max_depth': 12, 'min_child_samples': 47, 'subsample': 0.8, 'colsample_bytree': 0.8, 'num_leaves': 376}
Study loaded
LGBMRegressor 2 Best trial: score 0.34892366064576685,
params {'objective': 'regression', 'device': 'gpu', 'metric': 'rmse', 'verbose': -1, 'random_state': 42, 'n_estimators': 258, 'learning_rate': 0.012, 'max_depth': 11, 'min_child_samples': 76, 'subsample': 0.8, 'colsample_bytree': 0.5, 'num_leaves': 244}
Study loaded
LGBMReg

In [None]:
start_tuning(X_train, Y_train, CatBoostRegressor, CatBoostRegressor_param, 100)

Study loaded
CatBoostRegressor 0 Best trial: score 0.25340840706541967,
params {'iterations': 930, 'learning_rate': 0.032, 'depth': 9, 'l2_leaf_reg': 9, 'random_strength': 7.1762126213865765, 'border_count': 254, 'eval_metric': 'RMSE', 'random_state': 42, 'logging_level': 'Silent'}
Study loaded
CatBoostRegressor 1 Best trial: score 0.35566062106822194,
params {'iterations': 864, 'learning_rate': 0.032, 'depth': 7, 'l2_leaf_reg': 9, 'random_strength': 9.20684648068988, 'border_count': 254, 'eval_metric': 'RMSE', 'random_state': 42, 'logging_level': 'Silent'}
Study loaded
CatBoostRegressor 2 Best trial: score 0.34860868199562,
params {'iterations': 937, 'learning_rate': 0.021, 'depth': 9, 'l2_leaf_reg': 5, 'random_strength': 9.662960020444885, 'border_count': 254, 'eval_metric': 'RMSE', 'random_state': 42, 'logging_level': 'Silent'}
Study loaded
CatBoostRegressor 3 Best trial: score 0.18644317452864037,
params {'iterations': 811, 'learning_rate': 0.041999999999999996, 'depth': 9, 'l2_lea