# 앙상블 모델 실습

- XGBoost, LightGBM, Catboost의 간단한 구현을 해보고 각각의 모델을 비교해 봅시다

# 설정

먼저 몇 개의 모듈을 임포트합니다. 맷플롯립 그래프를 인라인으로 출력하도록 만들고 그림을 저장하는 함수를 준비합니다. 또한 파이썬 버전이 3.5 이상인지 확인합니다(파이썬 2.x에서도 동작하지만 곧 지원이 중단되므로 파이썬 3을 사용하는 것이 좋습니다). 사이킷런 버전이 0.20 이상인지도 확인합니다.

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
path = "/content/drive/MyDrive/DCC/ML/handson-ml2/datasets"

In [3]:
import os
os.chdir(path)

In [4]:
os.getcwd()

'/content/drive/MyDrive/DCC/ML/handson-ml2/datasets'

In [5]:
# 파이썬 ≥3.5 필수
import sys
assert sys.version_info >= (3, 5)

# 사이킷런 ≥0.20 필수
import sklearn
assert sklearn.__version__ >= "0.20"

# 공통 모듈 임포트
import numpy as np
import os

# 노트북 실행 결과를 동일하게 유지하기 위해
np.random.seed(42)

# 깔끔한 그래프 출력을 위해
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize = 14)
mpl.rc('xtick', labelsize = 12)
mpl.rc('ytick', labelsize = 12)

# 그림을 저장할 위치
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok = True)

def save_fig(fig_id, tight_layout = True, fig_extension = "png", resolution = 300) :
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("그림 저장:", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format = fig_extension, dpi = resolution)

## 패키지 설치

실습에 필요한 패키지 설치

- catboost
- lightGBM
- pycaret

In [6]:
# 한글 폰트 설치

!sudo apt-get install -y fonts-nanum
!sudo fc-cache -fv
!rm ~/.cache/matplotlib -rf

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following NEW packages will be installed:
  fonts-nanum
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 9,604 kB of archives.
After this operation, 29.5 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 fonts-nanum all 20170925-1 [9,604 kB]
Fetched 9,604 kB in 1s (6,510 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletyp

In [7]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.0.6-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.0.6


In [8]:
# !pip install pandas_profiling

In [9]:
!pip install pycaret

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycaret
  Downloading pycaret-2.3.10-py3-none-any.whl (320 kB)
[K     |████████████████████████████████| 320 kB 5.0 MB/s 
Collecting pyLDAvis
  Downloading pyLDAvis-3.3.1.tar.gz (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 33.5 MB/s 
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[K     |████████████████████████████████| 88 kB 6.3 MB/s 
[?25hCollecting mlxtend>=0.17.0
  Downloading mlxtend-0.20.0-py2.py3-none-any.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 40.6 MB/s 
Collecting scipy<=1.5.4
  Downloading scipy-1.5.4-cp37-cp37m-manylinux1_x86_64.whl (25.9 MB)
[K     |████████████████████████████████| 25

## GPU 사용

In [10]:
import torch
import torchvision
import torch.nn

In [11]:
!pip install --upgrade torch torchvision

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [12]:
torch.randn(3, 4, dtype = torch.float)

tensor([[ 0.5371, -0.4553,  0.0736,  0.6266],
        [ 0.1617, -0.3257,  0.0939,  0.6103],
        [ 0.0278, -1.6380, -0.2509,  0.8674]])

In [13]:
torch.cuda.is_available()

False

# 1 데이터 읽어오기 및 학습데이터 구축


## 1.1 데이터 읽어오기

In [14]:
import os
import pandas as pd
import numpy as np
import time # 학습하는데 걸리는 시간 계산하는 패키지

In [15]:
data = pd.read_csv("flights_sample.csv")

# 전체 data 중에 10%만 추출
# data = data.sample(frac = 0.1, random_state = 10)

data.head()

Unnamed: 0.1,Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
0,411984,2015,1,28,3,WN,103,N7728D,DCA,MKE,...,811.0,1.0,0,0,,,,,,
1,3591965,2015,8,11,2,B6,153,N592JB,JFK,PBI,...,345.0,337.0,0,0,,0.0,0.0,82.0,255.0,0.0
2,526451,2015,2,4,3,DL,1187,N921DN,MSP,DCA,...,2043.0,-19.0,0,0,,,,,,
3,1336011,2015,3,27,5,WN,171,N407WN,DEN,RDU,...,2313.0,-7.0,0,0,,,,,,
4,3424502,2015,8,1,6,WN,4330,N7751A,ATL,RIC,...,2318.0,13.0,0,0,,,,,,


## 1.2 매우 간단한 EDA

In [16]:
data.shape

(581908, 32)

In [17]:
data.describe()

Unnamed: 0.1,Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,FLIGHT_NUMBER,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,...,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
count,581908.0,581908.0,581908.0,581908.0,581908.0,581908.0,581908.0,573166.0,573166.0,572893.0,...,581908.0,572576.0,571339.0,581908.0,581908.0,105838.0,105838.0,105838.0,105838.0,105838.0
mean,2907859.0,2015.0,6.520888,15.697009,3.929209,2174.326538,1329.730994,1335.25985,9.384477,16.0764,...,1494.525428,1477.077317,4.429937,0.00255,0.015612,13.694561,0.069616,19.026966,23.600106,2.962953
std,1679056.0,0.0,3.403771,8.78047,1.987539,1756.913853,483.372298,496.043433,37.456924,8.891136,...,506.728727,525.954125,39.649669,0.050435,0.123971,28.747475,1.95667,49.127407,43.171513,20.718515
min,36.0,2015.0,1.0,1.0,1.0,1.0,1.0,1.0,-44.0,1.0,...,1.0,1.0,-73.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1451318.0,2015.0,4.0,8.0,2.0,731.0,917.0,921.0,-5.0,11.0,...,1110.0,1059.0,-13.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,2909395.0,2015.0,7.0,16.0,4.0,1693.0,1325.0,1330.0,-2.0,14.0,...,1520.0,1513.0,-5.0,0.0,0.0,2.0,0.0,2.0,3.0,0.0
75%,4360604.0,2015.0,9.0,23.0,6.0,3233.0,1730.0,1740.0,7.0,19.0,...,1919.0,1917.0,8.0,0.0,0.0,18.0,0.0,19.0,30.0,0.0
max,5819075.0,2015.0,12.0,31.0,7.0,9794.0,2359.0,2400.0,1670.0,185.0,...,2400.0,2400.0,1665.0,1.0,1.0,916.0,227.0,1665.0,1313.0,1052.0


In [18]:
# import pandas_profiling

# profile = data.profile_report()

# profile

## 1.3 학습 데이터 구축해보기

In [19]:
# 필요한 컬럼만 추출
data = data[["MONTH", "DAY", "DAY_OF_WEEK", "AIRLINE", "FLIGHT_NUMBER", "DESTINATION_AIRPORT",
                 "ORIGIN_AIRPORT", "AIR_TIME", "DEPARTURE_TIME", "DISTANCE", "ARRIVAL_DELAY"]]

# NA 값 제거
data.dropna(inplace = True)

# Classification 문제로 바꿈
# Arrival delay가 10분보다 큰지 작은지 (1 / 0)으로 구분
# Arrival delay가 10분보다 크면 1, 작으면 0
data["ARRIVAL_DELAY"] = (data["ARRIVAL_DELAY"] > 10) * 1
data["ARRIVAL_DELAY"].value_counts()

0    446235
1    125104
Name: ARRIVAL_DELAY, dtype: int64

범주형 변수 처리  
참고  - https://steadiness-193.tistory.com/233

In [20]:
data["AIRLINE"]

0         WN
1         B6
2         DL
3         WN
4         WN
          ..
581903    MQ
581904    US
581905    OO
581906    AA
581907    AA
Name: AIRLINE, Length: 571339, dtype: object

In [21]:
# 문자형인 변수를 카테고리화
data["AIRLINE"].astype("category")

0         WN
1         B6
2         DL
3         WN
4         WN
          ..
581903    MQ
581904    US
581905    OO
581906    AA
581907    AA
Name: AIRLINE, Length: 571339, dtype: category
Categories (14, object): ['AA', 'AS', 'B6', 'DL', ..., 'UA', 'US', 'VX', 'WN']

In [22]:
# 문자형인 변수를 카테고리화 하여 숫자를 할당함
# Memory 절약

data["AIRLINE"].astype("category").cat.codes+1

0         14
1          3
2          4
3         14
4         14
          ..
581903     8
581904    12
581905    10
581906     1
581907     1
Length: 571339, dtype: int8

In [23]:
# 다른 범주형 변수들도 동일한 방법으로 카테고리화하여 숫자를 할당
cols = ["AIRLINE", "FLIGHT_NUMBER", "DESTINATION_AIRPORT", "ORIGIN_AIRPORT"]
for item in cols:
    data[item] = data[item].astype("category").cat.codes +1

Train Set, Test Set 구분

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis = 1), data["ARRIVAL_DELAY"],
                                                random_state = 10, test_size = 0.25)  # Train Set : 75%, Test Set : 25%

In [25]:
print(X_train.shape); print(X_test.shape)
print(y_train.shape); print(y_test.shape)

(428504, 10)
(142835, 10)
(428504,)
(142835,)


# 2 XGBoost 모델 구축

최적의 하이퍼파라미터 찾기 !
- 코드 실행이 매우 오래 걸림

In [None]:
import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train, m.predict_proba(X_train)[:,1]),  # Train Set에 대한 AUC Score
            metrics.roc_auc_score(y_test, m.predict_proba(X_test)[:,1]))    # Test  Set에 대한 AUC Score

# Parameter Tuning
model = xgb.XGBClassifier()
param_dist = {"max_depth": [10, 30, 50],
              "learning_rate": [0.05, 0.1,0.16],
              "min_child_weight" : [1, 3, 6],
              "n_estimators": [200],
              }
              
grid_search = GridSearchCV(model, param_grid = param_dist, cv = 3, 
                           verbose = 10, n_jobs = -1)

# 모델 학습
grid_search.fit(X_train, y_train)

# 가장 좋은 하이퍼 파라미터 추출
grid_search.best_estimator_

- 가장 좋은 Best Estimator
```
xgb.XGBClassifier(max_depth = 50, learning_rate = 0.16, min_child_weight = 1, n_estimators = 200, n_jobs = -1, verbose = 1)
```



In [None]:
# 가장 좋은 Best Estimator
model = xgb.XGBClassifier(max_depth = 50, learning_rate = 0.16, min_child_weight = 1,
                          n_estimators = 200, n_jobs = -1, verbose = 1)

model.fit(X_train, y_train)
auc(model, X_train, X_test)

- AUC Score
```
auc(model, X_train, X_test)
#> (1.0, 0.7898891456656827)
```



# 3 LightGBM 모델 구축

최적의 하이퍼파라미터 찾기 !
- 코드 실행이 매우 오래 걸림

In [None]:
import lightgbm as lgb
from sklearn import metrics

def auc2(m, train, test): 
    return (metrics.roc_auc_score(y_train, m.predict(X_train)),
                            metrics.roc_auc_score(y_test, m.predict(X_test)))

lg = lgb.LGBMClassifier(silent = False)
param_dist = {"max_depth": [25,50, 75],
              "learning_rate" : [0.01, 0.05, 0.1],
              "num_leaves": [300, 900, 1200],
              "n_estimators": [300]
             }
grid_search = GridSearchCV(lg, n_jobs = -1, param_grid = param_dist, cv = 3, scoring = "roc_auc", verbose = 5)
grid_search.fit(X_train,y_train)
grid_search.best_estimator_

- 가장 좋은 Best Estimator
```
params = {"max_depth": 50, "learning_rate" : 0.1, "num_leaves": 900,  "n_estimators": 300}
```

찾은 하이퍼파라미터로 모델을 다시 학습 !

In [None]:
d_train = lgb.Dataset(X_train, label = y_train)
params = {"max_depth": 50, "learning_rate" : 0.1, "num_leaves": 900,  "n_estimators": 300}

# Without Categorical Features
model2 = lgb.train(params, d_train)
auc2(model2, X_train, X_test)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1522
[LightGBM] [Info] Number of data points in the train set: 428504, number of used features: 10
[LightGBM] [Info] Start training from score 0.219146


(0.96022681258952, 0.7835140779699798)

In [None]:
d_train = lgb.Dataset(X_train, label = y_train, free_raw_data = False)

#With Catgeorical Features
cate_features_name = ["MONTH", "DAY", "DAY_OF_WEEK", "AIRLINE",
                      "DESTINATION_AIRPORT", "ORIGIN_AIRPORT"]

model2 = lgb.train(params, d_train, categorical_feature = cate_features_name)

auc2(model2, X_train, X_test)

New categorical_feature is ['AIRLINE', 'DAY', 'DAY_OF_WEEK', 'DESTINATION_AIRPORT', 'MONTH', 'ORIGIN_AIRPORT']


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1823
[LightGBM] [Info] Number of data points in the train set: 428504, number of used features: 10
[LightGBM] [Info] Start training from score 0.219146


(0.9944446344291054, 0.773910349218679)

- AUC Score
1. 범주형 변수를 따로 지정하지 않았을 경우
```
model2 = lgb.train(params, d_train)
auc2(model2, X_train, X_test)
#> (0.96022681258952, 0.7835140779699798)
```

2. 범주형 변수를 따로 지정하였을 경우
```
model2 = lgb.train(params, d_train, categorical_feature = cate_features_name)
auc2(model2, X_train, X_test)
#> (0.9944446344291054, 0.773910349218679)
```

이 데이터의 경우, 범주형 변수를 따로 지정하면 Test Set의 AUC Score가 낮아지므로
Overfitting 되었음을 알 수 있다.

# 4 Catboost 모델 구축

In [None]:
X_train.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DESTINATION_AIRPORT,ORIGIN_AIRPORT,AIR_TIME,DEPARTURE_TIME,DISTANCE
298422,12,22,2,9,813,531,540,113.0,554.0,678
489243,12,4,5,14,456,592,580,64.0,1136.0,372
298007,5,13,3,5,4926,562,341,79.0,911.0,474
579998,6,11,4,4,1270,468,322,43.0,2050.0,270
500204,10,27,2,4,2278,146,19,61.0,1609.0,432


최적의 하이퍼파라미터 찾기 !
- 코드 실행이 매우 오래 걸림

In [None]:
import catboost as cb
from sklearn import metrics

cat_features_index = [0, 1, 2, 3, 4, 5, 6]

def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(X_train)[:,1]),
                            metrics.roc_auc_score(y_test,m.predict_proba(X_test)[:,1]))

params = {'depth': [4, 7, 10],
          'learning_rate' : [0.03, 0.1, 0.15],
          'l2_leaf_reg': [1, 4, 9],
          'iterations': [500]}

cb = cb.CatBoostClassifier()
cb_model = GridSearchCV(cb, params, scoring = "roc_auc", cv = 3)
cb_model.fit(X_train, y_train)

# Without Categorical features
clf = cb.CatBoostClassifier(eval_metric = "AUC", depth = 10, iterations = 500, l2_leaf_reg = 9, learning_rate = 0.15)
clf.fit(X_train,y_train)
auc(clf, X_train, X_test)

# With Categorical features
clf = cb.CatBoostClassifier(eval_metric = "AUC", one_hot_max_size = 31,
                            depth = 10, iterations = 500, l2_leaf_reg = 9, learning_rate = 0.15)
clf.fit(X_train,y_train, cat_features= cat_features_index)
auc(clf, X_train, X_test)

- 가장 좋은 Best Estimator
```
clf = cb.CatBoostClassifier(eval_metric = "AUC", depth = 10, learning_rate = 0.15, iterations = 500, l2_leaf_reg = 9, one_hot_max_size = 50)
```

In [None]:
import catboost as cb
cat_features_index = [0, 1, 2, 3, 4, 5, 6]

def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(X_train)[:,1]),
            metrics.roc_auc_score(y_test,m.predict_proba(X_test)[:,1]))
    
# With Categorical features
clf = cb.CatBoostClassifier(eval_metric = "AUC", depth = 10, learning_rate = 0.15,
                            iterations = 500, l2_leaf_reg = 9, one_hot_max_size = 31)

clf.fit(X_train, y_train, cat_features = cat_features_index)

0:	total: 790ms	remaining: 6m 34s
1:	total: 1.55s	remaining: 6m 26s
2:	total: 2.33s	remaining: 6m 25s
3:	total: 2.98s	remaining: 6m 10s
4:	total: 3.6s	remaining: 5m 56s
5:	total: 4.26s	remaining: 5m 50s
6:	total: 4.96s	remaining: 5m 49s
7:	total: 5.58s	remaining: 5m 43s
8:	total: 6.38s	remaining: 5m 47s
9:	total: 6.99s	remaining: 5m 42s
10:	total: 7.77s	remaining: 5m 45s
11:	total: 8.52s	remaining: 5m 46s
12:	total: 9.19s	remaining: 5m 44s
13:	total: 10s	remaining: 5m 48s
14:	total: 11.5s	remaining: 6m 11s
15:	total: 12.5s	remaining: 6m 19s
16:	total: 13.6s	remaining: 6m 27s
17:	total: 14.6s	remaining: 6m 30s
18:	total: 15.3s	remaining: 6m 27s
19:	total: 15.9s	remaining: 6m 21s
20:	total: 16.6s	remaining: 6m 19s
21:	total: 17.3s	remaining: 6m 16s
22:	total: 18s	remaining: 6m 14s
23:	total: 18.7s	remaining: 6m 11s
24:	total: 19.4s	remaining: 6m 8s
25:	total: 20.2s	remaining: 6m 7s
26:	total: 20.9s	remaining: 6m 5s
27:	total: 21.6s	remaining: 6m 4s
28:	total: 22.3s	remaining: 6m 2s
29:	t

<catboost.core.CatBoostClassifier at 0x7f1ba9f66850>

In [None]:
auc(clf, X_train, X_test)

(0.9018377992523106, 0.8323839009545351)

- AUC Score
```
auc(clf, X_train, X_test)
#> (0.9018377992523106, 0.8323839009545351)
```

# 5 결과 비교

# 6 `pycaret`을 이용한 가장 좋은 모델 찾기

- https://pycaret.org/create-model/
- 만능은 아님. 계속 학습시키면서 모델을 찾고 하이퍼 파라미터 튜닝하는 것이 더 정확할 수 있음
- 처음 시작하기 좋음

In [None]:
!pip install pycaret

In [26]:
import pycaret

In [34]:
!pip install markupsafe==2.0.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [36]:
from pycaret.classification import *

ImportError: ignored

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(data, random_state = 10, test_size = 0.25)

In [None]:
clf = setup(X_train, 
            target = 'ARRIVAL_DELAY', # 목표 변수
            preprocess = False, # True로 설정되면, 자체적인 Feature Engineering을 추가로 진행해 Predict가 불가능해진다.
            use_gpu = False, # GPU가 있으면 사용
            categorical_features = ["MONTH", "DAY", "DAY_OF_WEEK", "AIRLINE", "FLIGHT_NUMBER", "DESTINATION_AIRPORT", "ORIGIN_AIRPORT"],
            numeric_features = ["AIR_TIME", "DEPARTURE_TIME", "DISTANCE"], 
            session_id = 2021,
            fold = 5,
            fold_shuffle = True
            )

Unnamed: 0,Description,Value
0,session_id,2021
1,Target,ARRIVAL_DELAY
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(428504, 11)"
5,Missing Values,False
6,Numeric Features,3
7,Categorical Features,7
8,Transformed Train Set,"(299952, 10)"
9,Transformed Test Set,"(128552, 10)"


In [None]:
compare_models()

[]

In [None]:
top5 = compare_models(n_select = 5, sort = 'AUC')

# 7 `optuna`를 이용한 Hyper-parameter 튜닝

In [None]:
!pip install optuna

In [None]:
import optuna
from sklearn.model_selection import train_test_split
import catboost as cb
from sklearn import metrics

In [None]:
# 위에서 Grid search로 찾았던 파라미터 

params = {'depth': [4, 7, 10], # 10
          'learning_rate' : [0.03, 0.1, 0.15], # 0.15
          'l2_leaf_reg': [1, 4, 9], # 9
         'iterations': [300]} # 300

In [None]:
y_train.shape

(428504,)

In [None]:
def objective(trial, train, y_train):
    cat_features_index = [0,1,2,3,4,5,6]

    # 하이퍼파라미터 튜닝을 위한 Validation data 생성
    X_train, X_val, y_train, y_val = train_test_split(train, y_train, test_size = 0.3)

    param = {
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-5, 1e0),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1e-2, 1e0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 10),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 2, 20),
        "one_hot_max_size": trial.suggest_int("one_hot_max_size", 2, 20),  
    }
    
    model = cb.CatBoostClassifier(eval_metric = "AUC", 
                                  cat_features = cat_features_index)
    model.set_params(**param)

    model.fit(X_train,
              y_train,
              eval_set = [(X_val, y_val)],
              verbose = 0, 
              early_stopping_rounds = 30)
    
    auc_score = metrics.roc_auc_score(y_val, model.predict_proba(X_val)[:,1])
    
    return auc_score

In [None]:
import numpy as np
import pandas as pd
from optuna.samplers import TPESampler

study = optuna.create_study(sampler = TPESampler(), direction = "maximize")
func = lambda trial: objective(trial, X_train, y_train)
study.optimize(func, n_trials = 10, timeout = 600) # Run for 10 minutes

print("Number of completed trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial

print("\tBest Score: {}".format(trial.value))
print("\tBest Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-07-18 09:53:41,444][0m A new study created in memory with name: no-name-297ca56e-2f01-418f-840f-a742dc23e316[0m
[32m[I 2022-07-18 09:53:49,679][0m Trial 0 finished with value: 1.0 and parameters: {'learning_rate': 0.00011216761115818948, 'l2_leaf_reg': 0.39244195696489176, 'colsample_bylevel': 0.09243002213738269, 'depth': 8, 'min_data_in_leaf': 20, 'one_hot_max_size': 5}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-07-18 09:53:59,448][0m Trial 1 finished with value: 1.0 and parameters: {'learning_rate': 0.34632032100437576, 'l2_leaf_reg': 0.2433991366337072, 'colsample_bylevel': 0.07471365059879193, 'depth': 9, 'min_data_in_leaf': 17, 'one_hot_max_size': 2}. Best is trial 0 with value: 1.0.[0m
[32m[I 2022-07-18 09:54:07,792][0m Trial 2 finished with value: 1.0 and parameters: {'learning_rate': 0.035718049395143794, 'l2_leaf_reg': 0.03047032313235976, 'colsample_bylevel': 0.035088431151771494, 'depth': 9, 'min_data_in_leaf': 6, 'one_hot_max_size': 4}. Best is

Number of completed trials: 10
Best trial:
	Best Score: 1.0
	Best Params: 
    learning_rate: 0.00011216761115818948
    l2_leaf_reg: 0.39244195696489176
    colsample_bylevel: 0.09243002213738269
    depth: 8
    min_data_in_leaf: 20
    one_hot_max_size: 5


In [None]:
study.best_params

In [None]:
def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(X_train)[:,1]),
                            metrics.roc_auc_score(y_test,m.predict_proba(X_test)[:,1]))


In [None]:
param = {
    "learning_rate" : 0.3686617844744718,
    "l2_leaf_reg" : 0.9350132458243919,
    "colsample_bylevel" : 0.08342721275062269,
    "depth" : 4,
    "min_data_in_leaf" : 2,
    "one_hot_max_size" : 20,  
}

In [None]:
import catboost as cb
from sklearn.model_selection import train_test_split

cat_features_index = [0, 1, 2, 3, 4, 5, 6]

X_t_train, X_t_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2)

model = cb.CatBoostClassifier(eval_metric = "AUC", 
                                  cat_features = cat_features_index)
model.set_params(**param)

model.fit(X_t_train,
          y_train,
          eval_set = [(X_val, y_val)],
          verbose = 0, 
          early_stopping_rounds = 100)

Custom logger is already specified. Specify more than one logger at same time is not thread safe.

<catboost.core.CatBoostClassifier at 0x7fe73861c5d0>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis = 1),
                                                    data["ARRIVAL_DELAY"],
                                                    random_state = 10, test_size = 0.25)

In [None]:
auc(model, X_train, X_test)

(0.7685238520722791, 0.7379037996510001)