# CatBoost Tutorial

# 0. CatBoost Installation

In [3]:
#!pip install catboost
#!pip install ipywidgets
#!jupyter nbextension enable --py widgetsnbextension

## 0-1. Data Loading
간단한 예제로 titanic dataset에 대해 catboost를 적용해보겠습니다. 

In [1]:
from catboost.datasets import titanic
import numpy as np

train_df, test_df = titanic()

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


### Null 값 check

In [2]:
null_value_stats = train_df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Age         177
Cabin       687
Embarked      2
dtype: int64

missing value였던 instance를 구분하기 위해서 아주 작은 값을 할당해주도록 하겠습니다. 

In [3]:
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

input variable과 target variable을 나눠줍니다. 

In [4]:
X = train_df.drop('Survived', axis=1)
y = train_df.Survived

numeric을 제외한 cetegorical feature나 string feature는 catboost 내부에서 자동으로 categorical feature로 처리하여 ordered boosting을 수행합니다. categorical feature를 따로 저장만 해주면됩니다. 

In [5]:
print(X.dtypes)
categorical_features_indices = np.where(X.dtypes != np.float)[0]
print(X.columns)
categorical_features_indices

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object
Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')


array([ 0,  1,  2,  3,  5,  6,  7,  9, 10], dtype=int64)

### Data Splitting
train:validation = 3:1

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=923)
X_test = test_df

## 1. Modeling CatBoost

이제 제공된 CatBoost library를 사용하여 Catboost Model을 만들어보도록 하겠습니다. 

In [7]:
from catboost import CatBoostClassifier, Pool, cv
from sklearn.metrics import accuracy_score

### Training 

Accuracy 기준으로 이를 최소화 하는 방향으로 학습하도록 하겠습니다. 

In [73]:
model = CatBoostClassifier(
    custom_loss=['Accuracy'],
    random_seed=923,
    logging_level='Silent'
)

model을 fit 시킬 때, cat_features에 앞서 저장해둔 categorical feature의 index를 지정해줍니다. 이렇게 지정해준 categorical feature에 대해 ordered TS를 계산하여 numerical value로 치환될 것 입니다. 

In [74]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    logging_level='Verbose',  
    #plot=True
)

Learning rate set to 0.028683
0:	learn: 0.6739884	test: 0.6779887	best: 0.6779887 (0)	total: 24.9ms	remaining: 24.9s
1:	learn: 0.6575283	test: 0.6652014	best: 0.6652014 (1)	total: 52.7ms	remaining: 26.3s
2:	learn: 0.6414082	test: 0.6514773	best: 0.6514773 (2)	total: 68.4ms	remaining: 22.7s
3:	learn: 0.6272018	test: 0.6403541	best: 0.6403541 (3)	total: 70.6ms	remaining: 17.6s
4:	learn: 0.6112110	test: 0.6268848	best: 0.6268848 (4)	total: 84.5ms	remaining: 16.8s
5:	learn: 0.5967523	test: 0.6160295	best: 0.6160295 (5)	total: 99.1ms	remaining: 16.4s
6:	learn: 0.5836431	test: 0.6065507	best: 0.6065507 (6)	total: 112ms	remaining: 15.8s
7:	learn: 0.5718567	test: 0.5983032	best: 0.5983032 (7)	total: 130ms	remaining: 16.1s
8:	learn: 0.5635820	test: 0.5926517	best: 0.5926517 (8)	total: 139ms	remaining: 15.3s
9:	learn: 0.5530857	test: 0.5848741	best: 0.5848741 (9)	total: 151ms	remaining: 14.9s
10:	learn: 0.5436174	test: 0.5775056	best: 0.5775056 (10)	total: 163ms	remaining: 14.7s
11:	learn: 0.535

94:	learn: 0.3472590	test: 0.4730304	best: 0.4726647 (90)	total: 1.37s	remaining: 13s
95:	learn: 0.3463219	test: 0.4728980	best: 0.4726647 (90)	total: 1.39s	remaining: 13.1s
96:	learn: 0.3463057	test: 0.4729467	best: 0.4726647 (90)	total: 1.4s	remaining: 13s
97:	learn: 0.3455259	test: 0.4729330	best: 0.4726647 (90)	total: 1.42s	remaining: 13.1s
98:	learn: 0.3450137	test: 0.4737214	best: 0.4726647 (90)	total: 1.44s	remaining: 13.1s
99:	learn: 0.3442971	test: 0.4736851	best: 0.4726647 (90)	total: 1.46s	remaining: 13.1s
100:	learn: 0.3435989	test: 0.4735593	best: 0.4726647 (90)	total: 1.47s	remaining: 13.1s
101:	learn: 0.3429515	test: 0.4736792	best: 0.4726647 (90)	total: 1.49s	remaining: 13.1s
102:	learn: 0.3426272	test: 0.4736743	best: 0.4726647 (90)	total: 1.5s	remaining: 13.1s
103:	learn: 0.3423225	test: 0.4730495	best: 0.4726647 (90)	total: 1.51s	remaining: 13s
104:	learn: 0.3418398	test: 0.4729549	best: 0.4726647 (90)	total: 1.52s	remaining: 12.9s
105:	learn: 0.3410857	test: 0.47219

191:	learn: 0.3048107	test: 0.4662074	best: 0.4656128 (182)	total: 2.77s	remaining: 11.6s
192:	learn: 0.3048106	test: 0.4662069	best: 0.4656128 (182)	total: 2.77s	remaining: 11.6s
193:	learn: 0.3047790	test: 0.4662195	best: 0.4656128 (182)	total: 2.77s	remaining: 11.5s
194:	learn: 0.3044359	test: 0.4664736	best: 0.4656128 (182)	total: 2.79s	remaining: 11.5s
195:	learn: 0.3041269	test: 0.4664273	best: 0.4656128 (182)	total: 2.81s	remaining: 11.5s
196:	learn: 0.3036822	test: 0.4665485	best: 0.4656128 (182)	total: 2.83s	remaining: 11.5s
197:	learn: 0.3026967	test: 0.4669244	best: 0.4656128 (182)	total: 2.85s	remaining: 11.5s
198:	learn: 0.3024986	test: 0.4667496	best: 0.4656128 (182)	total: 2.87s	remaining: 11.5s
199:	learn: 0.3024558	test: 0.4667629	best: 0.4656128 (182)	total: 2.87s	remaining: 11.5s
200:	learn: 0.3019166	test: 0.4660328	best: 0.4656128 (182)	total: 2.89s	remaining: 11.5s
201:	learn: 0.3015846	test: 0.4657554	best: 0.4656128 (182)	total: 2.9s	remaining: 11.5s
202:	learn:

293:	learn: 0.2726067	test: 0.4648008	best: 0.4631718 (241)	total: 4.38s	remaining: 10.5s
294:	learn: 0.2725503	test: 0.4647915	best: 0.4631718 (241)	total: 4.4s	remaining: 10.5s
295:	learn: 0.2725034	test: 0.4648186	best: 0.4631718 (241)	total: 4.42s	remaining: 10.5s
296:	learn: 0.2724648	test: 0.4648225	best: 0.4631718 (241)	total: 4.44s	remaining: 10.5s
297:	learn: 0.2719739	test: 0.4646422	best: 0.4631718 (241)	total: 4.46s	remaining: 10.5s
298:	learn: 0.2718607	test: 0.4647162	best: 0.4631718 (241)	total: 4.48s	remaining: 10.5s
299:	learn: 0.2715648	test: 0.4649101	best: 0.4631718 (241)	total: 4.5s	remaining: 10.5s
300:	learn: 0.2714415	test: 0.4648546	best: 0.4631718 (241)	total: 4.51s	remaining: 10.5s
301:	learn: 0.2713164	test: 0.4648995	best: 0.4631718 (241)	total: 4.52s	remaining: 10.4s
302:	learn: 0.2706266	test: 0.4652265	best: 0.4631718 (241)	total: 4.54s	remaining: 10.4s
303:	learn: 0.2701916	test: 0.4652854	best: 0.4631718 (241)	total: 4.55s	remaining: 10.4s
304:	learn: 

386:	learn: 0.2450674	test: 0.4659053	best: 0.4631718 (241)	total: 6s	remaining: 9.51s
387:	learn: 0.2449117	test: 0.4662347	best: 0.4631718 (241)	total: 6.02s	remaining: 9.5s
388:	learn: 0.2444751	test: 0.4667114	best: 0.4631718 (241)	total: 6.04s	remaining: 9.49s
389:	learn: 0.2441628	test: 0.4663031	best: 0.4631718 (241)	total: 6.06s	remaining: 9.48s
390:	learn: 0.2436673	test: 0.4663573	best: 0.4631718 (241)	total: 6.08s	remaining: 9.47s
391:	learn: 0.2434781	test: 0.4664511	best: 0.4631718 (241)	total: 6.1s	remaining: 9.46s
392:	learn: 0.2428815	test: 0.4664287	best: 0.4631718 (241)	total: 6.12s	remaining: 9.45s
393:	learn: 0.2428345	test: 0.4664182	best: 0.4631718 (241)	total: 6.14s	remaining: 9.44s
394:	learn: 0.2424894	test: 0.4663075	best: 0.4631718 (241)	total: 6.16s	remaining: 9.43s
395:	learn: 0.2421481	test: 0.4662539	best: 0.4631718 (241)	total: 6.17s	remaining: 9.41s
396:	learn: 0.2420481	test: 0.4662986	best: 0.4631718 (241)	total: 6.19s	remaining: 9.41s
397:	learn: 0.2

484:	learn: 0.2217705	test: 0.4666818	best: 0.4631718 (241)	total: 7.76s	remaining: 8.24s
485:	learn: 0.2215942	test: 0.4666802	best: 0.4631718 (241)	total: 7.78s	remaining: 8.23s
486:	learn: 0.2213157	test: 0.4665956	best: 0.4631718 (241)	total: 7.8s	remaining: 8.22s
487:	learn: 0.2210594	test: 0.4664525	best: 0.4631718 (241)	total: 7.82s	remaining: 8.21s
488:	learn: 0.2206283	test: 0.4665328	best: 0.4631718 (241)	total: 7.84s	remaining: 8.19s
489:	learn: 0.2199419	test: 0.4671534	best: 0.4631718 (241)	total: 7.86s	remaining: 8.18s
490:	learn: 0.2198899	test: 0.4672020	best: 0.4631718 (241)	total: 7.87s	remaining: 8.16s
491:	learn: 0.2196549	test: 0.4672765	best: 0.4631718 (241)	total: 7.88s	remaining: 8.14s
492:	learn: 0.2191322	test: 0.4672219	best: 0.4631718 (241)	total: 7.9s	remaining: 8.12s
493:	learn: 0.2186208	test: 0.4670487	best: 0.4631718 (241)	total: 7.92s	remaining: 8.11s
494:	learn: 0.2181655	test: 0.4669192	best: 0.4631718 (241)	total: 7.93s	remaining: 8.1s
495:	learn: 0

585:	learn: 0.1990138	test: 0.4692822	best: 0.4631718 (241)	total: 9.58s	remaining: 6.77s
586:	learn: 0.1988735	test: 0.4691996	best: 0.4631718 (241)	total: 9.6s	remaining: 6.75s
587:	learn: 0.1985867	test: 0.4693015	best: 0.4631718 (241)	total: 9.62s	remaining: 6.74s
588:	learn: 0.1982422	test: 0.4689274	best: 0.4631718 (241)	total: 9.63s	remaining: 6.72s
589:	learn: 0.1981414	test: 0.4688989	best: 0.4631718 (241)	total: 9.65s	remaining: 6.71s
590:	learn: 0.1980615	test: 0.4688615	best: 0.4631718 (241)	total: 9.66s	remaining: 6.69s
591:	learn: 0.1978709	test: 0.4687524	best: 0.4631718 (241)	total: 9.68s	remaining: 6.67s
592:	learn: 0.1976247	test: 0.4690688	best: 0.4631718 (241)	total: 9.7s	remaining: 6.66s
593:	learn: 0.1973927	test: 0.4689486	best: 0.4631718 (241)	total: 9.71s	remaining: 6.64s
594:	learn: 0.1973432	test: 0.4689562	best: 0.4631718 (241)	total: 9.73s	remaining: 6.62s
595:	learn: 0.1972460	test: 0.4690807	best: 0.4631718 (241)	total: 9.75s	remaining: 6.61s
596:	learn: 

677:	learn: 0.1798992	test: 0.4724078	best: 0.4631718 (241)	total: 11.2s	remaining: 5.32s
678:	learn: 0.1794462	test: 0.4722642	best: 0.4631718 (241)	total: 11.2s	remaining: 5.31s
679:	learn: 0.1793873	test: 0.4723935	best: 0.4631718 (241)	total: 11.2s	remaining: 5.29s
680:	learn: 0.1793017	test: 0.4725678	best: 0.4631718 (241)	total: 11.3s	remaining: 5.28s
681:	learn: 0.1791738	test: 0.4727240	best: 0.4631718 (241)	total: 11.3s	remaining: 5.26s
682:	learn: 0.1790112	test: 0.4728274	best: 0.4631718 (241)	total: 11.3s	remaining: 5.25s
683:	learn: 0.1787506	test: 0.4735212	best: 0.4631718 (241)	total: 11.3s	remaining: 5.23s
684:	learn: 0.1786406	test: 0.4735697	best: 0.4631718 (241)	total: 11.3s	remaining: 5.22s
685:	learn: 0.1785785	test: 0.4732870	best: 0.4631718 (241)	total: 11.4s	remaining: 5.2s
686:	learn: 0.1782335	test: 0.4732979	best: 0.4631718 (241)	total: 11.4s	remaining: 5.18s
687:	learn: 0.1781185	test: 0.4732376	best: 0.4631718 (241)	total: 11.4s	remaining: 5.17s
688:	learn:

773:	learn: 0.1632679	test: 0.4782027	best: 0.4631718 (241)	total: 12.8s	remaining: 3.75s
774:	learn: 0.1630913	test: 0.4780279	best: 0.4631718 (241)	total: 12.9s	remaining: 3.73s
775:	learn: 0.1628013	test: 0.4780771	best: 0.4631718 (241)	total: 12.9s	remaining: 3.72s
776:	learn: 0.1626727	test: 0.4779103	best: 0.4631718 (241)	total: 12.9s	remaining: 3.7s
777:	learn: 0.1625431	test: 0.4780567	best: 0.4631718 (241)	total: 12.9s	remaining: 3.68s
778:	learn: 0.1622897	test: 0.4784278	best: 0.4631718 (241)	total: 12.9s	remaining: 3.67s
779:	learn: 0.1622850	test: 0.4784686	best: 0.4631718 (241)	total: 12.9s	remaining: 3.65s
780:	learn: 0.1622079	test: 0.4784956	best: 0.4631718 (241)	total: 13s	remaining: 3.63s
781:	learn: 0.1621747	test: 0.4783659	best: 0.4631718 (241)	total: 13s	remaining: 3.62s
782:	learn: 0.1619761	test: 0.4784768	best: 0.4631718 (241)	total: 13s	remaining: 3.6s
783:	learn: 0.1619002	test: 0.4784838	best: 0.4631718 (241)	total: 13s	remaining: 3.58s
784:	learn: 0.161883

873:	learn: 0.1490222	test: 0.4816002	best: 0.4631718 (241)	total: 14.6s	remaining: 2.11s
874:	learn: 0.1487703	test: 0.4814436	best: 0.4631718 (241)	total: 14.7s	remaining: 2.09s
875:	learn: 0.1487044	test: 0.4813801	best: 0.4631718 (241)	total: 14.7s	remaining: 2.08s
876:	learn: 0.1485780	test: 0.4813052	best: 0.4631718 (241)	total: 14.7s	remaining: 2.06s
877:	learn: 0.1485071	test: 0.4812548	best: 0.4631718 (241)	total: 14.7s	remaining: 2.04s
878:	learn: 0.1483976	test: 0.4809246	best: 0.4631718 (241)	total: 14.7s	remaining: 2.02s
879:	learn: 0.1482468	test: 0.4808857	best: 0.4631718 (241)	total: 14.7s	remaining: 2.01s
880:	learn: 0.1482425	test: 0.4808624	best: 0.4631718 (241)	total: 14.7s	remaining: 1.99s
881:	learn: 0.1482230	test: 0.4808401	best: 0.4631718 (241)	total: 14.8s	remaining: 1.98s
882:	learn: 0.1481297	test: 0.4806808	best: 0.4631718 (241)	total: 14.8s	remaining: 1.96s
883:	learn: 0.1479576	test: 0.4808694	best: 0.4631718 (241)	total: 14.8s	remaining: 1.94s
884:	learn

975:	learn: 0.1369538	test: 0.4868856	best: 0.4631718 (241)	total: 16.4s	remaining: 404ms
976:	learn: 0.1369001	test: 0.4868662	best: 0.4631718 (241)	total: 16.5s	remaining: 387ms
977:	learn: 0.1365612	test: 0.4867137	best: 0.4631718 (241)	total: 16.5s	remaining: 371ms
978:	learn: 0.1359596	test: 0.4863805	best: 0.4631718 (241)	total: 16.5s	remaining: 354ms
979:	learn: 0.1357767	test: 0.4862262	best: 0.4631718 (241)	total: 16.5s	remaining: 337ms
980:	learn: 0.1355782	test: 0.4863034	best: 0.4631718 (241)	total: 16.5s	remaining: 320ms
981:	learn: 0.1355044	test: 0.4865188	best: 0.4631718 (241)	total: 16.5s	remaining: 303ms
982:	learn: 0.1354895	test: 0.4864099	best: 0.4631718 (241)	total: 16.6s	remaining: 286ms
983:	learn: 0.1353886	test: 0.4863503	best: 0.4631718 (241)	total: 16.6s	remaining: 270ms
984:	learn: 0.1352802	test: 0.4863703	best: 0.4631718 (241)	total: 16.6s	remaining: 253ms
985:	learn: 0.1351837	test: 0.4865042	best: 0.4631718 (241)	total: 16.6s	remaining: 236ms
986:	learn

<catboost.core.CatBoostClassifier at 0x17b03361130>

학습 결과 최대 vailation set에 대한 accuracy는 약 0.798으로, boosting step 341번을 후에 달성했습니다.

### Cross-validation

위에서 얻어진 parameter들에 대해 더욱 robust한 accuracy를 얻기 위해 cross-validation을 수행해보도록 하겠습니다. catboost에서 제공하는 cv로 쉽게 수행할 수 있습니다. k=3으로 수행되었고 loss function은 logloss로 update했습니다. 

In [76]:
cv_params = model.get_params()
cv_params.update({
    'loss_function': 'Logloss'
})
cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    #plot=True
)

plot=Ture로 설정하면, cross-validation결과 값의 폭을 확인할 수 있습니다. 

In [77]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.84±0.01 on step 414


가장 높은 accuracy는 boosting을 414했을 때로 나타났습니다. 

In [78]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8383838383838383


cross-validation 결과, 처음 얻었던 accuracy=0.798보다 큰 약 0.8383의 accuracy를 얻었습니다. dataset을 어떻게 split하는가에 따라 성능지표의 값 차이가 나기 때문에 cross-validation을 통해 좀 더 robust한 성능을 얻을 필요가 있습니다. 

### Prediction
catboost의 predict 함수를 통해, classification의 prediction 값을 얻을 수 있고, predict_proba를 사용하면, 각 class에 대해 예측하고 있는 확률을 얻을 수 있습니다.

In [79]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

[0 0 0 0 1 0 1 0 1 0]
[[0.89890292 0.10109708]
 [0.7204943  0.2795057 ]
 [0.92087031 0.07912969]
 [0.88355545 0.11644455]
 [0.14982657 0.85017343]
 [0.88847395 0.11152605]
 [0.47331547 0.52668453]
 [0.61274044 0.38725956]
 [0.38739407 0.61260593]
 [0.87273403 0.12726597]]


## 2. CatBoost Features
위에서는 기본적인 feature들만 지정하여 사용했는데, 이제 다른 feature들은 무엇이 있는지 그 역할을 살펴보고 사용해보겠습니다. 

먼저, catboost model의 random seed를 지정해주지 않았을 때, model을 fit한 후에 사용된 random seed를 random_seed_를 통해 다시 알아낼 수 있습니다. 

In [14]:
model_without_seed = CatBoostClassifier(iterations=10, logging_level='Silent')
model_without_seed.fit(X, y, cat_features=categorical_features_indices)

print('Random seed assigned for this model: {}'.format(model_without_seed.random_seed_))

Random seed assigned for this model: 0


이제 parameter들을 지정하고, 'Pool'을 만들어보도록 하겠습니다. Pool은 dataset의 feature, label, categorical features indice, weight 등에 대한 정보를 저장할 수 있습니다. 

In [41]:
params = {
    'iterations': 500,
    'learning_rate': 0.1,
    'eval_metric': 'Accuracy',
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}

# feature/label/cat_features 한꺼번에 저장
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_validation, y_validation, cat_features=categorical_features_indices)

### Best Model
use_best_model을 사용해 training할 때 best_model의 parameter를 사용할 수 있습니다. 

In [16]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool);

print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Best model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, best_model.predict(X_validation))
))

Simple model validation accuracy: 0.8027

Best model validation accuracy: 0.8117


use_best_model을 사용한 model이 그렇지 않은 model보다 accuracy가 약 0.1 높았습니다. 

### Early stopping
학습 시간을 단축하기 위해 od_type과 od_wait을 사용하여 early stopping을 사용할 수 있습니다. early stop을 한 경우와 그렇지 않은 경우의 성능과 수행시간을 비교해보도록 하겠습니다. 

In [17]:
%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

Wall time: 7.85 s


<catboost.core.CatBoostClassifier at 0x17b02976ee0>

In [18]:
%%time
earlystop_params = params.copy()

#iteration 최대 40번까지만 허용
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 40
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool);

Wall time: 1.73 s


<catboost.core.CatBoostClassifier at 0x17b02d7c3a0>

In [19]:
print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, model.predict(X_validation))
))
print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    accuracy_score(y_validation, earlystop_model.predict(X_validation))
))

Simple model tree count: 500
Simple model validation accuracy: 0.8027

Early-stopped model tree count: 118
Early-stopped model validation accuracy: 0.7982


simple model과 비교해 early-stopped model의 수행시간은 4배 정도 단축됐지만, accuracy는 0.1 이하로 떨어졌습니다. 따라서 성능에 큰 차이가 없을 정도로 학습하고, 어느 정도 후에 멈추는 것이 시간복잡도 측면에서 더 효율적이라고 할 수 있습니다. 또한 overffiting을 방지하기 위해서도 적절한 iteration후에 early stopping하도록 는 것이 바람직합니다.

### Baseline
prediction_type = RawFormulaVal으로 설정해 training된 model을 사용해 다시 training시키는 것이 가능합니다.

In [45]:
current_params = params.copy()
current_params.update({
    'iterations': 10
})
model = CatBoostClassifier(**current_params).fit(X_train, y_train, categorical_features_indices)


# pre-trained model
baseline = model.predict(X_train, prediction_type='RawFormulaVal')

# new model
model.fit(X_train, y_train, categorical_features_indices, baseline=baseline);

### Snapshot
catboost는 snapshot을 제공하는데, 이를 통해 interruption 후에 다시 학습할 수도 있고 이전 결과로부터 다시 training을 시작할 수도 있습니다. 학습 시 save_snapshot=True로 설정해주어야 합니다.

In [51]:
params_with_snapshot = params.copy()
params_with_snapshot.update({
    'iterations': 10,
    'learning_rate': 0.5,
    'logging_level': 'Verbose'
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)


params_with_snapshot.update({
    'iterations': 15,
    'learning_rate': 0.1
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)


bestTest = 0.7668161435
bestIteration = 1

10:	learn: 0.8473054	test: 0.7533632	best: 0.7668161 (1)	total: 39.2ms	remaining: 24.3ms
11:	learn: 0.8458084	test: 0.7533632	best: 0.7668161 (1)	total: 44ms	remaining: 16.3ms
12:	learn: 0.8473054	test: 0.7578475	best: 0.7668161 (1)	total: 45.3ms	remaining: 8.11ms
13:	learn: 0.8473054	test: 0.7578475	best: 0.7668161 (1)	total: 46.4ms	remaining: 3.33ms
14:	learn: 0.8473054	test: 0.7578475	best: 0.7668161 (1)	total: 48.5ms	remaining: 0us

bestTest = 0.7668161435
bestIteration = 1



iteration 9번 후에 다시 10번부터 5번 반복하는 것을 확인할 수 있습니다.

### User Defined Objective Function

objective function을 사용자가 cumstomizing하는 것이 가능합니다. logloss function을 만들어보겠습니다.

In [52]:
class LoglossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):      
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        for index in range(len(targets)):
            e = np.exp(approxes[index])
            p = e / (1 + e)
            der1 = (1 - p) if targets[index] > 0.0 else -p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result

In [23]:
model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function=LoglossObjective(), 
    eval_metric="Logloss"
)

# Fit model
model.fit(train_pool)

# custom `loss_function`을 사용하면 prediction_type='RawFormulaVal'만 설정할 수 있다. 
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

0:	learn: 0.6816993	total: 12.9ms	remaining: 116ms
1:	learn: 0.6704449	total: 25.5ms	remaining: 102ms
2:	learn: 0.6611746	total: 34.5ms	remaining: 80.6ms
3:	learn: 0.6512890	total: 49.7ms	remaining: 74.5ms
4:	learn: 0.6425075	total: 60.5ms	remaining: 60.5ms
5:	learn: 0.6324326	total: 70.1ms	remaining: 46.7ms
6:	learn: 0.6230280	total: 79.8ms	remaining: 34.2ms
7:	learn: 0.6143051	total: 89.9ms	remaining: 22.5ms
8:	learn: 0.6071459	total: 101ms	remaining: 11.2ms
9:	learn: 0.5993086	total: 110ms	remaining: 0us


### User Defined Metric Function

metric 또한 사용자가 정의해줄 수 있습니다.

In [24]:
class LoglossMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
       
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += -w * (target[i] * approx[i] - np.log(1 + np.exp(approx[i])))

        return error_sum, weight_sum

In [25]:
model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function="Logloss",
    eval_metric=LoglossMetric()
)

model.fit(train_pool)
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

Learning rate set to 0.5
0:	learn: 0.5404506	total: 6.56ms	remaining: 59.1ms
1:	learn: 0.4718844	total: 14.1ms	remaining: 56.4ms
2:	learn: 0.4468731	total: 19.5ms	remaining: 45.4ms
3:	learn: 0.4299629	total: 23.4ms	remaining: 35.1ms
4:	learn: 0.4099561	total: 28.1ms	remaining: 28.1ms
5:	learn: 0.4007550	total: 35ms	remaining: 23.3ms
6:	learn: 0.3910768	total: 39.2ms	remaining: 16.8ms
7:	learn: 0.3796317	total: 42.7ms	remaining: 10.7ms
8:	learn: 0.3750199	total: 48.4ms	remaining: 5.38ms
9:	learn: 0.3715420	total: 53.9ms	remaining: 0us


### Staged Predict
catboost는 staged_predict method를 제공합니다. 이 methood를 사용해 prediction에 사용될 tree의 범위를 지정해줄 수 있습니다. 
CatBoost model has `staged_predict` method. It allows you to iteratively get predictions for a given range of trees.

In [26]:
model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
ntree_start, ntree_end, eval_period = 3, 9, 2
predictions_iterator = model.staged_predict(validate_pool, 'Probability', ntree_start, ntree_end, eval_period)
for preds, tree_count in zip(predictions_iterator, range(ntree_start, ntree_end, eval_period)):
    print('First class probabilities using the first {} trees: {}'.format(tree_count, preds[:5, 1]))

First class probabilities using the first 3 trees: [0.48257516 0.39585845 0.37611297 0.54042906 0.48257516]
First class probabilities using the first 5 trees: [0.46204618 0.3833052  0.35698827 0.61880357 0.46204618]
First class probabilities using the first 7 trees: [0.44704572 0.37983824 0.34322245 0.65681876 0.44704572]


### Feature Importances

적합된 model에서 각 feature가 target에 영향을 미치는 정도를 get_feature_importance method를 사용해 구할 수 있습니다.

In [27]:
model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Sex: 52.362643977926105
Pclass: 12.093354944343053
Ticket: 11.604122897539481
Age: 5.480820099799359
Parch: 4.882169523307244
SibSp: 4.179223814317337
Fare: 3.4140458603213086
Cabin: 3.024290925389097
Embarked: 2.959327957057029
PassengerId: 0.0
Name: 0.0


위의 결과를 통해 sex, pclass, ticket 순으로 target에 영향을 크게 미치고 있음을 확인할 수 있습니다.

### Eval Metrics

다음으로 eval_metrics이라는 method를 사용해 주어진 metric에 대한 값을 계산할 수 있습니다. 

In [80]:
model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, ['AUC'], plot=False)

In [81]:
print(eval_metrics['AUC'][:6])

[0.8264604810996563, 0.830387825233186, 0.8179921453117329, 0.8131647848142693, 0.8157830142366225, 0.8129193258059237]


### Learning Processes Comparison

서로 다른 model에 대해 learning process를 하나의 plot에 비교할 수도 있습니다.

In [82]:
model1 = CatBoostClassifier(iterations=10, depth=1, train_dir='model_depth_1/', logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=10, depth=5, train_dir='model_depth_5/', logging_level='Silent')
model2.fit(train_pool, eval_set=validate_pool);

In [84]:
from catboost import MetricVisualizer
#widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
#widget.start()

### Model Saving

학습을 마친 model은 save_model로 저장할 수 있습니다. load_model로 읽어와 다시 사용할 수 있습니다. 

In [32]:
model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')
model = CatBoostClassifier()
model.load_model('catboost_model.dump');

# 3. Parameters Tuning

hyperopt package를 사용해서 l2_leaf_reg, learning_rate 등의 hyperparameter들을 optimization해보도록 하겠습니다. 

In [55]:
#!pip install hyperopt

다음과 같이, tuning해줄 parameter에 대해서만 params로 지정할 수 있게 설정할 수 있습니다. best parameter를 고르는 기준은 cross-validation을 통해 accuracy 기준으로 선택하도록 하겠습니다. 

In [34]:
import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric='Accuracy',
        random_seed=42,
        verbose=False,
        loss_function='Logloss',
    )
    
    cv_data = cv(
        Pool(X, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

In [35]:
from numpy.random import RandomState

params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=RandomState(123)
)

print(best)

100%|████████████████████████████████████████████████| 50/50 [34:46<00:00, 41.73s/trial, best loss: 0.1661054994388328]
{'l2_leaf_reg': 1.0, 'learning_rate': 0.030276027601240763}


{'l2_leaf_reg': 1.0, 'learning_rate': 0.030276027601240763}가 optimal parameter로 얻어졌습니다. 이들로 다시 model을 학습해보겠습니다.

In [36]:
model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric='Accuracy',
    random_seed=42,
    verbose=False,
    loss_function='Logloss',
)
cv_data = cv(Pool(X, y, cat_features=categorical_features_indices), model.get_params())

In [37]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8338945005611672


hyperparameter를 tuning하기 전 accuracy는 약 0.82였는데, tuning 후 0.01이상 증가한 것을 볼 수 있습니다. 

### Make submission

위에서 validation set에 의해 선택된 parameter들을 사용해 전체 train data에 대해 학습시키고, test에 대해 예측해보도록 하겠습니다.

In [60]:
model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric='Accuracy',
    random_seed=42,
    verbose=False,
    loss_function='Logloss',
)

model.fit(X, y, cat_features=categorical_features_indices)

<catboost.core.CatBoostClassifier at 0x17b026410a0>

In [66]:
import pandas as pd
prediction = pd.DataFrame()
prediction['PassengerId'] = X_test['PassengerId']
prediction['Survived'] = model.predict(X_test)

In [68]:
prediction

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
