# Python Catboost Tutorial - Binary Classification

Adapted from the Catboost repository.

### CatBoost installation
If you have not already installed CatBoost: <br>
pip install --upgrade catboost


### Data Loading

In [1]:
from catboost import CatBoostClassifier, Pool, cv
from catboost.eval.catboost_evaluation import *

import numpy as np
import pandas as pd
from collections import Counter
from itertools import product

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE, SMOTENC

In [2]:
#Import Data
df = pd.read_csv("https://raw.githubusercontent.com/iandreafc/sna-bigdata-course/master/Datasets/titanic.csv")

#See the imported dataset
print("DF shape", df.shape)
df.head()


DF shape (891, 9)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


### Feature Preparation
First of all let's check how many missing values do we have:

In [3]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

As we cat see, **`Age`**, **`Cabin`** and **`Embarked`** indeed have some missing values, so let's fill them with some number way out of their distributions - so the model would be able to easily distinguish between them and take it into account:

In [4]:
df.fillna(-999, inplace=True)
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

Now let's separate features and label variable:

In [5]:
X = df.drop('Survived', axis=1)
y = df.Survived

Pay attention that our features are of differnt types - some of them are numeric, some are categorical, and some are even just strings, which normally should be handled in some specific way (for example encoded with bag-of-words representation). 

In [6]:
print(X.dtypes)

categorical_features_indices = np.where(X.dtypes != np.float)[0]
categorical_features_indices

Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Cabin        object
Embarked     object
dtype: object


array([0, 1, 3, 4, 6, 7], dtype=int64)

#### Encode Strings
Not strictly necessary in Catboost, but useful for example for SMOTE.

In [7]:
for var in ['Sex', 'Cabin', 'Embarked']:
    X[var] = X[var].astype('category').cat.codes
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,22.0,1,0,7.25,0,3
1,1,0,38.0,1,0,71.2833,82,1
2,3,0,26.0,0,0,7.925,0,3
3,1,0,35.0,1,0,53.1,56,3
4,3,1,35.0,0,0,8.05,0,3


### Data Splitting
Let's split the train data into training and validation sets.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=14)

### SMOTE
Check if target var has balanced classes and use SMOTE if needed (**only on Train Set**).

In [9]:
Counter(np.array(y_train).ravel())

Counter({0: 410, 1: 258})

In [10]:
#Apply SMOTENC, since you have categorical variables
sm = SMOTENC(categorical_features=categorical_features_indices, random_state = 14, n_jobs=-1)

#Save column names
xcol = list(X_train.columns)
ycol = y_train.name

#Apply SMOTE and convert back to Pandas
X_train, y_train = sm.fit_resample(X_train, np.array(y_train).ravel())
X_train = pd.DataFrame(X_train, columns= xcol)
y_train = pd.DataFrame(y_train, columns= [ycol])

#Check new class balance
Counter(np.array(y_train).ravel())

Counter({0: 410, 1: 410})

### Parameters Tuning

In [11]:
#Define a grid of parameters to test
grid = {'learning_rate': [0.01, 0.03, 0.1, 0.2],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        }

#Count all possible combinations
print("# Combinations:", len([dict(zip(grid.keys(),v)) for v in product(*grid.values())]))

# Combinations: 60


In [12]:
#Define Model (could also use custom loss here)
#custom_loss = ["Accuracy"]
model = CatBoostClassifier()

#Use the Pool function to specify the categorical features
train_pool = Pool(data= X_train, label= y_train, cat_features= categorical_features_indices)

#Grid Search
#Default cross-validation is 3-fold
grid_search_result = model.grid_search(grid, train_pool, cv=3)
bestparam = grid_search_result["params"]
bestparam

0:	loss: 0.3488224	best: 0.3488224 (0)	total: 14.2s	remaining: 14m
1:	loss: 0.3278566	best: 0.3278566 (1)	total: 32.2s	remaining: 15m 33s
2:	loss: 0.3252437	best: 0.3252437 (2)	total: 51.6s	remaining: 16m 20s
3:	loss: 0.3237958	best: 0.3237958 (3)	total: 1m 9s	remaining: 16m 11s
4:	loss: 0.3493395	best: 0.3237958 (3)	total: 1m 22s	remaining: 15m 10s
5:	loss: 0.3334053	best: 0.3237958 (3)	total: 1m 37s	remaining: 14m 37s
6:	loss: 0.3294918	best: 0.3237958 (3)	total: 1m 48s	remaining: 13m 44s
7:	loss: 0.3326646	best: 0.3237958 (3)	total: 1m 58s	remaining: 12m 48s
8:	loss: 0.3545886	best: 0.3237958 (3)	total: 2m 4s	remaining: 11m 47s
9:	loss: 0.3324901	best: 0.3237958 (3)	total: 2m 13s	remaining: 11m 6s
10:	loss: 0.3297630	best: 0.3237958 (3)	total: 2m 22s	remaining: 10m 34s
11:	loss: 0.3234005	best: 0.3234005 (11)	total: 2m 32s	remaining: 10m 8s
12:	loss: 0.3533436	best: 0.3234005 (11)	total: 2m 39s	remaining: 9m 35s
13:	loss: 0.3365020	best: 0.3234005 (11)	total: 2m 47s	remaining: 9m 9s

{'depth': 6, 'l2_leaf_reg': 1, 'learning_rate': 0.03}

In [13]:
#Set best params
model = CatBoostClassifier()

#Can define Custom Loss
bestparam["custom_loss"] = "Kappa"

#Depending on your objective you can also customize the evaluation metric
bestparam["eval_metric"] = "Kappa"

model.set_params(**bestparam)
print(model.get_params())

{'depth': 6, 'l2_leaf_reg': 1, 'learning_rate': 0.03, 'custom_loss': 'Kappa', 'eval_metric': 'Kappa'}


### Model Training
Retaining the best model and with early stopping, to avoid overfit.
**In real cases, we need an external test set, not used for training or validation (early stopping). That dataset is the one to be used to evaluate the final moldel.**

In [14]:
#Furter split the train set into final_train and validation sets
X_train_final, X_validation, y_train_final, y_validation = train_test_split(X_train, y_train,\
                                                                            train_size=0.75, random_state=14)

print(X_train.shape, X_train_final.shape, X_validation.shape)

(820, 8) (615, 8) (205, 8)


Use early sotopping rounds and validation set, to stop after K iterations with no improvement of the evaluation metric.

In [15]:
model.fit(X_train_final, y_train_final, cat_features=categorical_features_indices, eval_set=(X_validation, y_validation), \
                   early_stopping_rounds = 80, use_best_model=True, logging_level = "Verbose", plot=False)


0:	learn: 0.5751363	test: 0.4924293	best: 0.4924293 (0)	total: 23.4ms	remaining: 23.4s
1:	learn: 0.5627479	test: 0.5219124	best: 0.5219124 (1)	total: 38.1ms	remaining: 19s
2:	learn: 0.5881125	test: 0.5427657	best: 0.5427657 (2)	total: 68.5ms	remaining: 22.8s
3:	learn: 0.5914914	test: 0.5427657	best: 0.5427657 (2)	total: 104ms	remaining: 25.8s
4:	learn: 0.5979756	test: 0.5342486	best: 0.5427657 (2)	total: 119ms	remaining: 23.7s
5:	learn: 0.5663225	test: 0.5228860	best: 0.5427657 (2)	total: 121ms	remaining: 20.1s
6:	learn: 0.5565643	test: 0.5124496	best: 0.5427657 (2)	total: 123ms	remaining: 17.4s
7:	learn: 0.5630378	test: 0.5040163	best: 0.5427657 (2)	total: 129ms	remaining: 16s
8:	learn: 0.5599470	test: 0.5219124	best: 0.5427657 (2)	total: 148ms	remaining: 16.3s
9:	learn: 0.5598497	test: 0.5219124	best: 0.5427657 (2)	total: 156ms	remaining: 15.5s
10:	learn: 0.5850071	test: 0.5228860	best: 0.5427657 (2)	total: 175ms	remaining: 15.7s
11:	learn: 0.5882037	test: 0.5228860	best: 0.5427657 (

95:	learn: 0.7335552	test: 0.5738870	best: 0.5928796 (42)	total: 1.62s	remaining: 15.3s
96:	learn: 0.7335552	test: 0.5738870	best: 0.5928796 (42)	total: 1.64s	remaining: 15.3s
97:	learn: 0.7400539	test: 0.5738870	best: 0.5928796 (42)	total: 1.67s	remaining: 15.4s
98:	learn: 0.7400539	test: 0.5842187	best: 0.5928796 (42)	total: 1.68s	remaining: 15.3s
99:	learn: 0.7400539	test: 0.5842187	best: 0.5928796 (42)	total: 1.71s	remaining: 15.4s
100:	learn: 0.7400539	test: 0.5842187	best: 0.5928796 (42)	total: 1.72s	remaining: 15.3s
101:	learn: 0.7400539	test: 0.5842187	best: 0.5928796 (42)	total: 1.73s	remaining: 15.2s
102:	learn: 0.7400539	test: 0.5842187	best: 0.5928796 (42)	total: 1.76s	remaining: 15.3s
103:	learn: 0.7433317	test: 0.5738870	best: 0.5928796 (42)	total: 1.76s	remaining: 15.2s
104:	learn: 0.7400539	test: 0.5738870	best: 0.5928796 (42)	total: 1.77s	remaining: 15.1s
105:	learn: 0.7400539	test: 0.5738870	best: 0.5928796 (42)	total: 1.78s	remaining: 15s
106:	learn: 0.7433317	test: 

<catboost.core.CatBoostClassifier at 0x24becd06808>

With this we can see that the best **Kappa** value of **0.5929** (on **validation set**) was acheived at step **42** with no futher improvement after **80** iterations (so the training stopped). We now retain this model as the **best model**.

### Model Predictions and Fit

In [16]:
#Predict on the original Test Set
predictions = model.predict(X_test)
truevalues = np.array(y_test)

#Confusion Matrix
print(confusion_matrix(truevalues,predictions))

#Classificatiion Report
print(classification_report(truevalues,predictions))

#Cohen's Kappa
print("ACCURACY:", '%.4f' % accuracy_score(truevalues, predictions))
print("COHEN'S KAPPA:", '%.4f' % cohen_kappa_score(truevalues, predictions))

[[117  22]
 [ 19  65]]
              precision    recall  f1-score   support

           0       0.86      0.84      0.85       139
           1       0.75      0.77      0.76        84

    accuracy                           0.82       223
   macro avg       0.80      0.81      0.81       223
weighted avg       0.82      0.82      0.82       223

ACCURACY: 0.8161
COHEN'S KAPPA: 0.6112


### Monte Carlo Cross-Validation
Now repeat the process 1,000 times and provide average fit statistics, with their standard deviation.

In [17]:
#Save accuracy and kappa scores in a list
a,k = [], []

#For demonstrational purposes we now reapet it 10 times
for i in range(0,10):
    #Split with no random seed in train, validation and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)
    X_train_final, X_validation, y_train_final, y_validation = train_test_split(X_train, y_train, train_size=0.75)
    
    model.fit(X_train_final, y_train_final, cat_features=categorical_features_indices, \
              eval_set=(X_validation, y_validation), early_stopping_rounds = 80, use_best_model=True, \
              logging_level = "Silent")
    
    predictions = model.predict(X_test)
    truevalues = np.array(y_test)
    
    a.append(accuracy_score(truevalues, predictions))
    k.append(cohen_kappa_score(truevalues, predictions))

In [19]:
print("Accuracy at each cross-validation step\n", a, "\n")
print("Kappa at each cross-validation step\n", k, "\n")
print("Accuracy M", '%.4f' % np.mean(a), "SD", '%.4f' % np.std(a), "\n")
print("Kappa M", '%.4f' % np.mean(k), "SD", '%.4f' % np.std(k))

Accuracy at each cross-validation step
 [0.7892376681614349, 0.7982062780269058, 0.7847533632286996, 0.820627802690583, 0.7757847533632287, 0.7802690582959642, 0.7802690582959642, 0.8385650224215246, 0.7982062780269058, 0.7937219730941704] 

Kappa at each cross-validation step
 [0.5537151373216946, 0.5527875573777797, 0.558197127290738, 0.6017857142857144, 0.47613230595752676, 0.5269492185808909, 0.5269492185808909, 0.6479256205596, 0.563676681594852, 0.5705793703951775] 

Accuracy M 0.7960 SD 0.0187 

Kappa M 0.5579 SD 0.0434
