# Python Catboost Tutorial - Binary Classification

Adapted from the Catboost repository.

### CatBoost installation
If you have not already installed CatBoost: <br>
pip install --upgrade catboost


### Data Loading

In [1]:
from catboost import CatBoostClassifier, Pool, cv
from catboost.eval.catboost_evaluation import *

import numpy as np
import pandas as pd
from collections import Counter
from itertools import product

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix, classification_report

from imblearn.over_sampling import SMOTE, SMOTENC

In [2]:
#Import Data
df = pd.read_csv("https://raw.githubusercontent.com/iandreafc/sna-bigdata-course/master/Datasets/titanic.csv")

#See the imported dataset
print("DF shape", df.shape)
df.head()


DF shape (891, 9)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


### Feature Preparation
First of all let's check how many missing values do we have:

In [3]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

As we cat see, **`Age`**, **`Cabin`** and **`Embarked`** indeed have some missing values, so let's fill them with some number way out of their distributions - so the model would be able to easily distinguish between them and take it into account:

In [4]:
df.fillna(-999, inplace=True)
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

Now let's separate features and label variable:

In [5]:
X = df.drop('Survived', axis=1)
y = df.Survived

Pay attention that our features are of differnt types - some of them are numeric, some are categorical, and some are even just strings, which normally should be handled in some specific way (for example encoded with bag-of-words representation). 

In [6]:
print(X.dtypes)

categorical_features_indices = np.where(X.dtypes != np.float)[0]
categorical_features_indices

Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Fare        float64
Cabin        object
Embarked     object
dtype: object


array([0, 1, 3, 4, 6, 7], dtype=int64)

#### Encode Strings
Not strictly necessary in Catboost, but useful for example for SMOTE.

In [7]:
for var in ['Sex', 'Cabin', 'Embarked']:
    X[var] = X[var].astype('category').cat.codes
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,1,22.0,1,0,7.25,0,3
1,1,0,38.0,1,0,71.2833,82,1
2,3,0,26.0,0,0,7.925,0,3
3,1,0,35.0,1,0,53.1,56,3
4,3,1,35.0,0,0,8.05,0,3


### Data Splitting
Let's split the train data into training and validation sets.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=14)

### SMOTE
Check if target var has balanced classes and use SMOTE if needed (**only on Train Set**).

In [9]:
Counter(np.array(y_train).ravel())

Counter({0: 410, 1: 258})

In [10]:
#Apply SMOTENC, since you have categorical variables
sm = SMOTENC(categorical_features=categorical_features_indices, random_state = 14, n_jobs=-1)

#Save column names
xcol = list(X_train.columns)
ycol = y_train.name

#Apply SMOTE and convert back to Pandas
X_train, y_train = sm.fit_resample(X_train, np.array(y_train).ravel())
X_train = pd.DataFrame(X_train, columns= xcol)
y_train = pd.DataFrame(y_train, columns= [ycol])

#Check new class balance
Counter(np.array(y_train).ravel())

Counter({0: 410, 1: 410})

### Parameters Tuning

In [11]:
#Define a grid of parameters to test
grid = {'learning_rate': [0.01, 0.03, 0.1, 0.2],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        }

#Count all possible combinations
print("# Combinations:", len([dict(zip(grid.keys(),v)) for v in product(*grid.values())]))

# Combinations: 60


In [12]:
#Define Model (could also use custom loss here)
#custom_loss = ["Accuracy"]
model = CatBoostClassifier()

#Grid Search
#Default cross-validation is 3-fold
grid_search_result = model.grid_search(grid, X=X_train, y=y_train, cv=3)
bestparam = grid_search_result["params"]
bestparam

0:	loss: 0.3304858	best: 0.3304858 (0)	total: 1.17s	remaining: 1m 8s
1:	loss: 0.3226414	best: 0.3226414 (1)	total: 2.13s	remaining: 1m 1s
2:	loss: 0.3296104	best: 0.3226414 (1)	total: 3.07s	remaining: 58.3s
3:	loss: 0.3331585	best: 0.3226414 (1)	total: 4.06s	remaining: 56.8s
4:	loss: 0.3356785	best: 0.3226414 (1)	total: 5.05s	remaining: 55.6s
5:	loss: 0.3302601	best: 0.3226414 (1)	total: 5.93s	remaining: 53.4s
6:	loss: 0.3311892	best: 0.3226414 (1)	total: 6.93s	remaining: 52.5s
7:	loss: 0.3325525	best: 0.3226414 (1)	total: 7.85s	remaining: 51s
8:	loss: 0.3403906	best: 0.3226414 (1)	total: 8.7s	remaining: 49.3s
9:	loss: 0.3278574	best: 0.3226414 (1)	total: 9.54s	remaining: 47.7s
10:	loss: 0.3280204	best: 0.3226414 (1)	total: 10.3s	remaining: 46.1s
11:	loss: 0.3287230	best: 0.3226414 (1)	total: 11.3s	remaining: 45.3s
12:	loss: 0.3419757	best: 0.3226414 (1)	total: 12.1s	remaining: 43.8s
13:	loss: 0.3321732	best: 0.3226414 (1)	total: 13s	remaining: 42.8s
14:	loss: 0.3231395	best: 0.3226414

{'depth': 10, 'l2_leaf_reg': 1, 'learning_rate': 0.1}

In [13]:
#Set best params
model = CatBoostClassifier()

#Can define Custom Loss
bestparam["custom_loss"] = "Kappa"

#Depending on your objective you can also customize the evaluation metric
bestparam["eval_metric"] = "Kappa"

model.set_params(**bestparam)
print(model.get_params())

{'depth': 10, 'l2_leaf_reg': 1, 'learning_rate': 0.1, 'custom_loss': 'Kappa', 'eval_metric': 'Kappa'}


### Model Training
Retaining the best model and with early stopping, to avoid overfit.
**In real cases, we need an external test set, not used for training or validation (early stopping). That dataset is the one to be used to evaluate the final moldel.**

In [14]:
#Furter split the train set into final_train and validation sets
X_train_final, X_validation, y_train_final, y_validation = train_test_split(X_train, y_train,\
                                                                            train_size=0.75, random_state=14)

print(X_train.shape, X_train_final.shape, X_validation.shape)

(820, 8) (615, 8) (205, 8)


Use early sotopping rounds and validation set, to stop after K iterations with no improvement of the evaluation metric.

In [15]:
model.fit(X_train_final, y_train_final, cat_features=categorical_features_indices, eval_set=(X_validation, y_validation), \
                   early_stopping_rounds = 80, use_best_model=True, logging_level = "Verbose", plot=False)


0:	learn: 0.6481358	test: 0.6015936	best: 0.6015936 (0)	total: 36.6ms	remaining: 36.5s
1:	learn: 0.7237766	test: 0.5114537	best: 0.6015936 (0)	total: 83ms	remaining: 41.4s
2:	learn: 0.6949018	test: 0.5427657	best: 0.6015936 (0)	total: 88.4ms	remaining: 29.4s
3:	learn: 0.6882722	test: 0.5712757	best: 0.6015936 (0)	total: 93.9ms	remaining: 23.4s
4:	learn: 0.7011946	test: 0.5617530	best: 0.6015936 (0)	total: 129ms	remaining: 25.6s
5:	learn: 0.7499960	test: 0.5825252	best: 0.6015936 (0)	total: 172ms	remaining: 28.5s
6:	learn: 0.7335552	test: 0.5721497	best: 0.6015936 (0)	total: 183ms	remaining: 26s
7:	learn: 0.7467212	test: 0.5721497	best: 0.6015936 (0)	total: 195ms	remaining: 24.2s
8:	learn: 0.7334961	test: 0.5816733	best: 0.6015936 (0)	total: 200ms	remaining: 22.1s
9:	learn: 0.7431037	test: 0.5635343	best: 0.6015936 (0)	total: 224ms	remaining: 22.2s
10:	learn: 0.7627729	test: 0.5928796	best: 0.6015936 (0)	total: 254ms	remaining: 22.8s
11:	learn: 0.7561653	test: 0.5928796	best: 0.6015936 

97:	learn: 0.9739707	test: 0.5617530	best: 0.6245783 (25)	total: 2.95s	remaining: 27.1s
98:	learn: 0.9739707	test: 0.5522497	best: 0.6245783 (25)	total: 2.99s	remaining: 27.2s
99:	learn: 0.9739707	test: 0.5418327	best: 0.6245783 (25)	total: 3.04s	remaining: 27.3s
100:	learn: 0.9739707	test: 0.5418327	best: 0.6245783 (25)	total: 3.08s	remaining: 27.5s
101:	learn: 0.9739707	test: 0.5418327	best: 0.6245783 (25)	total: 3.11s	remaining: 27.4s
102:	learn: 0.9739707	test: 0.5617530	best: 0.6245783 (25)	total: 3.15s	remaining: 27.5s
103:	learn: 0.9739707	test: 0.5513351	best: 0.6245783 (25)	total: 3.19s	remaining: 27.5s
104:	learn: 0.9739707	test: 0.5513351	best: 0.6245783 (25)	total: 3.23s	remaining: 27.5s
105:	learn: 0.9772269	test: 0.5617530	best: 0.6245783 (25)	total: 3.27s	remaining: 27.6s
Stopped by overfitting detector  (80 iterations wait)

bestTest = 0.6245783133
bestIteration = 25

Shrink model to first 26 iterations.


<catboost.core.CatBoostClassifier at 0x24538a1da08>

With this we can see that the best **Kappa** value of **0.6246** (on validation set) was acheived at step **25** with no futher improvement after **80** iterations (so the training stopped). We now retain this model as the **best model**.

### Model Predictions and Fit

In [16]:
#Predict on the original Test Set
predictions = model.predict(X_test)
truevalues = np.array(y_test)

#Confusion Matrix
print(confusion_matrix(truevalues,predictions))

#Classificatiion Report
print(classification_report(truevalues,predictions))

#Cohen's Kappa
print("ACCURACY:", '%.4f' % accuracy_score(truevalues, predictions))
print("COHEN'S KAPPA:", '%.4f' % cohen_kappa_score(truevalues, predictions))

[[116  23]
 [ 19  65]]
              precision    recall  f1-score   support

           0       0.86      0.83      0.85       139
           1       0.74      0.77      0.76        84

    accuracy                           0.81       223
   macro avg       0.80      0.80      0.80       223
weighted avg       0.81      0.81      0.81       223

ACCURACY: 0.8117
COHEN'S KAPPA: 0.6027


### Monte Carlo Cross-Validation
Now repeat the process 1,000 times and provide average fit statistics, with their standard deviation.

In [17]:
#Save accuracy and kappa scores in a list
a,k = [], []

#For demonstrational purposes we now reapet it 10 times
for i in range(0,10):
    #Split with no random seed in train, validation and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)
    X_train_final, X_validation, y_train_final, y_validation = train_test_split(X_train, y_train, train_size=0.75)
    
    model.fit(X_train_final, y_train_final, cat_features=categorical_features_indices, \
              eval_set=(X_validation, y_validation), early_stopping_rounds = 80, use_best_model=True, \
              logging_level = "Silent")
    
    predictions = model.predict(X_test)
    truevalues = np.array(y_test)
    
    a.append(accuracy_score(truevalues, predictions))
    k.append(cohen_kappa_score(truevalues, predictions))

In [18]:
print("Accuracy at each cross-validation step\n", a, "\n")
print("Kappa at each cross-validation step\n", k, "\n")
print("Accuracy M", '%.4f' % np.mean(a), "SD", '%.4f' % np.std(a), "\n")
print("Kappa M", np.mean(k), "SD", '%.4f' % np.std(k))

Accuracy at each cross-validation step
 [0.7713004484304933, 0.8340807174887892, 0.8251121076233184, 0.7757847533632287, 0.7892376681614349, 0.8161434977578476, 0.8385650224215246, 0.8251121076233184, 0.7847533632286996, 0.7713004484304933] 

Kappa at each cross-validation step
 [0.5172955307499683, 0.6471066250374236, 0.5923981815625439, 0.5071169657855186, 0.5254674695522252, 0.6046270270270271, 0.6458756065284517, 0.6124158830607425, 0.5395336832143165, 0.5133712720893414] 

Accuracy M 0.8031 SD 0.0258 

Kappa M 0.5705208244607559 SD 0.0530
