# Python Catboost Tutorial - Regression

Adapted from the Catboost repository.

### CatBoost installation
If you have not already installed CatBoost: <br>
pip install --upgrade catboost


### Data Loading

In [1]:
from catboost import CatBoostRegressor, Pool, cv
from catboost.eval.catboost_evaluation import *

import numpy as np
import pandas as pd
from collections import Counter
from itertools import product

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

from imblearn.over_sampling import SMOTE, SMOTENC

In [2]:
#Define function to calculate the MAPE
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [4]:
#Import Data
df = pd.read_csv("https://raw.githubusercontent.com/iandreafc/sna-bigdata-course/master/Datasets/titanic.csv")

#See the imported dataset
print("DF shape", df.shape)
df.head()


DF shape (891, 9)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S
1,1,1,female,38.0,1,0,71.2833,C85,C
2,1,3,female,26.0,0,0,7.925,,S
3,1,1,female,35.0,1,0,53.1,C123,S
4,0,3,male,35.0,0,0,8.05,,S


### Feature Preparation
First of all let's check how many missing values do we have:

In [5]:
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Cabin       687
Embarked      2
dtype: int64

As we cat see, **`Age`**, **`Cabin`** and **`Embarked`** indeed have some missing values, so let's fill them with some number way out of their distributions - so the model would be able to easily distinguish between them and take it into account:

In [6]:
df.fillna(-999, inplace=True)
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Cabin       0
Embarked    0
dtype: int64

Now let's separate features and label variable, **to test regression we try to predict the ticket price**:

In [7]:
X = df.drop('Fare', axis=1)
y = df.Fare

Pay attention that our features are of differnt types - some of them are numeric, some are categorical, and some are even just strings, which normally should be handled in some specific way (for example encoded with bag-of-words representation). But in our case we could treat these string features just as categorical one - all the heavy lifting is done inside CatBoost. How cool is that? :)

In [8]:
print(X.dtypes)

categorical_features_indices = np.where(X.dtypes != np.float)[0]
categorical_features_indices

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
Cabin        object
Embarked     object
dtype: object


array([0, 1, 2, 4, 5, 6, 7], dtype=int64)

#### Encode Strings
Not strictly necessary in Catboost, but useful for example for SMOTE.

In [9]:
for var in ['Sex', 'Cabin', 'Embarked']:
    X[var] = X[var].astype('category').cat.codes
X.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Cabin,Embarked
0,0,3,1,22.0,1,0,0,3
1,1,1,0,38.0,1,0,82,1
2,1,3,0,26.0,0,0,0,3
3,1,1,0,35.0,1,0,56,3
4,0,3,1,35.0,0,0,0,3


### Data Splitting
Let's split the train data into training and validation sets.

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=14)

### Parameters Tuning

In [11]:
#Define a grid of parameters to test
grid = {'learning_rate': [0.01, 0.03, 0.1, 0.2],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, 7, 9],
        }

#Count all possible combinations
print("# Combinations:", len([dict(zip(grid.keys(),v)) for v in product(*grid.values())]))

# Combinations: 60


In [12]:
#Define Model
model = CatBoostRegressor()

#Grid Search
#Default cross-validation is 3-fold
grid_search_result = model.grid_search(grid, X=X_train, y=y_train, cv=3)
bestparam = grid_search_result["params"]
bestparam

0:	loss: 56.0010108	best: 56.0010108 (0)	total: 659ms	remaining: 38.9s
1:	loss: 55.0748455	best: 55.0748455 (1)	total: 1.22s	remaining: 35.3s
2:	loss: 54.7943036	best: 54.7943036 (2)	total: 1.75s	remaining: 33.3s
3:	loss: 54.5304768	best: 54.5304768 (3)	total: 2.32s	remaining: 32.5s
4:	loss: 56.5582544	best: 54.5304768 (3)	total: 3.06s	remaining: 33.7s
5:	loss: 56.1316956	best: 54.5304768 (3)	total: 3.67s	remaining: 33s
6:	loss: 55.2493711	best: 54.5304768 (3)	total: 4.35s	remaining: 32.9s
7:	loss: 55.2764053	best: 54.5304768 (3)	total: 4.87s	remaining: 31.7s
8:	loss: 56.9629922	best: 54.5304768 (3)	total: 5.43s	remaining: 30.8s
9:	loss: 56.5150976	best: 54.5304768 (3)	total: 5.96s	remaining: 29.8s
10:	loss: 55.7352622	best: 54.5304768 (3)	total: 6.5s	remaining: 29s
11:	loss: 55.8818405	best: 54.5304768 (3)	total: 7.03s	remaining: 28.1s
12:	loss: 56.6578179	best: 54.5304768 (3)	total: 7.65s	remaining: 27.7s
13:	loss: 56.7010579	best: 54.5304768 (3)	total: 8.3s	remaining: 27.3s
14:	loss

{'depth': 4, 'l2_leaf_reg': 1, 'learning_rate': 0.2}

In [13]:
#Set best params
model = CatBoostRegressor()

#Depending on your objective you can also customize the evaluation metric
bestparam["eval_metric"] = "RMSE"

model.set_params(**bestparam)
print(model.get_params())

{'loss_function': 'RMSE', 'depth': 4, 'l2_leaf_reg': 1, 'learning_rate': 0.2, 'eval_metric': 'RMSE'}


### Model Training
Retaining the best model and with early stopping, to avoid overfit.
**In real cases, we need an external test set, not used for training or validation (early stopping). That dataset is the one to be used to evaluate the final moldel.**

In [14]:
#Furter split the train set into final_train and validation sets
X_train_final, X_validation, y_train_final, y_validation = train_test_split(X_train, y_train,\
                                                                            train_size=0.75, random_state=14)

print(X_train.shape, X_train_final.shape, X_validation.shape)

(668, 8) (501, 8) (167, 8)


Use early sotopping rounds and validation set, to stop after K iterations with no improvement of the evaluation metric.

In [15]:
model.fit(X_train_final, y_train_final, cat_features=categorical_features_indices,\
          eval_set=(X_validation, y_validation), early_stopping_rounds = 80,\
          use_best_model=True, logging_level = "Verbose")

0:	learn: 50.7355258	test: 32.9638675	best: 32.9638675 (0)	total: 11.6ms	remaining: 11.6s
1:	learn: 47.9541212	test: 29.9682461	best: 29.9682461 (1)	total: 21.4ms	remaining: 10.7s
2:	learn: 45.4620514	test: 28.0218200	best: 28.0218200 (2)	total: 27.3ms	remaining: 9.06s
3:	learn: 43.8494195	test: 27.1830108	best: 27.1830108 (3)	total: 31.2ms	remaining: 7.78s
4:	learn: 42.8640474	test: 26.5177987	best: 26.5177987 (4)	total: 38.5ms	remaining: 7.67s
5:	learn: 41.6523581	test: 26.0724848	best: 26.0724848 (5)	total: 40.1ms	remaining: 6.64s
6:	learn: 41.0727002	test: 25.7819060	best: 25.7819060 (6)	total: 43.8ms	remaining: 6.22s
7:	learn: 40.6188271	test: 25.5462840	best: 25.5462840 (7)	total: 50.5ms	remaining: 6.27s
8:	learn: 39.7695883	test: 25.7766185	best: 25.5462840 (7)	total: 56ms	remaining: 6.17s
9:	learn: 39.3930822	test: 26.0255521	best: 25.5462840 (7)	total: 62.5ms	remaining: 6.19s
10:	learn: 39.2830589	test: 25.9968761	best: 25.5462840 (7)	total: 65.8ms	remaining: 5.92s
11:	learn: 

<catboost.core.CatBoostRegressor at 0x2c489bb55c8>

With this we can see that the best **RMSE** value of **25.55** (on validation set) was acheived at step **7** with no futher improvement after **80** iterations (so the training stopped). We now retain this model as the **best model**.

### Model Predictions and Fit

In [16]:
#Predict on the original Test Set
predictions = model.predict(X_test)
truevalues = np.array(y_test)

#Calculate MAE, MAPE (if no 0 values) and RMSE
print("MAE:", '%.4f' % mean_absolute_error(truevalues, predictions))
print("MAPE:", '%.4f' % mean_absolute_percentage_error(truevalues, predictions))
print("RMSE:", '%.4f' %  np.sqrt(mean_squared_error(truevalues, predictions)))

#Replace zero (only for demonstrational purpose) and calculate MAPE
truevalues[truevalues == 0] = 1
print("\nMAPE(replaced zeros):", '%.4f' % mean_absolute_percentage_error(truevalues, predictions), "%")

#Compare with variable, just to get an idea
print("\ny_test M and SD (for comparison)")
print('%.4f' % y_test.mean(),'%.4f' % y_test.std())

MAE: 16.4852
MAPE: inf
RMSE: 29.1040

MAPE(replaced zeros): 162.8685 %

y_test M and SD (for comparison)
31.5728 44.1218


  after removing the cwd from sys.path.


### Monte Carlo Cross-Validation
Now repeat the process 1,000 times and provide average fit statistics, with their standard deviation.

In [17]:
#Save accuracy and kappa scores in a list
MAE, RMSE = [], []

#For demonstrational purposes we now reapet it 10 times
for i in range(0,10):
    #Split with no random seed in train, validation and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)
    X_train_final, X_validation, y_train_final, y_validation = train_test_split(X_train, y_train, train_size=0.75)
    
    model.fit(X_train_final, y_train_final, cat_features=categorical_features_indices, \
              eval_set=(X_validation, y_validation), early_stopping_rounds = 80, use_best_model=True, \
              logging_level = "Silent")
    
    predictions = model.predict(X_test)
    truevalues = np.array(y_test)
    
    MAE.append(mean_absolute_error(truevalues, predictions))
    RMSE.append(np.sqrt(mean_squared_error(truevalues, predictions)))

In [18]:
print("MAE at each cross-validation step\n", MAE, "\n")
print("RMSE at each cross-validation step\n", RMSE, "\n")
print("MAE M", '%.4f' % np.mean(MAE), "SD", '%.4f' % np.std(MAE), "\n")
print("RMSE M", '%.4f' % np.mean(RMSE), "SD", '%.4f' % np.std(RMSE))

MAE at each cross-validation step
 [11.499690100141825, 13.437472463230856, 14.092179277479291, 15.632195515073683, 15.137832400236556, 13.985713216039198, 13.512252369686545, 13.429579187023693, 14.531671424713277, 13.961951194091617] 

RMSE at each cross-validation step
 [26.743695487654403, 36.3285772054398, 38.098133795600376, 30.70883385132934, 40.07287608204607, 38.04404761707792, 37.21866662393993, 26.85827714879597, 47.608030501326496, 24.097377393774458] 

MAE M 13.9221 SD 1.0641 

RMSE M 34.5779 SD 6.9297
