# CatBoost Algorithm
CatBoost is a state-of-the-art open-source gradient boosting on decision trees library. It's simple and easy to use. And is now regularly one of the top algorithms used in data science competitions as it produces very good results without extensive data clean-up or feature engineering.

In [1]:
# import libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
# load the dataset
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   pclass       891 non-null    int64   
 2   sex          891 non-null    object  
 3   age          714 non-null    float64 
 4   sibsp        891 non-null    int64   
 5   parch        891 non-null    int64   
 6   fare         891 non-null    float64 
 7   embarked     889 non-null    object  
 8   class        891 non-null    category
 9   who          891 non-null    object  
 10  adult_male   891 non-null    bool    
 11  deck         203 non-null    category
 12  embark_town  889 non-null    object  
 13  alive        891 non-null    object  
 14  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.7+ KB


In [4]:
df.isnull().sum().sort_values(ascending=False)

deck           688
age            177
embarked         2
embark_town      2
survived         0
pclass           0
sex              0
sibsp            0
parch            0
fare             0
class            0
who              0
adult_male       0
alive            0
alone            0
dtype: int64

## Pre-Processing Data 

In [5]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=5)

df['age'] = imputer.fit_transform(df[['age']])
df.drop('deck', axis=1, inplace=True)
# imputing the empty values by using pandas fillna method
df['embarked'] = df['embarked'].fillna(df['embarked'].mode()[0])
df['embark_town'] = df['embark_town'].fillna(df['embark_town'].mode()[0])

In [6]:
# convert all the category and object columns to category columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
df[categorical_cols] = df[categorical_cols].astype('category')

In [7]:
# create feature and target or label columns 
X = df.drop('survived', axis=1)
y = df['survived']

# split the dataset into training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# call the model and fit the model 
model = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=3, loss_function='Logloss', eval_metric='Accuracy')

# fit the model
model.fit(X_train, y_train, cat_features=categorical_cols.to_list())

# predict the model
y_pred = model.predict(X_test)

# evaluate the model
print(f'Accuracy Score: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix: \n {confusion_matrix(y_test, y_pred)}')
print(f'Classification Report: \n {classification_report(y_test, y_pred)}')

0:	learn: 1.0000000	total: 229ms	remaining: 22.6s
1:	learn: 1.0000000	total: 240ms	remaining: 11.8s
2:	learn: 1.0000000	total: 248ms	remaining: 8.03s
3:	learn: 1.0000000	total: 259ms	remaining: 6.23s
4:	learn: 1.0000000	total: 271ms	remaining: 5.14s
5:	learn: 1.0000000	total: 282ms	remaining: 4.42s
6:	learn: 1.0000000	total: 290ms	remaining: 3.85s
7:	learn: 1.0000000	total: 301ms	remaining: 3.46s
8:	learn: 1.0000000	total: 313ms	remaining: 3.16s
9:	learn: 1.0000000	total: 324ms	remaining: 2.91s
10:	learn: 1.0000000	total: 335ms	remaining: 2.71s
11:	learn: 1.0000000	total: 347ms	remaining: 2.54s
12:	learn: 1.0000000	total: 358ms	remaining: 2.4s
13:	learn: 1.0000000	total: 369ms	remaining: 2.27s
14:	learn: 1.0000000	total: 380ms	remaining: 2.15s
15:	learn: 1.0000000	total: 388ms	remaining: 2.03s
16:	learn: 1.0000000	total: 395ms	remaining: 1.93s
17:	learn: 1.0000000	total: 407ms	remaining: 1.85s
18:	learn: 1.0000000	total: 419ms	remaining: 1.78s
19:	learn: 1.0000000	total: 430ms	remainin