[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kjmazidi/Machine_Learning_3rd_edition/blob/master/Volume_II_Python/Part-2-ML_sklearn/4-01-DT-classification.ipynb)

# Decision Tree
## Titanic data

This notebook provides a simple example of classification with sklearn. It runs the Decision Tree algorithm on the Titanic data.

### Code Accompanying ***The Machine Learning Handbooks***, Volume II, Chapter 4

#### Book pdf is available on the GitHub repo: <https://github.com/kjmazidi/Machine_Learning_3rd_edition>

###### (c) 2025 KJG Mazidi, all rights reserved

In [17]:
# imports used in this notebook

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [3]:
### load the data using pandas

df = pd.read_csv('data/titanic3.csv', usecols=['pclass', 'survived', 'sex', 'age'])
print(df.head())
print('\nDimensions of data frame:', df.shape)

   pclass  survived     sex      age
0       1         1  female  29.0000
1       1         1    male   0.9167
2       1         0  female   2.0000
3       1         0    male  30.0000
4       1         0  female  25.0000

Dimensions of data frame: (1309, 4)


In [4]:
# convert columns to factors
df.survived = df.survived.astype('category').cat.codes
df.pclass = df.pclass.astype('category').cat.codes
df.sex = df.sex.astype('category').cat.codes
df.head()

Unnamed: 0,pclass,survived,sex,age
0,0,1,0,29.0
1,0,1,1,0.9167
2,0,0,0,2.0
3,0,0,1,30.0
4,0,0,0,25.0


In [5]:
# count missing values

df.isnull().sum()

pclass        0
survived      0
sex           0
age         263
dtype: int64

In [7]:
# fill missing values using numpy

age_mean = np.mean(df.age)
df.age = df.age.fillna(age_mean)
print(df.age)

0       29.000000
1        0.916700
2        2.000000
3       30.000000
4       25.000000
          ...    
1304    14.500000
1305    29.881135
1306    26.500000
1307    27.000000
1308    29.000000
Name: age, Length: 1309, dtype: float64


In [10]:
# train test split using sklearn

X = df.loc[:, ['pclass', 'age', 'sex']]
y = df.survived

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

print('train size:', X_train.shape)
print('test size:', X_test.shape)

train size: (1047, 3)
test size: (262, 3)


In [11]:
# instantiate sklearn classifier and fit the data

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

In [12]:
# make predictions

pred = clf.predict(X_test)

In [14]:
# evaluate using sklearn metrics

print('accuracy score: ', accuracy_score(y_test, pred))
print('precision score: ', precision_score(y_test, pred))
print('recall score: ', recall_score(y_test, pred))
print('f1 score: ', f1_score(y_test, pred))

accuracy score:  0.7786259541984732
precision score:  0.7692307692307693
recall score:  0.6
f1 score:  0.6741573033707865


In [16]:
# output sklearn confusion matrix

confusion_matrix(y_test, pred)

array([[144,  18],
       [ 40,  60]])

In [18]:
# output sklearn classification report

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83       162
           1       0.77      0.60      0.67       100

    accuracy                           0.78       262
   macro avg       0.78      0.74      0.75       262
weighted avg       0.78      0.78      0.77       262

