In [8]:
import pandas as pd

titanic_df = pd.read_csv("https://vincentarelbundock.github.io/Rdatasets/csv/COUNT/titanic.csv", index_col=0)
titanic_df

Unnamed: 0,class,age,sex,survived
1,1st class,adults,man,yes
2,1st class,adults,man,yes
3,1st class,adults,man,yes
4,1st class,adults,man,yes
5,1st class,adults,man,yes
...,...,...,...,...
1312,3rd class,child,women,no
1313,3rd class,child,women,no
1314,3rd class,child,women,no
1315,3rd class,child,women,no


## Categorical Naïve Bayes 

#### Encode data

In [9]:
from sklearn.preprocessing import OrdinalEncoder

titanic_x_enc_df = OrdinalEncoder().fit_transform(titanic_df[titanic_df.columns[:-1]])
titanic_x_enc_df

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       ...,
       [2., 1., 1.],
       [2., 1., 1.],
       [2., 1., 1.]])

#### Train/test split

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(titanic_x_enc_df, titanic_df.survived, test_size=0.2)

#### Fit model and predict

In [11]:
from sklearn.naive_bayes import CategoricalNB

cnb = CategoricalNB(alpha=0.001)
cnb.fit(X_train, y_train)
y_pred = cnb.predict(X_test)

#### Baseline model (majority prediction) accuracy

In [12]:
survived_count = titanic_df.survived.value_counts()['no']
overall_count = titanic_df.survived.count()
majority_prediction_accuracy = survived_count/overall_count
majority_prediction_accuracy

0.6208206686930091

#### Accuracy of fitted NB model

In [48]:
correctly_predicted_count = (y_test == y_pred).value_counts()[True]
test_instance_count = y_test.count()
accuracy = correctly_predicted_count/test_instance_count
accuracy

0.8181818181818182

The NB model's accuracy is a lot better than the baseline model's accuracy.

#### Confusion matrix

In [49]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=["actual"], colnames=["predicted"])
confusion_matrix

predicted,no,yes
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
no,168,7
yes,41,48


## Classification tree

#### Encode data

The attribute `class` needs to be dummy encoded as it has three values. The other two attributes can be labelled with values 0 and 1, as this amounts to correct numeric conversion.

In [50]:
titanic_x_enc_df = pd.get_dummies(titanic_df, columns=['class'])
titanic_x_enc_df[['age', 'sex']] = OrdinalEncoder().fit_transform(titanic_df[['age','sex']])
titanic_x_enc_df.drop('survived', axis=1, inplace=True)
titanic_x_enc_df

Unnamed: 0,age,sex,class_1st class,class_2nd class,class_3rd class
1,0.0,0.0,1,0,0
2,0.0,0.0,1,0,0
3,0.0,0.0,1,0,0
4,0.0,0.0,1,0,0
5,0.0,0.0,1,0,0
...,...,...,...,...,...
1312,1.0,1.0,0,0,1
1313,1.0,1.0,0,0,1
1314,1.0,1.0,0,0,1
1315,1.0,1.0,0,0,1


#### Train/test split

In [51]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(titanic_x_enc_df, titanic_df.survived, test_size=0.2)

#### Fit model and predict

In [52]:
from sklearn.tree import DecisionTreeClassifier

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

#### Baseline model (majority prediction) accuracy

In [53]:
survived_count = titanic_df.survived.value_counts()['no']
overall_count = titanic_df.survived.count()
majority_prediction_accuracy = survived_count/overall_count
majority_prediction_accuracy

0.6208206686930091

#### Accuracy of fitted classification tree model

In [54]:
correctly_predicted_count = (y_test == y_pred).value_counts()[True]
test_instance_count = y_test.count()
accuracy = correctly_predicted_count/test_instance_count
accuracy

0.7575757575757576

The classification tree's accuracy comparable to the NB model's accuracy.

#### Confusion matrix

In [55]:
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=["actual"], colnames=["predicted"])
confusion_matrix

predicted,no,yes
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
no,146,6
yes,58,54


However, the decision tree is a lot better at telling if someone will not survive than if they will survive.