# Naive Bayes on titanic dataset
dataset: https://www.kaggle.com/c/titanic/data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, confusion_matrix

# Load the dataset
training_data = pd.read_csv('titanic/train.csv')

training_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [2]:
# Encode the 'sex' and 'embarked' columns
training_data['Sex'] = pd.Categorical(training_data['Sex']).codes
training_data['Embarked'] = pd.Categorical(training_data['Embarked']).codes

training_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",1,22.0,1,0,A/5 21171,7.25,,2
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.0,1,0,PC 17599,71.2833,C85,0
2,3,1,3,"Heikkinen, Miss. Laina",0,26.0,0,0,STON/O2. 3101282,7.925,,2
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.0,1,0,113803,53.1,C123,2
4,5,0,3,"Allen, Mr. William Henry",1,35.0,0,0,373450,8.05,,2


In [3]:
# Remove rows with missing values
training_data = training_data.dropna()

In [4]:
# Define feature vector X using the columns 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
X = np.asarray(training_data[features])
print(X.shape)
print(X)

# Define y using the column 'Survived'
y = np.asarray(training_data['Survived']).flatten()
print(y.shape)
print(y)

(185, 7)
[[ 1.      0.     38.     ...  0.     71.2833  0.    ]
 [ 1.      0.     35.     ...  0.     53.1     2.    ]
 [ 1.      1.     54.     ...  0.     51.8625  2.    ]
 ...
 [ 1.      0.     56.     ...  1.     83.1583  0.    ]
 [ 1.      0.     19.     ...  0.     30.      2.    ]
 [ 1.      1.     26.     ...  0.     30.      0.    ]]
(185,)
[1 1 0 1 1 1 1 0 1 0 1 0 1 0 1 0 0 1 0 0 0 1 0 1 0 0 0 1 0 0 0 1 1 1 1 0 1
 1 1 1 1 0 1 0 0 1 0 0 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 1 0 1 1 1
 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 0 1 0 1 0 1 1 1 0 0 1 0 1 0 1 0 1 1 1 0 1
 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 0 0 1 1 1 1 0 0 1 1 1 1
 1 0 1 1 1 1 1 0 1 0 0 1 1 1 1 0 1 1 0 0 1 1 0 1 1 1 1 1 1 1 1 0 1 0 1 1 1]


In [5]:
# Split the dataset into training data and testing data with an 8:2 ratio (assuming random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [6]:
# Standardize the training data
standardizer = preprocessing.StandardScaler()
X_train = standardizer.fit_transform(X_train)

In [7]:
# Create and train a machine learning model
model = GaussianNB()
classifier = model.fit(X_train, y_train)

In [8]:
# Standardize the testing data and make predictions with the trained model
X_test = standardizer.transform(X_test)
y_pred = model.predict(X_test)

In [9]:
# Validate the model results
accuracy = accuracy_score(y_test, y_pred)
num_correct_samples = accuracy_score(y_test, y_pred, normalize=False)
con_matrix = confusion_matrix(y_test, y_pred)

print('number of correct sample: {}'.format(num_correct_samples))
print('accuracy: {}'.format(accuracy))
print('confusion matrix: {}'.format(con_matrix))

number of correct sample: 30
accuracy: 0.8108108108108109
confusion matrix: [[13  3]
 [ 4 17]]
