In [1]:
import numpy as np
import pandas as pd

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn import metrics

from NaiveBayes import NaiveBayes

import warnings
warnings.filterwarnings('ignore')

## Helper functions:

In [2]:
def split_training_set(X, y, encoded=False, random_state=1):
    X_data = X.copy()
    y_data = y.copy()
    if encoded:
        encoder = preprocessing.LabelEncoder()
        for col in X_data.columns:
            col_type = X_data[col].dtypes
            if (col_type == np.object) or (col_type == np.bool):
                X_data[col] = encoder.fit_transform(X_data[col])
        y_data = encoder.fit_transform(y_data)
    X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, random_state=random_state)
    return X_train, X_test, y_train, y_test

def X_titanic_preprocessing(X):
    X_data = X.copy()
    
    # Fill na
    for col in X_data.columns:
        col_type = X_data[col].dtypes
        if (col_type == np.object) or (col_type == np.bool):
            X_data[col].fillna('NaN', inplace=True)
        else:
            X_data[col].fillna(0, inplace=True)
            
    # Choose features
    X_data = X_data[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked']]
    
    return X_data

## The dataset: [Titanic](https://www.kaggle.com/c/titanic/data)

In [3]:
# Titanic Dataset
titanic_data = pd.read_csv('train.csv')

X_titanic_processed = X_titanic_preprocessing(titanic_data)
y_titanic = titanic_data['Survived']

X_train, X_test, y_train, y_test = split_training_set(X_titanic_processed, y_titanic)
X_train_encoded, X_test_encoded, y_train_encoded, y_test_encoded = split_training_set(X_titanic_processed, y_titanic, encoded=True)

## Base-line model using Gaussian Naive Bayes from sklearn:

In [4]:
# Base line model
gnb = GaussianNB()
gnb.fit(X_train_encoded, y_train_encoded)

y_pred_encoded = gnb.predict(X_test_encoded)
print("Accuracy:", metrics.accuracy_score(y_test_encoded, y_pred_encoded))

Accuracy: 0.7354260089686099


## My implementation of Naive Bayes:

In [5]:
my_nb = NaiveBayes()
my_nb.fit(X_train, y_train)
acc, y_pred = my_nb.test(X_test, y_test)
print('Accuracy: ', acc)

Accuracy:  0.7130044843049327
