# Naive Bayes Classifier from Scratch

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import scipy.stats as stats
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
titanic = pd.read_csv("Titanic.csv")
print(titanic.dtypes)

PassengerId     int64
Survived        int64
Pclass          int64
Sex            object
Age             int64
dtype: object


In [4]:
titanic = titanic.set_index("PassengerId")
print(titanic.head(5))

             Survived  Pclass     Sex  Age
PassengerId                               
1                   0       3    male   22
2                   1       1  female   38
3                   1       3  female   26
4                   1       1  female   35
5                   0       3    male   35


In [5]:
titanic["Survived"] = titanic.loc[:, ["Survived"]].replace(0, "Died")
titanic["Survived"] = titanic.loc[:, ["Survived"]].replace(1, "Alive")
print(set(titanic["Survived"]))

{'Died', 'Alive'}


### 1. Data Split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(titanic.loc[:, ["Pclass", "Sex", "Age"]], titanic["Survived"],
                                                    test_size=0.2, random_state=96)
print("Train:", X_train.shape, y_train.shape)
print("Test:", X_test.shape, y_test.shape)

Train: (800, 3) (800,)
Test: (201, 3) (201,)


### 2.1. Prior

In [8]:
prior = pd.crosstab(index=y_train, columns="Prior")
prior = prior/prior.sum()
print(prior)

col_0      Prior
Survived        
Alive     0.4075
Died      0.5925


### 2.2. Likelihood for Categorical Features

In [9]:
likeli_pclass = pd.crosstab(index=X_train["Pclass"], columns=y_train)
likeli_pclass = (likeli_pclass/likeli_pclass.sum()).T
print(likeli_pclass)

Pclass           1         2         3
Survived                              
Alive     0.420245  0.263804  0.315951
Died      0.191983  0.265823  0.542194


In [10]:
likeli_sex = pd.crosstab(index=X_train["Sex"], columns=y_train)
likeli_sex = (likeli_sex/likeli_sex.sum()).T
print(likeli_sex)

Sex         female      male
Survived                    
Alive     0.782209  0.217791
Died      0.101266  0.898734


### 2.3. Likelihood for Numerical Features

In [11]:
param_age = pd.DataFrame()
param_age["Mean"] = X_train["Age"].groupby(y_train).mean()
param_age["Std"] = X_train["Age"].groupby(y_train).std()
print(param_age)

               Mean        Std
Survived                      
Alive     29.588957  14.586076
Died      30.651899  13.966553


In [12]:
def Likeli_Age(age):
    likeli_age = pd.DataFrame(index=prior.index)
    likeli_age["Age"] = [stats.norm.pdf(loc=param_age["Mean"][y], scale=param_age["Std"][y],
                                        x=age) for y in prior.index]
    return likeli_age

### 2.4. Posterior

In [13]:
def Posterior(data):
    posterior = pd.DataFrame()
    product = likeli_pclass[data["Pclass"]]*likeli_sex[data["Sex"]]*Likeli_Age(data["Age"])["Age"]
    posterior["Posterior"] = product*prior["Prior"]/sum(product*prior["Prior"])
    return posterior

### 3. Class Prediction

In [14]:
print([Posterior(data=X_test.loc[ix, :]) for ix in X_test.index][0])

          Posterior
Survived           
Alive       0.08603
Died        0.91397


In [15]:
## Maximum a Posteriori
y_pred = [Posterior(data=X_test.loc[ix, :])["Posterior"].idxmax() for ix in X_test.index]
print(classification_report(y_true=y_test, y_pred=y_pred))

             precision    recall  f1-score   support

      Alive       0.82      0.79      0.80        80
       Died       0.86      0.88      0.87       121

avg / total       0.85      0.85      0.85       201

