In [18]:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('default')
from metrics import accuracy

from sklearn import datasets
from sklearn.model_selection import train_test_split

# Naive Bayes
Is a supervised learning algorithm based on applying Bayes’ theorem with the “naive” assumption of conditional independence between every pair of features given the value of the class variable.
The different naive Bayes classifiers differ mainly by the assumptions they make regarding the distribution of $P(x_i|y)$.

They work quite well, can be very fast and require a small amount of training data to estimate the necessary parameters. They have been famously used in document classification and spam filtering. However, it is known to be a bad estimator, so the probability outputs from are not to be taken too seriously.

Bayes Theorem:  

$P(A|B) = \frac{P(B|A) * P(A)} {P(B)} $   

The probability of A given B.    

In case of our target and data:    

$P(y|X)$   - Posterior Probability    
$P(y)$     - Prior probability of y    
$P(x_i|y)$ - Class conditional probability

Using the chain rule we get:
$P(y|X) = \frac{P(y)  *  \prod_i{P(x_i|y)}} {P(X)} $

$argmax(P(y|X))$ select the class with the highest probability.

Finally: $argmax( log(P(x_1|y)) + log(P(x_2|y)) + ... + log(P(y)) )$



Using the Gaussion distribution:

$P(x_i|y) = \frac{1}{\sqrt{2 \pi \sigma_y^2}} * exp(-\frac{(x_i-\mu_y)^2}{2 \sigma_y^2}) $

In [19]:
class NaiveBayes:
    def fit(self, X, y):
        # p_y = frequency
        # p_class_cond = P(xi|y)
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        self.n_classes = len(self._classes)
        
        # Initialize vectors needed for calculations
        self._mean = np.zeros((self.n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((self.n_classes, n_features), dtype=np.float64)
        for clss in self._classes:
            self._mean[clss,:] = X[y==clss].mean(axis=0)
            self._var[clss,:] = X[y==clss].var(axis=0)

        # Prior P(y) = frequencies of classes
        self._priors = [np.sum([y==clss]) / n_samples for clss in self._classes]


    def predict(self, X):
        return [self._predict(x) for x in X]

    def _predict(self, x):
        # chose class with highers probability from posteriors
        # Posteriors = P(y|x) for every y
        posteriors = [np.log(self._priors[id]) + np.sum(np.log(self._gaussian_pd(id,x))) for id in range(self.n_classes)]
        return self._classes[np.argmax(posteriors)]
        

    def _gaussian_pd(self, cid, x):
        mean = self._mean[cid]
        var = self._var[cid]
        numerator = np.exp(- np.square(x-mean) / (2 * var))
        denominator = np.sqrt(1* np.pi * var)
        return numerator / denominator


In [20]:
X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2, random_state=4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

In [21]:
naive = NaiveBayes()
naive.fit(X_train, y_train)
predicted = naive.predict(X_test)

accuracy(y_test, predicted)

0.835