## BaseNB

$$ LogSumExp(x_1 ... x_n) = \log(\sum_{i=1}^n e^{x_i}) $$

In [5]:
import numpy as np
from scipy.misc import logsumexp # 计算数据非常小时可用

In [27]:
class BaseNaiveBayes:
    
    def __init__(self):
        self._classes = None
        
    def _joint_log_likehood(self, X):
        """Compute the unnormalized posterior log probability of X

        I.e. ``log P(c) + log P(x|c)`` for all rows x of X, as an array-like of
        shape [n_classes, n_samples].

        Input is passed to _joint_log_likelihood as-is by predict,
        predict_proba and predict_log_proba.
        """
        pass
    
    def predict_log_proba(self, X):
        """
        Return log-probability estimates for the test vector X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        C : array-like, shape = [n_samples, n_classes]
            Returns the log-probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """
        jll = self._joint_log_likehood(X)
        # normalize by P(x) = P(f_1, ..., f_n) 归一化
        log_prob_x = logsumexp(jll, axis=1)
        return jll - np.atleast_2d(log_prob_x).T
        
    def predict_proba(self, X):
        """
        Return probability estimates for the test vector X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        C : array-like, shape = [n_samples, n_classes]
            Returns the probability of the samples for each class in
            the model. The columns correspond to the classes in sorted
            order, as they appear in the attribute `classes_`.
        """
        return np.exp(self.predict_log_proba(X))
    
    def predict(self, X):
        """
        Perform classification on an array of test vectors X.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]

        Returns
        -------
        C : array, shape = [n_samples]
            Predicted target values for X
        """
        jll = self._joint_log_likehood(X)
        return self._classes[np.argmax(jll, axis=1)]

## 高斯分布
$$ p(x = v | C_k) = \frac{1}{\sqrt{2\pi\sigma_k^2 }}e^{- \frac{(v - \mu_k)^2}{2\sigma_k^2}} $$

In [28]:
import numpy as np

class GaussianNaiveBayes(BaseNaiveBayes):
    """高斯贝叶斯"""
    
    def fit(self, X, y):
        """
        X: train dataset, shape = (n_samples, n_features)
        y: target, shape = (n_samples, )
        """
        # 计算y的先验概率
        y_prior_proba = []
        self._classes = np.unique(y) # 找到y类别
        for c in self._classes:
            c_count = (y==c).sum() # 计算c类有多少
            y_prior_proba.append(c_count / np.size(y))
        self._y_prior_proba = np.array(y_prior_proba)
        
        # 计算连续变量X的高斯分布参数 sigma方差  mu均值
        features = X.shape[1]
        self._mu = np.zeros((np.size(self._classes), features))  # y类别 和x特征
        self._sigma = np.zeros((np.size(self._classes), features))
        for i in range(np.size(self._classes)):
            x_c = X[y == self._classes[i]]  # 筛选y = i 的样本
            self._mu[i,:] = np.mean(x_c, axis=0) # X列：特征
            self._sigma[i,:] = np.var(x_c, axis=0)
        return self
            
    def _joint_log_likehood(self, X):
        jll = []
        for i in range(np.size(self._classes)):
            log_prior = np.log(self._y_prior_proba[i]) # log
            # 高斯公式取对数
            x_given_y = - 0.5 * np.sum(np.log(2 * np.pi * self._sigma[i,:]))
            x_given_y -= 0.5 * np.sum(((X - self._mu[i,:]) ** 2) / (self._sigma[i,:]), axis=1)
            jll.append(log_prior + x_given_y)
        jll = np.array(jll).T
        return jll

    def __str__(self):
        return "<GaussianNaiveBayes>"

In [29]:
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

test = np.array([[1, 'S'], [1, 'M'], [1, 'M'], [1, 'S'], [1, 'S'], [2, 'S'], [2, 'M'], 
                    [2, 'M'], [2, 'L'], [2, 'L'], [3, 'L'], [3, 'M'], [3, 'M'], [3, 'L'], [3, 'L']])
iris = load_iris()
X = iris.data 
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
gnb = GaussianNaiveBayes().fit(X_train, y_train)
log_proba = gnb.predict_log_proba(X_test)
proba = gnb.predict_proba(X_test)
y_pred = gnb.predict(X_test)
print(accuracy_score(y_test, y_pred))

1.0


Importing `logsumexp` from scipy.misc is deprecated in scipy 1.0.0. Use `scipy.special.logsumexp` instead.


## 伯努利

## 多项式