In [None]:
import numpy as np
import scipy as sp
import sklearn

$$Pr[\mathbf{X}, Y] = Pr[Y]\prod^K_{j=1}Pr[X_j|Y]$$

対数尤度は
$$\mathcal{L}(\mathcal{D};\{Pr[y]\},\{Pr[x_j|y]\})=\sum_{(\mathbf{x}_i,y_i)\in\mathcal{D}}\ln{Pr[\mathbf{x}_i|y_i]}$$

予測するときは
$$\newcommand{\argmax}{\mathop{\rm arg~max}\limits}\\
\begin{eqnarray*}
\hat{y}&=&\argmax_y Pr[y|\mathbf{x}^{new}]\\
&=&\argmax_y \frac{Pr[y]Pr[\mathbf{x}^{new}|y]}{\sum_y{Pr[y]Pr[\mathbf{x}^{new}|y]}}\\
&=&\argmax_y Pr[y]Pr[\mathbf{x}^{new}|y]\\
&=&\argmax_y \biggl(Pr[y]\prod_j{Pr[x^{new}_j|y]}\biggr)\\
&=&\argmax_y \biggl(\log{Pr[y]}+\sum_j{\log{Pr[x^{new}_j|y]}}\biggr)
\end{eqnarray*}$$

### 仕様設計
- データに依存しないアルゴリズムのパラメータは、クラスのコンストラクタの引数で指定する
- 学習はfit()メソッドで行う。訓練データとデータに依存したパラメータを、このメソッドの引数で指定する
- 予測はpredict()メソッドで行う。新規の入力データを、このメソッドの引数で指定する
- モデルのデータの当てはめの良さの評価は、score()メソッドで行う。評価対象のデータを、このメソッドの引数で指定する
- 次元削除などのデータ変換は、transform()メソッドで行う。

In [None]:
class NaiveBayes1(object):
    def __init__(self):
        self.pY_ = None
        self.pXgY_ = None
        
    def fit(self, X, y):
        """
        Fitting Model
        
        Parameters
        ----------
        X: array_like, shape=(n_samples, n_features), dtype=int
            feature values of training samples
        y: array_like, shape=(n_samples), dtype=int
            class labels of training samples
        """
        
        # constants
        n_samples = X.shape[0]
        n_features = X.shape[1]
        n_classes = 2
        n_fvalues = 2
        
        # check the size of y
        if n_samples != len(y):
            raise ValueError('Mismatched number of samples.')
        
        # count up n[yi=y]
        nY = np.zeros(n_classes, dtype=np.int)
        for i in xrange(n_samples):
            nY[y[i]] += 1
        
        # calc pY_
        self.pY_ = np.empty(n_classes, dtype=np.float)
        for i in xrange(n_classes):
            self.pY_[i] = nY[i] / np.float(n_samples)
    
        # count up n[x_ij=xj, yi=y]
        nXY = np.zeros((n_features, n_fvalues, n_classes), dtype=np.int)
        for i in xrange(n_samples):
            for j in xrange(n_features):
                nXY[j, X[i, j], y[i]] += 1
                
        # calc pXgY_
        self.pXgY_ = np.empty((n_features, n_fvalues, n_classes), dtype=np.float)
        for j in xrange(n_features):
            for xi in xrange(n_fvalues):
                for yi in xrange(n_classes):
                    self.pXgY_[j, xi, yi] = nXY[j, xi, yi] / np.float(nY[yi])
    
    def predict(self, X):
        """
        Predict class
        
        Parameters
        ----------
        X: array_like, shape=(n_samples, n_features), dtype=int
            feature values of unseen samples
            
        Returns
        -----
        y: array_like, shape=(n_samples), dtype=int
            predict class labels
        """
        
        # constants
        n_samples = X.shape[0]
        n_features = X.shape[1]
        
        # memory for return values
        y = np.empty(n_samples, dtype=np.int)
        
        # for each feature in X
        for i, xi in enumerate(X):
            
            # calc join probablity
            logpXY = np.log(self.pY_) + \
                np.sum(np.log(self.pXgY_[np.arange(n_features),xi,:]), axis=0)
            
            # predict class
            y[i] = np.argmax(logpXY)
        
        return y
    
    

In [None]:
data = np.genfromtxt('vote_filled.tsv', dtype=np.int)

In [None]:
X = data[:, :-1]
y = data[:, -1]

In [None]:
clr = NaiveBayes1()
clr.fit(X, y)

In [None]:
predict_y = clr.predict(X[:10, :])
for i in xrange(10):
    print i, y[i], predict_y[i]

## クラスの再編成
予測メソッドなどの共通部分を含む抽象クラスを作成し、その抽象クラスを継承した下位クラスを実装していく

In [None]:
from abc import ABCMeta, abstractmethod

class BaseBinaryNaiveBayes(object):
    """
    Abstract Class for Naive Bayes whose classes and features are binary.
    """
    
    __metaclass__ = ABCMeta
    
    def __init__(self):
        self.pY_ = None
        self.pXgY_ = None
    
    @abstractmethod
    def fit(self, X, y):
        """
        Abstract method for fitting model
        
        Attributes
        ----------
        `pY_` : array_like, shape=(n_classes), dtype=float
            pmf of a class
        `pXgY_` : array_like, shape(n_features, n_classes, n_fvalues), dtype=float
            pmf of feature values given a class
        """
        pass
    
    def predict(self, X):
        """
        Predict class
        
        Parameters
        ----------
        X: array_like, shape=(n_samples, n_features), dtype=int
            feature values of unseen samples
            
        Returns
        -----
        y: array_like, shape=(n_samples), dtype=int
            predict class labels
        """
        
        # constants
        n_samples = X.shape[0]
        n_features = X.shape[1]
        
        # memory for return values
        y = np.empty(n_samples, dtype=np.int)
        
        # for each feature in X
        for i, xi in enumerate(X):
            
            # calc join probablity
            logpXY = np.log(self.pY_) + \
                np.sum(np.log(self.pXgY_[np.arange(n_features),xi,:]), axis=0)
            
            # predict class
            y[i] = np.argmax(logpXY)
        
        return y

class NaiveBayes1(BaseBinaryNaiveBayes):
    """
    Naive Bayes class (1)
    """
    
    def __init__(self):
        super(NaiveBayes1, self).__init__()
    
    def fit(self, X, y):
        """
        Fitting Model
        
        Parameters
        ----------
        X: array_like, shape=(n_samples, n_features), dtype=int
            feature values of training samples
        y: array_like, shape=(n_samples), dtype=int
            class labels of training samples
        """
        
        # constants
        n_samples = X.shape[0]
        n_features = X.shape[1]
        n_classes = 2
        n_fvalues = 2
        
        # check the size of y
        if n_samples != len(y):
            raise ValueError('Mismatched number of samples.')
        
        # count up n[yi=y]
        nY = np.zeros(n_classes, dtype=np.int)
        for i in xrange(n_samples):
            nY[y[i]] += 1
        
        # calc pY_
        self.pY_ = np.empty(n_classes, dtype=np.float)
        for i in xrange(n_classes):
            self.pY_[i] = nY[i] / np.float(n_samples)
    
        # count up n[x_ij=xj, yi=y]
        nXY = np.zeros((n_features, n_fvalues, n_classes), dtype=np.int)
        for i in xrange(n_samples):
            for j in xrange(n_features):
                nXY[j, X[i, j], y[i]] += 1
                
        # calc pXgY_
        self.pXgY_ = np.empty((n_features, n_fvalues, n_classes), dtype=np.float)
        for j in xrange(n_features):
            for xi in xrange(n_fvalues):
                for yi in xrange(n_classes):
                    self.pXgY_[j, xi, yi] = nXY[j, xi, yi] / np.float(nY[yi])
    

## NaiveBayes (2)

### 特徴分布の学習

次元 | ループ変数 | 大きさ | 意味
---- | ---- | ---- | ----
0 | i | n_samples | 事例
1 | j | n_features | 特徴
2 | xi | n_fvalues | 特徴値
3 | yi | n_classes | クラス

In [None]:
from nbayes1b import BaseBinaryNaiveBayes
import numpy as np

In [None]:
class NaiveBayes2(BaseBinaryNaiveBayes):
    """
    Naive Bayes (2)
    """
    
    def __init__(self):
        super(NaiveBayes2, self).__init__()
        
    def fit(self, X, y):
        """
        Fitting Model
        
        Parameters
        ----------
        X: array_like, shape=(n_samples, n_features), dtype=int
            feature values of training samples
        y: array_like, shape=(n_samples), dtype=int
            class labels of training samples
        """
        
        # constants
        n_samples = X.shape[0]
        n_features = X.shape[1]
        n_classes = 2
        n_fvalues = 2
        
        # check the size of y
        if n_samples != len(y):
            raise ValueError('Missmatched number of values.')
        
        # count up n[yi=y]
        nY = np.sum(y[:, np.newaxis] == np.arange(n_classes)[np.newaxis, :], axis=0)
        
        # calc pY_
        self.pY_ = np.true_divide(nY, n_samples)
        
        # count up n[x_ij=xj, yi=y]
        ary_xi = np.arange(n_fvalues)[np.newaxis, np.newaxis, :, np.newaxis]
        ary_yi = np.arange(n_classes)[np.newaxis, np.newaxis, np.newaxis, :]
        ary_y = y[:, np.newaxis, np.newaxis, np.newaxis]
        ary_X = X[:, :, np.newaxis, np.newaxis]

        nXY = np.sum(np.logical_and(ary_X == ary_xi, ary_y == ary_yi), axis=0)
        
        # calc pXgY_
        self.pXgY_ = np.true_divide(nXY, nY[np.newaxis, np.newaxis, :])
        
        

In [3]:
import numpy as np
from nbayes2 import *
data = np.genfromtxt("vote_filled.tsv", dtype=np.int)
X = data[:, :-1]
y = data[:, -1]
clr1 = NaiveBayes1()
clr2 = NaiveBayes2()

In [4]:
%timeit clr1.fit(X, y)

The slowest run took 7.68 times longer than the fastest. This could mean that an intermediate result is being cached.
100 loops, best of 3: 6.76 ms per loop


In [5]:
%timeit clr2.fit(X, y)

1000 loops, best of 3: 529 µs per loop


In [6]:
clr = NaiveBayes2()
clr.fit(X, y)

predict_y = clr.predict(X[:10, :])
for i in xrange(10):
    print i, y[i], predict_y[i]

0 1 1
1 1 1
2 0 0
3 0 0
4 0 0
5 0 0
6 0 1
7 1 1
8 1 1
9 0 0
