In [85]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits, load_breast_cancer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

In [123]:
np.set_printoptions(suppress=True)

In [36]:
def write(num, *args):
    with open(f'file{num}', 'w') as file:
        file.write(' '.join(map(str, args)))

In [11]:
data_digits = load_digits()
data_cancer = load_breast_cancer()

In [14]:
X1, y1 = data_digits.data, data_digits.target

In [15]:
X2, y2 = data_cancer.data, data_cancer.target

In [27]:
X1[:2], y1[:20]

(array([[ 0.,  0.,  5., 13.,  9.,  1.,  0.,  0.,  0.,  0., 13., 15., 10.,
         15.,  5.,  0.,  0.,  3., 15.,  2.,  0., 11.,  8.,  0.,  0.,  4.,
         12.,  0.,  0.,  8.,  8.,  0.,  0.,  5.,  8.,  0.,  0.,  9.,  8.,
          0.,  0.,  4., 11.,  0.,  1., 12.,  7.,  0.,  0.,  2., 14.,  5.,
         10., 12.,  0.,  0.,  0.,  0.,  6., 13., 10.,  0.,  0.,  0.],
        [ 0.,  0.,  0., 12., 13.,  5.,  0.,  0.,  0.,  0.,  0., 11., 16.,
          9.,  0.,  0.,  0.,  0.,  3., 15., 16.,  6.,  0.,  0.,  0.,  7.,
         15., 16., 16.,  2.,  0.,  0.,  0.,  0.,  1., 16., 16.,  3.,  0.,
          0.,  0.,  0.,  1., 16., 16.,  6.,  0.,  0.,  0.,  0.,  1., 16.,
         16.,  6.,  0.,  0.,  0.,  0.,  0., 11., 16., 10.,  0.,  0.]]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]))

In [28]:
X2[:2], y2[:20]

(array([[  17.99    ,   10.38    ,  122.8     , 1001.      ,    0.1184  ,
            0.2776  ,    0.3001  ,    0.1471  ,    0.2419  ,    0.07871 ,
            1.095   ,    0.9053  ,    8.589   ,  153.4     ,    0.006399,
            0.04904 ,    0.05373 ,    0.01587 ,    0.03003 ,    0.006193,
           25.38    ,   17.33    ,  184.6     , 2019.      ,    0.1622  ,
            0.6656  ,    0.7119  ,    0.2654  ,    0.4601  ,    0.1189  ],
        [  20.57    ,   17.77    ,  132.9     , 1326.      ,    0.08474 ,
            0.07864 ,    0.0869  ,    0.07017 ,    0.1812  ,    0.05667 ,
            0.5435  ,    0.7339  ,    3.398   ,   74.08    ,    0.005225,
            0.01308 ,    0.0186  ,    0.0134  ,    0.01389 ,    0.003532,
           24.99    ,   23.41    ,  158.8     , 1956.      ,    0.1238  ,
            0.1866  ,    0.2416  ,    0.186   ,    0.275   ,    0.08902 ]]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]))

In [31]:
bnb = BernoulliNB()
mnb = MultinomialNB()
gnb = GaussianNB()

In [39]:
%%time
bnb_pred_1 = cross_val_score(bnb, X1, y1, n_jobs=2, scoring='accuracy')
mnb_pred_1 = cross_val_score(mnb, X1, y1, n_jobs=2, scoring='accuracy')
gnb_pred_1 = cross_val_score(gnb, X1, y1, n_jobs=2, scoring='accuracy')
print(f'Digits accuracy score on bayesian models:\n{bnb_pred_1.mean()}\n{mnb_pred_1.mean()}\n{gnb_pred_1.mean()}')



Digits accuracy score on bayesian models:
0.8258236507780582
0.8708771489735053
0.8186003803550138
CPU times: user 67.3 ms, sys: 35.4 ms, total: 103 ms
Wall time: 3.4 s




In [40]:
%%time
bnb_pred_2 = cross_val_score(bnb, X2, y2, n_jobs=2, scoring='accuracy')
mnb_pred_2 = cross_val_score(mnb, X2, y2, n_jobs=2, scoring='accuracy')
gnb_pred_2 = cross_val_score(gnb, X2, y2, n_jobs=2, scoring='accuracy')
print(f'Digits accuracy score on bayesian models:\n{bnb_pred_2.mean()}\n{mnb_pred_2.mean()}\n{gnb_pred_2.mean()}')

Digits accuracy score on bayesian models:
0.6274204028589994
0.8945790401930752
0.9367492806089297
CPU times: user 28 ms, sys: 7.04 ms, total: 35 ms
Wall time: 50.3 ms




In [41]:
write(1, gnb_pred_2.mean())

In [42]:
write(2, mnb_pred_1.mean())

In [43]:
write(3, 3, 4)

*This Naive Bayes tutorial is broken down into 5 parts:*

*Step 1*: __Separate By Class.__

*Step 2*: __Summarize Dataset.__

*Step 3*: __Summarize Data By Class.__

*Step 4*: __Gaussian Probability Density Function.__

*Step 5*: __Class Probabilities.__

In [191]:
class BernNB:
    def __init__(self):
        pass

    def fit(self, X, y):
        data = np.column_stack((X, y))
        n = np.unique(data[:, -1])
        self.n_size = []
        def stat(x):
            self.n_size.append(x.shape[0]) 
            return [x.mean(axis=0), x.std(axis=0)]
#         stat = lambda x: [x.mean(axis=0), x.std(axis=0)]
        self.params = np.array(list(map(lambda x: stat(data[data[:, -1] == x, :-1]), n)))
        self.data = data
        self.n_size = np.array(self.n_size)

    def gaussian(self, x):
        c = 0
        for r in x:
            p = (1 / (np.sqrt(2 * np.pi * self.params[:, 1]))) * np.exp(- np.power(r - self.params[:, 0], 2) / (2 * self.params[:, 1]))
            probs = np.prod(p, axis=1)
            probs *= self.n_size / self.data.shape[0]
            max_ = probs.argmax()
            print(f'{max_} {round(probs[max_], 8)}')
            
            if c == 20:
                break 
            c += 1
            
    def predict(self, data):
        preds = []
        print(self.gaussian(data).shape)


In [192]:
classifier = BernNB()
train_X1, test_X1, train_y1, test_y1 = train_test_split(X1, y1, shuffle=True, train_size=0.1, random_state=42)
classifier.fit(train_X1, train_y1)
classifier.predict(test_X1)

0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan
0 nan




AttributeError: 'NoneType' object has no attribute 'shape'

In [82]:
#prepare data 
X1.shape, y1.shape
print(y1[:15])

[0 1 2 3 4 5 6 7 8 9 0 1 2 3 4]


In [80]:
def bnb(X, y):
    data = np.column_stack((X, y))
    n = np.unique(data[:, -1])
    for i in list(map(lambda x: data[data[:, -1] == x, :-1], n)):
        mean, std = i.mean(axis=0), i.std(axis=0)
        const = (1 / (np.sqrt(2 * np.pi * std ** 2 + 1)))
#         p = const * np.exp(-((i - mean) ** 2) / (2 * std ** 2 + 1))
        print(p.shape[0] / data.shape[0])
        
        break

bnb(X1, y1)

(178, 64)
