In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("./data/iris.csv")
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
label_enc = LabelEncoder()
data["species"] = label_enc.fit_transform(data["species"])

In [4]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [5]:
from sklearn.preprocessing import KBinsDiscretizer
X = data.iloc[:,:-1]
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
est.fit(X)

Xt = est.transform(X)

Xt = Xt.astype(np.int)
data.iloc[:,:-1] = Xt
y = data.iloc[:,-1]
X = data.iloc[:,:-1]

In [6]:
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0,1,0,0,0
1,0,1,0,0,0
2,0,1,0,0,0
3,0,1,0,0,0
4,0,2,0,0,0


In [7]:
print(np.unique(data["species"],return_counts=True))

(array([0, 1, 2]), array([50, 50, 50]))


In [8]:
np.unique(data["species"])

array([0, 1, 2])

In [9]:
np.unique(data["species"])

array([0, 1, 2])

In [10]:
prob_dict = {}
classes = np.unique(data["species"])
num_classes = len(classes)
num_features = len(X.columns)
num_samples = len(X)

for specie in classes: 
    prob_dict[specie] = {}
    prob_dict[specie]["feature"] = np.ones((num_classes,num_features))
    prob_dict[specie]["class_prob"] = 0


In [11]:
for x,c in zip(X.values,y.values):

    for j,e in enumerate(x):

        prob_dict[c]["feature"][e,j]+=1
    prob_dict[c]["class_prob"] += 1


In [12]:
prob_dict

{0: {'feature': array([[46.,  2., 51., 51.],
         [ 6., 35.,  1.,  1.],
         [ 1., 16.,  1.,  1.]]),
  'class_prob': 50},
 1: {'feature': array([[ 7., 22.,  1.,  1.],
         [39., 30., 49., 49.],
         [ 7.,  1.,  3.,  3.]]),
  'class_prob': 50},
 2: {'feature': array([[ 2., 12.,  1.,  1.],
         [28., 37.,  7.,  5.],
         [23.,  4., 45., 47.]]),
  'class_prob': 50}}

In [13]:
for i in prob_dict.keys():
   
    prob_dict[i]["feature"]/= (prob_dict[i]["class_prob"]+1) 
    
    prob_dict[i]["class_prob"]/= num_samples
    prob_dict[i]["class_prob"] = prob_dict[i]["class_prob"]

In [14]:
prob_dict

{0: {'feature': array([[0.90196078, 0.03921569, 1.        , 1.        ],
         [0.11764706, 0.68627451, 0.01960784, 0.01960784],
         [0.01960784, 0.31372549, 0.01960784, 0.01960784]]),
  'class_prob': 0.3333333333333333},
 1: {'feature': array([[0.1372549 , 0.43137255, 0.01960784, 0.01960784],
         [0.76470588, 0.58823529, 0.96078431, 0.96078431],
         [0.1372549 , 0.01960784, 0.05882353, 0.05882353]]),
  'class_prob': 0.3333333333333333},
 2: {'feature': array([[0.03921569, 0.23529412, 0.01960784, 0.01960784],
         [0.54901961, 0.7254902 , 0.1372549 , 0.09803922],
         [0.45098039, 0.07843137, 0.88235294, 0.92156863]]),
  'class_prob': 0.3333333333333333}}

In [15]:

prob_matrix = np.zeros((num_samples,3))

for i in range(num_samples):
    spec = y[i]
    for j in classes:
        prob = prob_dict[j]["class_prob"] * prob_dict[j]["feature"][X.iloc[i,:].values,np.arange(4)].prod()
        prob_matrix[i,j] = prob
   

In [16]:
prob_matrix[0:10]

array([[2.06330898e-01, 1.03470688e-05, 3.64610995e-06],
       [2.06330898e-01, 1.03470688e-05, 3.64610995e-06],
       [2.06330898e-01, 1.03470688e-05, 3.64610995e-06],
       [2.06330898e-01, 1.03470688e-05, 3.64610995e-06],
       [9.43226964e-02, 3.44902292e-07, 3.94174048e-07],
       [9.43226964e-02, 3.44902292e-07, 3.94174048e-07],
       [2.06330898e-01, 1.03470688e-05, 3.64610995e-06],
       [2.06330898e-01, 1.03470688e-05, 3.64610995e-06],
       [2.06330898e-01, 1.03470688e-05, 3.64610995e-06],
       [2.06330898e-01, 1.03470688e-05, 3.64610995e-06]])

In [17]:
arg_max = np.argmax(prob_matrix,axis=1)

In [18]:
arg_max

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [19]:
from sklearn.naive_bayes import CategoricalNB
clf = CategoricalNB()

In [20]:
clf.fit(Xt, y.values)

CategoricalNB()

In [21]:
clf.predict_proba(Xt)[0:10]

array([[9.99932185e-01, 5.01445357e-05, 1.76699792e-05],
       [9.99932185e-01, 5.01445357e-05, 1.76699792e-05],
       [9.99932185e-01, 5.01445357e-05, 1.76699792e-05],
       [9.99932185e-01, 5.01445357e-05, 1.76699792e-05],
       [9.99992164e-01, 3.65659171e-06, 4.17896196e-06],
       [9.99992164e-01, 3.65659171e-06, 4.17896196e-06],
       [9.99932185e-01, 5.01445357e-05, 1.76699792e-05],
       [9.99932185e-01, 5.01445357e-05, 1.76699792e-05],
       [9.99932185e-01, 5.01445357e-05, 1.76699792e-05],
       [9.99932185e-01, 5.01445357e-05, 1.76699792e-05]])

In [22]:
pred = clf.predict(Xt)
pred = pred.astype(np.int)
print(pred) 

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1
 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 1 2 2 2 2
 2 2 2 2 2 2 2 2 1 2 2 2 1 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [23]:
(pred == arg_max).sum()

150

In [24]:
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [25]:
arg_max

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])