In [1]:
import random

random.seed(42)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("iris.csv", header=(0))
data = data.dropna(axis="rows")

classes = np.array(pd.unique(data[data.columns[-1]]), dtype=str)
attributes = list(data.columns)

data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
data = data.to_numpy()
nrow, ncol = data.shape
y = data[:, -1]
X = data[:, 0 : ncol - 1]

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.7, random_state=42
)

In [5]:
def likelyhood(y, Z):
    def gaussian(x, mu, sig):
        return np.exp(-np.power(x - mu, 2.0) / (2 * np.power(sig, 2.0)))

    prob = 1
    for j in np.arange(0, Z.shape[1]):
        m = np.mean(Z[:, j])
        s = np.std(Z[:, j])
        prob = prob * gaussian(y[j], m, s)
    return prob

In [6]:
P = pd.DataFrame(data=np.zeros((X_test.shape[0], len(classes))), columns=classes)
for i in np.arange(0, len(classes)):
    elements = tuple(np.where(y_train == classes[i]))
    Z = X_train[elements, :][0]
    for j in np.arange(0, X_test.shape[0]):
        x = X_test[j, :]
        pj = likelyhood(x, Z)
        P[classes[i]][j] = pj * len(elements) / X_train.shape[0]

In [7]:
P.head(10)

Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica
0,2.203966e-92,0.004345947,8.401969e-08
1,0.0001479191,7.322596e-20,5.675329e-35
2,3.574579e-294,6.684120999999999e-19,2.223171e-05
3,7.304744e-96,0.004012212,1.424931e-06
4,1.666488e-108,0.0008057105,1.41039e-06
5,0.001405723,6.169399000000001e-17,9.013438e-32
6,8.718053000000001e-55,0.002340174,4.316097e-11
7,2.564878e-183,6.862286e-14,0.001482946
8,9.841417e-98,0.0007266205,6.224668e-09
9,1.520404e-62,0.006288432,1.024099e-08


In [8]:
from sklearn.metrics import accuracy_score

y_pred = np.array(
    [P.columns[np.argmax(np.array(P.iloc[[i]]))] for i in np.arange(0, P.shape[0])],
    dtype=str,
)
score = accuracy_score(y_pred, y_test)
print("Accuracy:", score)

Accuracy: 1.0


In [9]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[19,  0,  0],
       [ 0, 13,  0],
       [ 0,  0, 13]])

In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

model = GaussianNB()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
score = accuracy_score(y_pred, y_test)
print("Accuracy:", score)

Accuracy: 1.0


In [11]:
df = pd.DataFrame({"Real Values": y_test, "Predicted Values": y_pred})
df

Unnamed: 0,Real Values,Predicted Values
0,Iris-versicolor,Iris-versicolor
1,Iris-setosa,Iris-setosa
2,Iris-virginica,Iris-virginica
3,Iris-versicolor,Iris-versicolor
4,Iris-versicolor,Iris-versicolor
5,Iris-setosa,Iris-setosa
6,Iris-versicolor,Iris-versicolor
7,Iris-virginica,Iris-virginica
8,Iris-versicolor,Iris-versicolor
9,Iris-versicolor,Iris-versicolor
