In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [2]:

class NaiveBayes:
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        #calc mean, var and prior
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y==c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._mean[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0]/float(n_samples)

    def predict(self, X):
        Y_pred = [self._predict(x) for x in X]
        return np.array(Y_pred)

    def _predict(self, x ):
        posteriors = []

        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            posterior = np.sum(np.log(self._pdf(idx,x)))
            posterior = posterior + prior
            posteriors.append(posterior)
        return self._classes[np.argmax(posteriors)]

    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(-((x-mean) ** 2)/(2*var))
        denominator = np.sqrt(2 *np.pi * var)
        return numerator/denominator

In [9]:
# loading in the dataset and checking to see that it has loaded propperly

dataset = pd.read_csv('./Datasets/Iris_data.csv')
print(len(dataset))
print(dataset.head())

150
   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [12]:
# removing no inputs in the data with the mean number of the data to help the effectivness of the k-nn 

y = dataset.iloc[:,4]
Y=[]
for i in range(len(y)):
    if y[i] == 'Iris-setosa':
        Y.append(0)
    elif y[i] == 'Iris-versicolor':
        Y.append(1)
    elif y[i] == 'Iris-virginica':
        Y.append(2)
print(Y)


TypeError: 'int' object is not iterable

In [5]:
# splitting the input and output data
X = dataset.iloc[:, 1:3]
Y = dataset.iloc[:, 8]
# splitting the data into the training data and the testing data
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, random_state=0, test_size = 0.2)

In [6]:
# standardizing the data to be of numbers between 0-1
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)

In [7]:
nb = NaiveBayes()
nb.fit(X_train, Y_train)
Y_pred = nb.predict(X_test)

  numerator = np.exp(-((x-mean) ** 2)/(2*var))
  return numerator/denominator


In [8]:
cm = confusion_matrix(Y_test, Y_pred)
print(cm)
# producing f1 score and accuracy score
print(f1_score(Y_test, Y_pred))
print(accuracy_score(Y_test, Y_pred))

[[107   0]
 [ 47   0]]
0.0
0.6948051948051948
