In [287]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # for data visualization purposes
import seaborn as sns # for statistical data visualization
%matplotlib inline

In [288]:
# Load the CSV file
file_path = 'iris.csv'
df = pd.read_csv(file_path)

In [289]:
# View dimensions of dataset
df.shape

(150, 5)

View top 5 rows of dataset


In [290]:
# Display the first few rows of the dataset to understand its structure
df.head(), df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal length  150 non-null    float64
 1   sepal width   150 non-null    float64
 2   petal length  150 non-null    float64
 3   petal width   150 non-null    float64
 4   class         150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


(   sepal length  sepal width  petal length  petal width        class
 0           5.1          3.5           1.4          0.2  Iris-setosa
 1           4.9          3.0           1.4          0.2  Iris-setosa
 2           4.7          3.2           1.3          0.2  Iris-setosa
 3           4.6          3.1           1.5          0.2  Iris-setosa
 4           5.0          3.6           1.4          0.2  Iris-setosa,
 None)

In [291]:
# Explore categorical variables
categorical = [var for var in df.columns if df[var].dtype=='O']

print('There are {} categorical variables\n'.format(len(categorical)))
print('The categorical variables are :\n\n', categorical)

There are 1 categorical variables

The categorical variables are :

 ['class']


In [292]:
# View the categorical variables
df[categorical].head()

Unnamed: 0,class
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-setosa


Explore problems within categorical variables
First, I will explore the categorical variables.

Missing values in categorical variables


In [293]:
# Check missing values in categorical variables
df[categorical].isnull().sum()

class    0
dtype: int64

In [294]:
# View frequency counts of values in categorical variables
for var in categorical: 
    print(df[var].value_counts())

class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64


In [295]:
# View frequency distribution of categorical variables
for var in categorical: 
    print(df[var].value_counts()/float(len(df)))

class
Iris-setosa        0.333333
Iris-versicolor    0.333333
Iris-virginica     0.333333
Name: count, dtype: float64


In [296]:
# Check labels in class variable
df['class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [297]:
# Check frequency distribution of values in native_country variable
df['class'].value_counts()

class
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [298]:
df[categorical].isnull().sum()

class    0
dtype: int64

In [299]:
# Check for cardinality in categorical variables
for var in categorical:
    print(var, ' contains ', len(df[var].unique()), ' labels')

class  contains  3  labels


In [300]:
# find numerical variables
numerical = [var for var in df.columns if df[var].dtype!='O']

print('There are {} numerical variables\n'.format(len(numerical)))
print('The numerical variables are :', numerical)

There are 4 numerical variables

The numerical variables are : ['sepal length', 'sepal width', 'petal length', 'petal width']


In [301]:
# view the numerical variables
df[numerical].head()

Unnamed: 0,sepal length,sepal width,petal length,petal width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [302]:
# check missing values in numerical variables
df[numerical].isnull().sum()

sepal length    0
sepal width     0
petal length    0
petal width     0
dtype: int64

In [303]:
# Check for negative values in numerical variables
df[numerical][df[numerical] < 0].count()

sepal length    0
sepal width     0
petal length    0
petal width     0
dtype: int64

In [304]:
# Shuffle the dataset to ensure randomness
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

Declare feature vector and target variable


In [305]:
# Split features (X) and target (y)
X = df.iloc[:, :-1].values  # All columns except the last one as features
y = df.iloc[:, -1].values   # The last column as target

Split data into separate training and test set


In [306]:
# Define the train-test split ratio
train_ratio = 0.7  # 70% training, 30% testing

# Calculate the split index
split_index = int(len(X) * train_ratio)

# Split the data into training and testing sets
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

In [307]:
# check the shape of X_train and X_test

X_train.shape, X_test.shape

((105, 4), (45, 4))

In [308]:
# check the shape of y_train and y_test

y_train.shape, y_test.shape

((105,), (45,))

In [309]:
# Outputs ready for Naive Bayes implementation
print("X_train:\n", X_train)
print("y_train:\n", y_train)

X_train:
 [[6.1 2.8 4.7 1.2]
 [5.7 3.8 1.7 0.3]
 [7.7 2.6 6.9 2.3]
 [6.  2.9 4.5 1.5]
 [6.8 2.8 4.8 1.4]
 [5.4 3.4 1.5 0.4]
 [5.6 2.9 3.6 1.3]
 [6.9 3.1 5.1 2.3]
 [6.2 2.2 4.5 1.5]
 [5.8 2.7 3.9 1.2]
 [6.5 3.2 5.1 2. ]
 [4.8 3.  1.4 0.1]
 [5.5 3.5 1.3 0.2]
 [4.9 3.1 1.5 0.1]
 [5.1 3.8 1.5 0.3]
 [6.3 3.3 4.7 1.6]
 [6.5 3.  5.8 2.2]
 [5.6 2.5 3.9 1.1]
 [5.7 2.8 4.5 1.3]
 [6.4 2.8 5.6 2.2]
 [4.7 3.2 1.6 0.2]
 [6.1 3.  4.9 1.8]
 [5.  3.4 1.6 0.4]
 [6.4 2.8 5.6 2.1]
 [7.9 3.8 6.4 2. ]
 [6.7 3.  5.2 2.3]
 [6.7 2.5 5.8 1.8]
 [6.8 3.2 5.9 2.3]
 [4.8 3.  1.4 0.3]
 [4.8 3.1 1.6 0.2]
 [4.6 3.6 1.  0.2]
 [5.7 4.4 1.5 0.4]
 [6.7 3.1 4.4 1.4]
 [4.8 3.4 1.6 0.2]
 [4.4 3.2 1.3 0.2]
 [6.3 2.5 5.  1.9]
 [6.4 3.2 4.5 1.5]
 [5.2 3.5 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.2 4.1 1.5 0.1]
 [5.8 2.7 5.1 1.9]
 [6.  3.4 4.5 1.6]
 [6.7 3.1 4.7 1.5]
 [5.4 3.9 1.3 0.4]
 [5.4 3.7 1.5 0.2]
 [5.5 2.4 3.7 1. ]
 [6.3 2.8 5.1 1.5]
 [6.4 3.1 5.5 1.8]
 [6.6 3.  4.4 1.4]
 [7.2 3.6 6.1 2.5]
 [5.7 2.9 4.2 1.3]
 [7.6 3.  6.6 2.1]
 [

**_ How to build the Naive Bayes from Scratch _**

Bayes' Theorem

P(A|B) = P(B|A) \* P(A) / P(B)

The probability of event A, given another event B, can be calculated as above formula.

Transfer this to our case, into class Lables and Features, then we can say:

P(y|X) = P(X|y) \* P(y) / P(X)

y = class lables that we want to predict
X = feature vector => X = {x1, x2, ..., xn}

Assume that features are mutually independent.

P(y|X) = P(x1|y) _ P(x2|y) _ ... _ P(xn|y) _ P(y) / P(X)

We want to select class with highest posterior probability


In [310]:
class NaiveBayes:

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)

        # calculate mean, var, and prior for each class
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors =  np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[idx, :] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)
    
    def _predict(self, x):
        posteriors = []

        # calculate posterior probability for each class
        for idx, c in enumerate(self._classes):
            prior = np.log(self._priors[idx])
            class_conditional = np.sum(np.log(self._pdf(idx, x)))
            posterior = prior + class_conditional
            posteriors.append(posterior)
        
        # return class with highest posterior probability
        return self._classes[np.argmax(posteriors)]
    
    def _pdf(self, class_idx, x):
        mean = self._mean[class_idx]
        var = self._var[class_idx]
        numerator = np.exp(- (x-mean)**2 / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator

In [311]:
def accuracy(y_true, y_pred):
    accuracy = np.sum(y_true == y_pred) / len(y_true)
    return accuracy

In [312]:
nb = NaiveBayes()
nb.fit(X_train, y_train)
predictions = nb.predict(X_test)

print('Naive Bayes classification accuracy:', accuracy(y_test, predictions))

Naive Bayes classification accuracy: 0.9555555555555556
