## <font color="orange">Gaussian Naive Bayes for Iris Flower Species Classification</font>

### Imports

In [1]:
import numpy as np
import pandas as pd #input
from numpy.random import rand
from numpy import mean, std #mean and standard deviation for gaussian probabilities
from scipy.stats import norm #gaussian probabilities
from math import log # to calculate posterior probability

### Constants

In [2]:
class_colname = 'class'
train_ds_percent = 0.8

### Environment

#### Iris Flower Species

In [3]:
f_data = '../input/iris-species/Iris.csv'
f_cols = ['SepalLengthCm',  'SepalWidthCm',  'PetalLengthCm',  'PetalWidthCm', 'Species']

#### Machine Learning Mastery
url: https://machinelearningmastery.com/naive-bayes-classifier-scratch-python/
<img src="../assets/images/MLMastery-GaussianNB.png">

f_data = '../input/ml_mastery/MLMastery-GaussianNB.csv'
f_cols = ['X1', 'X2', 'Y']

### Data

#### read the csv file

In [4]:
#read the csv file
df = pd.read_csv(f_data)

#### drop unwanted columns

In [5]:
#drop unwanted columns
drop_cols = list(set(df.columns) - set(f_cols))
df = df.drop(drop_cols, axis = 1)

#### rename last column that supposedly has a class/label

In [6]:
#rename the last column to 'class'
cols = df.columns.to_list()
cols[len(cols)-1] = class_colname
df.columns = cols

#### Sanity check for data getting loaded

In [7]:
print(df.head(2))

   SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm        class
0            5.1           3.5            1.4           0.2  Iris-setosa
1            4.9           3.0            1.4           0.2  Iris-setosa


### Model

#### Training Algorithm

In [8]:
'''
    return
            classes: (list) of unique class names in the dataset,
             got from the last column named class_colname.
             
            features: (list) of features (column names) in the dataset.
             this excludes the last column which we expect it to have the class labels.
             
            prior: (1-d array) of dim num_classes
            (prior probability of a set of features belonging to a class)
            
            mean_std: (3-d array) of dim num_classes x num_features x 2 (2: mean and std)
            (mean and standard deviation for all features, given the class)
            
    arguments:
    df: (dataframe) with features and class names (should have a 'class' column in addition to the feature columns).
    class_colname: (string) provide suitable column name otherwise, using the class_colname argument.
'''
def train_gaussian_nb(df, class_colname='class'):
    #number of classes
    classes = df[class_colname].unique()
    num_classes = len(df[class_colname].unique())
    #number of features
    features = df.columns[:-1]
    num_features = len(features)
    #number of data points
    N = len(df)
    
    #data structures for priors and
    # (mean, standard deviation) pairs for each feature and class
    # to later calculate likelihood (conditional probability of feature given class)
    prior = np.zeros(num_classes)
    mean_std = np.zeros((num_classes, num_features, 2), dtype=float)
    
    #for each class...
    for cls in range(num_classes):
        #calculate prior probability of data point belonging to class cls
        prior[cls] = len(df[df[class_colname]==classes[cls]]) / N

        #to later calculate likelihood: conditional probability for all features, given class cls,
        #we store the mean and standard deviation of all features, given class cls
        for i_feature in range(num_features):
            #store mean for i_feature, given cls
            mean_std[cls][i_feature][0] = mean(df[df[class_colname]==classes[cls]].iloc[:, i_feature])
            #store standard deviation for i_feature, given cls
            mean_std[cls][i_feature][1] = std(df[df[class_colname]==classes[cls]].iloc[:, i_feature])
            
    return classes, features, prior, mean_std

#### Prediction Algorithm

In [9]:
'''
    return (integer) the (0-based) index of class to which the document belongs
    
    arguments:
    num_classes: (int) number of classes
    num_features: (int) number of features
    prior: (1-d array) of dim num_classes
           (prior probability of a set of features belonging to a class)
    mean_std: (3-d array) of dim num_classes x num_features x 2 (2: mean and std)
              (mean and standard deviation for all features, given the class)
    x: (list) of features
'''
def apply_gaussian_naive_bayes(num_classes, num_features, prior, mean_std, x):
    score = np.zeros((num_classes), dtype=float)
    
    #for each class...
    for cls in range(num_classes):
        #print('class:', cls)
        
        #for this class, add the log-prior probability to the score
        score[cls] += log(prior[cls], 10) #log to the base 10
        
        #for each feature, add the log-likelihood to the score
        for i_feature in range(num_features):
            #print('feature', i_feature)
            #calculate likelihood from the trained mean and standard deviation
            mu = mean_std[cls][i_feature][0]
            sigma = mean_std[cls][i_feature][1]
            likelihood = norm(mu, sigma).pdf(x[i_feature])
            #add the log-likelihood to the score
            score[cls] += log(likelihood, 10) #log to the base 10
    
    #return the index of class with the maximum-a-posterior probability
    return score.argmax()

#### Training

##### split dataset into training and testing

In [10]:
#mask a % of data for training, and the remaining for testing
mask = rand(len(df)) < train_ds_percent
df_train = df[mask]
df_test = df[~mask]

##### train

In [11]:
#train the prior and likelihood on observed data df_train
classes, features, prior, mean_std = train_gaussian_nb(df_train, class_colname)

#### Prediction

In [12]:
#iterate over test dataset and count the number of correct and incorrect predictions
count_correct, count_incorrect = 0, 0
for index, row in df_test.iterrows():
    #actual class
    actual_cls = row[class_colname]
    #predicted class
    # input provided as row[:-1].to_list(), means, all columns except last, converted to a list
    pred_cls = apply_gaussian_naive_bayes(len(classes), len(features), prior, mean_std, row[:-1].to_list())
    if classes[pred_cls] == actual_cls:
        count_correct += 1
    else:
        count_incorrect += 1
    #print('(predicted, actual):', classes[pred_cls], row[class_colname])
print('Correct: ', count_correct, 'Incorrect: ', count_incorrect)
print('Percentage of correct predictions: ', (count_correct * 100)/(count_correct + count_incorrect))

Correct:  28 Incorrect:  1
Percentage of correct predictions:  96.55172413793103
