# Naive Bayes Classifier on Iris Dataset

### https://www.kaggle.com/uciml/iris

This dataset includes three iris species with 50 samples each as well as some properties about each flower. One flower species is linearly separable from the other two, but the other two are not linearly separable from each other.

The columns in this dataset are:

SepalLengthCm,  
SepalWidthCm,  
PetalLengthCm,  
PetalWidthCm,  
Species

In [56]:
from csv import reader
from math import sqrt
from math import exp
from math import pi

### Load the CSV file

In [57]:
def load_csv(filename):
    dataset = list()
    with open(filename, 'r') as file:
        csv_reader = reader(file)
        for row in csv_reader:
            if not row:
                continue
            dataset.append(row)
    return dataset
 

In [58]:
filename = 'Iris.csv'
dataset = load_csv(filename)
dataset[:5]

[['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'],
 ['4.9', '3', '1.4', '0.2', 'Iris-setosa'],
 ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'],
 ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'],
 ['5', '3.6', '1.4', '0.2', 'Iris-setosa']]

### Convert string column to float

In [59]:
def str_column_to_float(dataset, column):
    for row in dataset:
            row[column] = float(row[column].strip())
            

### Convert string column to integer

In [60]:
def str_column_to_int(dataset, column):
    class_values = [row[column] for row in dataset]
    unique = set(class_values)
    lookup = dict()
    for i, value in enumerate(unique):
        lookup[value] = i
        print('[%s] => %d' % (value, i))
    for row in dataset:
        row[column] = lookup[row[column]]
    return lookup

### Split the dataset by class values, returns a dictionary

In [61]:
def separate_by_class(dataset):
    separated = dict()
    for i in range(len(dataset)):
        vector = dataset[i]
        class_value = vector[-1]
        if (class_value not in separated):
            separated[class_value] = list()
        separated[class_value].append(vector)
    return separated

### Calculate the mean of a list of numbers

In [62]:
def mean(numbers):
    return sum(numbers)/float(len(numbers))

### Calculate the standard deviation of a list of numbers

In [63]:
def stdev(numbers):
    avg = mean(numbers)
    variance = sum([(x-avg)**2 for x in numbers]) / float(len(numbers)-1)
    return sqrt(variance)
 

### Calculate the mean, stdev and count for each column in a dataset

In [64]:
def summarize_dataset(dataset):
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    del(summaries[-1])
    return summaries

### Split dataset by class then calculate statistics for each row

In [65]:
def summarize_by_class(dataset):
    separated = separate_by_class(dataset)
    summaries = dict()
    for class_value, rows in separated.items():
        summaries[class_value] = summarize_dataset(rows)
    return summaries

### Calculate the Gaussian probability distribution function for x

In [66]:
def calculate_probability(x, mean, stdev):
    exponent = exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

### Calculate the probabilities of predicting each class for a given row

In [67]:
def calculate_class_probabilities(summaries, row):
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    for class_value, class_summaries in summaries.items():
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        for i in range(len(class_summaries)):
            mean, stdev, _ = class_summaries[i]
            probabilities[class_value] *= calculate_probability(row[i], mean, stdev)
    return probabilities

### Predict the class for a given row

In [68]:
def predict(summaries, row):
    probabilities = calculate_class_probabilities(summaries, row)
    best_label, best_prob = None, -1
    for class_value, probability in probabilities.items():
        if best_label is None or probability > best_prob:
            best_prob = probability
            best_label = class_value
    return best_label

### Make a prediction with Naive Bayes on Iris Dataset

In [69]:
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)

str_column_to_int(dataset, len(dataset[0])-1)

model = summarize_by_class(dataset)

row = [2.7,3.9,2.2,4.3]

label = predict(model, row)
print('Data=%s, Predicted: %s' % (row, label))

[Iris-versicolor] => 0
[Iris-setosa] => 1
[Iris-virginica] => 2
Data=[2.7, 3.9, 2.2, 4.3], Predicted: 2
