# 8. Naive Bayes

In [1]:
from random import seed
from math import sqrt, exp, pi

from functions import *

### Define functions
* Naive Bayes for classification using Gaussian distribution

In [2]:
# Split the dataset by class values
# Output: dictionary with distinct class values as keys, lists of corresponding records (full rows) as values
def bayes_separate_by_class(dataset):
    separated = dict()
    # Iterate over all rows in the dataset
    for i in range(len(dataset)):
        # Store row
        vector = dataset[i]
        # Extract class value (output), assuming last column
        class_value = vector[-1]
        # Check if first occurence of class value
        if class_value not in separated:
            # Add the new class value as a key in the dictionary
            separated[class_value] = list()
        # Append current record (row of data) to the corresponding list (dictionary value)
        separated[class_value].append(vector)
    return separated

# Calculate the mean of a list of numbers
def mean(numbers):
    return sum(numbers)/float(len(numbers))

# Calculate the standard deviation of a list of numbers
def stdev(numbers):
    avg = mean(numbers)
    # Makes numbers a list, calculating (x - avg)^2 and summing them, then dividing by count
    variance = sum([(x - avg)**2 for x in numbers]) / float(len(numbers)-1)
    # Return the standard deviation
    return sqrt(variance)

# Calculate the mean, stdev, and count for each input column in a dataset
# Output: list of tuples with input columns' summary statistics; mean, st. dev., count
def bayes_summarize_dataset(dataset):
    # The * operator separates the dataset (a list of lists) into separate lists for each row
    # zip() iterates over each element of each row, returns each column as a tuple of numbers
    # For each column in the zip result set, mean, stdev and length are calculated
    # These summary statistics are stored as a list of tuples - one per input column
    summaries = [(mean(column), stdev(column), len(column)) for column in zip(*dataset)]
    # Remove summary of output class variable, as that is not needed
    del(summaries[-1])
    # Returns the list of tuples (one per input variable) of summary statistics
    return summaries

# Split dataset by class then get summary statistics for each input variable in each set
# Output: dictionary; key = class, value = list[] summary statistic tuples() - per column
def bayes_summarize_by_class(dataset):
    separated = bayes_separate_by_class(dataset)
    summaries = dict()
    # Iterating over key, value where value = list of rows in dataset of that class (key)
    for class_value, rows in separated.items():
        # "rows" is list of rows in separated dataset dictionary (key = class, value = list)
        # Creates key = class, value = list of summary statistic tuples for input variables
        # i.e. input variable summary statistics "grouped by" that class (key)
        summaries[class_value] = bayes_summarize_dataset(rows)
    return summaries

# Calculates the Gaussian probability distribution function for x
# Input: x (for which to calculate probability), mean of x, standard deviation of x
# Output: the probability of x, given its mean and st. dev.
def gauss_calculate_probability(x, mean, stdev):
    exponent = exp(-((x - mean)**2 / (2 * stdev **2)))
    return (1 / (sqrt(2 * pi) * stdev)) * exponent

# Calculate the probabilities of predicting each class for a given row
# Input: set of prepared summaries (dictionary where value = list of tuples), a new test row
# Output: a dictionary of probabilities with one entry for each class, given the input row
def bayes_calculate_class_probabilities(summaries, row):
    # Counts total rows; per class, [0] is first tuple (input variable), [2] is count rows
    # We use first input variable [0], but they should all have same length in class subset
    total_rows = sum([summaries[label][0][2] for label in summaries])
    probabilities = dict()
    # Iterating over classes (keys) and input variable summaries in that class subset
    for class_value, class_summaries in summaries.items():
        # P(Class = class_value) = count(class_value) / total_rows
        probabilities[class_value] = summaries[class_value][0][2]/float(total_rows)
        # Length of dictionary = number of keys = number of input variables (tuples)
        for i in range(len(class_summaries)):
            # Get summary statistics for current input variable in current class
            mean, stdev, count = class_summaries[i]
            # P(class_value|X1,X2) = P(X1|class_value) * P(X2|class_value) * P(class_value)
            probabilities[class_value] *= gauss_calculate_probability(row[i], mean, stdev)
    return probabilities

# Calculate probabilities of a new row belonging to each class, and use highest to predict
def bayes_predict(summaries, row):
    # Calculate probability of the provided row belonging to each class; dictionary
    probabilities = bayes_calculate_class_probabilities(summaries, row)
    # Best prediction of class value, and the corresponding probability
    best_label, best_prob = None, -1
    # Iterating over the different classes (keys), and the probability (value) of each
    for class_value, probability in probabilities.items():
        # If first iteration or current probability is the best so far
        if best_label is None or probability > best_prob:
            # Update prediction and corresponding probability
            best_label = class_value
            best_prob = probability
    # Return prediction
    return best_label

# Manages the application of the Naive Bayes algorithm
# Input: train and test data sets
# Output: list of predictions corresponding to test set
def naive_bayes(train, test):
    # Get summary statistics for each input variable, in each class subset of training data
    summaries = bayes_summarize_by_class(train)
    predictions = list()
    # Iterate over test set rows, storing the prediction on each row of input values
    for row in test:
        predictions.append(bayes_predict(summaries, row))
    # Return the list of predictions corresponding to the test set
    return predictions


### Testing NB on contrived dataset

In [3]:
# Contrived dataset
dataset =   [[3.393533211,2.331273381,0],
            [3.110073483,1.781539638,0],
            [1.343808831,3.368360954,0],
            [3.582294042,4.67917911,0],
            [2.280362439,2.866990263,0],
            [7.423436942,4.696522875,1],
            [5.745051997,3.533989803,1],
            [9.172168622,2.511101045,1],
            [7.792783481,3.424088941,1],
            [7.939820817,0.791637231,1]]

# Testing separating dataset by distinct class values
separated = bayes_separate_by_class(dataset)
print('\nDataset split by classes')
i = 1
# Iterating over keys in the dictionary
for label in separated:
    print('Class:', label)
    for row in separated[label]:
        print('Row ' + str(i) + ':', row)
        i += 1

# Testing input variable summary statistics
summary = bayes_summarize_dataset(dataset)
print('\nSummary full dataset:', summary)

summary = bayes_summarize_by_class(dataset)
print('\nSummary input variables per class subset of dataset')
for label in summary:
    print('Class:', label)
    i = 1
    for row in summary[label]:
        print('X' + str(i) + ':', row)
        i += 1

# Testing Gaussian PDF
print('\nTest Gaussian probability when mean = 1, st. dev. = 1 (independent of class):')
# P(x=1) is top of bell curve when mean = 1 and st. dev. = 1
print('P(x=0):',gauss_calculate_probability(0.0, 1.0, 1.0))
print('P(x=1):',gauss_calculate_probability(1.0, 1.0, 1.0))
print('P(x=2):',gauss_calculate_probability(2.0, 1.0, 1.0))

# Testing calculating class probabilities
summaries = bayes_summarize_by_class(dataset)
probabilities = bayes_calculate_class_probabilities(summaries, dataset[0])
print('\nProbabilities for first row:', probabilities)


Dataset split by classes
Class: 0
Row 1: [3.393533211, 2.331273381, 0]
Row 2: [3.110073483, 1.781539638, 0]
Row 3: [1.343808831, 3.368360954, 0]
Row 4: [3.582294042, 4.67917911, 0]
Row 5: [2.280362439, 2.866990263, 0]
Class: 1
Row 6: [7.423436942, 4.696522875, 1]
Row 7: [5.745051997, 3.533989803, 1]
Row 8: [9.172168622, 2.511101045, 1]
Row 9: [7.792783481, 3.424088941, 1]
Row 10: [7.939820817, 0.791637231, 1]

Summary full dataset: [(5.178333386499999, 2.7665845055177263, 10), (2.9984683241, 1.218556343617447, 10)]

Summary input variables per class subset of dataset
Class: 0
X1: (2.7420144012, 0.9265683289298018, 5)
X2: (3.0054686692, 1.1073295894898725, 5)
Class: 1
X1: (7.6146523718, 1.2344321550313704, 5)
X2: (2.9914679790000003, 1.4541931384601618, 5)

Test Gaussian probability when mean = 1, st. dev. = 1 (independent of class):
P(x=0): 0.24197072451914337
P(x=1): 0.3989422804014327
P(x=2): 0.24197072451914337

Probabilities for first row: {0: 0.05032427673372075, 1: 0.00011557718

### Testing NB on Iris dataset

In [4]:
# Iris Flower: predicting flower species given measurements of iris flowers
# Multiclass classification problem, with a 26% baseline performance
print("\nEvaluate Naive Bayes on Iris Flower Species using k-fold cross-validation:")
# Load Iris Flower Species dataset, convert input variables to float, output class to int
dataset = load_csv('data/iris.csv')
for i in range(len(dataset[0])-1):
    str_column_to_float(dataset, i)
str_column_to_int(dataset, len(dataset[0])-1)

# Evaluate algorithm
seed(1)
n_folds = 5
scores = evaluate_algorithm(dataset, naive_bayes, n_folds)
print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))



Evaluate Naive Bayes on Iris Flower Species using k-fold cross-validation:
Loaded data file data/iris.csv with 150 rows and 5 columns.
Scores: [93.33333333333333, 96.66666666666667, 100.0, 93.33333333333333, 93.33333333333333]
Mean Accuracy: 95.333%
