#### Importing modules

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, StratifiedKFold, LeaveOneOut, ShuffleSplit
from sklearn.metrics import accuracy_score
from sklearn import datasets

#### Gaussian Naive Bayes class

In [2]:
class GaussianNaiveBayes:
    # Constructor
    def __init__(self):
        self.sigmas = dict()
        self.dispersions = dict()
        self.probs = dict()
        self.targets = list()
        self.columns = list()
    
    # Fit method
    def fit(self, x, y, column_names):
        self.__init__()
        
        # Preparing DataFrame
        dataset = pd.DataFrame(data=x, index=None, columns=column_names[:-1])
        target_column_name = column_names[-1]
        dataset[target_column_name] = y
        
        # Preparing sigma / dispersion dictionaries 
        for column in dataset.drop(target_column_name, axis=1):
            self.sigmas[column] = dict()
            self.dispersions[column] = dict()
            for target in dataset[target_column_name].unique():
                sample = np.array(dataset.query('{0} == @target'.format(target_column_name))[column])
                self.sigmas[column][target] = sample.std()
                self.dispersions[column][target] = sample.mean()
                self.probs[column] = len(sample) / len(x)
        
        self.targets = dataset[target_column_name].unique()
        self.columns = column_names
    
    # Predict method
    def predict(self, X):
        predicts = list()
        for row in X:
            target_prob_dict = dict()
            for target in self.targets:
                for x, column in zip(row, self.columns[:-1]):
                    sigma = self.sigmas[column][target]
                    dispersion = self.dispersions[column][target]
                    prob = self.probs[column]
                    target_prob = ((1 / np.sqrt(np.pi*sigma)) * np.exp(-( (x-dispersion)**2 / (2*sigma**2)))) * prob
                    target_prob_dict[target] = target_prob_dict.get(target, 1) * target_prob
            predicts.append(max(target_prob_dict, key=target_prob_dict.get))
        return predicts

#### Testing our classifier on default sklearn datasets (different cross-validation methods are being passed as an arguments)

In [3]:
def cross_val_gnb(dataset, cross_val_func, cross_val_args):
    data = dataset.data
    target = dataset.target
    column_names = list(name.replace(' ', '_') for name in dataset.feature_names)
    column_names.append('target')
    cv = cross_val_func(**cross_val_args)
    efficency = list()
    for train_index, test_index in cv.split(data, target):
        train_x, test_x = data[train_index], data[test_index]
        train_y, test_y = target[train_index], target[test_index]
        gnb = GaussianNaiveBayes()
        gnb.fit(train_x, train_y, column_names)
        pred_y = gnb.predict(test_x)
        efficency.append(accuracy_score(test_y, pred_y))
    return (np.average(efficency) * 100)

#### Loading datasets

In [4]:
iris_dataset = datasets.load_iris()
wine_dataset = datasets.load_wine()
cancer_dataset = datasets.load_breast_cancer()

In [5]:
datasets = (iris_dataset, wine_dataset, cancer_dataset)
names = ('Iris', 'Wine', 'Cancer')
cross_val_methods = {LeaveOneOut: {},
                     ShuffleSplit : {"n_splits" : 5, "test_size" : 0.25},
                     StratifiedKFold : {"n_splits": 5, "shuffle": True}}

#### Preparing a DataFrame with average classification accuracy for every dataset

In [6]:
df = pd.DataFrame({'Dataset': names, 'Accuracy, %': [None for name in names], 'Cross-validation method': [None for name in names]})
df = df.set_index('Dataset')

In [7]:
for dataset, name, cross_val_method in zip(datasets, names, cross_val_methods):
    df['Accuracy, %'][name] = cross_val_gnb(dataset, cross_val_method, cross_val_methods[cross_val_method])
    df['Cross-validation method'][name] = cross_val_method.__name__

In [8]:
df

Unnamed: 0_level_0,"Accuracy, %",Cross-validation method
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1
Iris,95.333333,LeaveOneOut
Wine,98.666667,ShuffleSplit
Cancer,92.971588,StratifiedKFold
