In [160]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
%matplotlib inline

In [54]:
iris = sns.load_dataset('iris')

iris_train, iris_test = train_test_split(iris)

In [16]:
def normal_density(x, mean, var):
    return (1 / np.sqrt(2 * np.pi * var)) * np.exp(-(x - mean) ** 2 / (2 * var))

In [146]:
stats_train = iris_train.groupby(['species']).agg([np.mean, np.var])
stats_test = iris_test.groupby(['species']).agg([np.mean, np.var])

In [155]:
def get_posterior(sample, stats, species):
    mask = stats.index == species
    features = stats.columns.levels[0].values
    p = [normal_density(sample[i],
                        stats[mask][feature].values[0, 0],
                        stats[mask][feature].values[0, 1])
         for i, feature in enumerate(features)]
    
    P = np.sum(mask) / len(stats)
    
    return P * np.prod(p)

def classify(sample, stats):
    posteriors = [(species, get_posterior(sample, stats, species))
                  for species in stats.index.values]
    
    result = np.argmax(list(map(lambda x: x[1], posteriors)))
    return posteriors[result]

In [156]:
result = pd.DataFrame({
    'result' : iris_test.apply(
        lambda r: classify(r.values, stats_test)[0], axis = 1).values,
    'target' : iris_test['species']})

success_rate = sum(np.equal(result['result'].values, 
                            result['target'].values)) / len(result)

print(success_rate)

1.0


In [159]:
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
y_train = iris_train['species'].values
X_train = iris_train[features].as_matrix()

y_test = iris_test['species'].values
X_test = iris_test[features].as_matrix()

In [162]:
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [167]:
sum(np.equal(y_pred, y_test)) / len(y_pred)

1.0

In [168]:
np.equal(y_pred, result['result'].values).all()

True