In [1]:
import numpy as np

dataset = np.loadtxt("diabetes.csv", delimiter=",")
X = dataset[:, :-1]
y = dataset[:, -1]

np.random.seed()
n_train = int(len(dataset)*.7)
perm_index = np.random.permutation(len(dataset))

X_train = X[perm_index[:n_train]]
X_test  = X[perm_index[n_train:]]
y_train = y[perm_index[:n_train]]
y_test  = y[perm_index[n_train:]]

In [2]:
def mean(array):
    return np.sum(array) / len(array)

In [3]:
def stdev(array):
    return np.sqrt(np.sum(np.power(array-np.full(array.shape, mean(array)), 2)) / (len(array)-1))

In [4]:
def summarize(X_train=X_train, y=y):
    summaries = {}
    for i in np.unique(y):
        mean_stdev = []
        for array in X_train[y_train==i, :].T:
            mean_stdev.append([mean(array), stdev(array)])
        summaries[i] = mean_stdev
    return summaries

In [5]:
def probability(array, mean_stdev):
    mean_stdev = np.array(mean_stdev).T
    mean  = mean_stdev[0]
    stdev = mean_stdev[1]
    return np.prod(np.exp(-np.power(array-mean, 2) / (2*np.power(stdev, 2))) / (np.sqrt(2*np.pi) * stdev))

In [6]:
def predict(summaries, array, y=y):
    label_prob = {}
    for i in np.unique(y):
        label_prob[i] = probability(array, summaries[i])
        
    best_prob = -1
    for label, prob in label_prob.items():
        if prob > best_prob:
            best_label = label
            best_prob  = prob
    return best_label    

In [7]:
def accuracy(pred, y_test=y_test):
    counter = 0
    for i in range(len(pred)):
        if pred[i] == y_test[i]:
            counter +=1
    return counter / len(y_test) * 100

In [8]:
pred = []
for array in X_test:
    pred.append(predict(summarize(), array))
    
print(accuracy(pred))

74.45887445887446


In [9]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train)

print(clf.score(X_test, y_test)*100)

77.48917748917748
