In [None]:
import pandas as pd
import random
import numpy as np

In [None]:
# Read the dataset
data = pd.read_csv("pima-indians-diabetes.csv")

In [None]:
data.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
0,1,85,66,29,0,26.6,0.351,31,0
1,8,183,64,0,0,23.3,0.672,32,1
2,1,89,66,23,94,28.1,0.167,21,0
3,0,137,40,35,168,43.1,2.288,33,1
4,5,116,74,0,0,25.6,0.201,30,0


In [None]:
data.describe()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
count,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0
mean,3.842243,120.859192,69.101695,20.517601,79.90352,31.990482,0.471674,33.219035,0.34811
std,3.370877,31.978468,19.368155,15.954059,115.283105,7.889091,0.331497,11.752296,0.476682
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.2435,24.0,0.0
50%,3.0,117.0,72.0,23.0,32.0,32.0,0.371,29.0,0.0
75%,6.0,140.0,80.0,32.0,127.5,36.6,0.625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
data.columns

Index(['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1'], dtype='object')

In [None]:
TRAIN_TEST_RATIO = 0.9        # 90% training data
picker = list(range(data.shape[0]))        # get all indices as a list
## sometimes the data is arranged classwise and not randomly
## therefore we shuffle the indices
random.shuffle(picker)
trainMax = int(data.shape[0] * TRAIN_TEST_RATIO)

train_features = []
test_features = []
train_labels = []
test_labels = []

for pick in picker[:trainMax]:
    train_features.append(data.values[pick][:-1])
    train_labels.append(int(data.values[pick][-1]))
for pick in picker[trainMax:]:
    test_features.append(data.values[pick][:-1])
    test_labels.append(int(data.values[pick][-1]))

train_features = np.array(train_features)
test_features = np.array(test_features)

In [None]:
data.values[pick]

array([  1.   , 121.   ,  78.   ,  39.   ,  74.   ,  39.   ,   0.261,
        28.   ,   0.   ])

In [None]:
print(train_features.shape, len(train_labels), test_features.shape, len(test_labels))

(690, 8) 690 (77, 8) 77


In [None]:

classes, counts = np.unique(train_labels, return_counts=True)
print(classes)
print(counts)

[0 1]
[450 240]


In [None]:

num_classes = len(classes)
num_feats = train_features.shape[1]  #total number of features
total_samples = len(train_labels) 

In [None]:
prior = np.array([ x*1.0/total_samples for x in counts ])

In [None]:
means = np.zeros((num_feats, num_classes)) # every feature, for each class
stddev = np.zeros((num_feats, num_classes)) # every feature, for each class

# For each class
for y in classes: # selecting a class 'y'
    pts = train_features[np.where( train_labels == y )[0], :]    # get all samples belonging to 'y'
    # For each feature
    for i in range(num_feats):
        means[i, y] = np.mean(pts[:, i])
        stddev[i, y] = np.std(pts[:, i])


In [None]:
def gaussian(x, m, v):
    g = np.sqrt(1.0/2*np.pi*v*v)*np.exp( -1.0*(((x - m)/v)**2) )
    return g

In [None]:
def get_likelihood(point, means, stddev):
    
    feat_prob = np.zeros((num_feats, num_classes))
    for y in classes:
        for i in range(num_feats):
            feat_prob[i, y] = gaussian(point[i], means[i, y], stddev[i, y]) # get the probability
    
    likelihood = np.zeros((num_classes, 1)) # likelihood for each class 'y'
    for y in classes:
        # Take the product of all the feature likelihoods of the class considered
        likelihood[y] = np.prod(feat_prob[np.nonzero(feat_prob), y]) # mutliply for each feature 'Xi'
    
    return likelihood

In [None]:
predictions = []
# For each test sample
for i in range(len(test_labels)):
    
    # Get its likelihood of belong to either class
    likelihood = get_likelihood(test_features[i, :], means, stddev)
    
    # Calculate the approximate posterior = likelihood * prior
    approx_posterior = [ np.asscalar(x*y) for x,y in zip(likelihood, prior) ]
    #approx because of missing P(X) (constant) in the denominator
   
    
    # Make the prediction as that class with the maximum approximate posterior
    prediction = np.argmax(approx_posterior)
    predictions.append(prediction)

  if __name__ == '__main__':


In [None]:
print("Accuracy")
print(np.mean([x == y for x, y in zip(predictions, test_labels)]))

Accuracy
0.7272727272727273


Use same dataset and Implement using a scikit-learn libraries

In [49]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
import sklearn.metrics as metrics

In [52]:
x=data.drop(["1"],axis=1)
y=data["1"]

In [55]:
x.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50
0,1,85,66,29,0,26.6,0.351,31
1,8,183,64,0,0,23.3,0.672,32
2,1,89,66,23,94,28.1,0.167,21
3,0,137,40,35,168,43.1,2.288,33
4,5,116,74,0,0,25.6,0.201,30


In [56]:
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, train_size=0.9)

In [57]:
gaussian_nb = GaussianNB()
gaussian_nb.fit(x_train, y_train)

GaussianNB()

In [61]:
y_pred =gaussian_nb.predict(x_test)

In [62]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)

0.7662337662337663