**Navies Bayes**

In [1]:
import pandas as pd
import random
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read the dataset
data = pd.read_csv("pima-indians-diabetes.csv",header = None)

In [3]:
# Visualize some data samples from the dataset
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [46]:
# 8th column is the class label
print('\n\n\nStats for the 7 features over the dataset and the 2 classes {8th column}{diabetic/not-diabetic}\n')
print(data.describe())




Stats for the 7 features over the dataset and the 2 classes {8th column}{diabetic/not-diabetic}

                0           1           2           3           4           5  \
count  768.000000  768.000000  768.000000  768.000000  768.000000  768.000000   
mean     3.845052  120.894531   69.105469   20.536458   79.799479   31.992578   
std      3.369578   31.972618   19.355807   15.952218  115.244002    7.884160   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      1.000000   99.000000   62.000000    0.000000    0.000000   27.300000   
50%      3.000000  117.000000   72.000000   23.000000   30.500000   32.000000   
75%      6.000000  140.250000   80.000000   32.000000  127.250000   36.600000   
max     17.000000  199.000000  122.000000   99.000000  846.000000   67.100000   

                6           7           8  
count  768.000000  768.000000  768.000000  
mean     0.471876   33.240885    0.348958  
std      0.331329   11.760232    0.476

In [47]:
TRAIN_TEST_RATIO = 0.8        # 80% training data
picker = list(range(data.shape[0]))        # get all indices as a list
## sometimes the data is arranged classwise and not randomly
## therefore we shuffle the indices
random.shuffle(picker)
trainMax = int(data.shape[0] * TRAIN_TEST_RATIO)

train_features = []
test_features = []
train_labels = []
test_labels = []

for pick in picker[:trainMax]:
    train_features.append(data.values[pick][:-1])
    train_labels.append(int(data.values[pick][-1]))
for pick in picker[trainMax:]:
    test_features.append(data.values[pick][:-1])
    test_labels.append(int(data.values[pick][-1]))

train_features = np.array(train_features)
test_features = np.array(test_features)

In [48]:
data.values[pick]

array([ 5.   , 73.   , 60.   ,  0.   ,  0.   , 26.8  ,  0.268, 27.   ,
        0.   ])

In [49]:
print(train_features.shape, len(train_labels), test_features.shape, len(test_labels))

(614, 8) 614 (154, 8) 154


### Exercise 1: Calculate Prior $P(Y)$

The formula for prior has been taught in class. This is also called the class probability. $P(Y)$ or $P(Y = y)$ is the fraction of the elements present in a class

In [50]:
# Get the number of unique classes & corresponding number of elements belonging to each class
classes, counts = np.unique(train_labels, return_counts=True)
print(classes)
print(counts)

[0 1]
[412 202]


In [51]:
### I assume my classes are from 0 ... N for some N (Here, we have just 2 classes)
num_classes = len(classes)
num_feats = train_features.shape[1]  #total number of features
total_samples = len(train_labels)    #total number of samples

### **Exercise 1: Find the prior probability of each class as the list `prior`**

In [52]:
# Prior for any class = {number of samples belonging to that class/ total_samples}
prior = np.array([ x*1.0/total_samples for x in counts ])

In [53]:
print(prior)

[0.67100977 0.32899023]


In [54]:
## Calculate the mean and variance per feature dimension here 
### from the training set from samples belonging to each class label.

means = np.zeros((num_feats, num_classes)) # every feature, for each class
stddev = np.zeros((num_feats, num_classes)) # every feature, for each class

# For each class
for y in classes: # selecting a class 'y'
    pts = train_features[np.where( train_labels == y )[0], :]    # get all samples belonging to 'y'
    # For each feature
    for i in range(num_feats):
        means[i, y] = np.mean(pts[:, i])
        stddev[i, y] = np.std(pts[:, i])

### This completes the training phase
### We know have estimated both the prior probability and the posterior distributions from our training set.

In [55]:
means,stddev

(array([[  3.37621359,   4.79207921],
        [109.52912621, 142.17326733],
        [ 68.54126214,  70.44554455],
        [ 19.43932039,  20.95544554],
        [ 68.19417476,  94.51980198],
        [ 30.17087379,  34.59653465],
        [  0.43646602,   0.53081683],
        [ 31.50242718,  36.59405941]]),
 array([[  3.02284461,   3.60504825],
        [ 25.64766168,  29.08105229],
        [ 18.08087941,  21.12432564],
        [ 14.75738116,  17.90467042],
        [ 98.69835464, 139.60759045],
        [  7.74978084,   7.13303831],
        [  0.31119032,   0.35237215],
        [ 11.99544792,  10.83967613]]))



### Exercise 2: Complete the Gaussian function ###

In [56]:
def gaussian(x, m, v):
    g = np.sqrt(1.0/2*np.pi*v*v)*np.exp( -1.0*(((x - m)/v)**2) )
    return g

### Exercise 3: Find the likelihood for each class 'y', once you have $P(X_{i}|y)$ from Exercise 2 ###

In [57]:
def get_likelihood(point, means, stddev):
    
    feat_prob = np.zeros((num_feats, num_classes))
    for y in classes:
        for i in range(num_feats):
            feat_prob[i, y] = gaussian(point[i], means[i, y], stddev[i, y]) # get the probability
    
    likelihood = np.zeros((num_classes, 1)) # likelihood for each class 'y'
    for y in classes:
        # Take the product of all the feature likelihoods of the class considered
        likelihood[y] = np.prod(feat_prob[np.nonzero(feat_prob), y]) # mutliply for each feature 'Xi'
    return likelihood

## Predict using Naive Bayes classifier

In [58]:
predictions = []
# For each test sample
for i in range(len(test_labels)):
    
    # Get its likelihood of belong to either class
    likelihood = get_likelihood(test_features[i, :], means, stddev)
    
    # Calculate the approximate posterior = likelihood * prior
    approx_posterior = [ np.asscalar(x*y) for x,y in zip(likelihood, prior) ]
    #approx because of missing P(X) (constant) in the denominator
    
    # Make the prediction as that class with the maximum approximate posterior
    prediction = np.argmax(approx_posterior)
    predictions.append(prediction)

In [59]:
print("Accuracy")
print(np.mean([x == y for x, y in zip(predictions, test_labels)]))

Accuracy
0.7077922077922078


### Use same dataset and Implement using a scikit-learn libraries ###

In [60]:
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split


In [61]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [62]:
X,y=data.iloc[:,:-1],data.iloc[:,-1]

In [76]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42,stratify=y)

In [77]:
X_train.shape,X_test.shape,y_test.shape,y_train.shape

((514, 8), (254, 8), (254,), (514,))

In [78]:
ss=StandardScaler()

In [79]:
Xtrain_std=ss.fit_transform(X_train)

In [80]:
NB=GaussianNB()

In [81]:
NB.fit(Xtrain_std,y_train)

GaussianNB()

In [82]:
Xtest_std=ss.transform(X_test)

In [83]:
y_pred=NB.predict(Xtest_std)

In [84]:
from sklearn.metrics import classification_report

In [85]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.80      0.81      0.80       165
           1       0.64      0.63      0.63        89

    accuracy                           0.74       254
   macro avg       0.72      0.72      0.72       254
weighted avg       0.74      0.74      0.74       254



In [86]:
NB.class_prior_

array([0.65175097, 0.34824903])

In [87]:
NB.class_count_

array([335., 179.])