In [1]:
import pandas as pd

data = pd.read_csv('pima.csv')

data.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [2]:
data.shape

(768, 9)

In [3]:
num_classes = len(data['Outcome'].unique())
num_classes

2

In [4]:
train = data.sample(frac=0.6,random_state=1)
test = data.drop(train.index)

In [5]:
train.shape

(461, 9)

In [6]:
test.shape

(307, 9)

In [10]:
grouped_data = train.groupby('Outcome')
grouped_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
285,7,136,74,26,135,26.0,0.647,51,0
101,1,151,60,0,0,26.1,0.179,22,0
581,6,109,60,27,0,25.0,0.206,27,0
352,3,61,82,28,0,34.4,0.243,46,0
726,1,116,78,29,180,36.1,0.496,25,0
445,0,180,78,63,14,59.4,2.42,25,1
614,11,138,74,26,144,36.1,0.557,50,1
355,9,165,88,0,0,30.4,0.302,49,1
19,1,115,70,30,96,34.6,0.529,32,1
493,4,125,70,18,122,28.9,1.144,45,1


In [8]:
summary = []
for outcome in data['Outcome'].unique(): # for each class
    class_data = grouped_data.get_group(outcome) # get subset of data for this class
    class_summary = []
    for feature in class_data.iloc[:,:-1]: 
        # for each feature in this class (ignore the class label)
        class_summary.append((class_data[feature].mean(),class_data[feature].std())) 
        # mean & std dev
    summary.append(class_summary)
summary

[[(4.771604938271605, 3.6666614383968414),
  (141.08024691358025, 33.146574239121406),
  (73.30246913580247, 17.359225984609907),
  (23.22222222222222, 18.208591242351943),
  (98.95679012345678, 137.96004231874247),
  (34.86358024691358, 7.165581607449847),
  (0.5558827160493827, 0.38798194851077417),
  (37.611111111111114, 11.263776933491421)],
 [(3.3812709030100336, 2.9812248716073264),
  (109.79264214046823, 24.95601122006083),
  (67.49498327759197, 19.778873421113857),
  (20.02006688963211, 14.967737070634456),
  (77.91638795986623, 111.55458695846238),
  (30.52976588628763, 8.370831644957885),
  (0.4449063545150501, 0.31923388460545665),
  (31.02675585284281, 11.095949129804032)]]

In [11]:
import math

def normpdf(x, mean, sd): #  gaussian probability distribution function
    var = float(sd)**2
    denom = (2*math.pi*var)**.5
    num = math.exp(-(float(x)-float(mean))**2/(2*var))
    return num/denom

In [12]:
import numpy as np

def class_probability(sample_features, summary):
    probabilities = []
    for c in range(num_classes): # for each class
        prob = 1
        for i, feature in enumerate(sample_features): # for each feature
            prob *= normpdf(feature, summary[0][c][0], summary[0][c][1]) 
            # multiply the probabilities
        probabilities.append(prob)
    return np.array(probabilities)

In [13]:
def predict(probabilities): # return class with max probability
    return np.argmax(probabilities)

In [14]:
def accuracy(test_classes, predicted_classes): # calculate accuracy
    correct = 0
    total = len(predicted_classes)
    for actual, predicted in zip(test_classes, predicted_classes):
        if actual == predicted:
            correct += 1
    return correct/total

In [15]:
predicted_classes = []

num_test_samples = len(test)

for i in range(num_test_samples):
    sample = test.iloc[i]
    sample = sample[:-1] # remove the class label
    probabilities = class_probability(sample, summary)
    prediction = predict(probabilities)
    predicted_classes.append(prediction)

acc = accuracy(test['Outcome'], predicted_classes)

In [16]:
acc

0.34527687296416937