In [1]:
import numpy as np
import pandas as pd

In [284]:
df = pd.read_csv(
    'pima-indians-diabetes.csv',
    header=None, 
    names = [
        "NumTimesPrg", 
        "PlGlcConc", 
        "BloodP",
        "SkinThick", 
        "TwoHourSerIns", 
        "BMI",
        "DiPedFunc", 
        "Age", 
        "HasDiabetes"
    ]
)
df.head()

Unnamed: 0,NumTimesPrg,PlGlcConc,BloodP,SkinThick,TwoHourSerIns,BMI,DiPedFunc,Age,HasDiabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [285]:
from sklearn.model_selection import train_test_split

In [286]:
from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
X = df.iloc[:, :-1].values
y = class_le.fit_transform(df.iloc[:, -1].values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3.0, random_state=0)

In [290]:
class NaiveBayesClassifier:
    def fit(self, X, y):
        self.summaries = self.get_summaries(X, y)
        
    def get_summaries(self, X, y):
        summaries = []
        for i in range(len(set(y))):
            grouped = X[np.where(y == i)[0]]
            summaries.append({
                'means': np.mean(grouped, axis=0),
                'variances': np.var(grouped,axis=0)
            })
        return summaries

    def predict(self, X):
        probabilities = self.calculateClassProbabilities(self.summaries, X)
        return np.argmax(probabilities, axis=0)
        
    def predict_proba(self, X):
        self.calculateClassProbabilities(self.summaries, X)

    def calculateClassProbabilities(self, summaries, X):
        probabilities = []
        for classSummaries in summaries:
            probabilities.append(calculateProbability(X, classSummaries['means'], classSummaries['variances']))    
        return probabilities

    # Gaussian model prob of one feature(x) in whole sample: 
    # P(x|w) = 1/sqrt(2*pi*var)*exp(-(x-mean)^2/(2*var)) 
    # prob of matching all features = product of all P(x|w)
    def calculateProbability(self, X, means, variances):
        exponent = np.exp(-((X - means) ** 2) / (2 * variances))
        probs = (1 / np.sqrt(2 * math.pi * variances)) * exponent
        return np.prod(probs, axis=1)

In [288]:
bayes = NaiveBayesClassifier()
bayes.fit(X_train,y_train)
predictions = bayes.predict(X_train)
print('Train accuracy:')
print(sum(predictions == y_train) / len(y_train))
print('Test accuracy:')
predictions = bayes.predict(X_test)
print(sum(predictions == y_test) / len(y_test))

Train accuracy:
0.765625
Test accuracy:
0.7265625


In [289]:
# using sklearn
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train, y_train)
predictions = clf.predict(X_train)
print('Train accuracy:')
print(sum(predictions == y_train) / len(y_train))
print('Test accuracy:')
predictions = clf.predict(X_test)
print(sum(predictions == y_test) / len(y_test))

Train accuracy:
0.771484375
Test accuracy:
0.75
