In [21]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix

In [22]:
df = pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [23]:
temp = df.groupby("Outcome").size()
temp

Outcome
0    500
1    268
dtype: int64

In [24]:
y = df['Outcome']
# drop the col 'outcome'
x = df.drop(['Outcome'],axis=1)

In [25]:
# Split data into train & test
x_train, x_test, y_train, y_test = train_test_split(x, y, stratify=y, random_state=42)


In [26]:
x_train.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,576.0,576.0,576.0,576.0,576.0,576.0,576.0,576.0
mean,3.831597,120.767361,69.170139,20.723958,77.899306,32.064583,0.4802,33.536458
std,3.312864,31.77138,18.699887,15.877307,107.415003,7.861032,0.333188,11.878752
min,0.0,0.0,0.0,0.0,0.0,0.0,0.084,21.0
25%,1.0,99.0,62.0,0.0,0.0,27.6,0.24575,24.0
50%,3.0,116.5,72.0,23.0,40.0,32.4,0.384,30.0
75%,6.0,141.0,80.0,32.0,129.25,36.525,0.64625,41.0
max,17.0,199.0,122.0,99.0,744.0,67.1,2.329,81.0


In [27]:
x_test.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
count,192.0,192.0,192.0,192.0,192.0,192.0,192.0,192.0
mean,3.885417,121.276042,68.911458,19.973958,85.5,31.776562,0.446906,32.354167
std,3.542915,32.650006,21.253333,16.203689,136.216758,7.969892,0.325265,11.381513
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0
25%,1.0,99.0,64.0,0.0,0.0,26.075,0.237,23.0
50%,3.0,119.5,72.0,22.0,0.0,31.6,0.343,28.0
75%,6.0,140.0,80.0,32.0,120.0,36.6,0.5635,39.0
max,15.0,197.0,106.0,63.0,846.0,59.4,2.42,68.0


In [28]:
train_mean_pos = x_train[y_train==1].mean()
train_std_pos = x_train[y_train==1].std()
train_mean_neg = x_train[y_train==0].mean()
train_std_neg = x_train[y_train==0].std()

In [29]:
from math import sqrt
from math import pi
from math import exp
# formula of Gausian NB
def cond_probability(x, mean, std):
  exponent = exp(-((x - mean)**2/(2*std**2)))
  return (1 / (sqrt(2*pi)*std)) * exponent

In [30]:
def predict(row):
  prob_pos = len(x_train[y_train==1]) / len(x_train)
  for i in range(0,len(row)):
    prob_pos = prob_pos * cond_probability(row[i],train_mean_pos[i],train_std_pos[i])
  prob_neg = len(x_train[y_train==0]) / len(x_train)
  for i in range(0,len(row)):
    prob_neg = prob_neg * cond_probability(row[i],train_mean_neg[i],train_std_neg[i])
  return [prob_pos,prob_neg]

In [31]:
predictions_raw = []
for row in x_test.values.tolist():
  predictions_raw.append(predict(row))

In [32]:
predictions_raw[0]

[1.6299028206157718e-14, 1.0044068228290291e-14]

In [33]:
predictions_raw

[[1.6299028206157718e-14, 1.0044068228290291e-14],
 [4.1416091656376674e-13, 1.3042909855968897e-12],
 [1.1228918162921918e-13, 3.698652043170301e-12],
 [4.27008488096741e-13, 7.034723756517606e-13],
 [7.913590093643907e-14, 1.281328360925972e-15],
 [6.03509035365965e-14, 1.390772897035608e-12],
 [1.6545496022765486e-13, 8.00346760473016e-13],
 [3.1403211509634367e-17, 4.768433856559617e-16],
 [1.270299259926624e-13, 1.1016259499091278e-13],
 [2.1748571686748456e-14, 2.1739905042451717e-12],
 [4.971297069213275e-13, 4.668622177374758e-12],
 [4.5137523313178456e-14, 2.6278537816854723e-15],
 [5.544956140320912e-14, 4.4363696298950895e-13],
 [8.398499050734631e-14, 2.789658783732221e-12],
 [2.3296901261242477e-14, 1.430621412993164e-14],
 [1.5705424247395808e-16, 2.391268492201095e-18],
 [1.3372550935227474e-16, 8.091703856113028e-17],
 [3.7850981610406474e-14, 4.424312795466002e-13],
 [5.1879097868291456e-18, 8.258115874425753e-17],
 [1.2095736900610271e-13, 4.805126942734823e-13],
 [9.

In [34]:
predictions = []
for row in predictions_raw:
  if(row[0]>row[1]):
    predictions.append(1)
  else:
    predictions.append(0)

In [35]:
# comparing our predictions and actual output for accuracy
accuracy_score(y_test.tolist(),predictions)

0.7135416666666666

In [36]:
# plotting the output for comparison
confusion_matrix(y_test.tolist(),predictions)

array([[96, 29],
       [26, 41]])

In [37]:
model = GaussianNB() # Gaussian NB has been used becuase the data is continuous
model.fit(x_train,y_train)

GaussianNB()

In [38]:
confusion_matrix(y_test,model.predict(x_test))

array([[96, 29],
       [26, 41]])