### Importing libraries

In [106]:
#importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score ,precision_score,recall_score,f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

### Read dataset

In [107]:
#reading the dataset
df = pd.read_csv("Iris.csv")

In [123]:
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [108]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [109]:
df.drop('Id', axis=1, inplace=True)

In [110]:
#converting species to categorical variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)

### Split Dataset

In [111]:
#splitting the data into train=70% and test=30%
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.3,random_state=0)

print(X_train[:3])
print('-'*15)
print(Y_train[:3])
print('-'*15)
print(X_test[:3])
print('-'*15)
print(Y_test[:3])

[[5.  2.  3.5 1. ]
 [6.5 3.  5.5 1.8]
 [6.7 3.3 5.7 2.5]]
---------------
[1 2 2]
---------------
[[5.8 2.8 5.1 2.4]
 [6.  2.2 4.  1. ]
 [5.5 4.2 1.4 0.2]]
---------------
[2 1 0]


### Data preprocessing

In [112]:
#data preprocessing
from sklearn.preprocessing import StandardScaler

sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

print(X_train[:3])
print('-'*15)
print(X_test[:3])

[[-1.02366372 -2.37846268 -0.18295039 -0.29145882]
 [ 0.69517462 -0.10190314  0.93066067  0.73721938]
 [ 0.92435306  0.58106472  1.04202177  1.6373128 ]]
---------------
[[-0.10694994 -0.55721505  0.70793846  1.50872803]
 [ 0.1222285  -1.92315077  0.09545238 -0.29145882]
 [-0.45071761  2.6299683  -1.35224199 -1.32013702]]


### Gaussian Naive Bayer's Classifier

In [113]:
#applying gaussian naive bayer's classifier
gaussian = GaussianNB()
gaussian.fit(X_train, Y_train)
Y_pred = gaussian.predict(X_test) 

print(X_test[:10])
print('-'*15)
print(Y_pred[:10])

[[-0.10694994 -0.55721505  0.70793846  1.50872803]
 [ 0.1222285  -1.92315077  0.09545238 -0.29145882]
 [-0.45071761  2.6299683  -1.35224199 -1.32013702]
 [ 1.6118884  -0.32955909  1.37610509  0.73721938]
 [-1.02366372  0.80872067 -1.29656144 -1.32013702]
 [ 0.46599617  0.58106472  1.20906343  1.6373128 ]
 [-1.02366372  1.03637663 -1.40792255 -1.19155225]
 [ 0.92435306  0.12575281  0.48521625  0.35146505]
 [ 1.03894229 -0.55721505  0.5408968   0.22288028]
 [ 0.23681773 -0.55721505  0.09545238  0.0942955 ]]
---------------
[2 1 0 2 0 2 0 1 1 1]


In [114]:
#observe y_test and y_predict
print(Y_pred[:20])
print(Y_test[:20])

[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0]
[2 1 0 2 0 2 0 1 1 1 2 1 1 1 1 0 1 1 0 0]


### Confusion Matrix

In [115]:
#computing the confusion matrix

cm = confusion_matrix(Y_test, Y_pred)
print(cm)

[[16  0  0]
 [ 0 18  0]
 [ 0  0 11]]


In [116]:
#calculating accuracy
accuracy = accuracy_score(Y_test,Y_pred)
print('accuracy_Naive Bayes: %.3f' %accuracy)

accuracy_Naive Bayes: 1.000


In [117]:
#calculating precision
precision =precision_score(Y_test, Y_pred,average='micro')
print('precision_Naive Bayes: %.3f' %precision)

precision_Naive Bayes: 1.000


In [118]:
#calculating recall
recall =  recall_score(Y_test, Y_pred,average='micro')
print('recall_Naive Bayes: %.3f' %recall)

recall_Naive Bayes: 1.000


In [119]:
#calculating error rate
error_rate=1-accuracy
print('error_rate_Naive Bayes: %.3f' %error_rate)

error_rate_Naive Bayes: 0.000


In [124]:
#calculating tp,tn,fp,fn
#TP: The actual value and predicted value should be the same. So concerning Setosa class, the value of cell 1 is the TP value.
#FN: The sum of values of corresponding rows except for the TP value
#FP: The sum of values of the corresponding column except for the TP value.
#TN: The sum of values of all columns and rows except the values of that class that we are calculating the values for.

##Setosa
print("Setosa:")
print("TP: ", cm[0][0]) 
print("FN: ", cm[0][1]+cm[0][2]) 
print("FP:", cm[1][0]+cm[2][0]) 
print("TN:", cm[1][1]+cm[1][2]+cm[2][1]+cm[2][2]) 

##Versicolor
print("Versicolor:")
print("TP: ", cm[1][1]) 
print("FN: ", cm[1][0]+cm[1][2]) 
print("FP:", cm[0][1]+cm[2][1]) 
print("TN:", cm[0][0]+cm[0][2]+cm[2][0]+cm[2][2]) 


##Virginica
print("Verginica:")
print("TP: ", cm[2][2]) 
print("FN: ", cm[2][0]+cm[2][1]) 
print("FP:", cm[0][2]+cm[1][2]) 
print("TN:", cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1]) 

Setosa:
TP:  16
FN:  0
FP: 0
TN: 29
Versicolor:
TP:  18
FN:  0
FP: 0
TN: 27
Verginica:
TP:  11
FN:  0
FP: 0
TN: 34
