-
Notifications
You must be signed in to change notification settings - Fork 0
/
Version_2_of_Naïve_Bayes_implmentation.py
181 lines (141 loc) · 5.05 KB
/
Version_2_of_Naïve_Bayes_implmentation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
# VERSION 2
# we need to calculate the mean and standard deviation of each feature in
# each class and then the class conditional probabilities of each feature in each class
# iris data :
"""
1. sepal length in cm
2. sepal width in cm
3. petal length in cm
4. petal width in cm
5. class:
-- Iris Setosa -> class label 0
-- Iris Versicolour -> class label 1
-- Iris Virginica -> class label 2
"""
import math
import random
import csv
def encode_class(mydata):
# this function encode the class label to integer so we can work well with mean and some calculation
# ex : [ [f1,f2,f3,c1=0] , ..... , [f1,f2,f3,c2=1] , ..... , [f1,f2,f3,c3=2] , .....] <-like this
classes = []
for i in range(len(mydata)):
if mydata[i][-1] not in classes:
classes.append(mydata[i][-1])
for i in range(len(classes)):
for j in range(len(mydata)):
if mydata[j][-1] == classes[i]:
mydata[j][-1] = i
return mydata
# function that take the data and split it into training set and test set by the split value
def handleDataset(filename, split, trainingSet=[], testSet=[]):
with open(filename, 'r') as csvfile:
lines = csv.reader(csvfile)
dataset = list(lines)
for x in range(len(dataset) - 1):
for y in range(4):
dataset[x][y] = float(dataset[x][y]) # to convert the string value in all dataset to float number
if random.random() < split:
trainingSet.append(dataset[x])
else:
testSet.append(dataset[x])
def groupUnderClass(mydata): #
dict = {}
for i in range(len(mydata)):
if mydata[i][-1] not in dict:
dict[mydata[i][-1]] = []
dict[mydata[i][-1]].append(mydata[i])
return dict
# Calculating Mean
def mean(numbers):
return sum(numbers) / float(len(numbers))
# Calculating Standard Deviation
def std_dev(numbers):
avg = mean(numbers)
variance = sum([pow(x - avg, 2) for x in numbers]) / float(len(numbers) - 1)
return math.sqrt(variance)
def MeanAndStdDev(mydata):
info = [(mean(attribute), std_dev(attribute)) for attribute in zip(*mydata)]
# eg: list = [ [a, b, c], [m, n, o], [x, y, z]]
# here mean of 1st attribute =(a + m+x)/3, mean of 2nd attribute = (b + n+y)/3
# delete summaries of last class
del info[-1]
return info
# find Mean and Standard Deviation under each class
def MeanAndStdDevForClass(mydata):
info = {}
dict = groupUnderClass(mydata)
for classValue, instances in dict.items():
info[classValue] = MeanAndStdDev(instances)
return info
# Calculate Gaussian Probability Density Function
def calculateGaussianProbability(x, mean, stdev):
expo = math.exp(-(math.pow(x - mean, 2) / (2 * math.pow(stdev, 2))))
return (1 / (math.sqrt(2 * math.pi) * stdev)) * expo
# Calculate Class Probabilities
def calculateClassProbabilities(info, test):
probabilities = {}
for classValue, classSummaries in info.items():
probabilities[classValue] = 1
for i in range(len(classSummaries)):
mean, std_dev = classSummaries[i]
x = test[i]
probabilities[classValue] *= calculateGaussianProbability(x, mean, std_dev)
return probabilities
# Make prediction - highest probability is the prediction
def predict(info, test):
probabilities = calculateClassProbabilities(info, test)
bestLabel, bestProb = None, -1
for classValue, probability in probabilities.items():
if bestLabel is None or probability > bestProb:
bestProb = probability
bestLabel = classValue
return bestLabel
# returns predictions for a set of examples
def getPredictions(info, test_set):
predictions = []
for i in range(len(test_set)):
result = predict(info, test_set[i])
predictions.append(result)
return predictions
# Accuracy score
def accuracy_rate(test, predictions):
correct = 0
for i in range(len(test)):
if test[i][-1] == predictions[i]:
correct += 1
return (correct / float(len(test))) * 100.0
# main code
trainingSet = []
testSet = []
handleDataset('iris.data', 0.5, trainingSet, testSet)
trainingSet = encode_class(trainingSet)
testSet = encode_class(testSet)
print('..................')
info = MeanAndStdDevForClass(trainingSet)
print('info is: ')
for i in info:
print(i)
print(info[i])
print('***********')
probs=calculateClassProbabilities(info,testSet[1])
print('probs of one test pattern')
print(probs)
print('*********************************************')
print('')
print('')
predictions = getPredictions(info, testSet)
accuracy = accuracy_rate(testSet, predictions)
print("Accuracy of your model is: ", accuracy)
print('')
print('')
print('*********************************************')
print('')
print('')
nn=calculateGaussianProbability(5.1,4.983,0.3)
print('nn is : ')
print(nn)
print('')
print('')
print('')
print('')