-
Notifications
You must be signed in to change notification settings - Fork 0
/
multinomial.go
108 lines (81 loc) · 2.46 KB
/
multinomial.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
package bengal
import "math"
func TrainMultinomial(input, output [][]string) NaiveBayesModel {
features := len(output[0])
vocabulary := unique(flatten2d(input))
classes := make([][]string, features)
prior := make([]map[string]float64, features)
condprob := make([]map[string]map[string]float64, features)
n := len(input)
nVocabulary := len(vocabulary)
for f := 0; f < features; f++ {
// Get classes for this feature
featureClasses := make([]string, len(output))
for i, y := range output {
featureClasses[i] = y[f]
}
classes[f] = unique(featureClasses)
// Get prior and condprob for this feature
prior[f] = make(map[string]float64)
condprob[f] = make(map[string]map[string]float64)
for _, class := range classes[f] {
// Find class examples
var examplesInClass [][]string
for i, x := range input {
if output[i][f] == class {
examplesInClass = append(examplesInClass, x)
}
}
// Define prior probabilities from raw class frequency
nClass := len(examplesInClass)
prior[f][class] = math.Log1p(float64(nClass) / float64(n))
// Count tokens in class examples
examplesInClassVocabulary := flatten2d(examplesInClass)
nExampleVocabulary := len(examplesInClassVocabulary)
tokenCounts := make(map[string]int)
for _, token := range examplesInClassVocabulary {
if _, ok := tokenCounts[token]; ok {
tokenCounts[token]++
} else {
tokenCounts[token] = 1
}
}
// Define conditional probabilities
for _, token := range vocabulary {
if _, ok := condprob[f][token]; !ok {
condprob[f][token] = make(map[string]float64)
}
thisTokenCount := 0
if tokenCount, ok := tokenCounts[token]; ok {
thisTokenCount = tokenCount
}
nonMonotonic := float64(1 + thisTokenCount) / float64(nExampleVocabulary + nVocabulary)
condprob[f][token][class] = math.Log1p(nonMonotonic)
}
}
}
return NaiveBayesModel{
vocabulary: vocabulary,
Classes: classes,
Prior: prior,
CondProb: condprob,
Input: input,
Output: output,
}
}
func (model NaiveBayesModel) PredictMultinomial(x []string) []string {
ret := make([]string, len(model.Classes))
for f, feature := range model.Classes {
scores := make(map[string]float64)
for _, class := range feature {
scores[class] = model.Prior[f][class]
for _, token := range x {
if condprob, ok := model.CondProb[f][token]; ok {
scores[class] += condprob[class]
}
}
}
ret[f] = argmax(scores)
}
return ret
}