/
label_one.go
278 lines (264 loc) · 9.34 KB
/
label_one.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
package sticker
import (
"encoding/gob"
"fmt"
"io"
"log"
"sort"
)
// LabelOneParameters is the parameters for LabelOne.
type LabelOneParameters struct {
// ClassifierTrainerName is the used BinaryClassifierTrainer name.
ClassifierTrainerName string
// C is the penalty parameter for BinaryClassifierTrainer.
C float32
// Epsilon is the tolerance parameter for BinaryClassifierTrainer.
Epsilon float32
// T is the maximum number of the rounds, which is equal to the maximum number of the target labels.
T uint
}
// NewLabelOneParameters returns an LabelOneParameters initialized with the default values.
func NewLabelOneParameters() *LabelOneParameters {
return &LabelOneParameters{
ClassifierTrainerName: "L1Logistic_PrimalSGD",
C: float32(1.0),
Epsilon: float32(1.0e-05),
T: uint(100),
}
}
// LabelOne is the One-versus-Rest classifier for multi-label ranking.
// The t-th classifier (t = 1, ..., T) is the classifier for the top-t frequently occurring label.
type LabelOne struct {
// Params is the used LabelOneParameters.
Params *LabelOneParameters
// Biases is the bias slice used by splitters on each classifier.
Biases []float32
// Weights is the weight sparse matrix used by each classifier.
// This is the map from the feature key to the (roundID, the weight on the feature of #roundID splitter) slice.
// This data structure reduces the number of times that the classifier accesses the golang's map a lot.
WeightLists map[uint32]KeyValues32
// Labels is the label slice used in each classifier.
// The t-th label is the target label of the t-th classifier.
Labels LabelVector
// The following members are not required.
//
// Summaries is the summary object slice for each boosting round.
// The entries in this summary is considered to provide compact and useful information in best-effort, so this specification would be loose and rapidly changing.
Summaries []map[string]interface{}
}
// TrainLabelOne returns an trained LabelOne on the given dataset ds.
func TrainLabelOne(ds *Dataset, params *LabelOneParameters, debug *log.Logger) (*LabelOne, error) {
classifierTrainer, ok := BinaryClassifierTrainers[params.ClassifierTrainerName]
if !ok {
return nil, fmt.Errorf("unknown ClassifierTrainerName: %s", params.ClassifierTrainerName)
}
n := ds.Size()
biases, weightLists := []float32{}, make(map[uint32]KeyValues32)
labels := []uint32{}
summaries := []map[string]interface{}{}
// Collect the label frequencies.
labelFreqs := make(map[uint32]float32)
for _, yi := range ds.Y {
for _, label := range yi {
labelFreqs[label]++
}
}
T := params.T
if T > uint(len(labelFreqs)) {
T = uint(len(labelFreqs))
}
labelTopT := RankTopK(labelFreqs, T)
for t := uint(1); t <= T; t++ {
targetLabel := labelTopT[t-1]
// Assign each data point to the positive class i.f.f. it has the target label.
deltas := make([]bool, n)
deltaFreq := make(map[bool]int)
for i, yi := range ds.Y {
delta := false
for _, label := range yi {
if targetLabel == label {
delta = true
break
}
}
deltas[i] = delta
deltaFreq[delta]++
}
// Training the splitter.
if debug != nil {
debug.Printf("TrainLabelOne: t=%d: training the splitter on %d negative(s) and %d positive(s) ...", t, deltaFreq[false], deltaFreq[true])
}
splitter, err := classifierTrainer(ds.X, deltas, params.C, params.Epsilon, debug)
if err != nil {
return nil, fmt.Errorf("BinaryClassifierTrainer(%s): %s", params.ClassifierTrainerName, err)
}
tn, fn, fp, tp, _, _ := splitter.ReportPerformance(ds.X, deltas)
if debug != nil {
debug.Printf("TrainLabelOne: t=%d: trained the splitter (tn=%d, fn=%d, fp=%d, tp=%d) ...", t, tn, fn, fp, tp)
}
// Append the round information.
biases = append(biases, splitter.Bias)
for feature, value := range splitter.Weight {
weightLists[feature] = append(weightLists[feature], KeyValue32{uint32(t - 1), value})
}
labels = append(labels, targetLabel)
summary := make(map[string]interface{})
summary["splitPerf"] = map[string]int{"tn": int(tn), "fn": int(fn), "fp": int(fp), "tp": int(tp)}
summaries = append(summaries, summary)
}
return &LabelOne{
Params: params,
Biases: biases,
WeightLists: weightLists,
Labels: labels,
Summaries: summaries,
}, nil
}
// DecodeLabelOneWithGobDecoder decodes LabelOne using decoder.
//
// This function returns an error in decoding.
func DecodeLabelOneWithGobDecoder(model *LabelOne, decoder *gob.Decoder) error {
model.Params = &LabelOneParameters{}
if err := decoder.Decode(model.Params); err != nil {
return fmt.Errorf("DecodeLabelOne: Params: %s", err)
}
if err := decoder.Decode(&model.Biases); err != nil {
return fmt.Errorf("DecodeLabelOne: Biases: %s", err)
}
var lenWeightLists int
if err := decoder.Decode(&lenWeightLists); err != nil {
return fmt.Errorf("DecodeLabelOne: len(WeightLists): %s", err)
}
model.WeightLists = make(map[uint32]KeyValues32)
for i := 0; i < lenWeightLists; i++ {
var feature uint32
if err := decoder.Decode(&feature); err != nil {
return fmt.Errorf("DecodeLabelOne: #%d WeightList: %s", i, err)
}
var weightList KeyValues32
if err := decoder.Decode(&weightList); err != nil {
return fmt.Errorf("DecodeLabelOne: #%d WeightList: %s", i, err)
}
model.WeightLists[feature] = weightList
}
if err := decoder.Decode(&model.Labels); err != nil {
return fmt.Errorf("DecodeLabelOne: Labels: %s", err)
}
if err := decoder.Decode(&model.Summaries); err != nil {
return fmt.Errorf("DecodeLabelOne: Summaries: %s", err)
}
return nil
}
// DecodeLabelOne decodes LabelOne from r.
// Directly passing *os.File used by a gob.Decoder to this function causes mysterious errors.
// Thus, if users use gob.Decoder, then they should call DecodeLabelOneWithGobDecoder.
//
// This function returns an error in decoding.
func DecodeLabelOne(model *LabelOne, r io.Reader) error {
return DecodeLabelOneWithGobDecoder(model, gob.NewDecoder(r))
}
// EncodeLabelOneWithGobEncoder decodes LabelOne using encoder.
//
// This function returns an error in decoding.
func EncodeLabelOneWithGobEncoder(model *LabelOne, encoder *gob.Encoder) error {
if err := encoder.Encode(model.Params); err != nil {
return fmt.Errorf("EncodeLabelOne: Params: %s", err)
}
if err := encoder.Encode(model.Biases); err != nil {
return fmt.Errorf("EncodeLabelOne: Biases: %s", err)
}
features := make([]int, 0, len(model.WeightLists))
for feature := range model.WeightLists {
features = append(features, int(feature))
}
sort.Ints(features)
if err := encoder.Encode(len(model.WeightLists)); err != nil {
return fmt.Errorf("EncodeLabelOne: len(WeightLists): %s", err)
}
for i, feature := range features {
if err := encoder.Encode(uint32(feature)); err != nil {
return fmt.Errorf("EncodeLabelOne: #%d WeightList: %s", i, err)
}
if err := encoder.Encode(model.WeightLists[uint32(feature)]); err != nil {
return fmt.Errorf("EncodeLabelOne: #%d WeightList: %s", i, err)
}
}
if err := encoder.Encode(model.Labels); err != nil {
return fmt.Errorf("EncodeLabelOne: Labels: %s", err)
}
if err := encoder.Encode(model.Summaries); err != nil {
return fmt.Errorf("EncodeLabelOne: Summaries: %s", err)
}
return nil
}
// EncodeLabelOne encodes LabelOne to w.
// Directly passing *os.File used by a gob.Encoder to this function causes mysterious errors.
// Thus, if users use gob.Encoder, then they should call EncodeLabelOneWithGobEncoder.
//
// This function returns an error in encoding.
func EncodeLabelOne(model *LabelOne, w io.Writer) error {
return EncodeLabelOneWithGobEncoder(model, gob.NewEncoder(w))
}
// GobEncode returns the error always, because users should encode large LabelOne objects with EncodeLabelOne.
func (model *LabelOne) GobEncode() ([]byte, error) {
return nil, fmt.Errorf("LabelOne should be encoded with EncodeLabelOne")
}
// Nrounds return the number of the rounds.
func (model *LabelOne) Nrounds() uint {
return uint(len(model.Labels))
}
// Predict returns the top-K predicted labels for the given data entry x with the first T rounds.
func (model *LabelOne) Predict(x FeatureVector, K uint, T uint) LabelVector {
if T > model.Nrounds() {
T = model.Nrounds()
}
z := make([]float32, T)
for _, xpair := range x {
for _, weightpair := range model.WeightLists[xpair.Key] {
if uint(weightpair.Key) >= T {
break
}
z[weightpair.Key] += weightpair.Value * xpair.Value
}
}
y := make(map[uint32]float32)
for t, zt := range z {
zt += model.Biases[t]
y[model.Labels[t]] += zt
}
return RankTopK(y, K)
}
// PredictAll returns the slice of the top-K predicted labels for each data entry in X with the first T rounds.
func (model *LabelOne) PredictAll(X FeatureVectors, K uint, T uint) LabelVectors {
Y := make(LabelVectors, 0, len(X))
for _, xi := range X {
Y = append(Y, model.Predict(xi, K, T))
}
return Y
}
// Prune returns the pruned LabelOne which has at most T rounds.
func (model *LabelOne) Prune(T uint) *LabelOne {
if T > model.Nrounds() {
T = model.Nrounds()
}
newModel := &LabelOne{
Params: model.Params,
Biases: model.Biases[:T],
WeightLists: make(map[uint32]KeyValues32),
Labels: model.Labels[:T],
Summaries: model.Summaries[:T],
}
for feature, weightList := range model.WeightLists {
l := 0
for _, weightpair := range weightList {
if weightpair.Key > uint32(T) {
break
}
l++
}
if l > 0 {
newModel.WeightLists[feature] = weightList[:l]
}
}
return newModel
}