-
Notifications
You must be signed in to change notification settings - Fork 152
/
histogram_quantile.go
324 lines (289 loc) · 8.74 KB
/
histogram_quantile.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
package universe
import (
"fmt"
"math"
"sort"
"github.com/influxdata/flux"
"github.com/influxdata/flux/execute"
"github.com/influxdata/flux/plan"
"github.com/influxdata/flux/semantic"
"github.com/pkg/errors"
)
const HistogramQuantileKind = "histogramQuantile"
const DefaultUpperBoundColumnLabel = "le"
type HistogramQuantileOpSpec struct {
Quantile float64 `json:"quantile"`
CountColumn string `json:"countColumn"`
UpperBoundColumn string `json:"upperBoundColumn"`
ValueColumn string `json:"valueColumn"`
MinValue float64 `json:"minValue"`
}
func init() {
histogramQuantileSignature := flux.FunctionSignature(map[string]semantic.PolyType{
"quantile": semantic.Float,
"countColumn": semantic.String,
"upperBoundColumn": semantic.String,
"valueColumn": semantic.String,
"minValue": semantic.Float,
}, nil)
flux.RegisterPackageValue("universe", HistogramQuantileKind, flux.FunctionValue(HistogramQuantileKind, createHistogramQuantileOpSpec, histogramQuantileSignature))
flux.RegisterOpSpec(HistogramQuantileKind, newHistogramQuantileOp)
plan.RegisterProcedureSpec(HistogramQuantileKind, newHistogramQuantileProcedure, HistogramQuantileKind)
execute.RegisterTransformation(HistogramQuantileKind, createHistogramQuantileTransformation)
}
func createHistogramQuantileOpSpec(args flux.Arguments, a *flux.Administration) (flux.OperationSpec, error) {
if err := a.AddParentFromArgs(args); err != nil {
return nil, err
}
s := new(HistogramQuantileOpSpec)
q, err := args.GetRequiredFloat("quantile")
if err != nil {
return nil, err
}
s.Quantile = q
if col, ok, err := args.GetString("countColumn"); err != nil {
return nil, err
} else if ok {
s.CountColumn = col
} else {
s.CountColumn = execute.DefaultValueColLabel
}
if col, ok, err := args.GetString("upperBoundColumn"); err != nil {
return nil, err
} else if ok {
s.UpperBoundColumn = col
} else {
s.UpperBoundColumn = DefaultUpperBoundColumnLabel
}
if col, ok, err := args.GetString("valueColumn"); err != nil {
return nil, err
} else if ok {
s.ValueColumn = col
} else {
s.ValueColumn = execute.DefaultValueColLabel
}
if min, ok, err := args.GetFloat("minValue"); err != nil {
return nil, err
} else if ok {
s.MinValue = min
}
return s, nil
}
func newHistogramQuantileOp() flux.OperationSpec {
return new(HistogramQuantileOpSpec)
}
func (s *HistogramQuantileOpSpec) Kind() flux.OperationKind {
return HistogramQuantileKind
}
type HistogramQuantileProcedureSpec struct {
plan.DefaultCost
Quantile float64 `json:"quantile"`
CountColumn string `json:"countColumn"`
UpperBoundColumn string `json:"upperBoundColumn"`
ValueColumn string `json:"valueColumn"`
MinValue float64 `json:"minValue"`
}
func newHistogramQuantileProcedure(qs flux.OperationSpec, a plan.Administration) (plan.ProcedureSpec, error) {
spec, ok := qs.(*HistogramQuantileOpSpec)
if !ok {
return nil, fmt.Errorf("invalid spec type %T", qs)
}
return &HistogramQuantileProcedureSpec{
Quantile: spec.Quantile,
CountColumn: spec.CountColumn,
UpperBoundColumn: spec.UpperBoundColumn,
ValueColumn: spec.ValueColumn,
MinValue: spec.MinValue,
}, nil
}
func (s *HistogramQuantileProcedureSpec) Kind() plan.ProcedureKind {
return HistogramQuantileKind
}
func (s *HistogramQuantileProcedureSpec) Copy() plan.ProcedureSpec {
ns := new(HistogramQuantileProcedureSpec)
*ns = *s
return ns
}
type histogramQuantileTransformation struct {
d execute.Dataset
cache execute.TableBuilderCache
spec HistogramQuantileProcedureSpec
}
type bucket struct {
count float64
upperBound float64
}
func createHistogramQuantileTransformation(id execute.DatasetID, mode execute.AccumulationMode, spec plan.ProcedureSpec, a execute.Administration) (execute.Transformation, execute.Dataset, error) {
s, ok := spec.(*HistogramQuantileProcedureSpec)
if !ok {
return nil, nil, fmt.Errorf("invalid spec type %T", spec)
}
cache := execute.NewTableBuilderCache(a.Allocator())
d := execute.NewDataset(id, mode, cache)
t := NewHistorgramQuantileTransformation(d, cache, s)
return t, d, nil
}
func NewHistorgramQuantileTransformation(
d execute.Dataset,
cache execute.TableBuilderCache,
spec *HistogramQuantileProcedureSpec,
) execute.Transformation {
return &histogramQuantileTransformation{
d: d,
cache: cache,
spec: *spec,
}
}
func (t histogramQuantileTransformation) RetractTable(id execute.DatasetID, key flux.GroupKey) error {
// TODO
return nil
}
func (t histogramQuantileTransformation) Process(id execute.DatasetID, tbl flux.Table) error {
builder, created := t.cache.TableBuilder(tbl.Key())
if !created {
return fmt.Errorf("histogramQuantile found duplicate table with key: %v", tbl.Key())
}
if err := execute.AddTableKeyCols(tbl.Key(), builder); err != nil {
return err
}
valueIdx, err := builder.AddCol(flux.ColMeta{
Label: t.spec.ValueColumn,
Type: flux.TFloat,
})
if err != nil {
return err
}
countIdx := execute.ColIdx(t.spec.CountColumn, tbl.Cols())
if countIdx < 0 {
return fmt.Errorf("table is missing count column %q", t.spec.CountColumn)
}
if tbl.Cols()[countIdx].Type != flux.TFloat {
return fmt.Errorf("count column %q must be of type float", t.spec.CountColumn)
}
upperBoundIdx := execute.ColIdx(t.spec.UpperBoundColumn, tbl.Cols())
if upperBoundIdx < 0 {
return fmt.Errorf("table is missing upper bound column %q", t.spec.UpperBoundColumn)
}
if tbl.Cols()[upperBoundIdx].Type != flux.TFloat {
return fmt.Errorf("upper bound column %q must be of type float", t.spec.UpperBoundColumn)
}
// Read buckets
var cdf []bucket
sorted := true //track if the cdf was naturally sorted
if err := tbl.Do(func(cr flux.ColReader) error {
offset := len(cdf)
// Grow cdf by number of rows
l := offset + cr.Len()
if cap(cdf) < l {
cpy := make([]bucket, l, l*2)
// Copy existing buckets to new slice
copy(cpy, cdf)
cdf = cpy
} else {
cdf = cdf[:l]
}
for i := 0; i < cr.Len(); i++ {
curr := i + offset
prev := curr - 1
b := bucket{}
if vs := cr.Floats(countIdx); vs.IsValid(i) {
b.count = vs.Value(i)
} else {
return fmt.Errorf("unexpected null in the countColumn")
}
if vs := cr.Floats(upperBoundIdx); vs.IsValid(i) {
b.upperBound = vs.Value(i)
} else {
return fmt.Errorf("unexpected null in the upperBoundColumn")
}
cdf[curr] = b
if prev >= 0 {
sorted = sorted && cdf[prev].upperBound <= cdf[curr].upperBound
}
}
return nil
}); err != nil {
return err
}
if !sorted {
sort.Slice(cdf, func(i, j int) bool {
return cdf[i].upperBound < cdf[j].upperBound
})
}
q, err := t.computeQuantile(cdf)
if err != nil {
return err
}
if err := execute.AppendKeyValues(tbl.Key(), builder); err != nil {
return err
}
if err := builder.AppendFloat(valueIdx, q); err != nil {
return err
}
return nil
}
func (t *histogramQuantileTransformation) computeQuantile(cdf []bucket) (float64, error) {
if len(cdf) == 0 {
return 0, errors.New("histogram is empty")
}
// Find rank index and check counts are monotonic
prevCount := 0.0
totalCount := cdf[len(cdf)-1].count
rank := t.spec.Quantile * totalCount
rankIdx := -1
for i, b := range cdf {
if b.count < prevCount {
return 0, errors.New("histogram records counts are not monotonic")
}
prevCount = b.count
if rank >= b.count {
rankIdx = i
}
}
var (
lowerCount,
lowerBound,
upperCount,
upperBound float64
)
switch rankIdx {
case -1:
// Quantile is below the lowest upper bound, interpolate using the min value
lowerCount = 0
lowerBound = t.spec.MinValue
upperCount = cdf[0].count
upperBound = cdf[0].upperBound
case len(cdf) - 1:
// Quantile is above the highest upper bound, simply return it as it must be finite
return cdf[len(cdf)-1].upperBound, nil
default:
lowerCount = cdf[rankIdx].count
lowerBound = cdf[rankIdx].upperBound
upperCount = cdf[rankIdx+1].count
upperBound = cdf[rankIdx+1].upperBound
}
if rank == lowerCount {
// No need to interpolate
return lowerBound, nil
}
if math.IsInf(lowerBound, -1) {
// We cannot interpolate with infinity
return upperBound, nil
}
if math.IsInf(upperBound, 1) {
// We cannot interpolate with infinity
return lowerBound, nil
}
// Compute quantile using linear interpolation
scale := (rank - lowerCount) / (upperCount - lowerCount)
return lowerBound + (upperBound-lowerBound)*scale, nil
}
func (t histogramQuantileTransformation) UpdateWatermark(id execute.DatasetID, mark execute.Time) error {
return t.d.UpdateWatermark(mark)
}
func (t histogramQuantileTransformation) UpdateProcessingTime(id execute.DatasetID, pt execute.Time) error {
return t.d.UpdateProcessingTime(pt)
}
func (t histogramQuantileTransformation) Finish(id execute.DatasetID, err error) {
t.d.Finish(err)
}