-
Notifications
You must be signed in to change notification settings - Fork 10
/
aggregate.rb
286 lines (229 loc) · 7.09 KB
/
aggregate.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
# Implements aggregate statistics and maintains
# configurable histogram for a set of given samples. Convenient for tracking
# high throughput data.
class Aggregate
#The current number of samples
attr_reader :count
#The maximum sample value
attr_reader :max
#The minimum samples value
attr_reader :min
#The sum of all samples
attr_reader :sum
#The number of samples falling below the lowest valued histogram bucket
attr_reader :outliers_low
#The number of samples falling above the highest valued histogram bucket
attr_reader :outliers_high
# The number of buckets in the binary logarithmic histogram (low => 2**0, high => 2**@@LOG_BUCKETS)
@@LOG_BUCKETS = 128
# Create a new Aggregate that maintains a binary logarithmic histogram
# by default. Specifying values for low, high, and width configures
# the aggregate to maintain a linear histogram with (high - low)/width buckets
def initialize (low=nil, high=nil, width=nil)
@count = 0
@sum = 0.0
@sum2 = 0.0
@outliers_low = 0
@outliers_high = 0
# If the user asks we maintain a linear histogram where
# values in the range [low, high) are bucketed in multiples
# of width
if (nil != low && nil != high && nil != width)
#Validate linear specification
if high <= low
raise ArgumentError, "High bucket must be > Low bucket"
end
if high - low < width
raise ArgumentError, "Histogram width must be <= histogram range"
end
if 0 != (high - low).modulo(width)
raise ArgumentError, "Histogram range (high - low) must be a multiple of width"
end
@low = low
@high = high
@width = width
else
@low = 1
@width = nil
@high = to_bucket(@@LOG_BUCKETS - 1)
end
#Initialize all buckets to 0
@buckets = Array.new(bucket_count, 0)
end
# Include a sample in the aggregate
def << data
# Update min/max
if 0 == @count
@min = data
@max = data
else
@max = data if data > @max
@min = data if data < @min
end
# Update the running info
@count += 1
@sum += data
@sum2 += (data * data)
# Update the bucket
@buckets[to_index(data)] += 1 unless outlier?(data)
end
#The current average of all samples
def mean
@sum / @count
end
#Calculate the standard deviation
def std_dev
Math.sqrt((@sum2.to_f - ((@sum.to_f * @sum.to_f)/@count.to_f)) / (@count.to_f - 1))
end
# Combine two aggregates
#def +(b)
# a = self
# c = Aggregate.new
# c.count = a.count + b.count
#end
#Generate a pretty-printed ASCII representation of the histogram
def to_s(columns=nil)
#default to an 80 column terminal, don't support < 80 for now
if nil == columns
columns = 80
else
raise ArgumentError if columns < 80
end
#Find the largest bucket and create an array of the rows we intend to print
disp_buckets = Array.new
max_count = 0
total = 0
@buckets.each_with_index do |count, idx|
next if 0 == count
max_count = [max_count, count].max
disp_buckets << [idx, to_bucket(idx), count]
total += count
end
#XXX: Better to print just header --> footer
return "Empty histogram" if 0 == disp_buckets.length
#Figure out how wide the value and count columns need to be based on their
#largest respective numbers
value_str = "value"
count_str = "count"
total_str = "Total"
value_width = [disp_buckets.last[1].to_s.length, value_str.length].max
value_width = [value_width, total_str.length].max
count_width = [total.to_s.length, count_str.length].max
max_bar_width = columns - (value_width + " |".length + "| ".length + count_width)
#Determine the value of a '@'
weight = [max_count.to_f/max_bar_width.to_f, 1.0].max
#format the header
histogram = sprintf("%#{value_width}s |", value_str)
max_bar_width.times { histogram << "-"}
histogram << sprintf("| %#{count_width}s\n", count_str)
# We denote empty buckets with a '~'
def skip_row(value_width)
sprintf("%#{value_width}s ~\n", " ")
end
#Loop through each bucket to be displayed and output the correct number
prev_index = disp_buckets[0][0] - 1
disp_buckets.each do |x|
#Denote skipped empty buckets with a ~
histogram << skip_row(value_width) unless prev_index == x[0] - 1
prev_index = x[0]
#Add the value
row = sprintf("%#{value_width}d |", x[1])
#Add the bar
bar_size = (x[2]/weight).to_i
bar_size.times { row += "@"}
(max_bar_width - bar_size).times { row += " " }
#Add the count
row << sprintf("| %#{count_width}d\n", x[2])
#Append the finished row onto the histogram
histogram << row
end
#End the table
histogram << skip_row(value_width) if disp_buckets.last[0] != bucket_count-1
histogram << sprintf("%#{value_width}s", "Total")
histogram << " |"
max_bar_width.times {histogram << "-"}
histogram << "| "
histogram << sprintf("%#{count_width}d\n", total)
end
#Iterate through each bucket in the histogram regardless of
#its contents
def each
@buckets.each_with_index do |count, index|
yield(to_bucket(index), count)
end
end
#Iterate through only the buckets in the histogram that contain
#samples
def each_nonzero
@buckets.each_with_index do |count, index|
yield(to_bucket(index), count) if count != 0
end
end
private
def linear?
nil != @width
end
def outlier? (data)
if data < @low
@outliers_low += 1
elsif data >= @high
@outliers_high += 1
else
return false
end
end
def bucket_count
if linear?
return (@high-@low)/@width
else
return @@LOG_BUCKETS
end
end
def to_bucket(index)
if linear?
return @low + (index * @width)
else
return 2**(index)
end
end
def right_bucket? index, data
# check invariant
raise unless linear?
bucket = to_bucket(index)
#It's the right bucket if data falls between bucket and next bucket
bucket <= data && data < bucket + @width
end
=begin
def find_bucket(lower, upper, target)
#Classic binary search
return upper if right_bucket?(upper, target)
# Cut the search range in half
middle = (upper/2).to_i
# Determine which half contains our value and recurse
if (to_bucket(middle) >= target)
return find_bucket(lower, middle, target)
else
return find_bucket(middle, upper, target)
end
end
=end
# A data point is added to the bucket[n] where the data point
# is less than the value represented by bucket[n], but greater
# than the value represented by bucket[n+1]
def to_index (data)
# basic case is simple
return log2(data).to_i if !linear?
# Search for the right bucket in the linear case
@buckets.each_with_index do |count, idx|
return idx if right_bucket?(idx, data)
end
#find_bucket(0, bucket_count-1, data)
#Should not get here
raise "#{data}"
end
# log2(x) returns j, | i = j-1 and 2**i <= data < 2**j
@@LOG2_DIVEDEND = Math.log(2)
def log2( x )
Math.log(x) / @@LOG2_DIVEDEND
end
end