This repository has been archived by the owner on Mar 18, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
counter.go
199 lines (175 loc) · 4.78 KB
/
counter.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
// Package hashcounter provides an efficient way to count the number of
// occurences of unique integers. The exposed functions are not thread-safe.
//
// Do not use this package if you cannot tolerate any collisions. By default
// the key will be determined by calling xxhash.Sum64. You can provide your
// own hash function as long as it returns a uint64 (like crc64, murmur3, etc).
// Since there is some hashing involved, there might be collisions. You've been
// warned. If your byte slices can safely fit in a uint64 without losing any
// bits then feel free to send a custom hash function using something like
// binary.Uvarint.
//
// You can initialize a new C using New or just using new(hashcounter.C).
package hashcounter
import (
"bytes"
"encoding/binary"
"errors"
"fmt"
"github.com/cespare/xxhash"
)
const (
part1Size = 16
part1Bits = (1 << part1Size) - 1
idSize = 64 - part1Size
idBits = (1 << idSize) - 1
)
// C holds an array of unique values and an associative count
type C struct {
arr [1 << part1Size][]uint64
hash func([]byte) uint64
}
// New returns a new instance of C
func New() *C {
return new(C)
}
// NewWithHash returns a new instance of C with the provided hash function
func NewWithHash(fn func([]byte) uint64) *C {
return &C{
hash: fn,
}
}
// Key returns the uint64 key for the given bytes
func (m *C) Key(k []byte) uint64 {
if m.hash != nil {
return m.hash(k)
}
return xxhash.Sum64(k)
}
func (m *C) loc(k uint64) (uint16, uint64) {
return uint16(k >> (64 - part1Size)), k & idBits
}
func (m *C) add(p1 uint16, id uint64, v uint16) {
for i := range m.arr[p1] {
if id == m.arr[p1][i]&idBits {
v64 := m.arr[p1][i]>>idSize + uint64(v)
m.arr[p1][i] = v64<<idSize | id
return
}
}
m.arr[p1] = append(m.arr[p1], id+uint64(v)<<idSize)
}
// Add adds the value to the given bytes
func (m *C) Add(b []byte, v uint16) {
p1, id := m.loc(m.Key(b))
m.add(p1, id, v)
}
// Get returns the value of the given bytes and a boolean if it was found
func (m *C) Get(b []byte) (uint16, bool) {
return m.GetKey(m.Key(b))
}
// GetKey takes a key rather than bytes but otherwise behaves like Get
func (m *C) GetKey(k uint64) (uint16, bool) {
p1, id := m.loc(k)
for i := range m.arr[p1] {
if id == m.arr[p1][i]&idBits {
return uint16(m.arr[p1][i] >> idSize), true
}
}
return 0, false
}
// Range calls the given function for every value in the map and continues
// looping until the given bool. The returned key is going to be the result
// of Key(bytes). If you want the key to be reversable, you must pass a hash
// function to NewWithHash that allows you to reverse the operation.
func (m *C) Range(f func(key uint64, value uint16) bool) {
var key uint64
for p1 := range m.arr {
for _, idv := range m.arr[p1] {
key = uint64(p1)<<(64-part1Size) | idv&idBits
if !f(key, uint16(idv>>idSize)) {
return
}
}
}
}
// Len returns a count of all of the keys
func (m *C) Len() int {
l := 0
for p1 := range m.arr {
l += len(m.arr[p1])
}
return l
}
// Reset removes all of the keys and returns C to it's empty state
func (m *C) Reset() {
for p1 := range m.arr {
m.arr[p1] = nil
}
}
// MarshalBinary implements the encoding.BinaryMarshaler interface.
func (m *C) MarshalBinary() ([]byte, error) {
buf := new(bytes.Buffer)
buf.Write([]byte{1}) // version
b := make([]byte, binary.MaxVarintLen64)
for p1 := range m.arr {
l := len(m.arr[p1])
if l < 1 {
continue
}
// if part1Size changes then we'll need to change this
binary.BigEndian.PutUint16(b, uint16(p1))
buf.Write(b[:2])
i := binary.PutUvarint(b, uint64(l))
buf.Write(b[:i])
for _, idv := range m.arr[p1] {
binary.BigEndian.PutUint64(b, idv)
buf.Write(b[:8])
}
}
return buf.Bytes(), nil
}
// UnmarshalBinary implements the encoding.BinaryUnmarshaler interface.
func (m *C) UnmarshalBinary(b []byte) error {
if len(b) < 1 {
return errors.New("empty byte slice")
}
if b[0] != 1 {
return fmt.Errorf("unexpected version: %d", b[0])
}
b = b[1:]
for len(b) > 0 {
p1 := binary.BigEndian.Uint16(b)
b = b[2:]
l, res := binary.Uvarint(b)
if res < 1 {
return fmt.Errorf("error reading length with Uvarint: %d", res)
}
b = b[res:]
m.arr[p1] = make([]uint64, l)
for i := range m.arr[p1] {
m.arr[p1][i] = binary.BigEndian.Uint64(b)
b = b[8:]
}
}
return nil
}
// Merge adds every key from the sent C to the called on C. This assumes the
// hash functions are the same.
func (m *C) Merge(n *C) {
for p1 := range n.arr {
if len(n.arr[p1]) < 1 {
continue
}
// if the array is empty on m then just copy n
if len(m.arr[p1]) == 0 {
m.arr[p1] = make([]uint64, len(n.arr[p1]))
copy(m.arr[p1], n.arr[p1])
continue
}
// otherwise loop over each n value and add it to m
for _, idv := range n.arr[p1] {
m.add(uint16(p1), idv&idBits, uint16(idv>>idSize))
}
}
}