store: use loser trees (thanos-io#7304)

Remove a long-standing TODO item in the code - let's use the great loser tree implementation by Bryan. It is faster than the heap because less comparisons are needed. Should be a nice improvement given that the heap is used in a lot of hot paths. Since Prometheus also uses this library, it's tricky to import the "any" version. I tried doing bboreham/go-loser#3 but it's still impossible to do that. Let's just copy/paste the code, it's not a lot. Bench: ``` goos: linux goarch: amd64 pkg: github.com/thanos-io/thanos/pkg/store cpu: Intel(R) Core(TM) i9-10885H CPU @ 2.40GHz │ oldkway │ newkway │ │ sec/op │ sec/op vs base │ KWayMerge-16 2.292m ± 3% 2.075m ± 15% -9.47% (p=0.023 n=10) │ oldkway │ newkway │ │ B/op │ B/op vs base │ KWayMerge-16 1.553Mi ± 0% 1.585Mi ± 0% +2.04% (p=0.000 n=10) │ oldkway │ newkway │ │ allocs/op │ allocs/op vs base │ KWayMerge-16 27.26k ± 0% 26.27k ± 0% -3.66% (p=0.000 n=10) ``` Signed-off-by: Giedrius Statkevičius <giedrius.statkevicius@vinted.com>
jnyi · Jun 1, 2024 · 1c57c98 · 1c57c98
1 parent 0f12888
commit 1c57c98
Show file tree

Hide file tree

Showing 9 changed files with 400 additions and 101 deletions.
diff --git a/pkg/losertree/tree.go b/pkg/losertree/tree.go
@@ -0,0 +1,161 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+// Original version copyright Bryan Boreham, 2024.
+// https://github.com/bboreham/go-loser/tree/any.
+// Loser tree, from https://en.wikipedia.org/wiki/K-way_merge_algorithm#Tournament_Tree
+
+package losertree
+
+type Sequence interface {
+	Next() bool // Advances and returns true if there is a value at this new position.
+}
+
+func New[E any, S Sequence](sequences []S, maxVal E, at func(S) E, less func(E, E) bool, close func(S)) *Tree[E, S] {
+	nSequences := len(sequences)
+	t := Tree[E, S]{
+		maxVal: maxVal,
+		at:     at,
+		less:   less,
+		close:  close,
+		nodes:  make([]node[E, S], nSequences*2),
+	}
+	for i, s := range sequences {
+		t.nodes[i+nSequences].items = s
+		t.moveNext(i + nSequences) // Must call Next on each item so that At() has a value.
+	}
+	if nSequences > 0 {
+		t.nodes[0].index = -1 // flag to be initialized on first call to Next().
+	}
+	return &t
+}
+
+// Call the close function on all sequences that are still open.
+func (t *Tree[E, S]) Close() {
+	for _, e := range t.nodes[len(t.nodes)/2 : len(t.nodes)] {
+		if e.index == -1 {
+			continue
+		}
+		t.close(e.items)
+	}
+}
+
+// A loser tree is a binary tree laid out such that nodes N and N+1 have parent N/2.
+// We store M leaf nodes in positions M...2M-1, and M-1 internal nodes in positions 1..M-1.
+// Node 0 is a special node, containing the winner of the contest.
+type Tree[E any, S Sequence] struct {
+	maxVal E
+	at     func(S) E
+	less   func(E, E) bool
+	close  func(S) // Called when Next() returns false.
+	nodes  []node[E, S]
+}
+
+type node[E any, S Sequence] struct {
+	index int // This is the loser for all nodes except the 0th, where it is the winner.
+	value E   // Value copied from the loser node, or winner for node 0.
+	items S   // Only populated for leaf nodes.
+}
+
+func (t *Tree[E, S]) moveNext(index int) bool {
+	n := &t.nodes[index]
+	if n.items.Next() {
+		n.value = t.at(n.items)
+		return true
+	}
+	t.close(n.items) // Next() returned false; close it and mark as finished.
+	n.value = t.maxVal
+	n.index = -1
+	return false
+}
+
+func (t *Tree[E, S]) Winner() S {
+	return t.nodes[t.nodes[0].index].items
+}
+
+func (t *Tree[E, S]) At() E {
+	return t.nodes[0].value
+}
+
+func (t *Tree[E, S]) Next() bool {
+	nodes := t.nodes
+	if len(nodes) == 0 {
+		return false
+	}
+	if nodes[0].index == -1 { // If tree has not been initialized yet, do that.
+		t.initialize()
+		return nodes[nodes[0].index].index != -1
+	}
+	if nodes[nodes[0].index].index == -1 { // already exhausted.
+		return false
+	}
+	t.moveNext(nodes[0].index)
+	t.replayGames(nodes[0].index)
+	return nodes[nodes[0].index].index != -1
+}
+
+// Current winner has been advanced independently; fix up the loser tree.
+func (t *Tree[E, S]) Fix(closed bool) {
+	nodes := t.nodes
+	cur := &nodes[nodes[0].index]
+	if closed {
+		cur.value = t.maxVal
+		cur.index = -1
+	} else {
+		cur.value = t.at(cur.items)
+	}
+	t.replayGames(nodes[0].index)
+}
+
+func (t *Tree[E, S]) IsEmpty() bool {
+	nodes := t.nodes
+	if nodes[0].index == -1 { // If tree has not been initialized yet, do that.
+		t.initialize()
+	}
+	return nodes[nodes[0].index].index == -1
+}
+
+func (t *Tree[E, S]) initialize() {
+	winner := t.playGame(1)
+	t.nodes[0].index = winner
+	t.nodes[0].value = t.nodes[winner].value
+}
+
+// Find the winner at position pos; if it is a non-leaf node, store the loser.
+// pos must be >= 1 and < len(t.nodes).
+func (t *Tree[E, S]) playGame(pos int) int {
+	nodes := t.nodes
+	if pos >= len(nodes)/2 {
+		return pos
+	}
+	left := t.playGame(pos * 2)
+	right := t.playGame(pos*2 + 1)
+	var loser, winner int
+	if t.less(nodes[left].value, nodes[right].value) {
+		loser, winner = right, left
+	} else {
+		loser, winner = left, right
+	}
+	nodes[pos].index = loser
+	nodes[pos].value = nodes[loser].value
+	return winner
+}
+
+// Starting at pos, which is a winner, re-consider all values up to the root.
+func (t *Tree[E, S]) replayGames(pos int) {
+	nodes := t.nodes
+	winningValue := nodes[pos].value
+	for n := parent(pos); n != 0; n = parent(n) {
+		node := &nodes[n]
+		if t.less(node.value, winningValue) {
+			// Record pos as the loser here, and the old loser is the new winner.
+			node.index, pos = pos, node.index
+			node.value, winningValue = winningValue, node.value
+		}
+	}
+	// pos is now the winner; store it in node 0.
+	nodes[0].index = pos
+	nodes[0].value = winningValue
+}
+
+func parent(i int) int { return i >> 1 }
diff --git a/pkg/losertree/tree_test.go b/pkg/losertree/tree_test.go
@@ -0,0 +1,124 @@
+// Copyright (c) The Thanos Authors.
+// Licensed under the Apache License 2.0.
+
+// Original version copyright Bryan Boreham, 2024.
+// https://github.com/bboreham/go-loser/tree/any.
+package losertree
+
+import (
+	"math"
+	"testing"
+)
+
+type List struct {
+	list []uint64
+	cur  uint64
+}
+
+func NewList(list ...uint64) *List {
+	return &List{list: list}
+}
+
+func (it *List) At() uint64 {
+	return it.cur
+}
+
+func (it *List) Next() bool {
+	if len(it.list) > 0 {
+		it.cur = it.list[0]
+		it.list = it.list[1:]
+		return true
+	}
+	it.cur = 0
+	return false
+}
+
+func (it *List) Seek(val uint64) bool {
+	for it.cur < val && len(it.list) > 0 {
+		it.cur = it.list[0]
+		it.list = it.list[1:]
+	}
+	return len(it.list) > 0
+}
+
+func checkIterablesEqual[E any, S1 Sequence, S2 Sequence](t *testing.T, a S1, b S2, at1 func(S1) E, at2 func(S2) E, less func(E, E) bool) {
+	t.Helper()
+	count := 0
+	for a.Next() {
+		count++
+		if !b.Next() {
+			t.Fatalf("b ended before a after %d elements", count)
+		}
+		if less(at1(a), at2(b)) || less(at2(b), at1(a)) {
+			t.Fatalf("position %d: %v != %v", count, at1(a), at2(b))
+		}
+	}
+	if b.Next() {
+		t.Fatalf("a ended before b after %d elements", count)
+	}
+}
+
+var testCases = []struct {
+	name string
+	args []*List
+	want *List
+}{
+	{
+		name: "empty input",
+		want: NewList(),
+	},
+	{
+		name: "one list",
+		args: []*List{NewList(1, 2, 3, 4)},
+		want: NewList(1, 2, 3, 4),
+	},
+	{
+		name: "two lists",
+		args: []*List{NewList(3, 4, 5), NewList(1, 2)},
+		want: NewList(1, 2, 3, 4, 5),
+	},
+	{
+		name: "two lists, first empty",
+		args: []*List{NewList(), NewList(1, 2)},
+		want: NewList(1, 2),
+	},
+	{
+		name: "two lists, second empty",
+		args: []*List{NewList(1, 2), NewList()},
+		want: NewList(1, 2),
+	},
+	{
+		name: "two lists b",
+		args: []*List{NewList(1, 2), NewList(3, 4, 5)},
+		want: NewList(1, 2, 3, 4, 5),
+	},
+	{
+		name: "two lists c",
+		args: []*List{NewList(1, 3), NewList(2, 4, 5)},
+		want: NewList(1, 2, 3, 4, 5),
+	},
+	{
+		name: "three lists",
+		args: []*List{NewList(1, 3), NewList(2, 4), NewList(5)},
+		want: NewList(1, 2, 3, 4, 5),
+	},
+}
+
+func TestMerge(t *testing.T) {
+	at := func(s *List) uint64 { return s.At() }
+	less := func(a, b uint64) bool { return a < b }
+	at2 := func(s *Tree[uint64, *List]) uint64 { return s.Winner().At() }
+	for _, tt := range testCases {
+		t.Run(tt.name, func(t *testing.T) {
+			numCloses := 0
+			closeFn := func(_ *List) {
+				numCloses++
+			}
+			lt := New(tt.args, math.MaxUint64, at, less, closeFn)
+			checkIterablesEqual(t, tt.want, lt, at, at2, less)
+			if numCloses != len(tt.args) {
+				t.Errorf("Expected %d closes, got %d", len(tt.args), numCloses)
+			}
+		})
+	}
+}
diff --git a/pkg/store/bucket.go b/pkg/store/bucket.go
@@ -1650,13 +1650,8 @@ func (s *BucketStore) Series(req *storepb.SeriesRequest, seriesSrv storepb.Store
 
 	// Merge the sub-results from each selected block.
 	tracing.DoInSpan(ctx, "bucket_store_merge_all", func(ctx context.Context) {
-		defer func() {
-			for _, resp := range respSets {
-				resp.Close()
-			}
-		}()
 		begin := time.Now()
-		set := NewDedupResponseHeap(NewProxyResponseHeap(respSets...))
+		set := NewResponseDeduplicator(NewProxyResponseLoserTree(respSets...))
 		for set.Next() {
 			at := set.At()
 			warn := at.GetWarning()

diff --git a/pkg/store/proxy.go b/pkg/store/proxy.go
@@ -436,7 +436,7 @@ func (s *ProxyStore) Series(originalRequest *storepb.SeriesRequest, srv storepb.
 
 	level.Debug(reqLogger).Log("msg", "Series: started fanout streams", "status", strings.Join(storeDebugMsgs, ";"))
 
-	respHeap := NewDedupResponseHeap(NewProxyResponseHeap(storeResponses...))
+	respHeap := NewResponseDeduplicator(NewProxyResponseLoserTree(storeResponses...))
 	for respHeap.Next() {
 		resp := respHeap.At()