From 69c9584127e86842d99bb73ed5059ba0bd94249b Mon Sep 17 00:00:00 2001 From: Jackson Owens Date: Fri, 26 Jan 2024 15:08:45 -0500 Subject: [PATCH] db: refactor table cache newIters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit refactors the interface of the table cache `newIters` method to support creation of any of a sstable's iterators, including its point iterator, range deletion iterator and range key iterator. This is motivated by the refactor planned in #2863, which in turn requires refactoring some of the newIters call sites that determine whether ingested sstables overlap existing tables. There's also a minor benefit that a compaction setup no longer needs to open a point iterator and then immediately close it when initializing the range deletion iterators. ``` goos: linux goarch: amd64 pkg: github.com/cockroachdb/pebble cpu: Intel(R) Xeon(R) CPU @ 2.80GHz │ old-NewItersAlloc.txt │ new-NewItersAlloc.txt │ │ sec/op │ sec/op vs base │ NewItersAlloc 427.0n ± 1% 412.1n ± 1% -3.47% (p=0.000 n=10) │ old-NewItersAlloc.txt │ new-NewItersAlloc.txt │ │ B/op │ B/op vs base │ NewItersAlloc 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ ¹ all samples are equal │ old-NewItersAlloc.txt │ new-NewItersAlloc.txt │ │ allocs/op │ allocs/op vs base │ NewItersAlloc 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ ¹ all samples are equal ``` --- compaction.go | 24 +-- compaction_test.go | 6 +- db.go | 69 +++---- external_iterator.go | 5 +- flushable.go | 2 +- get_iter_test.go | 9 +- ingest.go | 4 +- iterator_test.go | 11 +- level_checker.go | 2 +- level_checker_test.go | 11 +- level_iter.go | 34 +--- level_iter_test.go | 56 +++--- merging_iter_test.go | 25 +-- open.go | 2 +- scan_internal.go | 2 +- table_cache.go | 413 ++++++++++++++++++++++++++-------------- table_cache_test.go | 46 +++-- testdata/checkpoint | 12 +- testdata/cleaner | 8 +- testdata/event_listener | 12 +- testdata/metrics | 36 ++-- 21 files changed, 419 insertions(+), 370 deletions(-) diff --git a/compaction.go b/compaction.go index 7ccdb194b4..09482da4b2 100644 --- a/compaction.go +++ b/compaction.go @@ -1427,38 +1427,22 @@ func (c *compaction) newRangeDelIter( bytesIterated *uint64, ) (keyspan.FragmentIterator, io.Closer, error) { opts.level = l - iter, rangeDelIter, err := newIters(context.Background(), f.FileMetadata, + iterSet, err := newIters(context.Background(), f.FileMetadata, &opts, internalIterOpts{ bytesIterated: &c.bytesIterated, bufferPool: &c.bufferPool, - }) + }, iterRangeDeletions) if err != nil { return nil, nil, err - } - // TODO(peter): It is mildly wasteful to open the point iterator only to - // immediately close it. One way to solve this would be to add new - // methods to tableCache for creating point and range-deletion iterators - // independently. We'd only want to use those methods here, - // though. Doesn't seem worth the hassle in the near term. - if err = iter.Close(); err != nil { - if rangeDelIter != nil { - err = errors.CombineErrors(err, rangeDelIter.Close()) - } - return nil, nil, err - } - if rangeDelIter == nil { + } else if iterSet.rangeDeletion == nil { // The file doesn't contain any range deletions. return nil, nil, nil } - // Ensure that rangeDelIter is not closed until the compaction is // finished. This is necessary because range tombstone processing // requires the range tombstones to be held in memory for up to the // lifetime of the compaction. - closer := rangeDelIter - rangeDelIter = noCloseIter{rangeDelIter} - - return rangeDelIter, closer, nil + return noCloseIter{iterSet.rangeDeletion}, iterSet.rangeDeletion, nil } func (c *compaction) String() string { diff --git a/compaction_test.go b/compaction_test.go index 8a29f79823..876bae2eef 100644 --- a/compaction_test.go +++ b/compaction_test.go @@ -2891,9 +2891,9 @@ func TestCompactionCheckOrdering(t *testing.T) { } newIters := func( - _ context.Context, _ *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, - ) (internalIterator, keyspan.FragmentIterator, error) { - return &errorIter{}, nil, nil + _ context.Context, _ *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, _ iterKinds, + ) (iterSet, error) { + return iterSet{point: &errorIter{}}, nil } result := "OK" _, err := c.newInputIter(newIters, nil, nil) diff --git a/db.go b/db.go index 8a843b75aa..0f0219bfbe 100644 --- a/db.go +++ b/db.go @@ -3077,42 +3077,33 @@ func (d *DB) checkVirtualBounds(m *fileMetadata) { return } - if m.HasPointKeys { - pointIter, rangeDelIter, err := d.newIters(context.TODO(), m, nil, internalIterOpts{}) - if err != nil { - panic(errors.Wrap(err, "pebble: error creating point iterator")) - } + iters, err := d.newIters(context.TODO(), m, nil, internalIterOpts{}, iterPointKeys|iterRangeDeletions|iterRangeKeys) + if err != nil { + panic(errors.Wrap(err, "pebble: error creating iterators")) + } + defer iters.CloseAll() - defer pointIter.Close() - if rangeDelIter != nil { - defer rangeDelIter.Close() - } + if m.HasPointKeys { + pointIter := iters.Point() + rangeDelIter := iters.RangeDeletion() + // Check that the lower bound is tight. pointKey, _ := pointIter.First() - var rangeDel *keyspan.Span - if rangeDelIter != nil { - rangeDel, err = rangeDelIter.First() - if err != nil { - panic(err) - } + rangeDel, err := rangeDelIter.First() + if err != nil { + panic(err) } - - // Check that the lower bound is tight. if (rangeDel == nil || d.cmp(rangeDel.SmallestKey().UserKey, m.SmallestPointKey.UserKey) != 0) && (pointKey == nil || d.cmp(pointKey.UserKey, m.SmallestPointKey.UserKey) != 0) { panic(errors.Newf("pebble: virtual sstable %s lower point key bound is not tight", m.FileNum)) } + // Check that the upper bound is tight. pointKey, _ = pointIter.Last() - rangeDel = nil - if rangeDelIter != nil { - rangeDel, err = rangeDelIter.Last() - if err != nil { - panic(err) - } + rangeDel, err = rangeDelIter.Last() + if err != nil { + panic(err) } - - // Check that the upper bound is tight. if (rangeDel == nil || d.cmp(rangeDel.LargestKey().UserKey, m.LargestPointKey.UserKey) != 0) && (pointKey == nil || d.cmp(pointKey.UserKey, m.LargestPointKey.UserKey) != 0) { panic(errors.Newf("pebble: virtual sstable %s upper point key bound is not tight", m.FileNum)) @@ -3124,32 +3115,24 @@ func (d *DB) checkVirtualBounds(m *fileMetadata) { panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, key.UserKey)) } } - - if rangeDelIter != nil { - s, err := rangeDelIter.First() - for ; s != nil; s, err = rangeDelIter.Next() { - if d.cmp(s.SmallestKey().UserKey, m.SmallestPointKey.UserKey) < 0 { - panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, s.SmallestKey().UserKey)) - } - if d.cmp(s.LargestKey().UserKey, m.LargestPointKey.UserKey) > 0 { - panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, s.LargestKey().UserKey)) - } + s, err := rangeDelIter.First() + for ; s != nil; s, err = rangeDelIter.Next() { + if d.cmp(s.SmallestKey().UserKey, m.SmallestPointKey.UserKey) < 0 { + panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, s.SmallestKey().UserKey)) } - if err != nil { - panic(err) + if d.cmp(s.LargestKey().UserKey, m.LargestPointKey.UserKey) > 0 { + panic(errors.Newf("pebble: virtual sstable %s point key %s is not within bounds", m.FileNum, s.LargestKey().UserKey)) } } + if err != nil { + panic(err) + } } if !m.HasRangeKeys { return } - - rangeKeyIter, err := d.tableNewRangeKeyIter(m, keyspan.SpanIterOptions{}) - if err != nil { - panic(errors.Wrap(err, "pebble: error creating range key iterator")) - } - defer rangeKeyIter.Close() + rangeKeyIter := iters.RangeKey() // Check that the lower bound is tight. if s, err := rangeKeyIter.First(); err != nil { diff --git a/external_iterator.go b/external_iterator.go index 0b02e8f41d..60e08aa777 100644 --- a/external_iterator.go +++ b/external_iterator.go @@ -139,9 +139,8 @@ func NewExternalIterWithContext( // Add the readers to the Iterator so that Close closes them, and // SetOptions can re-construct iterators from them. externalReaders: readers, - newIters: func( - ctx context.Context, f *manifest.FileMetadata, opts *IterOptions, - internalOpts internalIterOpts) (internalIterator, keyspan.FragmentIterator, error) { + newIters: func(context.Context, *manifest.FileMetadata, *IterOptions, + internalIterOpts, iterKinds) (iterSet, error) { // NB: External iterators are currently constructed without any // `levelIters`. newIters should never be called. When we support // organizing multiple non-overlapping files into a single level diff --git a/flushable.go b/flushable.go index bc380746b4..eb5b6f8207 100644 --- a/flushable.go +++ b/flushable.go @@ -219,7 +219,7 @@ func (s *ingestedFlushable) constructRangeDelIter( ) (keyspan.FragmentIterator, error) { // Note that the keyspan level iter expects a non-nil iterator to be // returned even if there is an error. So, we return the emptyKeyspanIter. - iter, rangeDelIter, err := s.newIters(context.Background(), file, nil, internalIterOpts{}) + iter, rangeDelIter, err := s.newIters.TODO(context.Background(), file, nil, internalIterOpts{}) if err != nil { return emptyKeyspanIter, err } diff --git a/get_iter_test.go b/get_iter_test.go index 244c337823..a0c93d0118 100644 --- a/get_iter_test.go +++ b/get_iter_test.go @@ -11,7 +11,6 @@ import ( "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/internal/testkeys" ) @@ -387,13 +386,13 @@ func TestGetIter(t *testing.T) { // m is a map from file numbers to DBs. m := map[FileNum]*memTable{} newIter := func( - _ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, - ) (internalIterator, keyspan.FragmentIterator, error) { + _ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, _ iterKinds, + ) (iterSet, error) { d, ok := m[file.FileNum] if !ok { - return nil, nil, errors.New("no such file") + return iterSet{}, errors.New("no such file") } - return d.newIter(nil), nil, nil + return iterSet{point: d.newIter(nil)}, nil } var files [numLevels][]*fileMetadata diff --git a/ingest.go b/ingest.go index 05248d5158..03e1c9c733 100644 --- a/ingest.go +++ b/ingest.go @@ -1663,7 +1663,7 @@ func (d *DB) excise( // This file will contain point keys smallestPointKey := m.SmallestPointKey var err error - iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{ + iter, rangeDelIter, err = d.newIters.TODO(context.TODO(), m, &IterOptions{ CategoryAndQoS: sstable.CategoryAndQoS{ Category: "pebble-ingest", QoSLevel: sstable.LatencySensitiveQoSLevel, @@ -1781,7 +1781,7 @@ func (d *DB) excise( largestPointKey := m.LargestPointKey var err error if iter == nil && rangeDelIter == nil { - iter, rangeDelIter, err = d.newIters(context.TODO(), m, &IterOptions{ + iter, rangeDelIter, err = d.newIters.TODO(context.TODO(), m, &IterOptions{ CategoryAndQoS: sstable.CategoryAndQoS{ Category: "pebble-ingest", QoSLevel: sstable.LatencySensitiveQoSLevel, diff --git a/iterator_test.go b/iterator_test.go index 4f79ca8d9f..c4823a2cb4 100644 --- a/iterator_test.go +++ b/iterator_test.go @@ -23,7 +23,6 @@ import ( "github.com/cockroachdb/pebble/internal/base" "github.com/cockroachdb/pebble/internal/bytealloc" "github.com/cockroachdb/pebble/internal/invalidating" - "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/internal/testkeys" "github.com/cockroachdb/pebble/objstorage/objstorageprovider" @@ -919,14 +918,14 @@ func TestIteratorSeekOpt(t *testing.T) { oldNewIters := d.newIters d.newIters = func( ctx context.Context, file *manifest.FileMetadata, opts *IterOptions, - internalOpts internalIterOpts) (internalIterator, keyspan.FragmentIterator, error) { - iter, rangeIter, err := oldNewIters(ctx, file, opts, internalOpts) - iterWrapped := &iterSeekOptWrapper{ - internalIterator: iter, + internalOpts internalIterOpts, kinds iterKinds) (iterSet, error) { + iters, err := oldNewIters(ctx, file, opts, internalOpts, kinds) + iters.point = &iterSeekOptWrapper{ + internalIterator: iters.point, seekGEUsingNext: &seekGEUsingNext, seekPrefixGEUsingNext: &seekPrefixGEUsingNext, } - return iterWrapped, rangeIter, err + return iters, err } return s diff --git a/level_checker.go b/level_checker.go index b378f1c3df..96488f3220 100644 --- a/level_checker.go +++ b/level_checker.go @@ -378,7 +378,7 @@ func checkRangeTombstones(c *checkConfig) error { for f := files.First(); f != nil; f = files.Next() { lf := files.Take() //lower, upper := manifest.KeyRange(c.cmp, lf.Iter()) - iterToClose, iter, err := c.newIters( + iterToClose, iter, err := c.newIters.TODO( context.Background(), lf.FileMetadata, &IterOptions{level: manifest.Level(lsmLevel)}, internalIterOpts{}) if err != nil { return err diff --git a/level_checker_test.go b/level_checker_test.go index dae50bfca4..e73ba5ecf3 100644 --- a/level_checker_test.go +++ b/level_checker_test.go @@ -95,17 +95,20 @@ func TestCheckLevelsCornerCases(t *testing.T) { var fileNum FileNum newIters := - func(_ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts) (internalIterator, keyspan.FragmentIterator, error) { + func(_ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, _ iterKinds) (iterSet, error) { r := readers[file.FileNum] rangeDelIter, err := r.NewRawRangeDelIter() if err != nil { - return nil, nil, err + return iterSet{}, err } iter, err := r.NewIter(nil /* lower */, nil /* upper */) if err != nil { - return nil, nil, err + return iterSet{}, err } - return iter, rangeDelIter, nil + return iterSet{ + point: iter, + rangeDeletion: rangeDelIter, + }, nil } fm := &failMerger{} diff --git a/level_iter.go b/level_iter.go index e738b22f2d..3cefa66ae1 100644 --- a/level_iter.go +++ b/level_iter.go @@ -16,38 +16,6 @@ import ( "github.com/cockroachdb/pebble/sstable" ) -// tableNewIters creates a new point and range-del iterator for the given file -// number. -// -// On success, the internalIterator is not-nil and must be closed; the -// FragmentIterator can be nil. -// TODO(radu): always return a non-nil FragmentIterator. -// -// On error, the iterators are nil. -// -// The only (non-test) implementation of tableNewIters is tableCacheContainer.newIters(). -type tableNewIters func( - ctx context.Context, - file *manifest.FileMetadata, - opts *IterOptions, - internalOpts internalIterOpts, -) (internalIterator, keyspan.FragmentIterator, error) - -// tableNewRangeDelIter takes a tableNewIters and returns a TableNewSpanIter -// for the rangedel iterator returned by tableNewIters. -func tableNewRangeDelIter(ctx context.Context, newIters tableNewIters) keyspan.TableNewSpanIter { - return func(file *manifest.FileMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) { - iter, rangeDelIter, err := newIters(ctx, file, nil, internalIterOpts{}) - if iter != nil { - _ = iter.Close() - } - if rangeDelIter == nil { - rangeDelIter = emptyKeyspanIter - } - return rangeDelIter, err - } -} - type internalIterOpts struct { bytesIterated *uint64 bufferPool *sstable.BufferPool @@ -676,7 +644,7 @@ func (l *levelIter) loadFile(file *fileMetadata, dir int) loadFileReturnIndicato var rangeDelIter keyspan.FragmentIterator var iter internalIterator - iter, rangeDelIter, l.err = l.newIters(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts) + iter, rangeDelIter, l.err = l.newIters.TODO(l.ctx, l.iterFile, &l.tableOpts, l.internalOpts) l.iter = iter if l.err != nil { return noFileLoaded diff --git a/level_iter_test.go b/level_iter_test.go index 0ec3d6e96a..038f5c88ea 100644 --- a/level_iter_test.go +++ b/level_iter_test.go @@ -36,12 +36,12 @@ func TestLevelIter(t *testing.T) { var files manifest.LevelSlice newIters := func( - _ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts, - ) (internalIterator, keyspan.FragmentIterator, error) { + _ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts, _ iterKinds, + ) (iterSet, error) { f := *iters[file.FileNum] f.lower = opts.GetLowerBound() f.upper = opts.GetUpperBound() - return &f, nil, nil + return iterSet{point: &f}, nil } datadriven.RunTest(t, "testdata/level_iter", func(t *testing.T, d *datadriven.TestData) string { @@ -123,10 +123,10 @@ func TestLevelIter(t *testing.T) { var tableOpts *IterOptions newIters2 := func( ctx context.Context, file *manifest.FileMetadata, opts *IterOptions, - internalOpts internalIterOpts, - ) (internalIterator, keyspan.FragmentIterator, error) { + internalOpts internalIterOpts, kinds iterKinds, + ) (iterSet, error) { tableOpts = opts - return newIters(ctx, file, opts, internalOpts) + return newIters(ctx, file, opts, internalOpts, kinds) } iter := newLevelIter(context.Background(), opts, testkeys.Comparer, newIters2, files.Iter(), manifest.Level(level), internalIterOpts{}) @@ -158,20 +158,24 @@ func newLevelIterTest() *levelIterTest { } func (lt *levelIterTest) newIters( - ctx context.Context, file *manifest.FileMetadata, opts *IterOptions, iio internalIterOpts, -) (internalIterator, keyspan.FragmentIterator, error) { + ctx context.Context, + file *manifest.FileMetadata, + opts *IterOptions, + iio internalIterOpts, + kinds iterKinds, +) (iterSet, error) { lt.itersCreated++ iter, err := lt.readers[file.FileNum].NewIterWithBlockPropertyFiltersAndContextEtc( ctx, opts.LowerBound, opts.UpperBound, nil, false, true, iio.stats, sstable.CategoryAndQoS{}, nil, sstable.TrivialReaderProvider{Reader: lt.readers[file.FileNum]}) if err != nil { - return nil, nil, err + return iterSet{}, err } rangeDelIter, err := lt.readers[file.FileNum].NewRawRangeDelIter() if err != nil { - return nil, nil, err + return iterSet{}, err } - return iter, rangeDelIter, nil + return iterSet{point: iter, rangeDeletion: rangeDelIter}, nil } func (lt *levelIterTest) runClear(d *datadriven.TestData) string { @@ -535,10 +539,10 @@ func BenchmarkLevelIterSeekGE(b *testing.B) { readers, metas, keys, cleanup := buildLevelIterTables(b, blockSize, restartInterval, count) defer cleanup() newIters := func( - _ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, - ) (internalIterator, keyspan.FragmentIterator, error) { + _ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, _ iterKinds, + ) (iterSet, error) { iter, err := readers[file.FileNum].NewIter(nil /* lower */, nil /* upper */) - return iter, nil, err + return iterSet{point: iter}, err } l := newLevelIter(context.Background(), IterOptions{}, DefaultComparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{}) rng := rand.New(rand.NewSource(uint64(time.Now().UnixNano()))) @@ -576,11 +580,11 @@ func BenchmarkLevelIterSeqSeekGEWithBounds(b *testing.B) { // This newIters is cheaper than in practice since it does not do // tableCacheShard.findNode. newIters := func( - _ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts, - ) (internalIterator, keyspan.FragmentIterator, error) { + _ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts, _ iterKinds, + ) (iterSet, error) { iter, err := readers[file.FileNum].NewIter( opts.LowerBound, opts.UpperBound) - return iter, nil, err + return iterSet{point: iter}, err } l := newLevelIter(context.Background(), IterOptions{}, DefaultComparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{}) // Fake up the range deletion initialization, to resemble the usage @@ -618,11 +622,11 @@ func BenchmarkLevelIterSeqSeekPrefixGE(b *testing.B) { // This newIters is cheaper than in practice since it does not do // tableCacheShard.findNode. newIters := func( - _ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts, - ) (internalIterator, keyspan.FragmentIterator, error) { + _ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts, _ iterKinds, + ) (iterSet, error) { iter, err := readers[file.FileNum].NewIter( opts.LowerBound, opts.UpperBound) - return iter, nil, err + return iterSet{point: iter}, err } for _, skip := range []int{1, 2, 4, 8, 16} { @@ -669,10 +673,10 @@ func BenchmarkLevelIterNext(b *testing.B) { readers, metas, _, cleanup := buildLevelIterTables(b, blockSize, restartInterval, count) defer cleanup() newIters := func( - _ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, - ) (internalIterator, keyspan.FragmentIterator, error) { + _ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, _ iterKinds, + ) (iterSet, error) { iter, err := readers[file.FileNum].NewIter(nil /* lower */, nil /* upper */) - return iter, nil, err + return iterSet{point: iter}, err } l := newLevelIter(context.Background(), IterOptions{}, testkeys.Comparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{}) @@ -703,10 +707,10 @@ func BenchmarkLevelIterPrev(b *testing.B) { readers, metas, _, cleanup := buildLevelIterTables(b, blockSize, restartInterval, count) defer cleanup() newIters := func( - _ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, - ) (internalIterator, keyspan.FragmentIterator, error) { + _ context.Context, file *manifest.FileMetadata, _ *IterOptions, _ internalIterOpts, _ iterKinds, + ) (iterSet, error) { iter, err := readers[file.FileNum].NewIter(nil /* lower */, nil /* upper */) - return iter, nil, err + return iterSet{point: iter}, err } l := newLevelIter(context.Background(), IterOptions{}, DefaultComparer, newIters, metas.Iter(), manifest.Level(level), internalIterOpts{}) diff --git a/merging_iter_test.go b/merging_iter_test.go index 30a5a592bc..dd32c05412 100644 --- a/merging_iter_test.go +++ b/merging_iter_test.go @@ -163,22 +163,23 @@ func TestMergingIterCornerCases(t *testing.T) { fileNum base.FileNum ) newIters := - func(_ context.Context, file *manifest.FileMetadata, opts *IterOptions, iio internalIterOpts, - ) (internalIterator, keyspan.FragmentIterator, error) { + func(_ context.Context, file *manifest.FileMetadata, opts *IterOptions, iio internalIterOpts, kinds iterKinds, + ) (iterSet, error) { r := readers[file.FileNum] rangeDelIter, err := r.NewRawRangeDelIter() if err != nil { - return nil, nil, err + return iterSet{}, err } iter, err := r.NewIterWithBlockPropertyFilters( opts.GetLowerBound(), opts.GetUpperBound(), nil, true /* useFilterBlock */, iio.stats, sstable.CategoryAndQoS{}, nil, sstable.TrivialReaderProvider{Reader: r}) if err != nil { - return nil, nil, err + return iterSet{}, err } - return itertest.Attach(iter, itertest.ProbeState{Log: &buf}, pointProbes[file.FileNum]...), - attachKeyspanProbes(rangeDelIter, keyspanProbeContext{log: &buf}, rangeDelProbes[file.FileNum]...), - nil + return iterSet{ + point: itertest.Attach(iter, itertest.ProbeState{Log: &buf}, pointProbes[file.FileNum]...), + rangeDeletion: attachKeyspanProbes(rangeDelIter, keyspanProbeContext{log: &buf}, rangeDelProbes[file.FileNum]...), + }, nil } datadriven.RunTest(t, "testdata/merging_iter", func(t *testing.T, d *datadriven.TestData) string { @@ -655,19 +656,19 @@ func buildMergingIter(readers [][]*sstable.Reader, levelSlices []manifest.LevelS levelIndex := i level := len(readers) - 1 - i newIters := func( - _ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts, - ) (internalIterator, keyspan.FragmentIterator, error) { + _ context.Context, file *manifest.FileMetadata, opts *IterOptions, _ internalIterOpts, _ iterKinds, + ) (iterSet, error) { iter, err := readers[levelIndex][file.FileNum].NewIter( opts.LowerBound, opts.UpperBound) if err != nil { - return nil, nil, err + return iterSet{}, err } rdIter, err := readers[levelIndex][file.FileNum].NewRawRangeDelIter() if err != nil { iter.Close() - return nil, nil, err + return iterSet{}, err } - return iter, rdIter, err + return iterSet{point: iter, rangeDeletion: rdIter}, err } l := newLevelIter( context.Background(), IterOptions{}, testkeys.Comparer, newIters, levelSlices[i].Iter(), diff --git a/open.go b/open.go index 51d601e1ae..1647b2e3ab 100644 --- a/open.go +++ b/open.go @@ -370,7 +370,7 @@ func Open(dirname string, opts *Options) (db *DB, err error) { opts.TableCache, d.cacheID, d.objProvider, d.opts, tableCacheSize, &sstable.CategoryStatsCollector{}) d.newIters = d.tableCache.newIters - d.tableNewRangeKeyIter = d.tableCache.newRangeKeyIter + d.tableNewRangeKeyIter = tableNewRangeKeyIter(context.TODO(), d.newIters) // Replay any newer log files than the ones named in the manifest. type fileNumAndName struct { diff --git a/scan_internal.go b/scan_internal.go index 3fdb148e44..3073559ba8 100644 --- a/scan_internal.go +++ b/scan_internal.go @@ -464,7 +464,7 @@ func (d *DB) truncateSharedFile( // We will need to truncate file bounds in at least one direction. Open all // relevant iterators. - iter, rangeDelIter, err := d.newIters(ctx, file, &IterOptions{ + iter, rangeDelIter, err := d.newIters.TODO(ctx, file, &IterOptions{ LowerBound: lower, UpperBound: upper, level: manifest.Level(level), diff --git a/table_cache.go b/table_cache.go index b730ffec89..350e649f8c 100644 --- a/table_cache.go +++ b/table_cache.go @@ -48,6 +48,75 @@ func (s *filteredAllKeysIter) MaybeFilteredKeys() bool { return true } +// tableNewIters creates new iterators (point, range deletion and/or range key) +// for the given file metadata. Which of the various iterator kinds the user is +// requesting is specified with the iterKinds bitmap. +// +// On success, the requested subset of iters.{point,rangeDel,rangeKey} are +// populated with iterators. +// +// If a point iterator is requested and the operation was successful, +// iters.point is guaranteed to be non-nil and must be closed when the caller is +// finished. +// +// If a range deletion or range key iterator is requested, the corresponding +// iterator may be nil if the table does not contain any keys of the +// corresponding kind. The returned iterSet type provides RangeDeletion() and +// RangeKey() convenience methods that return non-nil empty iterators that may +// be used if the caller requires a non-nil iterator. +// +// On error, all iterators are nil. +// +// The only (non-test) implementation of tableNewIters is +// tableCacheContainer.newIters(). +type tableNewIters func( + ctx context.Context, + file *manifest.FileMetadata, + opts *IterOptions, + internalOpts internalIterOpts, + kinds iterKinds, +) (iterSet, error) + +// TODO implements the old tableNewIters interface that always attempted to open +// a point iterator and a range deletion iterator. All call sites should be +// updated to use `f` directly. +func (f tableNewIters) TODO( + ctx context.Context, + file *manifest.FileMetadata, + opts *IterOptions, + internalOpts internalIterOpts, +) (internalIterator, keyspan.FragmentIterator, error) { + iters, err := f(ctx, file, opts, internalOpts, iterPointKeys|iterRangeDeletions) + if err != nil { + return nil, nil, err + } + return iters.point, iters.rangeDeletion, nil +} + +// tableNewRangeDelIter takes a tableNewIters and returns a TableNewSpanIter +// for the rangedel iterator returned by tableNewIters. +func tableNewRangeDelIter(ctx context.Context, newIters tableNewIters) keyspan.TableNewSpanIter { + return func(file *manifest.FileMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) { + iters, err := newIters(ctx, file, nil, internalIterOpts{}, iterRangeDeletions) + if err != nil { + return nil, err + } + return iters.RangeDeletion(), nil + } +} + +// tableNewRangeKeyIter takes a tableNewIters and returns a TableNewSpanIter +// for the range key iterator returned by tableNewIters. +func tableNewRangeKeyIter(ctx context.Context, newIters tableNewIters) keyspan.TableNewSpanIter { + return func(file *manifest.FileMetadata, iterOptions keyspan.SpanIterOptions) (keyspan.FragmentIterator, error) { + iters, err := newIters(ctx, file, nil, internalIterOpts{}, iterRangeKeys) + if err != nil { + return nil, err + } + return iters.RangeKey(), nil + } +} + var tableCacheLabels = pprof.Labels("pebble", "table-cache") // tableCacheOpts contains the db specific fields @@ -140,14 +209,9 @@ func (c *tableCacheContainer) newIters( file *manifest.FileMetadata, opts *IterOptions, internalOpts internalIterOpts, -) (internalIterator, keyspan.FragmentIterator, error) { - return c.tableCache.getShard(file.FileBacking.DiskFileNum).newIters(ctx, file, opts, internalOpts, &c.dbOpts) -} - -func (c *tableCacheContainer) newRangeKeyIter( - file *manifest.FileMetadata, opts keyspan.SpanIterOptions, -) (keyspan.FragmentIterator, error) { - return c.tableCache.getShard(file.FileBacking.DiskFileNum).newRangeKeyIter(file, opts, &c.dbOpts) + kinds iterKinds, +) (iterSet, error) { + return c.tableCache.getShard(file.FileBacking.DiskFileNum).newIters(ctx, file, opts, internalOpts, &c.dbOpts, kinds) } // getTableProperties returns the properties associated with the backing physical @@ -201,17 +265,13 @@ func (c *tableCacheContainer) estimateSize( return size, nil } -// createCommonReader creates a Reader for this file. isShared, if true for -// virtual sstables, is passed into the vSSTable reader so its iterators can -// collapse obsolete points accordingly. -func createCommonReader( - v *tableCacheValue, file *fileMetadata, isShared bool, -) sstable.CommonReader { +// createCommonReader creates a Reader for this file. +func createCommonReader(v *tableCacheValue, file *fileMetadata) sstable.CommonReader { // TODO(bananabrick): We suffer an allocation if file is a virtual sstable. var cr sstable.CommonReader = v.reader if file.Virtual { virtualReader := sstable.MakeVirtualReader( - v.reader, file.VirtualMeta(), isShared, + v.reader, file.VirtualMeta(), v.isShared, ) cr = &virtualReader } @@ -227,12 +287,7 @@ func (c *tableCacheContainer) withCommonReader( if v.err != nil { return v.err } - provider := c.dbOpts.objProvider - objMeta, err := provider.Lookup(fileTypeTable, meta.FileBacking.DiskFileNum) - if err != nil { - return err - } - return fn(createCommonReader(v, meta, objMeta.IsShared())) + return fn(createCommonReader(v, meta)) } func (c *tableCacheContainer) withReader(meta physicalMeta, fn func(*sstable.Reader) error) error { @@ -430,7 +485,8 @@ func (c *tableCacheShard) newIters( opts *IterOptions, internalOpts internalIterOpts, dbOpts *tableCacheOpts, -) (internalIterator, keyspan.FragmentIterator, error) { + kinds iterKinds, +) (iterSet, error) { // TODO(sumeer): constructing the Reader should also use a plumbed context, // since parts of the sstable are read during the construction. The Reader // should not remember that context since the Reader can be long-lived. @@ -442,11 +498,64 @@ func (c *tableCacheShard) newIters( v := c.findNode(file, dbOpts) if v.err != nil { defer c.unrefValue(v) - return nil, nil, v.err + return iterSet{}, v.err + } + + // Note: This suffers an allocation for virtual sstables. + cr := createCommonReader(v, file) + var iters iterSet + var err error + if kinds.RangeKey() && file.HasRangeKeys { + iters.rangeKey, err = c.newRangeKeyIter(v, cr, opts.SpanIterOptions()) + } + if kinds.RangeDeletion() && file.HasPointKeys && err == nil { + iters.rangeDeletion, err = c.newRangeDelIter(ctx, file, cr, dbOpts) + } + if kinds.Point() && err == nil { + iters.point, err = c.newPointIter(ctx, v, file, cr, opts, internalOpts, dbOpts) + } + if err != nil { + // NB: There's a subtlety here: Because the point iterator is the last + // iterator we attempt to create, it's not possible for: + // err != nil && iters.point != nil + // If it were possible, we'd need to account for it to avoid double + // unref-ing here, once during CloseAll and once during `unrefValue`. + iters.CloseAll() + c.unrefValue(v) + return iterSet{}, err + } + // Only point iterators ever require the reader stay pinned in the cache. If + // we're not returning a point iterator to the caller, we need to unref v. + // There's an added subtlety that iters.point may be non-nil but not a true + // iterator that's ref'd the underlying reader if block filters excluded the + // entirety of the table. + // + // TODO(jackson): This `filteredAll` subtlety can be removed after the + // planned #2863 refactor, when there will be no need to return a special + // empty iterator type to signify that point keys were filtered. + if iters.point == nil || iters.point == filteredAll { + c.unrefValue(v) } + return iters, nil +} - hideObsoletePoints := false - var pointKeyFilters []BlockPropertyFilter +// newPointIter is an internal helper that constructs a point iterator over a +// sstable. This function is for internal use only, and callers should use +// newIters instead. +func (c *tableCacheShard) newPointIter( + ctx context.Context, + v *tableCacheValue, + file *manifest.FileMetadata, + cr sstable.CommonReader, + opts *IterOptions, + internalOpts internalIterOpts, + dbOpts *tableCacheOpts, +) (internalIterator, error) { + var ( + hideObsoletePoints bool = false + pointKeyFilters []BlockPropertyFilter + filterer *sstable.BlockPropertiesFilterer + ) if opts != nil { // This code is appending (at most one filter) in-place to // opts.PointKeyFilters even though the slice is shared for iterators in @@ -470,64 +579,24 @@ func (c *tableCacheShard) newIters( hideObsoletePoints, pointKeyFilters = v.reader.TryAddBlockPropertyFilterForHideObsoletePoints( opts.snapshotForHideObsoletePoints, file.LargestSeqNum, opts.PointKeyFilters) - } - ok := true - var filterer *sstable.BlockPropertiesFilterer - var err error - if opts != nil { + + var ok bool + var err error ok, filterer, err = c.checkAndIntersectFilters(v, opts.TableFilter, pointKeyFilters, internalOpts.boundLimitedFilter) - } - if err != nil { - c.unrefValue(v) - return nil, nil, err - } - - provider := dbOpts.objProvider - // Check if this file is a foreign file. - objMeta, err := provider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) - if err != nil { - return nil, nil, err - } - - // Note: This suffers an allocation for virtual sstables. - cr := createCommonReader(v, file, objMeta.IsShared()) - - // NB: range-del iterator does not maintain a reference to the table, nor - // does it need to read from it after creation. - rangeDelIter, err := cr.NewRawRangeDelIter() - if err != nil { - c.unrefValue(v) - return nil, nil, err - } - - // Assert expected bounds in tests. - if invariants.Enabled && rangeDelIter != nil { - cmp := base.DefaultComparer.Compare - if dbOpts.opts.Comparer != nil { - cmp = dbOpts.opts.Comparer.Compare + if err != nil { + return nil, err + } else if !ok { + // Return an empty iterator. This iterator has no mutable state, so + // using a singleton is fine. We must return `filteredAll` instead + // of nil so that the returned iterator returns MaybeFilteredKeys() + // = true. + // + // TODO(jackson): This `filteredAll` subtlety can be removed after the + // planned #2863 refactor, when there will be no need to return a special + // empty iterator type to signify that point keys were filtered. + return filteredAll, err } - // TODO(radu): we should be using AssertBounds, but it currently fails in - // some cases (#3167). - rangeDelIter = keyspan.AssertUserKeyBounds( - rangeDelIter, file.SmallestPointKey.UserKey, file.LargestPointKey.UserKey, cmp, - ) - } - - if !ok { - c.unrefValue(v) - // Return an empty iterator. This iterator has no mutable state, so - // using a singleton is fine. - // NB: We still return the potentially non-empty rangeDelIter. This - // ensures the iterator observes the file's range deletions even if the - // block property filters exclude all the file's point keys. The range - // deletions may still delete keys lower in the LSM in files that DO - // match the active filters. - // - // The point iterator returned must implement the filteredIter - // interface, so that the level iterator surfaces file boundaries when - // range deletions are present. - return filteredAll, rangeDelIter, err } var iter sstable.Iterator @@ -538,16 +607,16 @@ func (c *tableCacheShard) newIters( } tableFormat, err := v.reader.TableFormat() if err != nil { - return nil, nil, err + return nil, err } var rp sstable.ReaderProvider if tableFormat >= sstable.TableFormatPebblev3 && v.reader.Properties.NumValueBlocks > 0 { rp = &tableCacheShardReaderProvider{c: c, file: file, dbOpts: dbOpts} } - if objMeta.IsShared() && v.reader.Properties.GlobalSeqNum != 0 { + if v.isShared && v.reader.Properties.GlobalSeqNum != 0 { if tableFormat < sstable.TableFormatPebblev4 { - return nil, nil, errors.New("pebble: shared ingested sstable has a lower table format than expected") + return nil, errors.New("pebble: shared ingested sstable has a lower table format than expected") } // The table is shared and ingested. hideObsoletePoints = true @@ -566,16 +635,11 @@ func (c *tableCacheShard) newIters( internalOpts.stats, categoryAndQoS, dbOpts.sstStatsCollector, rp) } if err != nil { - if rangeDelIter != nil { - _ = rangeDelIter.Close() - } - c.unrefValue(v) - return nil, nil, err + return nil, err } // NB: v.closeHook takes responsibility for calling unrefValue(v) here. Take // care to avoid introducing an allocation here by adding a closure. iter.SetCloseHook(v.closeHook) - c.iterCount.Add(1) dbOpts.iterCount.Add(1) if invariants.RaceEnabled { @@ -583,73 +647,56 @@ func (c *tableCacheShard) newIters( c.mu.iters[iter] = debug.Stack() c.mu.Unlock() } - return iter, rangeDelIter, nil + return iter, nil } -func (c *tableCacheShard) newRangeKeyIter( - file *manifest.FileMetadata, opts keyspan.SpanIterOptions, dbOpts *tableCacheOpts, +// newRangeDelIter is an internal helper that constructs an iterator over a +// sstable's range deletions. This function is for table-cache internal use +// only, and callers should use newIters instead. +func (c *tableCacheShard) newRangeDelIter( + ctx context.Context, file *manifest.FileMetadata, cr sstable.CommonReader, dbOpts *tableCacheOpts, ) (keyspan.FragmentIterator, error) { - // Calling findNode gives us the responsibility of decrementing v's - // refCount. If opening the underlying table resulted in error, then we - // decrement this straight away. Otherwise, we pass that responsibility to - // the sstable iterator, which decrements when it is closed. - v := c.findNode(file, dbOpts) - if v.err != nil { - defer c.unrefValue(v) - return nil, v.err + // NB: range-del iterator does not maintain a reference to the table, nor + // does it need to read from it after creation. + rangeDelIter, err := cr.NewRawRangeDelIter() + if err != nil { + return nil, err + } + // Assert expected bounds in tests. + if invariants.Enabled && rangeDelIter != nil { + cmp := base.DefaultComparer.Compare + if dbOpts.opts.Comparer != nil { + cmp = dbOpts.opts.Comparer.Compare + } + // TODO(radu): we should be using AssertBounds, but it currently fails in + // some cases (#3167). + rangeDelIter = keyspan.AssertUserKeyBounds( + rangeDelIter, file.SmallestPointKey.UserKey, file.LargestPointKey.UserKey, cmp, + ) } + return rangeDelIter, nil +} - ok := true - var err error +// newRangeKeyIter is an internal helper that constructs an iterator over a +// sstable's range keys. This function is for table-cache internal use only, and +// callers should use newIters instead. +func (c *tableCacheShard) newRangeKeyIter( + v *tableCacheValue, cr sstable.CommonReader, opts keyspan.SpanIterOptions, +) (keyspan.FragmentIterator, error) { // Don't filter a table's range keys if the file contains RANGEKEYDELs. // The RANGEKEYDELs may delete range keys in other levels. Skipping the // file's range key blocks may surface deleted range keys below. This is // done here, rather than deferring to the block-property collector in order // to maintain parity with point keys and the treatment of RANGEDELs. - if v.reader.Properties.NumRangeKeyDels == 0 { - ok, _, err = c.checkAndIntersectFilters(v, nil, opts.RangeKeyFilters, nil) - } - if err != nil { - c.unrefValue(v) - return nil, err - } - if !ok { - c.unrefValue(v) - // Return the empty iterator. This iterator has no mutable state, so - // using a singleton is fine. - return emptyKeyspanIter, err - } - - var iter keyspan.FragmentIterator - if file.Virtual { - provider := dbOpts.objProvider - var objMeta objstorage.ObjectMetadata - objMeta, err = provider.Lookup(fileTypeTable, file.FileBacking.DiskFileNum) - if err == nil { - virtualReader := sstable.MakeVirtualReader( - v.reader, file.VirtualMeta(), objMeta.IsShared(), - ) - iter, err = virtualReader.NewRawRangeKeyIter() + if v.reader.Properties.NumRangeKeyDels == 0 && len(opts.RangeKeyFilters) > 0 { + ok, _, err := c.checkAndIntersectFilters(v, nil, opts.RangeKeyFilters, nil) + if err != nil { + return nil, err + } else if !ok { + return nil, nil } - } else { - iter, err = v.reader.NewRawRangeKeyIter() } - - // iter is a block iter that holds the entire value of the block in memory. - // No need to hold onto a ref of the cache value. - c.unrefValue(v) - - if err != nil { - return nil, err - } - - if iter == nil { - // NewRawRangeKeyIter can return nil even if there's no error. However, - // the keyspan.LevelIter expects a non-nil iterator if err is nil. - return emptyKeyspanIter, nil - } - - return iter, nil + return cr.NewRawRangeKeyIter() } type tableCacheShardReaderProvider struct { @@ -1104,6 +1151,7 @@ type tableCacheValue struct { closeHook func(i sstable.Iterator) error reader *sstable.Reader err error + isShared bool loaded chan struct{} // Reference count for the value. The reader is closed when the reference // count drops to zero. @@ -1127,6 +1175,11 @@ func (v *tableCacheValue) load(loadInfo loadInfo, c *tableCacheShard, dbOpts *ta cacheOpts := private.SSTableCacheOpts(dbOpts.cacheID, loadInfo.backingFileNum).(sstable.ReaderOption) v.reader, err = sstable.NewReader(f, dbOpts.opts, cacheOpts, dbOpts.filterMetrics) } + if err == nil { + var objMeta objstorage.ObjectMetadata + objMeta, err = dbOpts.objProvider.Lookup(fileTypeTable, loadInfo.backingFileNum) + v.isShared = objMeta.IsShared() + } if err != nil { v.err = errors.Wrapf( err, "pebble: backing file %s error", loadInfo.backingFileNum) @@ -1225,3 +1278,75 @@ func (n *tableCacheNode) unlink() *tableCacheNode { n.links.next = n return next } + +// iterSet holds a set of iterators of various key kinds, all constructed over +// the same data structure (eg, an sstable). A subset of the fields may be +// populated depending on the `iterKinds` passed to newIters. +type iterSet struct { + point internalIterator + rangeDeletion keyspan.FragmentIterator + rangeKey keyspan.FragmentIterator +} + +// TODO(jackson): Consider adding methods for fast paths that check whether an +// iterator of a particular kind is nil, so that these call sites don't need to +// reach into the struct's fields directly. + +// Point returns the contained point iterator. If there is no point iterator, +// Point returns a non-nil empty point iterator. +func (s *iterSet) Point() internalIterator { + if s.point == nil { + return emptyIter + } + return s.point +} + +// RangeDeletion returns the contained range deletion iterator. If there is no +// range deletion iterator, RangeDeletion returns a non-nil empty keyspan +// iterator. +func (s *iterSet) RangeDeletion() keyspan.FragmentIterator { + if s.rangeDeletion == nil { + return emptyKeyspanIter + } + return s.rangeDeletion +} + +// RangeKey returns the contained range key iterator. If there is no range key +// iterator, RangeKey returns a non-nil empty keyspan iterator. +func (s *iterSet) RangeKey() keyspan.FragmentIterator { + if s.rangeKey == nil { + return emptyKeyspanIter + } + return s.rangeKey +} + +// CloseAll closes all of the held iterators. If CloseAll is called, then Close +// must be not be called on the consitutent iterators. +func (s *iterSet) CloseAll() error { + var err error + if s.point != nil { + err = s.point.Close() + } + if s.rangeDeletion != nil { + err = firstError(err, s.rangeDeletion.Close()) + } + if s.rangeKey != nil { + err = firstError(err, s.rangeKey.Close()) + } + return err +} + +// iterKinds is a bitmap indicating a set of kinds of iterators. Callers may +// bitwise-OR iterPointKeys, iterRangeDeletions and/or iterRangeKeys together to +// represent a set of desired iterator kinds. +type iterKinds uint8 + +func (t iterKinds) Point() bool { return (t & iterPointKeys) != 0 } +func (t iterKinds) RangeDeletion() bool { return (t & iterRangeDeletions) != 0 } +func (t iterKinds) RangeKey() bool { return (t & iterRangeKeys) != 0 } + +const ( + iterPointKeys iterKinds = 1 << iota + iterRangeDeletions + iterRangeKeys +) diff --git a/table_cache_test.go b/table_cache_test.go index 19b16c0e71..f8c1b812d6 100644 --- a/table_cache_test.go +++ b/table_cache_test.go @@ -9,7 +9,6 @@ import ( "bytes" "context" "fmt" - "io" "os" "path" "strconv" @@ -20,7 +19,6 @@ import ( "github.com/cockroachdb/errors" "github.com/cockroachdb/pebble/internal/base" - "github.com/cockroachdb/pebble/internal/keyspan" "github.com/cockroachdb/pebble/internal/manifest" "github.com/cockroachdb/pebble/internal/testkeys" "github.com/cockroachdb/pebble/objstorage" @@ -609,7 +607,7 @@ func testTableCacheRandomAccess(t *testing.T, concurrent bool) { m.InitPhysicalBacking() m.Ref() defer m.Unref() - iter, _, err := c.newIters(context.Background(), m, nil, internalIterOpts{}) + iter, _, err := tableNewIters(c.newIters).TODO(context.Background(), m, nil, internalIterOpts{}) if err != nil { errc <- errors.Errorf("i=%d, fileNum=%d: find: %v", i, fileNum, err) return @@ -666,20 +664,20 @@ func testTableCacheFrequentlyUsedInternal(t *testing.T, rangeIter bool) { for i := 0; i < N; i++ { for _, j := range [...]int{pinned0, i % tableCacheTestNumTables, pinned1} { - var iter io.Closer + var iters iterSet var err error m := &fileMetadata{FileNum: FileNum(j)} m.InitPhysicalBacking() m.Ref() if rangeIter { - iter, err = c.newRangeKeyIter(m, keyspan.SpanIterOptions{}) + iters, err = c.newIters(context.Background(), m, nil, internalIterOpts{}, iterRangeKeys) } else { - iter, _, err = c.newIters(context.Background(), m, nil, internalIterOpts{}) + iters, err = c.newIters(context.Background(), m, nil, internalIterOpts{}, iterPointKeys) } if err != nil { t.Fatalf("i=%d, j=%d: find: %v", i, j, err) } - if err := iter.Close(); err != nil { + if err := iters.CloseAll(); err != nil { t.Fatalf("i=%d, j=%d: close: %v", i, j, err) } } @@ -721,11 +719,11 @@ func TestSharedTableCacheFrequentlyUsed(t *testing.T) { m := &fileMetadata{FileNum: FileNum(j)} m.InitPhysicalBacking() m.Ref() - iter1, _, err := c1.newIters(context.Background(), m, nil, internalIterOpts{}) + iter1, _, err := tableNewIters(c1.newIters).TODO(context.Background(), m, nil, internalIterOpts{}) if err != nil { t.Fatalf("i=%d, j=%d: find: %v", i, j, err) } - iter2, _, err := c2.newIters(context.Background(), m, nil, internalIterOpts{}) + iter2, _, err := tableNewIters(c2.newIters).TODO(context.Background(), m, nil, internalIterOpts{}) if err != nil { t.Fatalf("i=%d, j=%d: find: %v", i, j, err) } @@ -769,20 +767,20 @@ func testTableCacheEvictionsInternal(t *testing.T, rangeIter bool) { rng := rand.New(rand.NewSource(2)) for i := 0; i < N; i++ { j := rng.Intn(tableCacheTestNumTables) - var iter io.Closer + var iters iterSet var err error m := &fileMetadata{FileNum: FileNum(j)} m.InitPhysicalBacking() m.Ref() if rangeIter { - iter, err = c.newRangeKeyIter(m, keyspan.SpanIterOptions{}) + iters, err = c.newIters(context.Background(), m, nil, internalIterOpts{}, iterRangeKeys) } else { - iter, _, err = c.newIters(context.Background(), m, nil, internalIterOpts{}) + iters, err = c.newIters(context.Background(), m, nil, internalIterOpts{}, iterPointKeys) } if err != nil { t.Fatalf("i=%d, j=%d: find: %v", i, j, err) } - if err := iter.Close(); err != nil { + if err := iters.CloseAll(); err != nil { t.Fatalf("i=%d, j=%d: close: %v", i, j, err) } @@ -839,12 +837,12 @@ func TestSharedTableCacheEvictions(t *testing.T) { m := &fileMetadata{FileNum: FileNum(j)} m.InitPhysicalBacking() m.Ref() - iter1, _, err := c1.newIters(context.Background(), m, nil, internalIterOpts{}) + iter1, _, err := tableNewIters(c1.newIters).TODO(context.Background(), m, nil, internalIterOpts{}) if err != nil { t.Fatalf("i=%d, j=%d: find: %v", i, j, err) } - iter2, _, err := c2.newIters(context.Background(), m, nil, internalIterOpts{}) + iter2, _, err := tableNewIters(c2.newIters).TODO(context.Background(), m, nil, internalIterOpts{}) if err != nil { t.Fatalf("i=%d, j=%d: find: %v", i, j, err) } @@ -907,7 +905,7 @@ func TestTableCacheIterLeak(t *testing.T) { m.InitPhysicalBacking() m.Ref() defer m.Unref() - iter, _, err := c.newIters(context.Background(), m, nil, internalIterOpts{}) + iter, _, err := tableNewIters(c.newIters).TODO(context.Background(), m, nil, internalIterOpts{}) require.NoError(t, err) if err := c.close(); err == nil { @@ -934,7 +932,7 @@ func TestSharedTableCacheIterLeak(t *testing.T) { m.InitPhysicalBacking() m.Ref() defer m.Unref() - iter, _, err := c1.newIters(context.Background(), m, nil, internalIterOpts{}) + iter, _, err := tableNewIters(c1.newIters).TODO(context.Background(), m, nil, internalIterOpts{}) require.NoError(t, err) if err := c1.close(); err == nil { @@ -972,13 +970,13 @@ func TestTableCacheRetryAfterFailure(t *testing.T) { m.InitPhysicalBacking() m.Ref() defer m.Unref() - if _, _, err = c.newIters(context.Background(), m, nil, internalIterOpts{}); err == nil { + if _, _, err = tableNewIters(c.newIters).TODO(context.Background(), m, nil, internalIterOpts{}); err == nil { t.Fatalf("expected failure, but found success") } require.Equal(t, "pebble: backing file 000000 error: injected error", err.Error()) fs.setOpenError(false /* enabled */) var iter internalIterator - iter, _, err = c.newIters(context.Background(), m, nil, internalIterOpts{}) + iter, _, err = tableNewIters(c.newIters).TODO(context.Background(), m, nil, internalIterOpts{}) require.NoError(t, err) require.NoError(t, iter.Close()) fs.validate(t, c, nil) @@ -1036,7 +1034,7 @@ func TestTableCacheErrorBadMagicNumber(t *testing.T) { m.InitPhysicalBacking() m.Ref() defer m.Unref() - if _, _, err = c.newIters(context.Background(), m, nil, internalIterOpts{}); err == nil { + if _, _, err = tableNewIters(c.newIters).TODO(context.Background(), m, nil, internalIterOpts{}); err == nil { t.Fatalf("expected failure, but found success") } require.Equal(t, @@ -1161,16 +1159,16 @@ func BenchmarkNewItersAlloc(b *testing.B) { d.mu.Unlock() // Open once so that the Reader is cached. - iter, _, err := d.newIters(context.Background(), m, nil, internalIterOpts{}) - require.NoError(b, iter.Close()) + iters, err := d.newIters(context.Background(), m, nil, internalIterOpts{}, iterPointKeys|iterRangeDeletions) + require.NoError(b, iters.CloseAll()) require.NoError(b, err) for i := 0; i < b.N; i++ { b.StartTimer() - iter, _, err := d.newIters(context.Background(), m, nil, internalIterOpts{}) + iters, err := d.newIters(context.Background(), m, nil, internalIterOpts{}, iterPointKeys|iterRangeDeletions) b.StopTimer() require.NoError(b, err) - require.NoError(b, iter.Close()) + require.NoError(b, iters.CloseAll()) } } diff --git a/testdata/checkpoint b/testdata/checkpoint index f815c82486..edf0f63841 100644 --- a/testdata/checkpoint +++ b/testdata/checkpoint @@ -207,29 +207,23 @@ open: db/000005.sst read-at(630, 53): db/000005.sst read-at(593, 37): db/000005.sst read-at(74, 519): db/000005.sst -read-at(47, 27): db/000005.sst -open: db/000005.sst -close: db/000005.sst open: db/000009.sst read-at(625, 53): db/000009.sst read-at(588, 37): db/000009.sst read-at(69, 519): db/000009.sst -read-at(42, 27): db/000009.sst -open: db/000009.sst -close: db/000009.sst open: db/000007.sst read-at(630, 53): db/000007.sst read-at(593, 37): db/000007.sst read-at(74, 519): db/000007.sst -read-at(47, 27): db/000007.sst -open: db/000007.sst -close: db/000007.sst +read-at(47, 27): db/000005.sst open: db/000005.sst read-at(0, 47): db/000005.sst +read-at(47, 27): db/000007.sst open: db/000007.sst read-at(0, 47): db/000007.sst create: db/000010.sst close: db/000005.sst +read-at(42, 27): db/000009.sst open: db/000009.sst read-at(0, 42): db/000009.sst close: db/000007.sst diff --git a/testdata/cleaner b/testdata/cleaner index 8b5f1ca094..a659aa978d 100644 --- a/testdata/cleaner +++ b/testdata/cleaner @@ -69,20 +69,16 @@ open: db/000005.sst read-at(607, 53): db/000005.sst read-at(570, 37): db/000005.sst read-at(79, 491): db/000005.sst -read-at(52, 27): db/000005.sst -open: db/000005.sst -close: db/000005.sst open: db/000007.sst read-at(581, 53): db/000007.sst read-at(544, 37): db/000007.sst read-at(53, 491): db/000007.sst -read-at(26, 27): db/000007.sst -open: db/000007.sst -close: db/000007.sst +read-at(52, 27): db/000005.sst open: db/000005.sst read-at(0, 52): db/000005.sst create: db/000008.sst close: db/000005.sst +read-at(26, 27): db/000007.sst open: db/000007.sst read-at(0, 26): db/000007.sst close: db/000007.sst diff --git a/testdata/event_listener b/testdata/event_listener index ad32671e53..23a37bc539 100644 --- a/testdata/event_listener +++ b/testdata/event_listener @@ -97,18 +97,14 @@ open: db/000005.sst read-at(609, 53): db/000005.sst read-at(572, 37): db/000005.sst read-at(53, 519): db/000005.sst -read-at(26, 27): db/000005.sst -open: db/000005.sst -close: db/000005.sst open: db/000008.sst read-at(609, 53): db/000008.sst read-at(572, 37): db/000008.sst read-at(53, 519): db/000008.sst -read-at(26, 27): db/000008.sst -open: db/000008.sst -close: db/000008.sst +read-at(26, 27): db/000005.sst open: db/000005.sst read-at(0, 26): db/000005.sst +read-at(26, 27): db/000008.sst open: db/000008.sst read-at(0, 26): db/000008.sst close: db/000008.sst @@ -221,7 +217,7 @@ MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B) Backing tables: 0 (0B) Virtual tables: 0 (0B) -Block cache: 6 entries (1.1KB) hit rate: 11.1% +Block cache: 6 entries (1.1KB) hit rate: 0.0% Table cache: 1 entries (808B) hit rate: 40.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 @@ -320,7 +316,7 @@ MemTables: 1 (512KB) zombie: 1 (512KB) Zombie tables: 0 (0B) Backing tables: 0 (0B) Virtual tables: 0 (0B) -Block cache: 12 entries (2.3KB) hit rate: 14.3% +Block cache: 12 entries (2.3KB) hit rate: 7.7% Table cache: 1 entries (808B) hit rate: 50.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 diff --git a/testdata/metrics b/testdata/metrics index 55a1beacc1..e44288bde1 100644 --- a/testdata/metrics +++ b/testdata/metrics @@ -121,7 +121,7 @@ MemTables: 1 (256KB) zombie: 2 (512KB) Zombie tables: 2 (1.3KB) Backing tables: 0 (0B) Virtual tables: 0 (0B) -Block cache: 5 entries (1.1KB) hit rate: 42.9% +Block cache: 5 entries (1.1KB) hit rate: 33.3% Table cache: 2 entries (1.6KB) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 @@ -129,7 +129,7 @@ Table iters: 2 Filter utility: 0.0% Ingestions: 0 as flushable: 0 (0B in 0 tables) Iter category stats: - pebble-compaction, non-latency: {BlockBytes:132 BlockBytesInCache:88 BlockReadDuration:10ms} + pebble-compaction, non-latency: {BlockBytes:88 BlockBytesInCache:44 BlockReadDuration:10ms} disk-usage ---- @@ -162,7 +162,7 @@ MemTables: 1 (256KB) zombie: 2 (512KB) Zombie tables: 2 (1.3KB) Backing tables: 0 (0B) Virtual tables: 0 (0B) -Block cache: 5 entries (1.1KB) hit rate: 42.9% +Block cache: 5 entries (1.1KB) hit rate: 33.3% Table cache: 2 entries (1.6KB) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 @@ -170,7 +170,7 @@ Table iters: 2 Filter utility: 0.0% Ingestions: 0 as flushable: 0 (0B in 0 tables) Iter category stats: - pebble-compaction, non-latency: {BlockBytes:132 BlockBytesInCache:88 BlockReadDuration:10ms} + pebble-compaction, non-latency: {BlockBytes:88 BlockBytesInCache:44 BlockReadDuration:10ms} # Closing iter c will release one of the zombie sstables. The other # zombie sstable is still referenced by iter b. @@ -200,7 +200,7 @@ MemTables: 1 (256KB) zombie: 2 (512KB) Zombie tables: 1 (661B) Backing tables: 0 (0B) Virtual tables: 0 (0B) -Block cache: 3 entries (556B) hit rate: 42.9% +Block cache: 3 entries (556B) hit rate: 33.3% Table cache: 1 entries (808B) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 @@ -209,7 +209,7 @@ Filter utility: 0.0% Ingestions: 0 as flushable: 0 (0B in 0 tables) Iter category stats: c, non-latency: {BlockBytes:44 BlockBytesInCache:44 BlockReadDuration:0s} - pebble-compaction, non-latency: {BlockBytes:132 BlockBytesInCache:88 BlockReadDuration:10ms} + pebble-compaction, non-latency: {BlockBytes:88 BlockBytesInCache:44 BlockReadDuration:10ms} disk-usage ---- @@ -242,7 +242,7 @@ MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B) Backing tables: 0 (0B) Virtual tables: 0 (0B) -Block cache: 0 entries (0B) hit rate: 42.9% +Block cache: 0 entries (0B) hit rate: 33.3% Table cache: 0 entries (0B) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 @@ -252,7 +252,7 @@ Ingestions: 0 as flushable: 0 (0B in 0 tables) Iter category stats: b, latency: {BlockBytes:44 BlockBytesInCache:0 BlockReadDuration:10ms} c, non-latency: {BlockBytes:44 BlockBytesInCache:44 BlockReadDuration:0s} - pebble-compaction, non-latency: {BlockBytes:132 BlockBytesInCache:88 BlockReadDuration:10ms} + pebble-compaction, non-latency: {BlockBytes:88 BlockBytesInCache:44 BlockReadDuration:10ms} disk-usage ---- @@ -311,7 +311,7 @@ MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B) Backing tables: 0 (0B) Virtual tables: 0 (0B) -Block cache: 0 entries (0B) hit rate: 42.9% +Block cache: 0 entries (0B) hit rate: 33.3% Table cache: 0 entries (0B) hit rate: 66.7% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 @@ -321,7 +321,7 @@ Ingestions: 0 as flushable: 0 (0B in 0 tables) Iter category stats: b, latency: {BlockBytes:44 BlockBytesInCache:0 BlockReadDuration:10ms} c, non-latency: {BlockBytes:44 BlockBytesInCache:44 BlockReadDuration:0s} - pebble-compaction, non-latency: {BlockBytes:132 BlockBytesInCache:88 BlockReadDuration:10ms} + pebble-compaction, non-latency: {BlockBytes:88 BlockBytesInCache:44 BlockReadDuration:10ms} additional-metrics ---- @@ -364,7 +364,7 @@ MemTables: 1 (256KB) zombie: 1 (256KB) Zombie tables: 0 (0B) Backing tables: 0 (0B) Virtual tables: 0 (0B) -Block cache: 0 entries (0B) hit rate: 27.3% +Block cache: 0 entries (0B) hit rate: 14.3% Table cache: 0 entries (0B) hit rate: 58.3% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 @@ -374,7 +374,7 @@ Ingestions: 0 as flushable: 0 (0B in 0 tables) Iter category stats: b, latency: {BlockBytes:44 BlockBytesInCache:0 BlockReadDuration:10ms} c, non-latency: {BlockBytes:44 BlockBytesInCache:44 BlockReadDuration:0s} - pebble-compaction, non-latency: {BlockBytes:411 BlockBytesInCache:154 BlockReadDuration:60ms} + pebble-compaction, non-latency: {BlockBytes:301 BlockBytesInCache:44 BlockReadDuration:60ms} additional-metrics ---- @@ -466,7 +466,7 @@ MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B) Backing tables: 0 (0B) Virtual tables: 0 (0B) -Block cache: 12 entries (2.4KB) hit rate: 24.5% +Block cache: 12 entries (2.4KB) hit rate: 16.7% Table cache: 1 entries (808B) hit rate: 60.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 @@ -476,7 +476,7 @@ Ingestions: 0 as flushable: 2 (2.1KB in 3 tables) Iter category stats: b, latency: {BlockBytes:44 BlockBytesInCache:0 BlockReadDuration:10ms} c, non-latency: {BlockBytes:44 BlockBytesInCache:44 BlockReadDuration:0s} - pebble-compaction, non-latency: {BlockBytes:411 BlockBytesInCache:154 BlockReadDuration:60ms} + pebble-compaction, non-latency: {BlockBytes:301 BlockBytesInCache:44 BlockReadDuration:60ms} pebble-ingest, latency: {BlockBytes:192 BlockBytesInCache:128 BlockReadDuration:10ms} batch @@ -527,7 +527,7 @@ MemTables: 1 (1.0MB) zombie: 1 (1.0MB) Zombie tables: 0 (0B) Backing tables: 0 (0B) Virtual tables: 0 (0B) -Block cache: 12 entries (2.4KB) hit rate: 24.5% +Block cache: 12 entries (2.4KB) hit rate: 16.7% Table cache: 1 entries (808B) hit rate: 60.0% Secondary cache: 0 entries (0B) hit rate: 0.0% Snapshots: 0 earliest seq num: 0 @@ -537,7 +537,7 @@ Ingestions: 0 as flushable: 2 (2.1KB in 3 tables) Iter category stats: b, latency: {BlockBytes:44 BlockBytesInCache:0 BlockReadDuration:10ms} c, non-latency: {BlockBytes:44 BlockBytesInCache:44 BlockReadDuration:0s} - pebble-compaction, non-latency: {BlockBytes:411 BlockBytesInCache:154 BlockReadDuration:60ms} + pebble-compaction, non-latency: {BlockBytes:301 BlockBytesInCache:44 BlockReadDuration:60ms} pebble-ingest, latency: {BlockBytes:192 BlockBytesInCache:128 BlockReadDuration:10ms} build ext1 @@ -612,7 +612,7 @@ Ingestions: 1 as flushable: 2 (2.1KB in 3 tables) Iter category stats: b, latency: {BlockBytes:44 BlockBytesInCache:0 BlockReadDuration:10ms} c, non-latency: {BlockBytes:44 BlockBytesInCache:44 BlockReadDuration:0s} - pebble-compaction, non-latency: {BlockBytes:411 BlockBytesInCache:154 BlockReadDuration:60ms} + pebble-compaction, non-latency: {BlockBytes:301 BlockBytesInCache:44 BlockReadDuration:60ms} pebble-ingest, latency: {BlockBytes:328 BlockBytesInCache:128 BlockReadDuration:30ms} # Virtualize a virtual sstable. @@ -712,5 +712,5 @@ Ingestions: 2 as flushable: 2 (2.1KB in 3 tables) Iter category stats: b, latency: {BlockBytes:44 BlockBytesInCache:0 BlockReadDuration:10ms} c, non-latency: {BlockBytes:44 BlockBytesInCache:44 BlockReadDuration:0s} - pebble-compaction, non-latency: {BlockBytes:941 BlockBytesInCache:640 BlockReadDuration:70ms} + pebble-compaction, non-latency: {BlockBytes:677 BlockBytesInCache:376 BlockReadDuration:70ms} pebble-ingest, latency: {BlockBytes:400 BlockBytesInCache:200 BlockReadDuration:30ms}