pkg/compactindex/query.go

package compactindex

import (
	"errors"
	"fmt"
	"io"
)

// DB is a compactindex handle.
type DB struct {
	Header
	Stream io.ReaderAt
}

// Open returns a handle to access a compactindex.
//
// The provided stream must start with the Magic byte sequence.
// Tip: Use io.NewSectionReader to create aligned substreams when dealing with a file that contains multiple indexes.
func Open(stream io.ReaderAt) (*DB, error) {
	// Read the static 32-byte header.
	// Ignore errors if the read fails after filling the buffer (e.g. EOF).
	var fileHeader [headerSize]byte
	n, readErr := stream.ReadAt(fileHeader[:], 0)
	if n < len(fileHeader) {
		// ReadAt must return non-nil error here.
		return nil, readErr
	}
	db := new(DB)
	if err := db.Header.Load(&fileHeader); err != nil {
		return nil, err
	}
	db.Stream = stream
	return db, nil
}

// Lookup queries for a key in the index and returns the value (offset), if any.
//
// Returns ErrNotFound if the key is unknown.
func (db *DB) Lookup(key []byte) (uint64, error) {
	bucket, err := db.LookupBucket(key)
	if err != nil {
		return 0, err
	}
	return bucket.Lookup(key)
}

// LookupBucket returns a handle to the bucket that might contain the given key.
func (db *DB) LookupBucket(key []byte) (*Bucket, error) {
	return db.GetBucket(db.Header.BucketHash(key))
}

// GetBucket returns a handle to the bucket at the given index.
func (db *DB) GetBucket(i uint) (*Bucket, error) {
	if i >= uint(db.Header.NumBuckets) {
		return nil, fmt.Errorf("out of bounds bucket index: %d >= %d", i, db.Header.NumBuckets)
	}

	// Fill bucket handle.
	bucket := &Bucket{
		BucketDescriptor: BucketDescriptor{
			Stride:      db.entryStride(),
			OffsetWidth: intWidth(db.FileSize),
		},
	}
	// Read bucket header.
	readErr := bucket.BucketHeader.readFrom(db.Stream, i)
	if readErr != nil {
		return nil, readErr
	}
	bucket.Entries = io.NewSectionReader(db.Stream, int64(bucket.FileOffset), int64(bucket.NumEntries)*int64(bucket.Stride))
	return bucket, nil
}

func (db *DB) entryStride() uint8 {
	hashSize := 3 // TODO remove hardcoded constant
	offsetSize := intWidth(db.FileSize)
	return uint8(hashSize) + offsetSize
}

func bucketOffset(i uint) int64 {
	return headerSize + int64(i)*bucketHdrLen
}

func (b *BucketHeader) readFrom(rd io.ReaderAt, i uint) error {
	var buf [bucketHdrLen]byte
	n, err := rd.ReadAt(buf[:], bucketOffset(i))
	if n < len(buf) {
		return err
	}
	b.Load(&buf)
	return nil
}

func (b *BucketHeader) writeTo(wr io.WriterAt, i uint) error {
	var buf [bucketHdrLen]byte
	b.Store(&buf)
	_, err := wr.WriteAt(buf[:], bucketOffset(i))
	return err
}

// Bucket is a database handle pointing to a subset of the index.
type Bucket struct {
	BucketDescriptor
	Entries *io.SectionReader
}

// maxEntriesPerBucket is the hardcoded maximum permitted number of entries per bucket.
const maxEntriesPerBucket = 1 << 24 // (16 * stride) MiB

// targetEntriesPerBucket is the average number of records in each hashtable bucket we aim for.
const targetEntriesPerBucket = 10000

// Load retrieves all entries in the hashtable.
func (b *Bucket) Load(batchSize int) ([]Entry, error) {
	if batchSize <= 0 {
		batchSize = 512 // default to reasonable batch size
	}
	// TODO bounds check
	if b.NumEntries > maxEntriesPerBucket {
		return nil, fmt.Errorf("refusing to load bucket with %d entries", b.NumEntries)
	}
	entries := make([]Entry, 0, b.NumEntries)

	stride := int(b.Stride)
	buf := make([]byte, batchSize*stride)
	off := int64(0)
	for {
		// Read another chunk.
		n, err := b.Entries.ReadAt(buf, off)
		// Decode all entries in it.
		sub := buf[:n]
		for len(sub) >= stride {
			entries = append(entries, b.unmarshalEntry(sub))
			sub = sub[stride:]
			off += int64(stride)
		}
		// Handle error.
		if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
			break
		} else if err != nil {
			return nil, err
		}
	}

	return entries, nil
}

// TODO: This binary search algo is not optimized for high-latency remotes yet.

// Lookup queries for a key using binary search.
func (b *Bucket) Lookup(key []byte) (uint64, error) {
	return b.binarySearch(b.Hash(key))
}

func (b *Bucket) binarySearch(target uint64) (uint64, error) {
	low := 0
	high := int(b.NumEntries)
	for low <= high {
		median := (low + high) / 2
		entry, err := b.loadEntry(median)
		if err != nil {
			return 0, err
		}
		if entry.Hash == target {
			return entry.Value, nil
		} else if entry.Hash < target {
			low = median + 1
		} else {
			high = median - 1
		}
	}
	return 0, ErrNotFound
}

func (b *Bucket) loadEntry(i int) (Entry, error) {
	off := int64(i) * int64(b.Stride)
	buf := make([]byte, b.Stride)
	n, err := b.Entries.ReadAt(buf, off)
	if n != len(buf) {
		return Entry{}, err
	}
	return b.unmarshalEntry(buf), nil
}

// ErrNotFound marks a missing entry.
var ErrNotFound = errors.New("not found")