diff --git a/commitment/bin_patricia_hashed.go b/commitment/bin_patricia_hashed.go index 0c4aebdb6..73863144f 100644 --- a/commitment/bin_patricia_hashed.go +++ b/commitment/bin_patricia_hashed.go @@ -23,6 +23,7 @@ import ( "fmt" "io" "math/bits" + "sort" "github.com/holiman/uint256" "github.com/ledgerwatch/log/v3" @@ -746,7 +747,7 @@ func (bph *BinPatriciaHashed) computeBinaryCellHash(cell *BinaryCell, depth int, var valBuf [128]byte valLen := cell.accountForHashing(valBuf[:], storageRootHash) if bph.trace { - fmt.Printf("accountLeafHashWithKey for [%x]=>[%x]\n", bph.hashAuxBuffer[:halfKeySize+1-depth], valBuf[:valLen]) + fmt.Printf("accountLeafHashWithKey for [%x]=>[%x]\n", cell.downHashedKey[:halfKeySize+1-depth], rlp.RlpEncodedBytes(valBuf[:valLen])) } return bph.accountLeafHashWithKey(buf, cell.downHashedKey[:halfKeySize+1-depth], rlp.RlpEncodedBytes(valBuf[:valLen])) } @@ -1274,9 +1275,19 @@ func (bph *BinPatriciaHashed) RootHash() ([]byte, error) { return hash[1:], nil // first byte is 128+hash_len } -func (bph *BinPatriciaHashed) ReviewKeys(plainKeys, hashedKeys [][]byte) (rootHash []byte, branchNodeUpdates map[string]BranchData, err error) { +func (bph *BinPatriciaHashed) ProcessKeys(plainKeys [][]byte) (rootHash []byte, branchNodeUpdates map[string]BranchData, err error) { branchNodeUpdates = make(map[string]BranchData) + pks := make(map[string]int, len(plainKeys)) + hashedKeys := make([][]byte, len(plainKeys)) + for i, pk := range plainKeys { + hashedKeys[i] = hexToBin(pk) + pks[string(hashedKeys[i])] = i + } + + sort.Slice(hashedKeys, func(i, j int) bool { + return bytes.Compare(hashedKeys[i], hashedKeys[j]) < 0 + }) stagedBinaryCell := new(BinaryCell) for i, hashedKey := range hashedKeys { plainKey := plainKeys[i] @@ -1310,7 +1321,7 @@ func (bph *BinPatriciaHashed) ReviewKeys(plainKeys, hashedKeys [][]byte) (rootHa cell.setAccountFields(stagedBinaryCell.CodeHash[:], &stagedBinaryCell.Balance, stagedBinaryCell.Nonce) if bph.trace { - fmt.Printf("accountFn reading key %x => balance=%v nonce=%v codeHash=%x\n", cell.apk, cell.Balance.Uint64(), cell.Nonce, cell.CodeHash) + fmt.Printf("accountFn reading key %x => balance=%d nonce=%v codeHash=%x\n", cell.apk, &cell.Balance, cell.Nonce, cell.CodeHash) } } } else { @@ -1503,12 +1514,10 @@ func (bph *BinPatriciaHashed) SetState(buf []byte) error { return err } - bph.currentKeyLen = int(s.CurrentKeyLen) bph.rootChecked = s.RootChecked bph.rootTouched = s.RootTouched bph.rootPresent = s.RootPresent - copy(bph.currentKey[:], s.CurrentKey[:]) copy(bph.depths[:], s.Depths[:]) copy(bph.branchBefore[:], s.BranchBefore[:]) copy(bph.touchMap[:], s.TouchMap[:]) @@ -1517,16 +1526,25 @@ func (bph *BinPatriciaHashed) SetState(buf []byte) error { return nil } -func (bph *BinPatriciaHashed) ProcessUpdates(plainKeys, hashedKeys [][]byte, updates []Update) (rootHash []byte, branchNodeUpdates map[string]BranchData, err error) { +func (bph *BinPatriciaHashed) ProcessUpdates(plainKeys [][]byte, updates []Update) (rootHash []byte, branchNodeUpdates map[string]BranchData, err error) { branchNodeUpdates = make(map[string]BranchData) + for i, pk := range plainKeys { + updates[i].hashedKey = hexToBin(pk) + updates[i].plainKey = pk + } + + sort.Slice(updates, func(i, j int) bool { + return bytes.Compare(updates[i].hashedKey, updates[j].hashedKey) < 0 + }) + for i, plainKey := range plainKeys { - hashedKey := hashedKeys[i] + update := updates[i] if bph.trace { - fmt.Printf("plainKey=[%x], hashedKey=[%x], currentKey=[%x]\n", plainKey, hashedKey, bph.currentKey[:bph.currentKeyLen]) + fmt.Printf("plainKey=[%x], hashedKey=[%x], currentKey=[%x]\n", update.plainKey, update.hashedKey, bph.currentKey[:bph.currentKeyLen]) } // Keep folding until the currentKey is the prefix of the key we modify - for bph.needFolding(hashedKey) { + for bph.needFolding(update.hashedKey) { if branchData, updateKey, err := bph.fold(); err != nil { return nil, nil, fmt.Errorf("fold: %w", err) } else if branchData != nil { @@ -1534,27 +1552,26 @@ func (bph *BinPatriciaHashed) ProcessUpdates(plainKeys, hashedKeys [][]byte, upd } } // Now unfold until we step on an empty cell - for unfolding := bph.needUnfolding(hashedKey); unfolding > 0; unfolding = bph.needUnfolding(hashedKey) { - if err := bph.unfold(hashedKey, unfolding); err != nil { + for unfolding := bph.needUnfolding(update.hashedKey); unfolding > 0; unfolding = bph.needUnfolding(update.hashedKey) { + if err := bph.unfold(update.hashedKey, unfolding); err != nil { return nil, nil, fmt.Errorf("unfold: %w", err) } } - update := updates[i] // Update the cell if update.Flags == DeleteUpdate { - bph.deleteBinaryCell(hashedKey) + bph.deleteBinaryCell(update.hashedKey) if bph.trace { - fmt.Printf("key %x deleted\n", plainKey) + fmt.Printf("key %x deleted\n", update.plainKey) } } else { - cell := bph.updateBinaryCell(plainKey, hashedKey) + cell := bph.updateBinaryCell(update.plainKey, update.hashedKey) if bph.trace { fmt.Printf("accountFn updated key %x =>", plainKey) } if update.Flags&BalanceUpdate != 0 { if bph.trace { - fmt.Printf(" balance=%d", update.Balance.Uint64()) + fmt.Printf(" balance=%d", &update.Balance) } cell.Balance.Set(&update.Balance) } @@ -1603,13 +1620,13 @@ func (bph *BinPatriciaHashed) hashAndNibblizeKey2(key []byte) []byte { //nolint bph.keccak.Reset() bph.keccak.Write(key[:length.Addr]) - copy(hashedKey[:length.Hash], bph.keccak.Sum(nil)) + bph.keccak.Read(hashedKey[:length.Hash]) if len(key[length.Addr:]) > 0 { hashedKey = append(hashedKey, make([]byte, length.Hash)...) bph.keccak.Reset() bph.keccak.Write(key[length.Addr:]) - copy(hashedKey[length.Hash:], bph.keccak.Sum(nil)) + bph.keccak.Read(hashedKey[length.Hash:]) } nibblized := make([]byte, len(hashedKey)*2) diff --git a/commitment/bin_patricia_hashed_test.go b/commitment/bin_patricia_hashed_test.go index 2c16bcbf5..1b406ce14 100644 --- a/commitment/bin_patricia_hashed_test.go +++ b/commitment/bin_patricia_hashed_test.go @@ -20,7 +20,7 @@ func Test_BinPatriciaTrie_UniqueRepresentation(t *testing.T) { trie := NewBinPatriciaHashed(length.Addr, ms.branchFn, ms.accountFn, ms.storageFn) trieBatch := NewBinPatriciaHashed(length.Addr, ms2.branchFn, ms2.accountFn, ms2.storageFn) - plainKeys, hashedKeys, updates := NewUpdateBuilder(). + plainKeys, updates := NewUpdateBuilder(). Balance("e25652aaa6b9417973d325f9a1246b48ff9420bf", 12). Balance("cdd0a12034e978f7eccda72bd1bd89a8142b704e", 120000). Balance("5bb6abae12c87592b940458437526cb6cad60d50", 170). @@ -43,13 +43,13 @@ func Test_BinPatriciaTrie_UniqueRepresentation(t *testing.T) { fmt.Println("1. Running sequential updates over the bin trie") var seqHash []byte for i := 0; i < len(updates); i++ { - sh, branchNodeUpdates, err := trie.ReviewKeys(plainKeys[i:i+1], hashedKeys[i:i+1]) + sh, branchNodeUpdates, err := trie.ProcessKeys(plainKeys[i : i+1]) require.NoError(t, err) require.Len(t, sh, length.Hash) ms.applyBranchNodeUpdates(branchNodeUpdates) // WARN! provided sequential branch updates are incorrect - lead to deletion of prefixes (afterMap is zero) // while root hashes are equal - renderUpdates(branchNodeUpdates) + //renderUpdates(branchNodeUpdates) fmt.Printf("h=%x\n", sh) seqHash = sh @@ -57,11 +57,11 @@ func Test_BinPatriciaTrie_UniqueRepresentation(t *testing.T) { fmt.Println("2. Running batch updates over the bin trie") - batchHash, branchBatchUpdates, err := trieBatch.ReviewKeys(plainKeys, hashedKeys) + batchHash, branchBatchUpdates, err := trieBatch.ProcessKeys(plainKeys) require.NoError(t, err) ms2.applyBranchNodeUpdates(branchBatchUpdates) - renderUpdates(branchBatchUpdates) + //renderUpdates(branchBatchUpdates) require.EqualValues(t, seqHash, batchHash) // require.EqualValues(t, seqHash, batchHash) @@ -88,7 +88,7 @@ func Test_BinPatriciaHashed_UniqueRepresentation(t *testing.T) { ms := NewMockState(t) ms2 := NewMockState(t) - plainKeys, hashedKeys, updates := NewUpdateBuilder(). + plainKeys, updates := NewUpdateBuilder(). Balance("f5", 4). Balance("ff", 900234). Balance("04", 1233). @@ -122,12 +122,12 @@ func Test_BinPatriciaHashed_UniqueRepresentation(t *testing.T) { t.Fatal(err) } - sequentialRoot, branchNodeUpdates, err := trieOne.ReviewKeys(plainKeys[i:i+1], hashedKeys[i:i+1]) + sequentialRoot, branchNodeUpdates, err := trieOne.ProcessKeys(plainKeys[i : i+1]) require.NoError(t, err) roots = append(roots, sequentialRoot) ms.applyBranchNodeUpdates(branchNodeUpdates) - renderUpdates(branchNodeUpdates) + //renderUpdates(branchNodeUpdates) } err := ms2.applyPlainUpdates(plainKeys, updates) @@ -135,9 +135,9 @@ func Test_BinPatriciaHashed_UniqueRepresentation(t *testing.T) { fmt.Printf("\n2. Trie batch update generated following branch updates\n") // batch update - batchRoot, branchNodeUpdatesTwo, err := trieTwo.ReviewKeys(plainKeys, hashedKeys) + batchRoot, branchNodeUpdatesTwo, err := trieTwo.ProcessKeys(plainKeys) require.NoError(t, err) - renderUpdates(branchNodeUpdatesTwo) + //renderUpdates(branchNodeUpdatesTwo) fmt.Printf("\n sequential roots:\n") for i, rh := range roots { @@ -154,7 +154,7 @@ func Test_BinPatriciaHashed_EmptyState(t *testing.T) { ms := NewMockState(t) hph := NewBinPatriciaHashed(1, ms.branchFn, ms.accountFn, ms.storageFn) hph.SetTrace(false) - plainKeys, hashedKeys, updates := NewUpdateBuilder(). + plainKeys, updates := NewUpdateBuilder(). Balance("00", 4). Balance("01", 5). Balance("02", 6). @@ -171,7 +171,7 @@ func Test_BinPatriciaHashed_EmptyState(t *testing.T) { err := ms.applyPlainUpdates(plainKeys, updates) require.NoError(t, err) - firstRootHash, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + firstRootHash, branchNodeUpdates, err := hph.ProcessKeys(plainKeys) require.NoError(t, err) t.Logf("root hash %x\n", firstRootHash) @@ -179,48 +179,48 @@ func Test_BinPatriciaHashed_EmptyState(t *testing.T) { ms.applyBranchNodeUpdates(branchNodeUpdates) fmt.Printf("1. Generated updates\n") - renderUpdates(branchNodeUpdates) + //renderUpdates(branchNodeUpdates) // More updates hph.Reset() hph.SetTrace(false) - plainKeys, hashedKeys, updates = NewUpdateBuilder(). + plainKeys, updates = NewUpdateBuilder(). Storage("03", "58", "050505"). Build() err = ms.applyPlainUpdates(plainKeys, updates) require.NoError(t, err) - secondRootHash, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + secondRootHash, branchNodeUpdates, err := hph.ProcessKeys(plainKeys) require.NoError(t, err) require.NotEqualValues(t, firstRootHash, secondRootHash) ms.applyBranchNodeUpdates(branchNodeUpdates) fmt.Printf("2. Generated single update\n") - renderUpdates(branchNodeUpdates) + //renderUpdates(branchNodeUpdates) // More updates hph.Reset() hph.SetTrace(false) - plainKeys, hashedKeys, updates = NewUpdateBuilder(). + plainKeys, updates = NewUpdateBuilder(). Storage("03", "58", "070807"). Build() err = ms.applyPlainUpdates(plainKeys, updates) require.NoError(t, err) - thirdRootHash, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + thirdRootHash, branchNodeUpdates, err := hph.ProcessKeys(plainKeys) require.NoError(t, err) require.NotEqualValues(t, secondRootHash, thirdRootHash) ms.applyBranchNodeUpdates(branchNodeUpdates) fmt.Printf("3. Generated single update\n") - renderUpdates(branchNodeUpdates) + //renderUpdates(branchNodeUpdates) } func Test_BinPatriciaHashed_EmptyUpdateState(t *testing.T) { ms := NewMockState(t) hph := NewBinPatriciaHashed(1, ms.branchFn, ms.accountFn, ms.storageFn) hph.SetTrace(false) - plainKeys, hashedKeys, updates := NewUpdateBuilder(). + plainKeys, updates := NewUpdateBuilder(). Balance("00", 4). Nonce("00", 246462653). Balance("01", 5). @@ -233,24 +233,24 @@ func Test_BinPatriciaHashed_EmptyUpdateState(t *testing.T) { err := ms.applyPlainUpdates(plainKeys, updates) require.NoError(t, err) - hashBeforeEmptyUpdate, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + hashBeforeEmptyUpdate, branchNodeUpdates, err := hph.ProcessKeys(plainKeys) require.NoError(t, err) require.NotEmpty(t, hashBeforeEmptyUpdate) ms.applyBranchNodeUpdates(branchNodeUpdates) fmt.Println("1. Updates applied") - renderUpdates(branchNodeUpdates) + //renderUpdates(branchNodeUpdates) // generate empty updates and do NOT reset tree hph.SetTrace(true) - plainKeys, hashedKeys, updates = NewUpdateBuilder().Build() + plainKeys, updates = NewUpdateBuilder().Build() err = ms.applyPlainUpdates(plainKeys, updates) require.NoError(t, err) - hashAfterEmptyUpdate, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + hashAfterEmptyUpdate, branchNodeUpdates, err := hph.ProcessKeys(plainKeys) require.NoError(t, err) ms.applyBranchNodeUpdates(branchNodeUpdates) diff --git a/commitment/commitment.go b/commitment/commitment.go index a51cfcb59..f6e74bb63 100644 --- a/commitment/commitment.go +++ b/commitment/commitment.go @@ -24,9 +24,10 @@ type Trie interface { // Reset Drops everything from the trie Reset() - ReviewKeys(pk, hk [][]byte) (rootHash []byte, branchNodeUpdates map[string]BranchData, err error) + // Reads updates from storage + ProcessKeys(pk [][]byte) (rootHash []byte, branchNodeUpdates map[string]BranchData, err error) - ProcessUpdates(pk, hk [][]byte, updates []Update) (rootHash []byte, branchNodeUpdates map[string]BranchData, err error) + ProcessUpdates(pk [][]byte, updates []Update) (rootHash []byte, branchNodeUpdates map[string]BranchData, err error) ResetFns( branchFn func(prefix []byte) ([]byte, error), @@ -70,6 +71,9 @@ const ( type BranchData []byte func (branchData BranchData) String() string { + if len(branchData) == 0 { + return "" + } touchMap := binary.BigEndian.Uint16(branchData[0:]) afterMap := binary.BigEndian.Uint16(branchData[2:]) pos := 4 @@ -468,10 +472,10 @@ func NewHexBranchMerger(capacity uint64) *BranchMerger { // MergeHexBranches combines two branchData, number 2 coming after (and potentially shadowing) number 1 func (m *BranchMerger) Merge(branch1 BranchData, branch2 BranchData) (BranchData, error) { - if branch2 == nil { + if len(branch2) == 0 { return branch1, nil } - if branch1 == nil { + if len(branch1) == 0 { return branch2, nil } diff --git a/commitment/hex_patricia_hashed.go b/commitment/hex_patricia_hashed.go index 3ba53a417..c185f75a3 100644 --- a/commitment/hex_patricia_hashed.go +++ b/commitment/hex_patricia_hashed.go @@ -21,9 +21,11 @@ import ( "encoding/binary" "encoding/hex" "fmt" + "github.com/ledgerwatch/erigon-lib/common/hexutility" "hash" "io" "math/bits" + "sort" "strings" "github.com/holiman/uint256" @@ -81,20 +83,6 @@ type HexPatriciaHashed struct { auxBuffer *bytes.Buffer // auxiliary buffer used during branch updates encoding } -// represents state of the tree -type state struct { - Root []byte // encoded root cell - Depths [128]int // For each row, the depth of cells in that row - TouchMap [128]uint16 // For each row, bitmap of cells that were either present before modification, or modified or deleted - AfterMap [128]uint16 // For each row, bitmap of cells that were present after modification - BranchBefore [128]bool // For each row, whether there was a branch node in the database loaded in unfold - CurrentKey [128]byte // For each row indicates which column is currently selected - CurrentKeyLen int8 - RootChecked bool // Set to false if it is not known whether the root is empty, set to true if it is checked - RootTouched bool - RootPresent bool -} - func NewHexPatriciaHashed(accountKeyLen int, branchFn func(prefix []byte) ([]byte, error), accountFn func(plainKey []byte, cell *Cell) error, @@ -131,11 +119,12 @@ type Cell struct { } var ( - EmptyRootHash, _ = hex.DecodeString("56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421") - EmptyCodeHash, _ = hex.DecodeString("c5d2460186f7233c927e7db2dcc703c0e500b653ca82273b7bfad8045d85a470") + EmptyRootHash = hexutility.MustDecodeHex("56e81f171bcc55a6ff8345e692c0f86e5b48e01b996cadc001622fb5e363b421") + EmptyCodeHash = hexutility.MustDecodeHex("c5d2460186f7233c927e7db2dcc703c0e500b653ca82273b7bfad8045d85a470") + EmptyCodeHashArray = *(*[length.Hash]byte)(EmptyCodeHash) ) -func (cell *Cell) fillEmpty() { +func (cell *Cell) reset() { cell.apl = 0 cell.spl = 0 cell.downHashedLen = 0 @@ -393,6 +382,9 @@ func (cell *Cell) setStorage(value []byte) { } func (cell *Cell) setAccountFields(codeHash []byte, balance *uint256.Int, nonce uint64) { + if len(codeHash) == 0 { + codeHash = common.Copy(EmptyCodeHash) + } copy(cell.CodeHash[:], codeHash) cell.Balance.SetBytes(balance.Bytes()) @@ -722,7 +714,7 @@ func (hph *HexPatriciaHashed) computeCellHash(cell *Cell, depth int, buf []byte) var valBuf [128]byte valLen := cell.accountForHashing(valBuf[:], storageRootHash) if hph.trace { - fmt.Printf("accountLeafHashWithKey for [%x]=>[%x]\n", hph.hashAuxBuffer[:65-depth], valBuf[:valLen]) + fmt.Printf("accountLeafHashWithKey for [%x]=>[%x]\n", cell.downHashedKey[:65-depth], rlp.RlpEncodedBytes(valBuf[:valLen])) } return hph.accountLeafHashWithKey(buf, cell.downHashedKey[:65-depth], rlp.RlpEncodedBytes(valBuf[:valLen])) } @@ -889,12 +881,13 @@ func (hph *HexPatriciaHashed) unfold(hashedKey []byte, unfolding int) error { } row := hph.activeRows for i := 0; i < 16; i++ { - hph.grid[row][i].fillEmpty() + hph.grid[row][i].reset() } hph.touchMap[row] = 0 hph.afterMap[row] = 0 hph.branchBefore[row] = false if upCell.downHashedLen == 0 { + // root unfolded depth = upDepth + 1 if unfolded, err := hph.unfoldBranchNode(row, touched && !present /* deleted */, depth); err != nil { return err @@ -1197,12 +1190,10 @@ func (hph *HexPatriciaHashed) deleteCell(hashedKey []byte) { } } } - cell.extLen = 0 - cell.Balance.Clear() - copy(cell.CodeHash[:], EmptyCodeHash) - cell.Nonce = 0 + cell.reset() } +// fetches cell by key and set touch/after maps func (hph *HexPatriciaHashed) updateCell(plainKey, hashedKey []byte) *Cell { var cell *Cell var col, depth int @@ -1234,6 +1225,7 @@ func (hph *HexPatriciaHashed) updateCell(plainKey, hashedKey []byte) *Cell { if len(hashedKey) == 2*length.Hash { // set account key cell.apl = len(plainKey) copy(cell.apk[:], plainKey) + copy(cell.CodeHash[:], EmptyCodeHash) } else { // set storage key cell.spl = len(plainKey) copy(cell.spk[:], plainKey) @@ -1242,19 +1234,35 @@ func (hph *HexPatriciaHashed) updateCell(plainKey, hashedKey []byte) *Cell { } func (hph *HexPatriciaHashed) RootHash() ([]byte, error) { - hash, err := hph.computeCellHash(&hph.root, 0, nil) + rh, err := hph.computeCellHash(&hph.root, 0, nil) if err != nil { return nil, err } - return hash[1:], nil // first byte is 128+hash_len + //// set root hash field if it's not a cell to correctly encode trie state + //if hph.root.apl == 0 && hph.root.spl == 0 && !bytes.Equal(hph.root.h[:], rh) { + // copy(hph.root.h[:], rh[1:]) + // hph.root.hl = len(rh) - 1 + //} + return rh[1:], nil // first byte is 128+hash_len } -func (hph *HexPatriciaHashed) ReviewKeys(plainKeys, hashedKeys [][]byte) (rootHash []byte, branchNodeUpdates map[string]BranchData, err error) { +func (hph *HexPatriciaHashed) ProcessKeys(plainKeys [][]byte) (rootHash []byte, branchNodeUpdates map[string]BranchData, err error) { branchNodeUpdates = make(map[string]BranchData) + pks := make(map[string]int, len(plainKeys)) + hashedKeys := make([][]byte, len(plainKeys)) + for i, pk := range plainKeys { + hashedKeys[i] = hph.hashAndNibblizeKey(pk) + pks[string(hashedKeys[i])] = i + } + + sort.Slice(hashedKeys, func(i, j int) bool { + return bytes.Compare(hashedKeys[i], hashedKeys[j]) < 0 + }) + stagedCell := new(Cell) - for i, hashedKey := range hashedKeys { - plainKey := plainKeys[i] + for _, hashedKey := range hashedKeys { + plainKey := plainKeys[pks[string(hashedKey)]] if hph.trace { fmt.Printf("plainKey=[%x], hashedKey=[%x], currentKey=[%x]\n", plainKey, hashedKey, hph.currentKey[:hph.currentKeyLen]) } @@ -1274,7 +1282,7 @@ func (hph *HexPatriciaHashed) ReviewKeys(plainKeys, hashedKeys [][]byte) (rootHa } // Update the cell - stagedCell.fillEmpty() + stagedCell.reset() if len(plainKey) == hph.accountKeyLen { if err := hph.accountFn(plainKey, stagedCell); err != nil { return nil, nil, fmt.Errorf("accountFn for key %x failed: %w", plainKey, err) @@ -1284,7 +1292,7 @@ func (hph *HexPatriciaHashed) ReviewKeys(plainKeys, hashedKeys [][]byte) (rootHa cell.setAccountFields(stagedCell.CodeHash[:], &stagedCell.Balance, stagedCell.Nonce) if hph.trace { - fmt.Printf("accountFn reading key %x => balance=%v nonce=%v codeHash=%x\n", cell.apk, cell.Balance.Uint64(), cell.Nonce, cell.CodeHash) + fmt.Printf("accountFn update key %x => balance=%d nonce=%v codeHash=%x\n", cell.apk, &cell.Balance, cell.Nonce, cell.CodeHash) } } } else { @@ -1322,6 +1330,94 @@ func (hph *HexPatriciaHashed) ReviewKeys(plainKeys, hashedKeys [][]byte) (rootHa return rootHash, branchNodeUpdates, nil } +func (hph *HexPatriciaHashed) ProcessUpdates(plainKeys [][]byte, updates []Update) (rootHash []byte, branchNodeUpdates map[string]BranchData, err error) { + branchNodeUpdates = make(map[string]BranchData) + + for i, pk := range plainKeys { + updates[i].hashedKey = hph.hashAndNibblizeKey(pk) + updates[i].plainKey = pk + } + + sort.Slice(updates, func(i, j int) bool { + return bytes.Compare(updates[i].hashedKey, updates[j].hashedKey) < 0 + }) + + for i, update := range updates { + // if hph.trace { + fmt.Printf("(%d/%d) key=[%x] %s hashedKey=[%x] currentKey=[%x]\n", + i+1, len(updates), update.plainKey, update.String(), update.hashedKey, hph.currentKey[:hph.currentKeyLen]) + // } + // Keep folding until the currentKey is the prefix of the key we modify + for hph.needFolding(update.hashedKey) { + if branchData, updateKey, err := hph.fold(); err != nil { + return nil, nil, fmt.Errorf("fold: %w", err) + } else if branchData != nil { + branchNodeUpdates[string(updateKey)] = branchData + } + } + // Now unfold until we step on an empty cell + for unfolding := hph.needUnfolding(update.hashedKey); unfolding > 0; unfolding = hph.needUnfolding(update.hashedKey) { + if err := hph.unfold(update.hashedKey, unfolding); err != nil { + return nil, nil, fmt.Errorf("unfold: %w", err) + } + } + + // Update the cell + if update.Flags == DeleteUpdate { + hph.deleteCell(update.hashedKey) + if hph.trace { + fmt.Printf("delete cell %x hash %x\n", update.plainKey, update.hashedKey) + } + } else { + cell := hph.updateCell(update.plainKey, update.hashedKey) + if hph.trace && len(update.plainKey) == hph.accountKeyLen { + fmt.Printf("accountFn updated key %x =>", update.plainKey) + } + if update.Flags&BalanceUpdate != 0 { + if hph.trace { + fmt.Printf(" balance=%d", &update.Balance) + } + cell.Balance.Set(&update.Balance) + } + if update.Flags&NonceUpdate != 0 { + if hph.trace { + fmt.Printf(" nonce=%d", update.Nonce) + } + cell.Nonce = update.Nonce + } + if update.Flags&CodeUpdate != 0 { + if hph.trace { + fmt.Printf(" codeHash=%x", update.CodeHashOrStorage) + } + copy(cell.CodeHash[:], update.CodeHashOrStorage[:update.ValLength]) + } + if hph.trace { + fmt.Printf("\n") + } + if update.Flags&StorageUpdate != 0 { + cell.setStorage(update.CodeHashOrStorage[:update.ValLength]) + if hph.trace { + fmt.Printf("\rstorage set %x => %x\n", update.plainKey, update.CodeHashOrStorage[:update.ValLength]) + } + } + } + } + // Folding everything up to the root + for hph.activeRows > 0 { + if branchData, updateKey, err := hph.fold(); err != nil { + return nil, nil, fmt.Errorf("final fold: %w", err) + } else if branchData != nil { + branchNodeUpdates[string(updateKey)] = branchData + } + } + + rootHash, err = hph.RootHash() + if err != nil { + return nil, branchNodeUpdates, fmt.Errorf("root hash evaluation failed: %w", err) + } + return rootHash, branchNodeUpdates, nil +} + func (hph *HexPatriciaHashed) SetTrace(trace bool) { hph.trace = trace } func (hph *HexPatriciaHashed) Variant() TrieVariant { return VariantHexPatriciaTrie } @@ -1360,6 +1456,18 @@ var ( stateRootTouched stateRootFlag = 4 ) +// represents state of the tree +type state struct { + Root []byte // encoded root cell + Depths [128]int // For each row, the depth of cells in that row + TouchMap [128]uint16 // For each row, bitmap of cells that were either present before modification, or modified or deleted + AfterMap [128]uint16 // For each row, bitmap of cells that were present after modification + BranchBefore [128]bool // For each row, whether there was a branch node in the database loaded in unfold + RootChecked bool // Set to false if it is not known whether the root is empty, set to true if it is checked + RootTouched bool + RootPresent bool +} + func (s *state) Encode(buf []byte) ([]byte, error) { var rootFlags stateRootFlag if s.RootPresent { @@ -1373,15 +1481,9 @@ func (s *state) Encode(buf []byte) ([]byte, error) { } ee := bytes.NewBuffer(buf) - if err := binary.Write(ee, binary.BigEndian, s.CurrentKeyLen); err != nil { - return nil, fmt.Errorf("encode currentKeyLen: %w", err) - } if err := binary.Write(ee, binary.BigEndian, int8(rootFlags)); err != nil { return nil, fmt.Errorf("encode rootFlags: %w", err) } - if n, err := ee.Write(s.CurrentKey[:]); err != nil || n != len(s.CurrentKey) { - return nil, fmt.Errorf("encode currentKey: %w", err) - } if err := binary.Write(ee, binary.BigEndian, uint16(len(s.Root))); err != nil { return nil, fmt.Errorf("encode root len: %w", err) } @@ -1424,9 +1526,6 @@ func (s *state) Encode(buf []byte) ([]byte, error) { func (s *state) Decode(buf []byte) error { aux := bytes.NewBuffer(buf) - if err := binary.Read(aux, binary.BigEndian, &s.CurrentKeyLen); err != nil { - return fmt.Errorf("currentKeyLen: %w", err) - } var rootFlags stateRootFlag if err := binary.Read(aux, binary.BigEndian, &rootFlags); err != nil { return fmt.Errorf("rootFlags: %w", err) @@ -1441,9 +1540,7 @@ func (s *state) Decode(buf []byte) error { if rootFlags&stateRootChecked != 0 { s.RootChecked = true } - if n, err := aux.Read(s.CurrentKey[:]); err != nil || n != 128 { - return fmt.Errorf("currentKey: %w", err) - } + var rootSize uint16 if err := binary.Read(aux, binary.BigEndian, &rootSize); err != nil { return fmt.Errorf("root size: %w", err) @@ -1486,90 +1583,105 @@ func (s *state) Decode(buf []byte) error { return nil } -func (c *Cell) bytes() []byte { +func (c *Cell) Encode() []byte { var pos = 1 - size := 1 + c.hl + 1 + c.apl + c.spl + 1 + c.downHashedLen + 1 + c.extLen + 1 // max size + size := pos + 5 + c.hl + c.apl + c.spl + c.downHashedLen + c.extLen // max size buf := make([]byte, size) var flags uint8 if c.hl != 0 { - flags |= 1 + flags |= cellFlagHash buf[pos] = byte(c.hl) pos++ copy(buf[pos:pos+c.hl], c.h[:]) pos += c.hl } if c.apl != 0 { - flags |= 2 - buf[pos] = byte(c.hl) + flags |= cellFlagAccount + buf[pos] = byte(c.apl) pos++ copy(buf[pos:pos+c.apl], c.apk[:]) pos += c.apl } if c.spl != 0 { - flags |= 4 + flags |= cellFlagStorage buf[pos] = byte(c.spl) pos++ copy(buf[pos:pos+c.spl], c.spk[:]) pos += c.spl } if c.downHashedLen != 0 { - flags |= 8 + flags |= cellFlagDownHash buf[pos] = byte(c.downHashedLen) pos++ - copy(buf[pos:pos+c.downHashedLen], c.downHashedKey[:]) + copy(buf[pos:pos+c.downHashedLen], c.downHashedKey[:c.downHashedLen]) pos += c.downHashedLen } if c.extLen != 0 { - flags |= 16 + flags |= cellFlagExtension buf[pos] = byte(c.extLen) pos++ - copy(buf[pos:pos+c.downHashedLen], c.downHashedKey[:]) - //pos += c.downHashedLen + copy(buf[pos:pos+c.extLen], c.extension[:]) + pos += c.extLen //nolint + } + if c.Delete { + flags |= cellFlagDelete } buf[0] = flags return buf } -func (c *Cell) decodeBytes(buf []byte) error { +const ( + cellFlagHash = uint8(1 << iota) + cellFlagAccount + cellFlagStorage + cellFlagDownHash + cellFlagExtension + cellFlagDelete +) + +func (c *Cell) Decode(buf []byte) error { if len(buf) < 1 { return fmt.Errorf("invalid buffer size to contain Cell (at least 1 byte expected)") } - c.fillEmpty() + c.reset() var pos int flags := buf[pos] pos++ - if flags&1 != 0 { + if flags&cellFlagHash != 0 { c.hl = int(buf[pos]) pos++ copy(c.h[:], buf[pos:pos+c.hl]) pos += c.hl } - if flags&2 != 0 { + if flags&cellFlagAccount != 0 { c.apl = int(buf[pos]) pos++ copy(c.apk[:], buf[pos:pos+c.apl]) pos += c.apl } - if flags&4 != 0 { + if flags&cellFlagStorage != 0 { c.spl = int(buf[pos]) pos++ copy(c.spk[:], buf[pos:pos+c.spl]) pos += c.spl } - if flags&8 != 0 { + if flags&cellFlagDownHash != 0 { c.downHashedLen = int(buf[pos]) pos++ copy(c.downHashedKey[:], buf[pos:pos+c.downHashedLen]) pos += c.downHashedLen } - if flags&16 != 0 { + if flags&cellFlagExtension != 0 { c.extLen = int(buf[pos]) pos++ copy(c.extension[:], buf[pos:pos+c.extLen]) - //pos += c.extLen + pos += c.extLen //nolint + } + if flags&cellFlagDelete != 0 { + c.Delete = true } return nil } @@ -1577,15 +1689,15 @@ func (c *Cell) decodeBytes(buf []byte) error { // Encode current state of hph into bytes func (hph *HexPatriciaHashed) EncodeCurrentState(buf []byte) ([]byte, error) { s := state{ - CurrentKeyLen: int8(hph.currentKeyLen), - RootChecked: hph.rootChecked, - RootTouched: hph.rootTouched, - RootPresent: hph.rootPresent, - Root: make([]byte, 0), + RootChecked: hph.rootChecked, + RootTouched: hph.rootTouched, + RootPresent: hph.rootPresent, + } + if hph.currentKeyLen > 0 { + panic("currentKeyLen > 0") } - s.Root = hph.root.bytes() - copy(s.CurrentKey[:], hph.currentKey[:]) + s.Root = hph.root.Encode() copy(s.Depths[:], hph.depths[:]) copy(s.BranchBefore[:], hph.branchBefore[:]) copy(s.TouchMap[:], hph.touchMap[:]) @@ -1596,6 +1708,23 @@ func (hph *HexPatriciaHashed) EncodeCurrentState(buf []byte) ([]byte, error) { // buf expected to be encoded hph state. Decode state and set up hph to that state. func (hph *HexPatriciaHashed) SetState(buf []byte) error { + if buf == nil { + // reset state to 'empty' + hph.currentKeyLen = 0 + hph.rootChecked = false + hph.rootTouched = false + hph.rootPresent = false + hph.activeRows = 0 + + for i := 0; i < len(hph.depths); i++ { + hph.depths[i] = 0 + hph.branchBefore[i] = false + hph.touchMap[i] = 0 + hph.afterMap[i] = 0 + } + hph.root = Cell{} + return nil + } if hph.activeRows != 0 { return fmt.Errorf("has active rows, could not reset state") } @@ -1607,21 +1736,28 @@ func (hph *HexPatriciaHashed) SetState(buf []byte) error { hph.Reset() - if err := hph.root.decodeBytes(s.Root); err != nil { + if err := hph.root.Decode(s.Root); err != nil { return err } - - hph.currentKeyLen = int(s.CurrentKeyLen) hph.rootChecked = s.RootChecked hph.rootTouched = s.RootTouched hph.rootPresent = s.RootPresent - copy(hph.currentKey[:], s.CurrentKey[:]) copy(hph.depths[:], s.Depths[:]) copy(hph.branchBefore[:], s.BranchBefore[:]) copy(hph.touchMap[:], s.TouchMap[:]) copy(hph.afterMap[:], s.AfterMap[:]) + if hph.root.apl > 0 { + if err := hph.accountFn(hph.root.apk[:hph.root.apl], &hph.root); err != nil { + return err + } + } else if hph.root.spl > 0 { + if err := hph.storageFn(hph.root.spk[:hph.root.spl], &hph.root); err != nil { + return err + } + } + return nil } @@ -1717,100 +1853,24 @@ func commonPrefixLen(b1, b2 []byte) int { return i } -func (hph *HexPatriciaHashed) ProcessUpdates(plainKeys, hashedKeys [][]byte, updates []Update) (rootHash []byte, branchNodeUpdates map[string]BranchData, err error) { - branchNodeUpdates = make(map[string]BranchData) - - for i, plainKey := range plainKeys { - hashedKey := hashedKeys[i] - if hph.trace { - fmt.Printf("plainKey=[%x], hashedKey=[%x], currentKey=[%x]\n", plainKey, hashedKey, hph.currentKey[:hph.currentKeyLen]) - } - // Keep folding until the currentKey is the prefix of the key we modify - for hph.needFolding(hashedKey) { - if branchData, updateKey, err := hph.fold(); err != nil { - return nil, nil, fmt.Errorf("fold: %w", err) - } else if branchData != nil { - branchNodeUpdates[string(updateKey)] = branchData - } - } - // Now unfold until we step on an empty cell - for unfolding := hph.needUnfolding(hashedKey); unfolding > 0; unfolding = hph.needUnfolding(hashedKey) { - if err := hph.unfold(hashedKey, unfolding); err != nil { - return nil, nil, fmt.Errorf("unfold: %w", err) - } - } - - update := updates[i] - // Update the cell - if update.Flags == DeleteUpdate { - hph.deleteCell(hashedKey) - if hph.trace { - fmt.Printf("key %x deleted\n", plainKey) - } - } else { - cell := hph.updateCell(plainKey, hashedKey) - if hph.trace { - fmt.Printf("accountFn updated key %x =>", plainKey) - } - if update.Flags&BalanceUpdate != 0 { - if hph.trace { - fmt.Printf(" balance=%d", update.Balance.Uint64()) - } - cell.Balance.Set(&update.Balance) - } - if update.Flags&NonceUpdate != 0 { - if hph.trace { - fmt.Printf(" nonce=%d", update.Nonce) - } - cell.Nonce = update.Nonce - } - if update.Flags&CodeUpdate != 0 { - if hph.trace { - fmt.Printf(" codeHash=%x", update.CodeHashOrStorage) - } - copy(cell.CodeHash[:], update.CodeHashOrStorage[:]) - } - if hph.trace { - fmt.Printf("\n") - } - if update.Flags&StorageUpdate != 0 { - cell.setStorage(update.CodeHashOrStorage[:update.ValLength]) - if hph.trace { - fmt.Printf("\rstorageFn filled key %x => %x\n", plainKey, update.CodeHashOrStorage[:update.ValLength]) - } - } - } - } - // Folding everything up to the root - for hph.activeRows > 0 { - if branchData, updateKey, err := hph.fold(); err != nil { - return nil, nil, fmt.Errorf("final fold: %w", err) - } else if branchData != nil { - branchNodeUpdates[string(updateKey)] = branchData - } - } - - rootHash, err = hph.RootHash() - if err != nil { - return nil, branchNodeUpdates, fmt.Errorf("root hash evaluation failed: %w", err) - } - return rootHash, branchNodeUpdates, nil -} - // nolint // Hashes provided key and expands resulting hash into nibbles (each byte split into two nibbles by 4 bits) func (hph *HexPatriciaHashed) hashAndNibblizeKey(key []byte) []byte { hashedKey := make([]byte, length.Hash) hph.keccak.Reset() - hph.keccak.Write(key[:length.Addr]) - copy(hashedKey[:length.Hash], hph.keccak.Sum(nil)) + fp := length.Addr + if len(key) < length.Addr { + fp = len(key) + } + hph.keccak.Write(key[:fp]) + hph.keccak.Read(hashedKey[:length.Hash]) - if len(key[length.Addr:]) > 0 { + if len(key[fp:]) > 0 { hashedKey = append(hashedKey, make([]byte, length.Hash)...) hph.keccak.Reset() - hph.keccak.Write(key[length.Addr:]) - copy(hashedKey[length.Hash:], hph.keccak.Sum(nil)) + hph.keccak.Write(key[fp:]) + hph.keccak.Read(hashedKey[length.Hash:]) } nibblized := make([]byte, len(hashedKey)*2) @@ -1853,35 +1913,109 @@ func (uf UpdateFlags) String() string { } type Update struct { + hashedKey []byte + plainKey []byte Flags UpdateFlags Balance uint256.Int Nonce uint64 - CodeHashOrStorage [length.Hash]byte ValLength int + CodeHashOrStorage [length.Hash]byte } -func (u *Update) DecodeForStorage(enc []byte) { - u.Nonce = 0 +func (u *Update) Reset() { + u.Flags = 0 u.Balance.Clear() + u.Nonce = 0 + u.ValLength = 0 copy(u.CodeHashOrStorage[:], EmptyCodeHash) +} + +func (u *Update) Merge(b *Update) { + if b.Flags == DeleteUpdate { + u.Flags = DeleteUpdate + return + } + if b.Flags&BalanceUpdate != 0 { + u.Flags |= BalanceUpdate + u.Balance.Set(&b.Balance) + } + if b.Flags&NonceUpdate != 0 { + u.Flags |= NonceUpdate + u.Nonce = b.Nonce + } + if b.Flags&CodeUpdate != 0 { + u.Flags |= CodeUpdate + copy(u.CodeHashOrStorage[:], b.CodeHashOrStorage[:]) + u.ValLength = b.ValLength + } + if b.Flags&StorageUpdate != 0 { + u.Flags |= StorageUpdate + copy(u.CodeHashOrStorage[:], b.CodeHashOrStorage[:]) + u.ValLength = b.ValLength + } +} + +func (u *Update) DecodeForStorage(enc []byte) { + //u.Reset() + + //balance := new(uint256.Int) + // + //if len(enc) > 0 { + // pos := 0 + // nonceBytes := int(enc[pos]) + // pos++ + // if nonceBytes > 0 { + // nonce := bytesToUint64(enc[pos : pos+nonceBytes]) + // if u.Nonce != nonce { + // u.Flags |= NonceUpdate + // } + // u.Nonce = nonce + // pos += nonceBytes + // } + // balanceBytes := int(enc[pos]) + // pos++ + // if balanceBytes > 0 { + // balance.SetBytes(enc[pos : pos+balanceBytes]) + // if u.Balance.Cmp(balance) != 0 { + // u.Flags |= BalanceUpdate + // } + // u.Balance.Set(balance) + // pos += balanceBytes + // } + // codeHashBytes := int(enc[pos]) + // pos++ + // + // if codeHashBytes > 0 { + // if !bytes.Equal(u.CodeHashOrStorage[:], enc[pos:pos+codeHashBytes]) { + // u.Flags |= CodeUpdate + // copy(u.CodeHashOrStorage[:], enc[pos:pos+codeHashBytes]) + // u.ValLength = length.Hash + // } + // } + //} + //return pos := 0 nonceBytes := int(enc[pos]) pos++ if nonceBytes > 0 { u.Nonce = bytesToUint64(enc[pos : pos+nonceBytes]) + u.Flags |= NonceUpdate pos += nonceBytes } balanceBytes := int(enc[pos]) pos++ if balanceBytes > 0 { u.Balance.SetBytes(enc[pos : pos+balanceBytes]) + u.Flags |= BalanceUpdate pos += balanceBytes } codeHashBytes := int(enc[pos]) pos++ if codeHashBytes > 0 { copy(u.CodeHashOrStorage[:], enc[pos:pos+codeHashBytes]) + u.ValLength = length.Hash + u.Flags |= CodeUpdate } } @@ -1938,11 +2072,12 @@ func (u *Update) Decode(buf []byte, pos int) (int, error) { pos += n } if u.Flags&CodeUpdate != 0 { - if len(buf) < pos+32 { + if len(buf) < pos+length.Hash { return 0, fmt.Errorf("decode Update: buffer too small for codeHash") } copy(u.CodeHashOrStorage[:], buf[pos:pos+32]) - pos += 32 + pos += length.Hash + u.ValLength = length.Hash } if u.Flags&StorageUpdate != 0 { l, n := binary.Uvarint(buf[pos:]) @@ -1950,7 +2085,7 @@ func (u *Update) Decode(buf []byte, pos int) (int, error) { return 0, fmt.Errorf("decode Update: buffer too small for storage len") } if n < 0 { - return 0, fmt.Errorf("decode Update: storage lee overflow") + return 0, fmt.Errorf("decode Update: storage pos overflow") } pos += n if len(buf) < pos+int(l) { diff --git a/commitment/hex_patricia_hashed_bench_test.go b/commitment/hex_patricia_hashed_bench_test.go index a44d4e7c8..687b756e7 100644 --- a/commitment/hex_patricia_hashed_bench_test.go +++ b/commitment/hex_patricia_hashed_bench_test.go @@ -28,7 +28,7 @@ func Benchmark_HexPatriciaHahsed_ReviewKeys(b *testing.B) { builder.Balance(hex.EncodeToString(key), rnd.Uint64()) } - pk, hk, _ := builder.Build() + pk, _ := builder.Build() b.Run("review_keys", func(b *testing.B) { for i, j := 0, 0; i < b.N; i, j = i+1, j+1 { @@ -36,7 +36,7 @@ func Benchmark_HexPatriciaHahsed_ReviewKeys(b *testing.B) { j = 0 } - hph.ReviewKeys(pk[j:j+1], hk[j:j+1]) + hph.ProcessKeys(pk[j : j+1]) } }) } diff --git a/commitment/hex_patricia_hashed_fuzz_test.go b/commitment/hex_patricia_hashed_fuzz_test.go index e1e772b83..816717974 100644 --- a/commitment/hex_patricia_hashed_fuzz_test.go +++ b/commitment/hex_patricia_hashed_fuzz_test.go @@ -40,7 +40,7 @@ func Fuzz_ProcessUpdate(f *testing.F) { hph.SetTrace(false) hphAnother.SetTrace(false) - plainKeys, hashedKeys, updates := builder.Build() + plainKeys, updates := builder.Build() if err := ms.applyPlainUpdates(plainKeys, updates); err != nil { t.Fatal(err) } @@ -48,7 +48,7 @@ func Fuzz_ProcessUpdate(f *testing.F) { t.Fatal(err) } - rootHash, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + rootHash, branchNodeUpdates, err := hph.ProcessKeys(plainKeys) if err != nil { t.Fatal(err) } @@ -58,7 +58,7 @@ func Fuzz_ProcessUpdate(f *testing.F) { t.Fatalf("invalid root hash length: expected 32 bytes, got %v", len(rootHash)) } - rootHashAnother, branchNodeUpdates, err := hphAnother.ReviewKeys(plainKeys, hashedKeys) + rootHashAnother, branchNodeUpdates, err := hphAnother.ProcessKeys(plainKeys) if err != nil { t.Fatal(err) } @@ -143,7 +143,7 @@ func Fuzz_ProcessUpdates_ArbitraryUpdateCount(f *testing.F) { hph := NewHexPatriciaHashed(20, ms.branchFn, ms.accountFn, ms.storageFn) hphAnother := NewHexPatriciaHashed(20, ms2.branchFn, ms2.accountFn, ms2.storageFn) - plainKeys, hashedKeys, updates := builder.Build() + plainKeys, updates := builder.Build() hph.SetTrace(false) hphAnother.SetTrace(false) @@ -151,7 +151,7 @@ func Fuzz_ProcessUpdates_ArbitraryUpdateCount(f *testing.F) { err := ms.applyPlainUpdates(plainKeys, updates) require.NoError(t, err) - rootHashReview, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + rootHashReview, branchNodeUpdates, err := hph.ProcessKeys(plainKeys) require.NoError(t, err) ms.applyBranchNodeUpdates(branchNodeUpdates) @@ -160,7 +160,7 @@ func Fuzz_ProcessUpdates_ArbitraryUpdateCount(f *testing.F) { err = ms2.applyPlainUpdates(plainKeys, updates) require.NoError(t, err) - rootHashAnother, branchUpdatesAnother, err := hphAnother.ReviewKeys(plainKeys, hashedKeys) + rootHashAnother, branchUpdatesAnother, err := hphAnother.ProcessKeys(plainKeys) require.NoError(t, err) ms2.applyBranchNodeUpdates(branchUpdatesAnother) @@ -200,12 +200,12 @@ func Fuzz_HexPatriciaHashed_ReviewKeys(f *testing.F) { hph.SetTrace(false) - plainKeys, hashedKeys, updates := builder.Build() + plainKeys, updates := builder.Build() if err := ms.applyPlainUpdates(plainKeys, updates); err != nil { t.Fatal(err) } - rootHash, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + rootHash, branchNodeUpdates, err := hph.ProcessKeys(plainKeys) require.NoError(t, err) ms.applyBranchNodeUpdates(branchNodeUpdates) diff --git a/commitment/hex_patricia_hashed_test.go b/commitment/hex_patricia_hashed_test.go index 3798701c7..ae86ae78a 100644 --- a/commitment/hex_patricia_hashed_test.go +++ b/commitment/hex_patricia_hashed_test.go @@ -21,9 +21,12 @@ import ( "fmt" "math/rand" "testing" + "time" + "github.com/holiman/uint256" "github.com/stretchr/testify/require" + "github.com/ledgerwatch/erigon-lib/common" "github.com/ledgerwatch/erigon-lib/common/length" ) @@ -31,7 +34,7 @@ func Test_HexPatriciaHashed_ResetThenSingularUpdates(t *testing.T) { ms := NewMockState(t) hph := NewHexPatriciaHashed(1, ms.branchFn, ms.accountFn, ms.storageFn) hph.SetTrace(false) - plainKeys, hashedKeys, updates := NewUpdateBuilder(). + plainKeys, updates := NewUpdateBuilder(). Balance("00", 4). Balance("01", 5). Balance("02", 6). @@ -48,7 +51,7 @@ func Test_HexPatriciaHashed_ResetThenSingularUpdates(t *testing.T) { err := ms.applyPlainUpdates(plainKeys, updates) require.NoError(t, err) - firstRootHash, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + firstRootHash, branchNodeUpdates, err := hph.ProcessUpdates(plainKeys, updates) require.NoError(t, err) t.Logf("root hash %x\n", firstRootHash) @@ -56,18 +59,18 @@ func Test_HexPatriciaHashed_ResetThenSingularUpdates(t *testing.T) { ms.applyBranchNodeUpdates(branchNodeUpdates) fmt.Printf("1. Generated updates\n") - renderUpdates(branchNodeUpdates) + //renderUpdates(branchNodeUpdates) // More updates hph.Reset() - hph.SetTrace(false) - plainKeys, hashedKeys, updates = NewUpdateBuilder(). - Storage("03", "58", "050505"). + hph.SetTrace(true) + plainKeys, updates = NewUpdateBuilder(). + Storage("03", "58", "050506"). Build() err = ms.applyPlainUpdates(plainKeys, updates) require.NoError(t, err) - secondRootHash, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + secondRootHash, branchNodeUpdates, err := hph.ProcessKeys(plainKeys) require.NoError(t, err) require.NotEqualValues(t, firstRootHash, secondRootHash) @@ -77,27 +80,29 @@ func Test_HexPatriciaHashed_ResetThenSingularUpdates(t *testing.T) { // More updates hph.Reset() - hph.SetTrace(false) - plainKeys, hashedKeys, updates = NewUpdateBuilder(). - Storage("03", "58", "070807"). + hph.SetTrace(true) + plainKeys, updates = NewUpdateBuilder(). + Storage("03", "58", "020807"). Build() + fmt.Printf("3. Generated single update %s\n", updates[0].String()) err = ms.applyPlainUpdates(plainKeys, updates) require.NoError(t, err) - thirdRootHash, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + thirdRootHash, branchNodeUpdates, err := hph.ProcessKeys(plainKeys) require.NoError(t, err) require.NotEqualValues(t, secondRootHash, thirdRootHash) + renderUpdates(branchNodeUpdates) ms.applyBranchNodeUpdates(branchNodeUpdates) fmt.Printf("3. Generated single update\n") - renderUpdates(branchNodeUpdates) + //renderUpdates(branchNodeUpdates) } func Test_HexPatriciaHashed_EmptyUpdate(t *testing.T) { ms := NewMockState(t) hph := NewHexPatriciaHashed(1, ms.branchFn, ms.accountFn, ms.storageFn) hph.SetTrace(false) - plainKeys, hashedKeys, updates := NewUpdateBuilder(). + plainKeys, updates := NewUpdateBuilder(). Balance("00", 4). Nonce("00", 246462653). Balance("01", 5). @@ -110,24 +115,24 @@ func Test_HexPatriciaHashed_EmptyUpdate(t *testing.T) { err := ms.applyPlainUpdates(plainKeys, updates) require.NoError(t, err) - hashBeforeEmptyUpdate, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + hashBeforeEmptyUpdate, branchNodeUpdates, err := hph.ProcessKeys(plainKeys) require.NoError(t, err) require.NotEmpty(t, hashBeforeEmptyUpdate) ms.applyBranchNodeUpdates(branchNodeUpdates) fmt.Println("1. Updates applied") - renderUpdates(branchNodeUpdates) + //renderUpdates(branchNodeUpdates) // generate empty updates and do NOT reset tree - hph.SetTrace(true) + //hph.SetTrace(true) - plainKeys, hashedKeys, updates = NewUpdateBuilder().Build() + plainKeys, updates = NewUpdateBuilder().Build() err = ms.applyPlainUpdates(plainKeys, updates) require.NoError(t, err) - hashAfterEmptyUpdate, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + hashAfterEmptyUpdate, branchNodeUpdates, err := hph.ProcessKeys(plainKeys) require.NoError(t, err) ms.applyBranchNodeUpdates(branchNodeUpdates) @@ -136,11 +141,108 @@ func Test_HexPatriciaHashed_EmptyUpdate(t *testing.T) { require.EqualValues(t, hashBeforeEmptyUpdate, hashAfterEmptyUpdate) } +func Test_HexPatriciaHashed_UniqueRepresentation2(t *testing.T) { + ms := NewMockState(t) + ms2 := NewMockState(t) + + plainKeys, updates := NewUpdateBuilder(). + Balance("71562b71999873db5b286df957af199ec94617f7", 999860099). + Nonce("71562b71999873db5b286df957af199ec94617f7", 3). + Balance("3a220f351252089d385b29beca14e27f204c296a", 900234). + Balance("0000000000000000000000000000000000000000", 2000000000000138901). + //Balance("0000000000000000000000000000000000000000", 4000000000000138901). + Build() + + trieOne := NewHexPatriciaHashed(20, ms.branchFn, ms.accountFn, ms.storageFn) + trieTwo := NewHexPatriciaHashed(20, ms2.branchFn, ms2.accountFn, ms2.storageFn) + + //trieOne.SetTrace(true) + //trieTwo.SetTrace(true) + + // single sequential update + roots := make([][]byte, 0) + fmt.Printf("1. Trie sequential update generated following branch updates\n") + + var ra, rb []byte + { + if err := ms.applyPlainUpdates(plainKeys, updates); err != nil { + t.Fatal(err) + } + + rh, branchNodeUpdates, err := trieOne.ProcessKeys(plainKeys) + require.NoError(t, err) + ms.applyBranchNodeUpdates(branchNodeUpdates) + //renderUpdates(branchNodeUpdates) + + ra = common.Copy(rh) + } + { + err := ms2.applyPlainUpdates(plainKeys, updates) + require.NoError(t, err) + + fmt.Printf("\n2. Trie batch update generated following branch updates\n") + // batch update + rh, branchNodeUpdatesTwo, err := trieTwo.ProcessKeys(plainKeys) + require.NoError(t, err) + ms2.applyBranchNodeUpdates(branchNodeUpdatesTwo) + //renderUpdates(branchNodeUpdatesTwo) + + rb = common.Copy(rh) + } + require.EqualValues(t, ra, rb) + + plainKeys, updates = NewUpdateBuilder(). + //Balance("71562b71999873db5b286df957af199ec94617f7", 999860099). + //Nonce("71562b71999873db5b286df957af199ec94617f7", 3). + //Balance("3a220f351252089d385b29beca14e27f204c296a", 900234). + //Balance("0000000000000000000000000000000000000000", 2000000000000138901). + Balance("0000000000000000000000000000000000000000", 4000000000000138901). + Build() + + if err := ms.applyPlainUpdates(plainKeys, updates); err != nil { + t.Fatal(err) + } + + sequentialRoot, branchNodeUpdates, err := trieOne.ProcessKeys(plainKeys) + require.NoError(t, err) + roots = append(roots, sequentialRoot) + ms.applyBranchNodeUpdates(branchNodeUpdates) + //renderUpdates(branchNodeUpdates) + + plainKeys, updates = NewUpdateBuilder(). + Balance("71562b71999873db5b286df957af199ec94617f7", 999860099). + Nonce("71562b71999873db5b286df957af199ec94617f7", 3). + Balance("3a220f351252089d385b29beca14e27f204c296a", 900234). + //Balance("0000000000000000000000000000000000000000", 2000000000000138901). + Balance("0000000000000000000000000000000000000000", 4000000000000138901). + Build() + + err = ms2.applyPlainUpdates(plainKeys, updates) + require.NoError(t, err) + + fmt.Printf("\n2. Trie batch update generated following branch updates\n") + // batch update + batchRoot, branchNodeUpdatesTwo, err := trieTwo.ProcessKeys(plainKeys) + require.NoError(t, err) + //renderUpdates(branchNodeUpdatesTwo) + + fmt.Printf("\n sequential roots:\n") + for i, rh := range roots { + fmt.Printf("%2d %+v\n", i, hex.EncodeToString(rh)) + } + + ms2.applyBranchNodeUpdates(branchNodeUpdatesTwo) + + require.EqualValues(t, batchRoot, roots[len(roots)-1], + "expected equal roots, got sequential [%v] != batch [%v]", hex.EncodeToString(roots[len(roots)-1]), hex.EncodeToString(batchRoot)) + require.Lenf(t, batchRoot, 32, "root hash length should be equal to 32 bytes") +} + func Test_HexPatriciaHashed_UniqueRepresentation(t *testing.T) { ms := NewMockState(t) ms2 := NewMockState(t) - plainKeys, hashedKeys, updates := NewUpdateBuilder(). + plainKeys, updates := NewUpdateBuilder(). Balance("f5", 4). Balance("ff", 900234). Balance("04", 1233). @@ -162,8 +264,8 @@ func Test_HexPatriciaHashed_UniqueRepresentation(t *testing.T) { trieOne := NewHexPatriciaHashed(1, ms.branchFn, ms.accountFn, ms.storageFn) trieTwo := NewHexPatriciaHashed(1, ms2.branchFn, ms2.accountFn, ms2.storageFn) - trieOne.SetTrace(true) - trieTwo.SetTrace(true) + //trieOne.SetTrace(true) + //trieTwo.SetTrace(true) // single sequential update roots := make([][]byte, 0) @@ -174,12 +276,12 @@ func Test_HexPatriciaHashed_UniqueRepresentation(t *testing.T) { t.Fatal(err) } - sequentialRoot, branchNodeUpdates, err := trieOne.ReviewKeys(plainKeys[i:i+1], hashedKeys[i:i+1]) + sequentialRoot, branchNodeUpdates, err := trieOne.ProcessKeys(plainKeys[i : i+1]) require.NoError(t, err) roots = append(roots, sequentialRoot) ms.applyBranchNodeUpdates(branchNodeUpdates) - renderUpdates(branchNodeUpdates) + //renderUpdates(branchNodeUpdates) } err := ms2.applyPlainUpdates(plainKeys, updates) @@ -187,9 +289,9 @@ func Test_HexPatriciaHashed_UniqueRepresentation(t *testing.T) { fmt.Printf("\n2. Trie batch update generated following branch updates\n") // batch update - batchRoot, branchNodeUpdatesTwo, err := trieTwo.ReviewKeys(plainKeys, hashedKeys) + batchRoot, branchNodeUpdatesTwo, err := trieTwo.ProcessKeys(plainKeys) require.NoError(t, err) - renderUpdates(branchNodeUpdatesTwo) + //renderUpdates(branchNodeUpdatesTwo) fmt.Printf("\n sequential roots:\n") for i, rh := range roots { @@ -245,7 +347,7 @@ func Test_Sepolia(t *testing.T) { } hph := NewHexPatriciaHashed(length.Addr, ms.branchFn, ms.accountFn, ms.storageFn) - hph.SetTrace(true) + //hph.SetTrace(true) for _, testData := range tests { builder := NewUpdateBuilder() @@ -253,13 +355,13 @@ func Test_Sepolia(t *testing.T) { for address, balance := range testData.balances { builder.IncrementBalance(address, balance) } - plainKeys, hashedKeys, updates := builder.Build() + plainKeys, updates := builder.Build() if err := ms.applyPlainUpdates(plainKeys, updates); err != nil { t.Fatal(err) } - rootHash, branchNodeUpdates, err := hph.ReviewKeys(plainKeys, hashedKeys) + rootHash, branchNodeUpdates, err := hph.ProcessKeys(plainKeys) if err != nil { t.Fatal(err) } @@ -269,22 +371,67 @@ func Test_Sepolia(t *testing.T) { } } +func Test_Cell_EncodeDecode(t *testing.T) { + rnd := rand.New(rand.NewSource(time.Now().UnixMilli())) + first := &Cell{ + Nonce: rnd.Uint64(), + hl: length.Hash, + StorageLen: rnd.Intn(33), + apl: length.Addr, + spl: length.Addr + length.Hash, + downHashedLen: rnd.Intn(129), + extLen: rnd.Intn(65), + downHashedKey: [128]byte{}, + extension: [64]byte{}, + spk: [52]byte{}, + h: [32]byte{}, + CodeHash: [32]byte{}, + Storage: [32]byte{}, + apk: [20]byte{}, + } + b := uint256.NewInt(rnd.Uint64()) + first.Balance = *b + + rnd.Read(first.downHashedKey[:first.downHashedLen]) + rnd.Read(first.extension[:first.extLen]) + rnd.Read(first.spk[:]) + rnd.Read(first.apk[:]) + rnd.Read(first.h[:]) + rnd.Read(first.CodeHash[:]) + rnd.Read(first.Storage[:first.StorageLen]) + if rnd.Intn(100) > 50 { + first.Delete = true + } + + second := &Cell{} + second.Decode(first.Encode()) + + require.EqualValues(t, first.downHashedLen, second.downHashedLen) + require.EqualValues(t, first.downHashedKey[:], second.downHashedKey[:]) + require.EqualValues(t, first.apl, second.apl) + require.EqualValues(t, first.spl, second.spl) + require.EqualValues(t, first.hl, second.hl) + require.EqualValues(t, first.apk[:], second.apk[:]) + require.EqualValues(t, first.spk[:], second.spk[:]) + require.EqualValues(t, first.h[:], second.h[:]) + require.EqualValues(t, first.extension[:first.extLen], second.extension[:second.extLen]) + // encode doesnt code Nonce, Balance, CodeHash and Storage + require.EqualValues(t, first.Delete, second.Delete) +} + func Test_HexPatriciaHashed_StateEncode(t *testing.T) { //trie := NewHexPatriciaHashed(length.Hash, nil, nil, nil) var s state s.Root = make([]byte, 128) rnd := rand.New(rand.NewSource(42)) - n, err := rnd.Read(s.CurrentKey[:]) - require.NoError(t, err) - require.EqualValues(t, 128, n) - n, err = rnd.Read(s.Root[:]) + + n, err := rnd.Read(s.Root[:]) require.NoError(t, err) require.EqualValues(t, len(s.Root), n) s.RootPresent = true s.RootTouched = true s.RootChecked = true - s.CurrentKeyLen = int8(rnd.Intn(129)) for i := 0; i < len(s.Depths); i++ { s.Depths[i] = rnd.Intn(256) } @@ -310,8 +457,6 @@ func Test_HexPatriciaHashed_StateEncode(t *testing.T) { require.EqualValues(t, s.Root[:], s1.Root[:]) require.EqualValues(t, s.Depths[:], s1.Depths[:]) - require.EqualValues(t, s.CurrentKeyLen, s1.CurrentKeyLen) - require.EqualValues(t, s.CurrentKey[:], s1.CurrentKey[:]) require.EqualValues(t, s.AfterMap[:], s1.AfterMap[:]) require.EqualValues(t, s.TouchMap[:], s1.TouchMap[:]) require.EqualValues(t, s.BranchBefore[:], s1.BranchBefore[:]) @@ -323,7 +468,7 @@ func Test_HexPatriciaHashed_StateEncode(t *testing.T) { func Test_HexPatriciaHashed_StateEncodeDecodeSetup(t *testing.T) { ms := NewMockState(t) - plainKeys, hashedKeys, updates := NewUpdateBuilder(). + plainKeys, updates := NewUpdateBuilder(). Balance("f5", 4). Balance("ff", 900234). Balance("03", 7). @@ -342,7 +487,7 @@ func Test_HexPatriciaHashed_StateEncodeDecodeSetup(t *testing.T) { err := ms.applyPlainUpdates(plainKeys, updates) require.NoError(t, err) - rhBefore, branchUpdates, err := before.ReviewKeys(plainKeys, hashedKeys) + rhBefore, branchUpdates, err := before.ProcessKeys(plainKeys) require.NoError(t, err) ms.applyBranchNodeUpdates(branchUpdates) @@ -357,7 +502,7 @@ func Test_HexPatriciaHashed_StateEncodeDecodeSetup(t *testing.T) { require.EqualValues(t, rhBefore, rhAfter) // create new update and apply it to both tries - nextPK, nextHashed, nextUpdates := NewUpdateBuilder(). + nextPK, nextUpdates := NewUpdateBuilder(). Nonce("ff", 4). Balance("b9", 6000000000). Balance("ad", 8000000000). @@ -366,11 +511,11 @@ func Test_HexPatriciaHashed_StateEncodeDecodeSetup(t *testing.T) { err = ms.applyPlainUpdates(nextPK, nextUpdates) require.NoError(t, err) - rh2Before, branchUpdates, err := before.ReviewKeys(nextPK, nextHashed) + rh2Before, branchUpdates, err := before.ProcessKeys(nextPK) require.NoError(t, err) ms.applyBranchNodeUpdates(branchUpdates) - rh2After, branchUpdates, err := after.ReviewKeys(nextPK, nextHashed) + rh2After, branchUpdates, err := after.ProcessKeys(nextPK) require.NoError(t, err) _ = branchUpdates @@ -378,11 +523,76 @@ func Test_HexPatriciaHashed_StateEncodeDecodeSetup(t *testing.T) { require.EqualValues(t, rh2Before, rh2After) } +func Test_HexPatriciaHashed_StateRestoreAndContinue(t *testing.T) { + ms := NewMockState(t) + + plainKeys, updates := NewUpdateBuilder(). + Balance("f5", 4). + Balance("ff", 900234). + Build() + + trieOne := NewHexPatriciaHashed(1, ms.branchFn, ms.accountFn, ms.storageFn) + err := ms.applyPlainUpdates(plainKeys, updates) + require.NoError(t, err) + + beforeRestore, branchNodeUpdatesOne, err := trieOne.ProcessKeys(plainKeys) + require.NoError(t, err) + + //renderUpdates(branchNodeUpdatesOne) + ms.applyBranchNodeUpdates(branchNodeUpdatesOne) + + buf, err := trieOne.EncodeCurrentState(nil) + require.NoError(t, err) + require.NotEmpty(t, buf) + + trieTwo := NewHexPatriciaHashed(1, ms.branchFn, ms.accountFn, ms.storageFn) + err = trieTwo.SetState(buf) + require.NoError(t, err) + + hashAfterRestore, err := trieTwo.RootHash() + require.NoError(t, err) + require.EqualValues(t, beforeRestore, hashAfterRestore) + + plainKeys, updates = NewUpdateBuilder(). + Balance("ff", 900234). + Balance("04", 1233). + Storage("04", "01", "0401"). + Balance("ba", 065606). + Balance("00", 4). + Balance("01", 5). + Balance("02", 6). + Balance("03", 7). + Storage("03", "56", "050505"). + Balance("05", 9). + Storage("03", "87", "060606"). + Balance("b9", 6). + Nonce("ff", 169356). + Storage("05", "02", "8989"). + Storage("f5", "04", "9898"). + Build() + + err = ms.applyPlainUpdates(plainKeys, updates) + require.NoError(t, err) + + beforeRestore, branchNodeUpdatesOne, err = trieOne.ProcessKeys(plainKeys) + require.NoError(t, err) + + renderUpdates(branchNodeUpdatesOne) + + twoAfterRestore, branchNodeUpdatesTwo, err := trieTwo.ProcessKeys(plainKeys) + require.NoError(t, err) + + _ = branchNodeUpdatesTwo + + ms.applyBranchNodeUpdates(branchNodeUpdatesOne) + require.EqualValues(t, beforeRestore, twoAfterRestore) +} + func Test_HexPatriciaHashed_RestoreAndContinue(t *testing.T) { ms := NewMockState(t) ms2 := NewMockState(t) - plainKeys, hashedKeys, updates := NewUpdateBuilder(). + plainKeys, updates := NewUpdateBuilder(). Balance("f5", 4). Balance("ff", 900234). Balance("04", 1233). @@ -409,9 +619,9 @@ func Test_HexPatriciaHashed_RestoreAndContinue(t *testing.T) { _ = updates - batchRoot, branchNodeUpdatesTwo, err := trieTwo.ReviewKeys(plainKeys, hashedKeys) + beforeRestore, branchNodeUpdatesTwo, err := trieTwo.ProcessKeys(plainKeys) require.NoError(t, err) - renderUpdates(branchNodeUpdatesTwo) + //renderUpdates(branchNodeUpdatesTwo) ms2.applyBranchNodeUpdates(branchNodeUpdatesTwo) buf, err := trieTwo.EncodeCurrentState(nil) @@ -420,39 +630,19 @@ func Test_HexPatriciaHashed_RestoreAndContinue(t *testing.T) { err = trieOne.SetState(buf) require.NoError(t, err) - require.EqualValues(t, batchRoot[:], trieOne.root.h[:]) - require.EqualValues(t, trieTwo.root.hl, trieOne.root.hl) - require.EqualValues(t, trieTwo.root.apl, trieOne.root.apl) - if trieTwo.root.apl > 0 { - require.EqualValues(t, trieTwo.root.apk, trieOne.root.apk) - } - require.EqualValues(t, trieTwo.root.spl, trieOne.root.spl) - if trieTwo.root.apl > 0 { - require.EqualValues(t, trieTwo.root.spk, trieOne.root.spk) - } - if trieTwo.root.downHashedLen > 0 { - require.EqualValues(t, trieTwo.root.downHashedKey, trieOne.root.downHashedKey) - } - require.EqualValues(t, trieTwo.root.Nonce, trieOne.root.Nonce) - //require.EqualValues(t, trieTwo.root.CodeHash, trieOne.root.CodeHash) - require.EqualValues(t, trieTwo.root.StorageLen, trieOne.root.StorageLen) - require.EqualValues(t, trieTwo.root.extension, trieOne.root.extension) - - require.EqualValues(t, trieTwo.currentKey, trieOne.currentKey) - require.EqualValues(t, trieTwo.afterMap, trieOne.afterMap) - require.EqualValues(t, trieTwo.touchMap[:], trieOne.touchMap[:]) - require.EqualValues(t, trieTwo.branchBefore[:], trieOne.branchBefore[:]) - require.EqualValues(t, trieTwo.rootTouched, trieOne.rootTouched) - require.EqualValues(t, trieTwo.rootPresent, trieOne.rootPresent) - require.EqualValues(t, trieTwo.rootChecked, trieOne.rootChecked) - require.EqualValues(t, trieTwo.currentKeyLen, trieOne.currentKeyLen) + fmt.Printf("rh %x\n", trieOne.root.h[:]) + require.EqualValues(t, beforeRestore[:], trieOne.root.h[:]) + + hashAfterRestore, err := trieOne.RootHash() + require.NoError(t, err) + require.EqualValues(t, beforeRestore, hashAfterRestore) } func Test_HexPatriciaHashed_ProcessUpdates_UniqueRepresentation_AfterStateRestore(t *testing.T) { ms := NewMockState(t) ms2 := NewMockState(t) - plainKeys, hashedKeys, updates := NewUpdateBuilder(). + plainKeys, updates := NewUpdateBuilder(). Balance("f5", 4). Balance("ff", 900234). Balance("04", 1233). @@ -476,8 +666,8 @@ func Test_HexPatriciaHashed_ProcessUpdates_UniqueRepresentation_AfterStateRestor batch.Reset() sequential.Reset() - sequential.SetTrace(true) - batch.SetTrace(true) + //sequential.SetTrace(true) + //batch.SetTrace(true) // single sequential update roots := make([][]byte, 0) @@ -494,11 +684,11 @@ func Test_HexPatriciaHashed_ProcessUpdates_UniqueRepresentation_AfterStateRestor require.NoError(t, err) } - sequentialRoot, branchNodeUpdates, err := sequential.ReviewKeys(plainKeys[i:i+1], hashedKeys[i:i+1]) + sequentialRoot, branchNodeUpdates, err := sequential.ProcessKeys(plainKeys[i : i+1]) require.NoError(t, err) roots = append(roots, sequentialRoot) - renderUpdates(branchNodeUpdates) + //renderUpdates(branchNodeUpdates) ms.applyBranchNodeUpdates(branchNodeUpdates) if i == (len(updates)/2 - 1) { @@ -512,7 +702,7 @@ func Test_HexPatriciaHashed_ProcessUpdates_UniqueRepresentation_AfterStateRestor fmt.Printf("\n2. Trie batch update generated following branch updates\n") // batch update - batchRoot, branchNodeUpdatesTwo, err := batch.ReviewKeys(plainKeys, hashedKeys) + batchRoot, branchNodeUpdatesTwo, err := batch.ProcessKeys(plainKeys) require.NoError(t, err) renderUpdates(branchNodeUpdatesTwo) ms2.applyBranchNodeUpdates(branchNodeUpdatesTwo) diff --git a/commitment/patricia_state_mock_test.go b/commitment/patricia_state_mock_test.go index 82dc932a2..666feb468 100644 --- a/commitment/patricia_state_mock_test.go +++ b/commitment/patricia_state_mock_test.go @@ -52,7 +52,7 @@ func (ms MockState) accountFn(plainKey []byte, cell *Cell) error { return nil } if pos != len(exBytes) { - ms.t.Fatalf("accountFn key [%x] leftover bytes in [%x], comsumed %x", plainKey, exBytes, pos) + ms.t.Fatalf("accountFn key [%x] leftover %d bytes in [%x], comsumed %x", plainKey, len(exBytes)-pos, exBytes, pos) return nil } if ex.Flags&StorageUpdate != 0 { @@ -154,6 +154,7 @@ func (ms *MockState) applyPlainUpdates(plainKeys [][]byte, updates []Update) err if update.Flags&StorageUpdate != 0 { ex.Flags |= StorageUpdate copy(ex.CodeHashOrStorage[:], update.CodeHashOrStorage[:]) + ex.ValLength = update.ValLength } ms.sm[string(key)] = ex.Encode(nil, ms.numBuf[:]) } else { @@ -328,7 +329,7 @@ func (ub *UpdateBuilder) DeleteStorage(addr string, loc string) *UpdateBuilder { // 1. Plain keys // 2. Corresponding hashed keys // 3. Corresponding updates -func (ub *UpdateBuilder) Build() (plainKeys, hashedKeys [][]byte, updates []Update) { +func (ub *UpdateBuilder) Build() (plainKeys [][]byte, updates []Update) { hashed := make([]string, 0, len(ub.keyset)+len(ub.keyset2)) preimages := make(map[string][]byte) preimages2 := make(map[string][]byte) @@ -371,10 +372,8 @@ func (ub *UpdateBuilder) Build() (plainKeys, hashedKeys [][]byte, updates []Upda } slices.Sort(hashed) plainKeys = make([][]byte, len(hashed)) - hashedKeys = make([][]byte, len(hashed)) updates = make([]Update, len(hashed)) for i, hashedKey := range hashed { - hashedKeys[i] = []byte(hashedKey) key := preimages[hashedKey] key2 := preimages2[hashedKey] plainKey := make([]byte, len(key)+len(key2)) diff --git a/common/bitutil/select.go b/common/bitutil/select.go index 386660903..f3266c269 100644 --- a/common/bitutil/select.go +++ b/common/bitutil/select.go @@ -20,7 +20,7 @@ import ( ) // Required by select64 -var kSelectInByte []byte = []byte{ +var kSelectInByte = [2048]byte{ 8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, @@ -58,6 +58,10 @@ const ( kOnesStep4 uint64 = 0x1111111111111111 kOnesStep8 uint64 = 0x0101010101010101 kLAMBDAsStep8 uint64 = 0x80 * kOnesStep8 + + kOnesStep4x3 = 0x3 * kOnesStep4 + kOnesStep4xA = 0xA * kOnesStep4 + kOnesStep8xF = 0xF * kOnesStep8 ) /** Returns the index of the k-th 1-bit in the 64-bit word x. @@ -77,16 +81,25 @@ const ( * [4] Facebook Folly library: https://github.com/facebook/folly * */ -func Select64(x uint64, k int) int { + +func Select64(x uint64, k int) (place int) { + /* Original implementation - a bit obfuscated to satisfy Golang's inlining costs s := x s = s - ((s & (0xA * kOnesStep4)) >> 1) s = (s & (0x3 * kOnesStep4)) + ((s >> 2) & (0x3 * kOnesStep4)) s = (s + (s >> 4)) & (0xF * kOnesStep8) byteSums := s * kOnesStep8 - + */ + s := x - ((x & kOnesStep4xA) >> 1) + s = (s & kOnesStep4x3) + ((s >> 2) & kOnesStep4x3) + byteSums := ((s + (s >> 4)) & kOnesStep8xF) * kOnesStep8 + /* Original implementaiton: kStep8 := uint64(k) * kOnesStep8 geqKStep8 := ((kStep8 | kLAMBDAsStep8) - byteSums) & kLAMBDAsStep8 - place := bits.OnesCount64(geqKStep8) * 8 + place = bits.OnesCount64(geqKStep8) * 8 + byteRank := uint64(k) - (((byteSums << 8) >> place) & uint64(0xFF)) + */ + place = bits.OnesCount64((((uint64(k)*kOnesStep8)|kLAMBDAsStep8)-byteSums)&kLAMBDAsStep8) * 8 byteRank := uint64(k) - (((byteSums << 8) >> place) & uint64(0xFF)) return place + int(kSelectInByte[((x>>place)&0xFF)|(byteRank<<8)]) } diff --git a/common/bitutil/select_test.go b/common/bitutil/select_test.go new file mode 100644 index 000000000..3f5085100 --- /dev/null +++ b/common/bitutil/select_test.go @@ -0,0 +1,36 @@ +/* +Copyright 2021 Erigon contributors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +package bitutil + +import ( + "math" + "testing" +) + +func TestSelect64(t *testing.T) { + if res := Select64(5270498307387724361, 14); res != 41 { + panic(res) + } + if res := Select64(5270498307387724361, 6); res != 18 { + panic(res) + } + if res := Select64(uint64(math.MaxUint64), 62); res != 62 { + panic(res) + } + if res := Select64(210498307387724361, 14); res != 35 { + panic(res) + } +} diff --git a/common/bytes.go b/common/bytes.go index 59929736c..b74166585 100644 --- a/common/bytes.go +++ b/common/bytes.go @@ -17,6 +17,7 @@ package common import ( + "bytes" "fmt" ) @@ -53,6 +54,21 @@ func Copy(b []byte) []byte { return c } +func AppendInto(dst []byte, src ...[]byte) { + d := bytes.NewBuffer(dst) + for _, s := range src { + d.Write(s) + } +} + +func Append(data ...[]byte) []byte { + s := new(bytes.Buffer) + for _, d := range data { + s.Write(d) + } + return s.Bytes() +} + func EnsureEnoughSize(in []byte, size int) []byte { if cap(in) < size { newBuf := make([]byte, size) diff --git a/common/cryptozerocopy/crypto_zero_copy.go b/common/cryptozerocopy/crypto_zero_copy.go new file mode 100644 index 000000000..cd53fec0c --- /dev/null +++ b/common/cryptozerocopy/crypto_zero_copy.go @@ -0,0 +1,11 @@ +package cryptozerocopy + +import "hash" + +// KeccakState wraps sha3.state. In addition to the usual hash methods, it also supports +// Read to get a variable amount of data from the hash state. Read is faster than Sum +// because it doesn't copy the internal state, but also modifies the internal state. +type KeccakState interface { + hash.Hash + Read([]byte) (int, error) +} diff --git a/common/datadir/dirs.go b/common/datadir/dirs.go index d4cd59972..6b5cf9c7d 100644 --- a/common/datadir/dirs.go +++ b/common/datadir/dirs.go @@ -31,6 +31,7 @@ type Dirs struct { Tmp string Snap string SnapHistory string + SnapWarm string TxPool string Nodes string } @@ -53,6 +54,7 @@ func New(datadir string) Dirs { Tmp: filepath.Join(datadir, "temp"), Snap: filepath.Join(datadir, "snapshots"), SnapHistory: filepath.Join(datadir, "snapshots", "history"), + SnapWarm: filepath.Join(datadir, "snapshots", "warm"), TxPool: filepath.Join(datadir, "txpool"), Nodes: filepath.Join(datadir, "nodes"), } diff --git a/common/dbg/dbg_evn.go b/common/dbg/dbg_evn.go new file mode 100644 index 000000000..e5d4fe286 --- /dev/null +++ b/common/dbg/dbg_evn.go @@ -0,0 +1,26 @@ +package dbg + +import ( + "os" + + "github.com/c2h5oh/datasize" +) + +func EnvString(envVarName string, defaultVal string) string { + v, _ := os.LookupEnv(envVarName) + if v != "" { + return v + } + return defaultVal +} +func EnvDataSize(envVarName string, defaultVal datasize.ByteSize) datasize.ByteSize { + v, _ := os.LookupEnv(envVarName) + if v != "" { + val, err := datasize.ParseString(v) + if err != nil { + panic(err) + } + return val + } + return defaultVal +} diff --git a/common/dbg/experiments.go b/common/dbg/experiments.go index ff4f966d6..fb50a4df8 100644 --- a/common/dbg/experiments.go +++ b/common/dbg/experiments.go @@ -281,3 +281,67 @@ func StopAfterReconst() bool { }) return stopAfterReconst } + +var ( + discardCommitment bool + discardCommitmentOnce sync.Once +) + +func DiscardCommitment() bool { + discardCommitmentOnce.Do(func() { + v, _ := os.LookupEnv("DISCARD_COMMITMENT") + if v == "true" { + discardCommitment = true + log.Info("[Experiment]", "DISCARD_COMMITMENT", discardCommitment) + } + }) + return discardCommitment +} + +var ( + noPrune bool + noPruneOnce sync.Once +) + +func NoPrune() bool { + noPruneOnce.Do(func() { + v, _ := os.LookupEnv("NO_PRUNE") + if v == "true" { + noPrune = true + log.Info("[Experiment]", "NO_PRUNE", noPrune) + } + }) + return noPrune +} + +var ( + snMadvNormal bool + snMadvNormalOnce sync.Once +) + +func SnMadvNormal() bool { + snMadvNormalOnce.Do(func() { + v, _ := os.LookupEnv("SN_MADV_NORMAL") + if v == "true" { + snMadvNormal = true + log.Info("[Experiment]", "SN_MADV_NORMAL", snMadvNormal) + } + }) + return snMadvNormal +} + +var ( + mdbxLockInRam bool + mdbxLockInRamOnce sync.Once +) + +func MdbxLockInRam() bool { + mdbxLockInRamOnce.Do(func() { + v, _ := os.LookupEnv("MDBX_LOCK_IN_RAM") + if v == "true" { + mdbxLockInRam = true + log.Info("[Experiment]", "MDBX_LOCK_IN_RAM", mdbxLockInRam) + } + }) + return mdbxLockInRam +} diff --git a/common/dir/rw_dir.go b/common/dir/rw_dir.go index 008d0f569..4b56cdc68 100644 --- a/common/dir/rw_dir.go +++ b/common/dir/rw_dir.go @@ -21,10 +21,12 @@ import ( "path/filepath" ) -func MustExist(path string) { +func MustExist(path ...string) { const perm = 0764 // user rwx, group rw, other r - if err := os.MkdirAll(path, perm); err != nil { - panic(err) + for _, p := range path { + if err := os.MkdirAll(p, perm); err != nil { + panic(err) + } } } @@ -47,6 +49,23 @@ func FileExist(path string) bool { return true } +func WriteFileWithFsync(name string, data []byte, perm os.FileMode) error { + f, err := os.OpenFile(name, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, perm) + if err != nil { + return err + } + defer f.Close() + _, err = f.Write(data) + if err != nil { + return err + } + err = f.Sync() + if err != nil { + return err + } + return err +} + func Recreate(dir string) { if Exist(dir) { _ = os.RemoveAll(dir) diff --git a/compress/decompress.go b/compress/decompress.go index 3fbd9603e..3877b08b0 100644 --- a/compress/decompress.go +++ b/compress/decompress.go @@ -23,11 +23,13 @@ import ( "os" "path/filepath" "strconv" + "sync/atomic" "time" + "github.com/ledgerwatch/log/v3" + "github.com/ledgerwatch/erigon-lib/common/dbg" "github.com/ledgerwatch/erigon-lib/mmap" - "github.com/ledgerwatch/log/v3" ) type word []byte // plain text word associated with code from dictionary @@ -112,6 +114,8 @@ type Decompressor struct { emptyWordsCount uint64 filePath, fileName string + + readAheadRefcnt atomic.Int32 // ref-counter: allow enable/disable read-ahead from goroutines. only when refcnt=0 - disable read-ahead once } // Tables with bitlen greater than threshold will be condensed. @@ -150,7 +154,6 @@ func NewDecompressor(compressedFilePath string) (d *Decompressor, err error) { fileName: fName, } defer func() { - if rec := recover(); rec != nil { err = fmt.Errorf("decompressing file: %s, %+v, trace: %s", compressedFilePath, rec, dbg.Stack()) } @@ -372,12 +375,22 @@ func (d *Decompressor) DisableReadAhead() { if d == nil || d.mmapHandle1 == nil { return } - _ = mmap.MadviseRandom(d.mmapHandle1) + leftReaders := d.readAheadRefcnt.Add(-1) + if leftReaders == 0 { + if dbg.SnMadvNormal() { + _ = mmap.MadviseNormal(d.mmapHandle1) + } else { + _ = mmap.MadviseRandom(d.mmapHandle1) + } + } else if leftReaders < 0 { + log.Warn("read-ahead negative counter", "file", d.FileName()) + } } func (d *Decompressor) EnableReadAhead() *Decompressor { if d == nil || d.mmapHandle1 == nil { return d } + d.readAheadRefcnt.Add(1) _ = mmap.MadviseSequential(d.mmapHandle1) return d } @@ -385,6 +398,7 @@ func (d *Decompressor) EnableMadvNormal() *Decompressor { if d == nil || d.mmapHandle1 == nil { return d } + d.readAheadRefcnt.Add(1) _ = mmap.MadviseNormal(d.mmapHandle1) return d } @@ -392,6 +406,7 @@ func (d *Decompressor) EnableWillNeed() *Decompressor { if d == nil || d.mmapHandle1 == nil { return d } + d.readAheadRefcnt.Add(1) _ = mmap.MadviseWillNeed(d.mmapHandle1) return d } @@ -661,9 +676,14 @@ func (g *Getter) SkipUncompressed() (uint64, int) { return g.dataP, int(wordLen) } -// Match returns true and next offset if the word at current offset fully matches the buf -// returns false and current offset otherwise. -func (g *Getter) Match(buf []byte) (bool, uint64) { +// Match returns +// +// 1 if the word at current offset is greater than the buf +// +// -1 if it is less than the buf +// +// 0 if they are equal. +func (g *Getter) Match(buf []byte) int { savePos := g.dataP wordLen := g.nextPos(true) wordLen-- // because when create huffman tree we do ++ , because 0 is terminator @@ -673,10 +693,18 @@ func (g *Getter) Match(buf []byte) (bool, uint64) { g.dataP++ g.dataBit = 0 } - if lenBuf != 0 { + if lenBuf != 0 || lenBuf != int(wordLen) { g.dataP, g.dataBit = savePos, 0 } - return lenBuf == int(wordLen), g.dataP + if lenBuf == int(wordLen) { + return 0 + } + if lenBuf < int(wordLen) { + return -1 + } + if lenBuf > int(wordLen) { + return 1 + } } var bufPos int @@ -684,9 +712,14 @@ func (g *Getter) Match(buf []byte) (bool, uint64) { for pos := g.nextPos(false /* clean */); pos != 0; pos = g.nextPos(false) { bufPos += int(pos) - 1 pattern := g.nextPattern() - if lenBuf < bufPos+len(pattern) || !bytes.Equal(buf[bufPos:bufPos+len(pattern)], pattern) { + compared := bytes.Compare(buf[bufPos:bufPos+len(pattern)], pattern) + if compared != 0 { + g.dataP, g.dataBit = savePos, 0 + return compared + } + if lenBuf < bufPos+len(pattern) { g.dataP, g.dataBit = savePos, 0 - return false, savePos + return -1 } } if g.dataBit > 0 { @@ -703,9 +736,14 @@ func (g *Getter) Match(buf []byte) (bool, uint64) { bufPos += int(pos) - 1 if bufPos > lastUncovered { dif := uint64(bufPos - lastUncovered) - if lenBuf < bufPos || !bytes.Equal(buf[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif]) { + compared := bytes.Compare(buf[lastUncovered:bufPos], g.data[postLoopPos:postLoopPos+dif]) + if compared != 0 { + g.dataP, g.dataBit = savePos, 0 + return compared + } + if lenBuf < bufPos { g.dataP, g.dataBit = savePos, 0 - return false, savePos + return -1 } postLoopPos += dif } @@ -713,18 +751,28 @@ func (g *Getter) Match(buf []byte) (bool, uint64) { } if int(wordLen) > lastUncovered { dif := wordLen - uint64(lastUncovered) - if lenBuf < int(wordLen) || !bytes.Equal(buf[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif]) { + + compared := bytes.Compare(buf[lastUncovered:wordLen], g.data[postLoopPos:postLoopPos+dif]) + if compared != 0 { + g.dataP, g.dataBit = savePos, 0 + return compared + } + if lenBuf < int(wordLen) { g.dataP, g.dataBit = savePos, 0 - return false, savePos + return -1 } postLoopPos += dif } - if lenBuf != int(wordLen) { + if lenBuf < int(wordLen) { g.dataP, g.dataBit = savePos, 0 - return false, savePos + return -1 + } + if lenBuf > int(wordLen) { + g.dataP, g.dataBit = savePos, 0 + return 1 } g.dataP, g.dataBit = postLoopPos, 0 - return true, postLoopPos + return 0 } // MatchPrefix only checks if the word at the current offset has a buf prefix. Does not move offset to the next word. diff --git a/compress/decompress_bench_test.go b/compress/decompress_bench_test.go index 9f6cd4b5d..c43c7cdd5 100644 --- a/compress/decompress_bench_test.go +++ b/compress/decompress_bench_test.go @@ -71,7 +71,7 @@ func BenchmarkDecompressMatch(b *testing.B) { defer d.Close() g := d.MakeGetter() for i := 0; i < b.N; i++ { - _, _ = g.Match([]byte("longlongword")) + _ = g.Match([]byte("longlongword")) } } diff --git a/compress/decompress_fuzz_test.go b/compress/decompress_fuzz_test.go index e127a6240..9201d0372 100644 --- a/compress/decompress_fuzz_test.go +++ b/compress/decompress_fuzz_test.go @@ -68,9 +68,9 @@ func FuzzDecompressMatch(f *testing.F) { t.Fatalf("MatchCmp: expected match: %v\n", expected) } g.Reset(savePos) - ok, _ := g.Match(expected) + ok := g.Match(expected) pos2 := g.dataP - if !ok { + if ok != 0 { t.Fatalf("MatchBool: expected match: %v\n", expected) } g.Reset(savePos) diff --git a/compress/decompress_test.go b/compress/decompress_test.go index 0becd5bb5..8dd993d24 100644 --- a/compress/decompress_test.go +++ b/compress/decompress_test.go @@ -86,8 +86,8 @@ func TestDecompressMatchOK(t *testing.T) { w := loremStrings[i] if i%2 != 0 { expected := fmt.Sprintf("%s %d", w, i) - ok, _ := g.Match([]byte(expected)) - if !ok { + cmp := g.Match([]byte(expected)) + if cmp != 0 { t.Errorf("expexted match with %s", expected) } } else { @@ -162,8 +162,8 @@ func TestDecompressMatchOKCondensed(t *testing.T) { for g.HasNext() { if i%2 != 0 { expected := fmt.Sprintf("word-%d", i) - ok, _ := g.Match([]byte(expected)) - if !ok { + cmp := g.Match([]byte(expected)) + if cmp != 0 { t.Errorf("expexted match with %s", expected) } } else { @@ -186,8 +186,8 @@ func TestDecompressMatchNotOK(t *testing.T) { for g.HasNext() { w := loremStrings[i] expected := fmt.Sprintf("%s %d", w, i+1) - ok, _ := g.Match([]byte(expected)) - if ok { + cmp := g.Match([]byte(expected)) + if cmp == 0 { t.Errorf("not expexted match with %s", expected) } else { g.Skip() @@ -530,36 +530,32 @@ func TestDecompressRandomMatchBool(t *testing.T) { pos := g.dataP if INPUT_FLAGS[input_idx] == 0 { // []byte input notExpected := string(WORDS[word_idx]) + "z" - ok, _ := g.Match([]byte(notExpected)) - if ok { + if g.MatchCmp([]byte(notExpected)) == 0 { t.Fatalf("not expected match: %v\n got: %v\n", []byte(notExpected), WORDS[word_idx]) } expected := WORDS[word_idx] - ok, _ = g.Match(expected) - if !ok { + if g.MatchCmp(expected) != 0 { g.Reset(pos) word, _ := g.Next(nil) if bytes.Compare(expected, word) != 0 { - fmt.Printf("1 expected: %v, acutal %v, ok %v\n", expected, word, ok) + fmt.Printf("1 expected: %v, acutal %v\n", expected, word) } t.Fatalf("expected match: %v\n got: %v\n", expected, word) } word_idx++ } else { // nil input notExpected := []byte{0} - ok, _ := g.Match(notExpected) - if ok { + if g.MatchCmp(notExpected) == 0 { t.Fatal("not expected match []byte{0} with nil\n") } expected := []byte{} - ok, _ = g.Match(nil) - if !ok { + if g.MatchCmp(nil) != 0 { g.Reset(pos) word, _ := g.Next(nil) if bytes.Compare(expected, word) != 0 { - fmt.Printf("2 expected: %v, acutal %v, ok %v\n", expected, word, ok) + fmt.Printf("2 expected: %v, acutal %v\n", expected, word) } t.Fatalf("expected match: %v\n got: %v\n", expected, word) } diff --git a/downloader/util.go b/downloader/util.go index 57963021b..a5f11a4b4 100644 --- a/downloader/util.go +++ b/downloader/util.go @@ -170,7 +170,7 @@ func seedableSnapshotsBySubDir(dir, subDir string) ([]string, error) { continue } ext := filepath.Ext(f.Name()) - if ext != ".v" && ext != ".ef" { // filter out only compressed files + if ext != ".kv" && ext != ".v" && ext != ".ef" { // filter out only compressed files continue } diff --git a/go.mod b/go.mod index b10c4d26b..25d05b6b4 100644 --- a/go.mod +++ b/go.mod @@ -3,13 +3,14 @@ module github.com/ledgerwatch/erigon-lib go 1.19 require ( - github.com/erigontech/mdbx-go v0.27.14 + github.com/erigontech/mdbx-go v0.33.1 github.com/ledgerwatch/interfaces v0.0.0-20230909005156-bff86c603a43 github.com/ledgerwatch/log/v3 v3.9.0 github.com/ledgerwatch/secp256k1 v1.0.0 ) require ( + github.com/FastFilter/xorfilter v0.1.3 github.com/RoaringBitmap/roaring v1.2.3 github.com/VictoriaMetrics/metrics v1.23.1 github.com/anacrolix/dht/v2 v2.19.2-0.20221121215055-066ad8494444 @@ -23,10 +24,12 @@ require ( github.com/go-stack/stack v1.8.1 github.com/google/btree v1.1.2 github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 - github.com/hashicorp/golang-lru/v2 v2.0.6 + github.com/hashicorp/golang-lru/v2 v2.0.4 + github.com/holiman/bloomfilter/v2 v2.0.3 github.com/holiman/uint256 v1.2.3 github.com/matryer/moq v0.3.2 github.com/pbnjay/memory v0.0.0-20210728143218-7b4eea64cf58 + github.com/pkg/errors v0.9.1 github.com/pelletier/go-toml/v2 v2.1.0 github.com/quasilyte/go-ruleguard/dsl v0.3.22 github.com/spaolacci/murmur3 v1.1.0 @@ -95,7 +98,6 @@ require ( github.com/pion/turn/v2 v2.0.8 // indirect github.com/pion/udp v0.1.4 // indirect github.com/pion/webrtc/v3 v3.1.42 // indirect - github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/rs/dnscache v0.0.0-20211102005908-e0241e321417 // indirect @@ -117,3 +119,8 @@ require ( rsc.io/tmplfunc v0.0.3 // indirect zombiezen.com/go/sqlite v0.13.1 // indirect ) + +replace ( + github.com/holiman/bloomfilter/v2 => github.com/AskAlexSharov/bloomfilter/v2 v2.0.8 + github.com/tidwall/btree => github.com/AskAlexSharov/btree v1.6.2 +) diff --git a/go.sum b/go.sum index 9a4f689d4..6e1e00593 100644 --- a/go.sum +++ b/go.sum @@ -4,7 +4,13 @@ crawshaw.io/iox v0.0.0-20181124134642-c51c3df30797 h1:yDf7ARQc637HoxDho7xjqdvO5Z crawshaw.io/iox v0.0.0-20181124134642-c51c3df30797/go.mod h1:sXBiorCo8c46JlQV3oXPKINnZ8mcqnye1EkVkqsectk= crawshaw.io/sqlite v0.3.2/go.mod h1:igAO5JulrQ1DbdZdtVq48mnZUBAPOeFzer7VhDWNtW4= filippo.io/edwards25519 v1.0.0-rc.1 h1:m0VOOB23frXZvAOK44usCgLWvtsxIoMCTBGJZlpmGfU= +github.com/AskAlexSharov/bloomfilter/v2 v2.0.8 h1:eRExAhnCcGHKC4/s8bpbYHJTQfOtn/urU/CYXNx2Q+8= +github.com/AskAlexSharov/bloomfilter/v2 v2.0.8/go.mod h1:zpoh+gs7qcpqrHr3dB55AMiJwo0iURXE7ZOP9L9hSkA= +github.com/AskAlexSharov/btree v1.6.2 h1:5+GQo+SmoAmBEsnW/ksj1csim/aQMRuLUywvwMphs2Y= +github.com/AskAlexSharov/btree v1.6.2/go.mod h1:twD9XRA5jj9VUQGELzDO4HPQTNJsoWWfYEL+EUQ2cKY= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= +github.com/FastFilter/xorfilter v0.1.3 h1:c0nMe68qEoce/2NIolD2nvwQnIgIFBOYI34HcnsjQSc= +github.com/FastFilter/xorfilter v0.1.3/go.mod h1:RB6+tbWbRN163V4y7z10tNfZec6n1oTsOElP0Tu5hzU= github.com/RoaringBitmap/roaring v0.4.7/go.mod h1:8khRDP4HmeXns4xIj9oGrKSz7XTQiJx2zgh7AcNke4w= github.com/RoaringBitmap/roaring v0.4.17/go.mod h1:D3qVegWTmfCaX4Bl5CrBE9hfrSrrXIr8KVNvRsDi1NI= github.com/RoaringBitmap/roaring v0.4.23/go.mod h1:D0gp8kJQgE1A4LQ5wFLggQEyvDi06Mq5mKs52e1TwOo= @@ -127,8 +133,8 @@ github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymF github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/erigontech/mdbx-go v0.27.14 h1:IVVeQVCAjZRpAR8bThlP2ISxrOwdV35NZdGwAgotaRw= -github.com/erigontech/mdbx-go v0.27.14/go.mod h1:FAMxbOgqOnRDx51j8HjuJZIgznbDwjX7LItd+/UWyA4= +github.com/erigontech/mdbx-go v0.33.1 h1:j4UV+kHlSSPLD/e1vLI6PuaTcjsJAX0heBryewyk7fA= +github.com/erigontech/mdbx-go v0.33.1/go.mod h1:FAMxbOgqOnRDx51j8HjuJZIgznbDwjX7LItd+/UWyA4= github.com/frankban/quicktest v1.9.0/go.mod h1:ui7WezCLWMWxVWr1GETZY3smRy0G4KWq9vcPtJmFl7Y= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/fsnotify/fsnotify v1.4.7/go.mod h1:jwhsz4b93w/PPRr/qN1Yymfu8t87LnFCMoQvtojpjFo= @@ -206,8 +212,8 @@ github.com/gorilla/websocket v1.5.0/go.mod h1:YR8l580nyteQvAITg2hZ9XVh4b55+EU/ad github.com/grpc-ecosystem/go-grpc-middleware v1.4.0 h1:UH//fgunKIs4JdUbpDl1VZCDaL56wXCB/5+wF6uHfaI= github.com/grpc-ecosystem/go-grpc-middleware v1.4.0/go.mod h1:g5qyo/la0ALbONm6Vbp88Yd8NsDy6rZz+RcrMPxvld8= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/hashicorp/golang-lru/v2 v2.0.6 h1:3xi/Cafd1NaoEnS/yDssIiuVeDVywU0QdFGl3aQaQHM= -github.com/hashicorp/golang-lru/v2 v2.0.6/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= +github.com/hashicorp/golang-lru/v2 v2.0.4 h1:7GHuZcgid37q8o5i3QI9KMT4nCWQQ3Kx3Ov6bb9MfK0= +github.com/hashicorp/golang-lru/v2 v2.0.4/go.mod h1:QeFd9opnmA6QUJc5vARoKUSoFhyfM2/ZepoAG6RGpeM= github.com/hexops/gotextdiff v1.0.3 h1:gitA9+qJrrTCsiCl7+kh75nPqQt1cx4ZkudSTLoUqJM= github.com/holiman/uint256 v1.2.3 h1:K8UWO1HUJpRMXBxbmaY1Y8IAMZC/RsKB+ArEnnK4l5o= github.com/holiman/uint256 v1.2.3/go.mod h1:SC8Ryt4n+UBbPbIBKaG9zbbDlp4jOru9xFZmPzLUTxw= @@ -386,8 +392,6 @@ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= -github.com/tidwall/btree v1.6.0 h1:LDZfKfQIBHGHWSwckhXI0RPSXzlo+KYdjK7FWSqOzzg= -github.com/tidwall/btree v1.6.0/go.mod h1:twD9XRA5jj9VUQGELzDO4HPQTNJsoWWfYEL+EUQ2cKY= github.com/tinylib/msgp v1.0.2/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE= github.com/tinylib/msgp v1.1.0/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE= github.com/tinylib/msgp v1.1.2/go.mod h1:+d+yLhGm8mzTaHzB+wgMYrodPfmZrzkirds8fDWklFE= diff --git a/kv/bitmapdb/fixed_size.go b/kv/bitmapdb/fixed_size.go index 97bc501b7..72be51c53 100644 --- a/kv/bitmapdb/fixed_size.go +++ b/kv/bitmapdb/fixed_size.go @@ -35,10 +35,12 @@ type FixedSizeBitmaps struct { f *os.File filePath, fileName string - data []uint64 - metaData []byte - amount uint64 - version uint8 + data []uint64 + + metaData []byte + count uint64 //of keys + baseDataID uint64 // deducted from all stored values + version uint8 m mmap2.MMap bitsPerBitmap int @@ -46,18 +48,17 @@ type FixedSizeBitmaps struct { modTime time.Time } -func OpenFixedSizeBitmaps(filePath string, bitsPerBitmap int) (*FixedSizeBitmaps, error) { +func OpenFixedSizeBitmaps(filePath string) (*FixedSizeBitmaps, error) { _, fName := filepath.Split(filePath) idx := &FixedSizeBitmaps{ - filePath: filePath, - fileName: fName, - bitsPerBitmap: bitsPerBitmap, + filePath: filePath, + fileName: fName, } var err error idx.f, err = os.Open(filePath) if err != nil { - return nil, fmt.Errorf("OpenFile: %w", err) + return nil, fmt.Errorf("OpenFixedSizeBitmaps: %w", err) } var stat os.FileInfo if stat, err = idx.f.Stat(); err != nil { @@ -73,8 +74,16 @@ func OpenFixedSizeBitmaps(filePath string, bitsPerBitmap int) (*FixedSizeBitmaps idx.data = castToArrU64(idx.m[MetaHeaderSize:]) idx.version = idx.metaData[0] - idx.amount = binary.BigEndian.Uint64(idx.metaData[1 : 8+1]) - + pos := 1 + idx.count = binary.BigEndian.Uint64(idx.metaData[pos : pos+8]) + pos += 8 + idx.baseDataID = binary.BigEndian.Uint64(idx.metaData[pos : pos+8]) + pos += 8 + idx.bitsPerBitmap = int(binary.BigEndian.Uint16(idx.metaData[pos : pos+8])) + pos += 2 // nolint + if idx.bitsPerBitmap*int(idx.count)/8 > idx.size-MetaHeaderSize { + return nil, fmt.Errorf("file metadata doesn't match file length: bitsPerBitmap=%d, count=%d, len=%d, %s", idx.bitsPerBitmap, int(idx.count), idx.size, fName) + } return idx, nil } @@ -96,8 +105,8 @@ func (bm *FixedSizeBitmaps) Close() { } func (bm *FixedSizeBitmaps) At(item uint64) (res []uint64, err error) { - if item > bm.amount { - return nil, fmt.Errorf("too big item number: %d > %d", item, bm.amount) + if item > bm.count { + return nil, fmt.Errorf("too big item number: %d > %d", item, bm.count) } n := bm.bitsPerBitmap * int(item) @@ -112,7 +121,7 @@ func (bm *FixedSizeBitmaps) At(item uint64) (res []uint64, err error) { } for bit := bitFrom; bit < bitTo; bit++ { if bm.data[i]&(1< bm.count { + return 0, false, fmt.Errorf("too big item number: %d > %d", item, bm.count) + } + + n := bm.bitsPerBitmap * int(item) + blkFrom, bitFrom := n/64, n%64 + blkTo := (n+bm.bitsPerBitmap)/64 + 1 + bitTo := 64 + + var j uint64 + var found bool + for i := blkFrom; i < blkTo; i++ { // TODO: optimize me. it's copy-paste of method `At` + if i == blkTo-1 { + bitTo = (n + bm.bitsPerBitmap) % 64 + } + for bit := bitFrom; bit < bitTo; bit++ { + if bm.data[i]&(1< bm.amount { - return 0, 0, false, false, fmt.Errorf("too big item number: %d > %d", item, bm.amount) + if item > bm.count { + return 0, 0, false, false, fmt.Errorf("too big item number: %d > %d", item, bm.count) } n := bm.bitsPerBitmap * int(item) blkFrom, bitFrom := n/64, n%64 @@ -154,19 +191,22 @@ func (bm *FixedSizeBitmaps) First2At(item, after uint64) (fst uint64, snd uint64 bitFrom = 0 } - return + return fst + bm.baseDataID, snd + bm.baseDataID, ok, ok2, err } type FixedSizeBitmapsWriter struct { f *os.File indexFile, tmpIdxFilePath string - data []uint64 // slice of correct size for the index to work with - metaData []byte - m mmap2.MMap + fileName string + + data []uint64 // slice of correct size for the index to work with + metaData []byte + m mmap2.MMap version uint8 - amount uint64 + baseDataID uint64 // deducted from all stored + count uint64 // of keys size int bitsPerBitmap uint64 @@ -176,19 +216,22 @@ type FixedSizeBitmapsWriter struct { const MetaHeaderSize = 64 -func NewFixedSizeBitmapsWriter(indexFile string, bitsPerBitmap int, amount uint64, logger log.Logger) (*FixedSizeBitmapsWriter, error) { +func NewFixedSizeBitmapsWriter(indexFile string, bitsPerBitmap int, baseDataID, amount uint64, logger log.Logger) (*FixedSizeBitmapsWriter, error) { pageSize := os.Getpagesize() + _, fileName := filepath.Split(indexFile) //TODO: use math.SafeMul() - bytesAmount := MetaHeaderSize + (bitsPerBitmap*int(amount))/8 + bytesAmount := MetaHeaderSize + (bitsPerBitmap*int(amount))/8 + 1 size := (bytesAmount/pageSize + 1) * pageSize // must be page-size-aligned idx := &FixedSizeBitmapsWriter{ indexFile: indexFile, + fileName: fileName, tmpIdxFilePath: indexFile + ".tmp", bitsPerBitmap: uint64(bitsPerBitmap), size: size, - amount: amount, + count: amount, version: 1, logger: logger, + baseDataID: baseDataID, } _ = os.Remove(idx.tmpIdxFilePath) @@ -214,8 +257,10 @@ func NewFixedSizeBitmapsWriter(indexFile string, bitsPerBitmap int, amount uint6 // return nil, err //} idx.metaData[0] = idx.version - binary.BigEndian.PutUint64(idx.metaData[1:], idx.amount) - idx.amount = binary.BigEndian.Uint64(idx.metaData[1 : 8+1]) + //fmt.Printf("build: count=%d, %s\n", idx.count, indexFile) + binary.BigEndian.PutUint64(idx.metaData[1:], idx.count) + binary.BigEndian.PutUint64(idx.metaData[1+8:], idx.baseDataID) + binary.BigEndian.PutUint16(idx.metaData[1+8+8:], uint16(idx.bitsPerBitmap)) return idx, nil } @@ -260,13 +305,17 @@ func castToArrU64(in []byte) []uint64 { } func (w *FixedSizeBitmapsWriter) AddArray(item uint64, listOfValues []uint64) error { - if item > w.amount { - return fmt.Errorf("too big item number: %d > %d", item, w.amount) + if item > w.count { + return fmt.Errorf("too big item number: %d > %d", item, w.count) } offset := item * w.bitsPerBitmap for _, v := range listOfValues { + if v < w.baseDataID { //uint-underflow protection + return fmt.Errorf("too small value: %d < %d, %s", v, w.baseDataID, w.fileName) + } + v = v - w.baseDataID if v > w.bitsPerBitmap { - return fmt.Errorf("too big value: %d > %d", v, w.bitsPerBitmap) + return fmt.Errorf("too big value: %d > %d, %s", v, w.bitsPerBitmap, w.fileName) } n := offset + v blkAt, bitAt := int(n/64), int(n%64) diff --git a/kv/bitmapdb/fixed_size_test.go b/kv/bitmapdb/fixed_size_test.go index 9f513c583..8c80ecb39 100644 --- a/kv/bitmapdb/fixed_size_test.go +++ b/kv/bitmapdb/fixed_size_test.go @@ -30,7 +30,7 @@ func TestFixedSizeBitmaps(t *testing.T) { tmpDir, require := t.TempDir(), require.New(t) must := require.NoError idxPath := filepath.Join(tmpDir, "idx.tmp") - wr, err := NewFixedSizeBitmapsWriter(idxPath, 14, 7, log.New()) + wr, err := NewFixedSizeBitmapsWriter(idxPath, 14, 0, 7, log.New()) require.NoError(err) defer wr.Close() @@ -47,7 +47,7 @@ func TestFixedSizeBitmaps(t *testing.T) { err = wr.Build() require.NoError(err) - bm, err := OpenFixedSizeBitmaps(idxPath, 14) + bm, err := OpenFixedSizeBitmaps(idxPath) require.NoError(err) defer bm.Close() @@ -95,13 +95,13 @@ func TestPageAlined(t *testing.T) { tmpDir, require := t.TempDir(), require.New(t) idxPath := filepath.Join(tmpDir, "idx.tmp") - bm2, err := NewFixedSizeBitmapsWriter(idxPath, 128, 100, log.New()) + bm2, err := NewFixedSizeBitmapsWriter(idxPath, 128, 0, 100, log.New()) require.NoError(err) require.Equal((128/8*100/os.Getpagesize()+1)*os.Getpagesize(), bm2.size) defer bm2.Close() bm2.Close() - bm3, err := NewFixedSizeBitmapsWriter(idxPath, 128, 1000, log.New()) + bm3, err := NewFixedSizeBitmapsWriter(idxPath, 128, 0, 1000, log.New()) require.NoError(err) require.Equal((128/8*1000/os.Getpagesize()+1)*os.Getpagesize(), bm3.size) defer bm3.Close() diff --git a/kv/mdbx/kv_mdbx.go b/kv/mdbx/kv_mdbx.go index 847f772f2..9441e9bbd 100644 --- a/kv/mdbx/kv_mdbx.go +++ b/kv/mdbx/kv_mdbx.go @@ -580,6 +580,7 @@ func (db *MdbxKV) AllTables() kv.TableCfg { return db.buckets } +func (tx *MdbxTx) IsRo() bool { return tx.readOnly } func (tx *MdbxTx) ViewID() uint64 { return tx.tx.ID() } func (tx *MdbxTx) CollectMetrics() { @@ -629,9 +630,16 @@ func (tx *MdbxTx) CollectMetrics() { } // ListBuckets - all buckets stored as keys of un-named bucket -func (tx *MdbxTx) ListBuckets() ([]string, error) { - return tx.tx.ListDBI() +func (tx *MdbxTx) ListBuckets() ([]string, error) { return tx.tx.ListDBI() } + +func (tx *MdbxTx) WarmupDB(force bool) error { + if force { + return tx.tx.EnvWarmup(mdbx.WarmupForce|mdbx.WarmupOomSafe, time.Hour) + } + return tx.tx.EnvWarmup(mdbx.WarmupDefault, time.Hour) } +func (tx *MdbxTx) LockDBInRam() error { return tx.tx.EnvWarmup(mdbx.WarmupLock, time.Hour) } +func (tx *MdbxTx) UnlockDBFromRam() error { return tx.tx.EnvWarmup(mdbx.WarmupRelease, time.Hour) } func (db *MdbxKV) View(ctx context.Context, f func(tx kv.Tx) error) (err error) { // can't use db.env.View method - because it calls commit for read transactions - it conflicts with write transactions. diff --git a/kv/mdbx/kv_mdbx_temporary.go b/kv/mdbx/kv_mdbx_temporary.go index f6723b85c..c7a6d5040 100644 --- a/kv/mdbx/kv_mdbx_temporary.go +++ b/kv/mdbx/kv_mdbx_temporary.go @@ -34,7 +34,6 @@ func NewTemporaryMdbx(tempdir string) (kv.RwDB, error) { if err != nil { return &TemporaryMdbx{}, err } - db, err := Open(path, log.Root(), false) if err != nil { return &TemporaryMdbx{}, err diff --git a/kv/memdb/memory_mutation_test.go b/kv/memdb/memory_mutation_test.go index 9777b8a30..28c38891b 100644 --- a/kv/memdb/memory_mutation_test.go +++ b/kv/memdb/memory_mutation_test.go @@ -36,10 +36,13 @@ func TestPutAppendHas(t *testing.T) { batch := NewMemoryBatch(rwTx, "") require.NoError(t, batch.Append(kv.HashedAccounts, []byte("AAAA"), []byte("value1.5"))) - require.Error(t, batch.Append(kv.HashedAccounts, []byte("AAAA"), []byte("value1.3"))) + //MDBX's APPEND checking only keys, not values + require.NoError(t, batch.Append(kv.HashedAccounts, []byte("AAAA"), []byte("value1.3"))) + require.NoError(t, batch.Put(kv.HashedAccounts, []byte("AAAA"), []byte("value1.3"))) require.NoError(t, batch.Append(kv.HashedAccounts, []byte("CBAA"), []byte("value3.5"))) - require.Error(t, batch.Append(kv.HashedAccounts, []byte("CBAA"), []byte("value3.1"))) + //MDBX's APPEND checking only keys, not values + require.NoError(t, batch.Append(kv.HashedAccounts, []byte("CBAA"), []byte("value3.1"))) require.NoError(t, batch.AppendDup(kv.HashedAccounts, []byte("CBAA"), []byte("value3.1"))) require.Error(t, batch.Append(kv.HashedAccounts, []byte("AAAA"), []byte("value1.3"))) diff --git a/kv/remotedbserver/remotedbserver.go b/kv/remotedbserver/remotedbserver.go index 526190af3..ddf882bd4 100644 --- a/kv/remotedbserver/remotedbserver.go +++ b/kv/remotedbserver/remotedbserver.go @@ -27,6 +27,7 @@ import ( "sync/atomic" "time" + "github.com/ledgerwatch/erigon-lib/state" "github.com/ledgerwatch/log/v3" "google.golang.org/protobuf/proto" "google.golang.org/protobuf/types/known/emptypb" @@ -72,7 +73,7 @@ type KvServer struct { kv kv.RoDB stateChangeStreams *StateChangePubSub blockSnapshots Snapsthots - historySnapshots Snapsthots + historySnapshots *state.AggregatorV3 ctx context.Context //v3 fields @@ -94,7 +95,7 @@ type Snapsthots interface { Files() []string } -func NewKvServer(ctx context.Context, db kv.RoDB, snapshots Snapsthots, historySnapshots Snapsthots, logger log.Logger) *KvServer { +func NewKvServer(ctx context.Context, db kv.RoDB, snapshots Snapsthots, historySnapshots *state.AggregatorV3, logger log.Logger) *KvServer { return &KvServer{ trace: false, rangeStep: 1024, @@ -456,7 +457,9 @@ func (s *KvServer) Snapshots(ctx context.Context, _ *remote.SnapshotsRequest) (* return &remote.SnapshotsReply{BlocksFiles: []string{}, HistoryFiles: []string{}}, nil } - return &remote.SnapshotsReply{BlocksFiles: s.blockSnapshots.Files(), HistoryFiles: s.historySnapshots.Files()}, nil + ac := s.historySnapshots.MakeContext() + defer ac.Close() + return &remote.SnapshotsReply{BlocksFiles: s.blockSnapshots.Files(), HistoryFiles: ac.Files()}, nil } type StateChangePubSub struct { diff --git a/kv/tables.go b/kv/tables.go index d700ce173..f62c3eef2 100644 --- a/kv/tables.go +++ b/kv/tables.go @@ -676,7 +676,8 @@ var ChaindataTablesCfg = TableCfg{ }, CallTraceSet: {Flags: DupSort}, - TblAccountKeys: {Flags: DupSort}, + TblAccountKeys: {Flags: DupSort}, + //TblAccountVals: {Flags: DupSort}, TblAccountHistoryKeys: {Flags: DupSort}, TblAccountHistoryVals: {Flags: DupSort}, TblAccountIdx: {Flags: DupSort}, @@ -689,21 +690,22 @@ var ChaindataTablesCfg = TableCfg{ TblCodeIdx: {Flags: DupSort}, TblCommitmentKeys: {Flags: DupSort}, TblCommitmentHistoryKeys: {Flags: DupSort}, - TblCommitmentIdx: {Flags: DupSort}, - TblLogAddressKeys: {Flags: DupSort}, - TblLogAddressIdx: {Flags: DupSort}, - TblLogTopicsKeys: {Flags: DupSort}, - TblLogTopicsIdx: {Flags: DupSort}, - TblTracesFromKeys: {Flags: DupSort}, - TblTracesFromIdx: {Flags: DupSort}, - TblTracesToKeys: {Flags: DupSort}, - TblTracesToIdx: {Flags: DupSort}, - RAccountKeys: {Flags: DupSort}, - RAccountIdx: {Flags: DupSort}, - RStorageKeys: {Flags: DupSort}, - RStorageIdx: {Flags: DupSort}, - RCodeKeys: {Flags: DupSort}, - RCodeIdx: {Flags: DupSort}, + //TblCommitmentHistoryVals: {Flags: DupSort}, + TblCommitmentIdx: {Flags: DupSort}, + TblLogAddressKeys: {Flags: DupSort}, + TblLogAddressIdx: {Flags: DupSort}, + TblLogTopicsKeys: {Flags: DupSort}, + TblLogTopicsIdx: {Flags: DupSort}, + TblTracesFromKeys: {Flags: DupSort}, + TblTracesFromIdx: {Flags: DupSort}, + TblTracesToKeys: {Flags: DupSort}, + TblTracesToIdx: {Flags: DupSort}, + RAccountKeys: {Flags: DupSort}, + RAccountIdx: {Flags: DupSort}, + RStorageKeys: {Flags: DupSort}, + RStorageIdx: {Flags: DupSort}, + RCodeKeys: {Flags: DupSort}, + RCodeIdx: {Flags: DupSort}, } var TxpoolTablesCfg = TableCfg{} @@ -791,15 +793,17 @@ func reinit() { // Temporal const ( - AccountsDomain Domain = "AccountsDomain" - StorageDomain Domain = "StorageDomain" - CodeDomain Domain = "CodeDomain" + AccountsDomain Domain = "AccountsDomain" + StorageDomain Domain = "StorageDomain" + CodeDomain Domain = "CodeDomain" + CommitmentDomain Domain = "CommitmentDomain" ) const ( - AccountsHistory History = "AccountsHistory" - StorageHistory History = "StorageHistory" - CodeHistory History = "CodeHistory" + AccountsHistory History = "AccountsHistory" + StorageHistory History = "StorageHistory" + CodeHistory History = "CodeHistory" + CommitmentHistory History = "CommitmentHistory" ) const ( diff --git a/recsplit/eliasfano16/elias_fano.go b/recsplit/eliasfano16/elias_fano.go index b67a2ab24..e32046bcd 100644 --- a/recsplit/eliasfano16/elias_fano.go +++ b/recsplit/eliasfano16/elias_fano.go @@ -442,8 +442,7 @@ func (ef *DoubleEliasFano) Data() []uint64 { func (ef *DoubleEliasFano) get2(i uint64) (cumKeys, position uint64, windowCumKeys uint64, selectCumKeys int, currWordCumKeys, lower, cumDelta uint64) { posLower := i * (ef.lCumKeys + ef.lPosition) - idx64 := posLower / 64 - shift := posLower % 64 + idx64, shift := posLower/64, posLower%64 lower = ef.lowerBits[idx64] >> shift if shift > 0 { lower |= ef.lowerBits[idx64+1] << (64 - shift) @@ -504,11 +503,10 @@ func (ef *DoubleEliasFano) Get2(i uint64) (cumKeys, position uint64) { } func (ef *DoubleEliasFano) Get3(i uint64) (cumKeys, cumKeysNext, position uint64) { - var windowCumKeys uint64 - var selectCumKeys int - var currWordCumKeys uint64 - var lower uint64 - var cumDelta uint64 + var ( + windowCumKeys, currWordCumKeys, lower, cumDelta uint64 + selectCumKeys int + ) cumKeys, position, windowCumKeys, selectCumKeys, currWordCumKeys, lower, cumDelta = ef.get2(i) windowCumKeys &= (uint64(0xffffffffffffffff) << selectCumKeys) << 1 for windowCumKeys == 0 { diff --git a/recsplit/eliasfano32/elias_fano.go b/recsplit/eliasfano32/elias_fano.go index a966aa9c3..b8e92fc94 100644 --- a/recsplit/eliasfano32/elias_fano.go +++ b/recsplit/eliasfano32/elias_fano.go @@ -221,6 +221,13 @@ func (ef *EliasFano) upper(i uint64) uint64 { return currWord*64 + uint64(sel) - i } +// TODO: optimize me - to avoid object allocation +func Seek(data []byte, n uint64) (uint64, bool) { + ef, _ := ReadEliasFano(data) + //TODO: if startTxNum==0, can do ef.Get(0) + return ef.Search(n) +} + // Search returns the value in the sequence, equal or greater than given value func (ef *EliasFano) search(v uint64) (nextV uint64, nextI uint64, ok bool) { if v == 0 { diff --git a/recsplit/eliasfano32/elias_fano_test.go b/recsplit/eliasfano32/elias_fano_test.go index 5d9cd74f1..580be3360 100644 --- a/recsplit/eliasfano32/elias_fano_test.go +++ b/recsplit/eliasfano32/elias_fano_test.go @@ -59,7 +59,7 @@ func TestEliasFanoSeek(t *testing.T) { { v2, ok2 := ef.Search(ef.Max()) require.True(t, ok2, v2) - require.Equal(t, ef.Max(), v2) + require.Equal(t, int(ef.Max()), int(v2)) it := ef.Iterator() //it.SeekDeprecated(ef.Max()) for i := 0; i < int(ef.Count()-1); i++ { diff --git a/recsplit/golomb_rice.go b/recsplit/golomb_rice.go index 98221e1bf..e0bdc70d7 100644 --- a/recsplit/golomb_rice.go +++ b/recsplit/golomb_rice.go @@ -116,9 +116,7 @@ func (g *GolombRiceReader) SkipSubtree(nodes, fixedLen int) { g.currFixedOffset += fixedLen } -func (g *GolombRiceReader) ReadNext(log2golomb int) uint64 { - var result uint64 - +func (g *GolombRiceReader) ReadNext(log2golomb int) (result uint64) { if g.currWindowUnary == 0 { result += uint64(g.validLowerBitsUnary) g.currWindowUnary = g.data[g.currPtrUnary] @@ -141,9 +139,8 @@ func (g *GolombRiceReader) ReadNext(log2golomb int) uint64 { result <<= log2golomb idx64 := g.currFixedOffset >> 6 - var fixed uint64 shift := g.currFixedOffset & 63 - fixed = g.data[idx64] >> shift + fixed := g.data[idx64] >> shift if shift+log2golomb > 64 { fixed |= g.data[idx64+1] << (64 - shift) } diff --git a/recsplit/index.go b/recsplit/index.go index d1765a7d3..4fa95025e 100644 --- a/recsplit/index.go +++ b/recsplit/index.go @@ -25,6 +25,7 @@ import ( "os" "path/filepath" "sync" + "sync/atomic" "time" "unsafe" @@ -64,7 +65,8 @@ type Index struct { primaryAggrBound uint16 // The lower bound for primary key aggregation (computed from leafSize) enums bool - readers *sync.Pool + readers *sync.Pool + readAheadRefcnt atomic.Int32 // ref-counter: allow enable/disable read-ahead from goroutines. only when refcnt=0 - disable read-ahead once } func MustOpen(indexFile string) *Index { @@ -344,17 +346,29 @@ func (idx *Index) DisableReadAhead() { if idx == nil || idx.mmapHandle1 == nil { return } - _ = mmap.MadviseRandom(idx.mmapHandle1) + leftReaders := idx.readAheadRefcnt.Add(-1) + if leftReaders == 0 { + if dbg.SnMadvNormal() { + _ = mmap.MadviseNormal(idx.mmapHandle1) + } else { + _ = mmap.MadviseRandom(idx.mmapHandle1) + } + } else if leftReaders < 0 { + log.Warn("read-ahead negative counter", "file", idx.FileName()) + } } func (idx *Index) EnableReadAhead() *Index { + idx.readAheadRefcnt.Add(1) _ = mmap.MadviseSequential(idx.mmapHandle1) return idx } func (idx *Index) EnableMadvNormal() *Index { + idx.readAheadRefcnt.Add(1) _ = mmap.MadviseNormal(idx.mmapHandle1) return idx } func (idx *Index) EnableWillNeed() *Index { + idx.readAheadRefcnt.Add(1) _ = mmap.MadviseWillNeed(idx.mmapHandle1) return idx } diff --git a/recsplit/index_reader.go b/recsplit/index_reader.go index 0ad10ea09..0ccfff745 100644 --- a/recsplit/index_reader.go +++ b/recsplit/index_reader.go @@ -37,38 +37,34 @@ func NewIndexReader(index *Index) *IndexReader { } } -func (r *IndexReader) sum(key []byte) (uint64, uint64) { +func (r *IndexReader) sum(key []byte) (hi uint64, lo uint64) { r.mu.Lock() - defer r.mu.Unlock() r.hasher.Reset() r.hasher.Write(key) //nolint:errcheck - return r.hasher.Sum128() + hi, lo = r.hasher.Sum128() + r.mu.Unlock() + return hi, lo } -func (r *IndexReader) sum2(key1, key2 []byte) (uint64, uint64) { +func (r *IndexReader) sum2(key1, key2 []byte) (hi uint64, lo uint64) { r.mu.Lock() - defer r.mu.Unlock() r.hasher.Reset() r.hasher.Write(key1) //nolint:errcheck r.hasher.Write(key2) //nolint:errcheck - return r.hasher.Sum128() + hi, lo = r.hasher.Sum128() + r.mu.Unlock() + return hi, lo } // Lookup wraps index Lookup func (r *IndexReader) Lookup(key []byte) uint64 { bucketHash, fingerprint := r.sum(key) - if r.index != nil { - return r.index.Lookup(bucketHash, fingerprint) - } - return 0 + return r.index.Lookup(bucketHash, fingerprint) } func (r *IndexReader) Lookup2(key1, key2 []byte) uint64 { bucketHash, fingerprint := r.sum2(key1, key2) - if r.index != nil { - return r.index.Lookup(bucketHash, fingerprint) - } - return 0 + return r.index.Lookup(bucketHash, fingerprint) } func (r *IndexReader) Empty() bool { @@ -81,3 +77,11 @@ func (r *IndexReader) Close() { } r.index.readers.Put(r) } + +func (r *IndexReader) Sum(key []byte) (uint64, uint64) { return r.sum(key) } +func (r *IndexReader) LookupHash(hi, lo uint64) uint64 { + if r.index != nil { + return r.index.Lookup(hi, lo) + } + return 0 +} diff --git a/recsplit/index_test.go b/recsplit/index_test.go index 849cdb710..db66d3803 100644 --- a/recsplit/index_test.go +++ b/recsplit/index_test.go @@ -32,10 +32,11 @@ func TestReWriteIndex(t *testing.T) { logger := log.New() tmpDir := t.TempDir() indexFile := filepath.Join(tmpDir, "index") + salt := uint32(1) rs, err := NewRecSplit(RecSplitArgs{ KeyCount: 100, BucketSize: 10, - Salt: 0, + Salt: &salt, TmpDir: tmpDir, IndexFile: indexFile, LeafSize: 8, diff --git a/recsplit/recsplit.go b/recsplit/recsplit.go index a019ca9b3..4cd881b44 100644 --- a/recsplit/recsplit.go +++ b/recsplit/recsplit.go @@ -128,7 +128,7 @@ type RecSplitArgs struct { BucketSize int BaseDataID uint64 EtlBufLimit datasize.ByteSize - Salt uint32 // Hash seed (salt) for the hash function used for allocating the initial buckets - need to be generated randomly + Salt *uint32 // Hash seed (salt) for the hash function used for allocating the initial buckets - need to be generated randomly LeafSize uint16 } @@ -144,21 +144,22 @@ func NewRecSplit(args RecSplitArgs, logger log.Logger) (*RecSplit, error) { 0x082f20e10092a9a3, 0x2ada2ce68d21defc, 0xe33cb4f3e7c6466b, 0x3980be458c509c59, 0xc466fd9584828e8c, 0x45f0aabe1a61ede6, 0xf6e7b8b33ad9b98d, 0x4ef95e25f4b4983d, 0x81175195173b92d3, 0x4e50927d8dd15978, 0x1ea2099d1fafae7f, 0x425c8a06fbaaa815, 0xcd4216006c74052a} } - rs.salt = args.Salt - if rs.salt == 0 { + rs.tmpDir = args.TmpDir + rs.indexFile = args.IndexFile + rs.tmpFilePath = args.IndexFile + ".tmp" + _, fname := filepath.Split(rs.indexFile) + rs.indexFileName = fname + rs.baseDataID = args.BaseDataID + if args.Salt == nil { seedBytes := make([]byte, 4) if _, err := rand.Read(seedBytes); err != nil { return nil, err } rs.salt = binary.BigEndian.Uint32(seedBytes) + } else { + rs.salt = *args.Salt } rs.hasher = murmur3.New128WithSeed(rs.salt) - rs.tmpDir = args.TmpDir - rs.indexFile = args.IndexFile - rs.tmpFilePath = args.IndexFile + ".tmp" - _, fname := filepath.Split(rs.indexFile) - rs.indexFileName = fname - rs.baseDataID = args.BaseDataID rs.etlBufLimit = args.EtlBufLimit if rs.etlBufLimit == 0 { rs.etlBufLimit = etl.BufferOptimalSize @@ -190,6 +191,7 @@ func NewRecSplit(args RecSplitArgs, logger log.Logger) (*RecSplit, error) { return rs, nil } +func (rs *RecSplit) Salt() uint32 { return rs.salt } func (rs *RecSplit) Close() { if rs.indexF != nil { rs.indexF.Close() @@ -210,8 +212,8 @@ func (rs *RecSplit) SetTrace(trace bool) { // remap converts the number x which is assumed to be uniformly distributed over the range [0..2^64) to the number that is uniformly // distributed over the range [0..n) -func remap(x uint64, n uint64) uint64 { - hi, _ := bits.Mul64(x, n) +func remap(x uint64, n uint64) (hi uint64) { + hi, _ = bits.Mul64(x, n) return hi } @@ -260,6 +262,8 @@ func splitParams(m, leafSize, primaryAggrBound, secondaryAggrBound uint16) (fano return } +var golombBaseLog2 = -math.Log((math.Sqrt(5) + 1.0) / 2.0) + func computeGolombRice(m uint16, table []uint32, leafSize, primaryAggrBound, secondaryAggrBound uint16) { fanout, unit := splitParams(m, leafSize, primaryAggrBound, secondaryAggrBound) k := make([]uint16, fanout) @@ -273,7 +277,7 @@ func computeGolombRice(m uint16, table []uint32, leafSize, primaryAggrBound, sec sqrtProd *= math.Sqrt(float64(k[i])) } p := math.Sqrt(float64(m)) / (math.Pow(2*math.Pi, (float64(fanout)-1.)/2.0) * sqrtProd) - golombRiceLength := uint32(math.Ceil(math.Log2(-math.Log((math.Sqrt(5)+1.0)/2.0) / math.Log1p(-p)))) // log2 Golomb modulus + golombRiceLength := uint32(math.Ceil(math.Log2(golombBaseLog2 / math.Log1p(-p)))) // log2 Golomb modulus if golombRiceLength > 0x1F { panic("golombRiceLength > 0x1F") } @@ -299,8 +303,7 @@ func computeGolombRice(m uint16, table []uint32, leafSize, primaryAggrBound, sec // salt for the part of the hash function separating m elements. It is based on // calculations with assumptions that we draw hash functions at random func (rs *RecSplit) golombParam(m uint16) int { - s := uint16(len(rs.golombRice)) - for m >= s { + for s := uint16(len(rs.golombRice)); m >= s; s++ { rs.golombRice = append(rs.golombRice, 0) // For the case where bucket is larger than planned if s == 0 { @@ -310,7 +313,6 @@ func (rs *RecSplit) golombParam(m uint16) int { } else { computeGolombRice(s, rs.golombRice, rs.leafSize, rs.primaryAggrBound, rs.secondaryAggrBound) } - s++ } return int(rs.golombRice[m] >> 27) } diff --git a/recsplit/recsplit_fuzz_test.go b/recsplit/recsplit_fuzz_test.go index ef2f58b9d..8786749a6 100644 --- a/recsplit/recsplit_fuzz_test.go +++ b/recsplit/recsplit_fuzz_test.go @@ -52,11 +52,12 @@ func FuzzRecSplit(f *testing.F) { } tmpDir := t.TempDir() indexFile := filepath.Join(tmpDir, "index") + salt := uint32(1) rs, err := NewRecSplit(RecSplitArgs{ KeyCount: count, Enums: true, BucketSize: 10, - Salt: 0, + Salt: &salt, TmpDir: tmpDir, IndexFile: indexFile, LeafSize: 8, diff --git a/recsplit/recsplit_test.go b/recsplit/recsplit_test.go index ab4f818eb..4725d620d 100644 --- a/recsplit/recsplit_test.go +++ b/recsplit/recsplit_test.go @@ -28,10 +28,11 @@ import ( func TestRecSplit2(t *testing.T) { logger := log.New() tmpDir := t.TempDir() + salt := uint32(1) rs, err := NewRecSplit(RecSplitArgs{ KeyCount: 2, BucketSize: 10, - Salt: 0, + Salt: &salt, TmpDir: tmpDir, IndexFile: filepath.Join(tmpDir, "index"), LeafSize: 8, @@ -62,10 +63,11 @@ func TestRecSplit2(t *testing.T) { func TestRecSplitDuplicate(t *testing.T) { logger := log.New() tmpDir := t.TempDir() + salt := uint32(1) rs, err := NewRecSplit(RecSplitArgs{ KeyCount: 2, BucketSize: 10, - Salt: 0, + Salt: &salt, TmpDir: tmpDir, IndexFile: filepath.Join(tmpDir, "index"), LeafSize: 8, @@ -87,10 +89,11 @@ func TestRecSplitDuplicate(t *testing.T) { func TestRecSplitLeafSizeTooLarge(t *testing.T) { logger := log.New() tmpDir := t.TempDir() + salt := uint32(1) _, err := NewRecSplit(RecSplitArgs{ KeyCount: 2, BucketSize: 10, - Salt: 0, + Salt: &salt, TmpDir: tmpDir, IndexFile: filepath.Join(tmpDir, "index"), LeafSize: 64, @@ -104,10 +107,11 @@ func TestIndexLookup(t *testing.T) { logger := log.New() tmpDir := t.TempDir() indexFile := filepath.Join(tmpDir, "index") + salt := uint32(1) rs, err := NewRecSplit(RecSplitArgs{ KeyCount: 100, BucketSize: 10, - Salt: 0, + Salt: &salt, TmpDir: tmpDir, IndexFile: indexFile, LeafSize: 8, @@ -138,10 +142,11 @@ func TestTwoLayerIndex(t *testing.T) { logger := log.New() tmpDir := t.TempDir() indexFile := filepath.Join(tmpDir, "index") + salt := uint32(1) rs, err := NewRecSplit(RecSplitArgs{ KeyCount: 100, BucketSize: 10, - Salt: 0, + Salt: &salt, TmpDir: tmpDir, IndexFile: indexFile, LeafSize: 8, diff --git a/state/aggregator.go b/state/aggregator.go deleted file mode 100644 index 0f7199246..000000000 --- a/state/aggregator.go +++ /dev/null @@ -1,1368 +0,0 @@ -/* - Copyright 2022 The Erigon contributors - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. -*/ - -package state - -import ( - "bytes" - "context" - "fmt" - "math" - "math/bits" - "os" - "sync" - "sync/atomic" - "time" - - "github.com/VictoriaMetrics/metrics" - "github.com/holiman/uint256" - "github.com/ledgerwatch/log/v3" - "golang.org/x/sync/errgroup" - - "github.com/ledgerwatch/erigon-lib/commitment" - "github.com/ledgerwatch/erigon-lib/common" - "github.com/ledgerwatch/erigon-lib/common/background" - "github.com/ledgerwatch/erigon-lib/common/length" - "github.com/ledgerwatch/erigon-lib/kv" - "github.com/ledgerwatch/erigon-lib/kv/iter" - "github.com/ledgerwatch/erigon-lib/kv/order" -) - -// StepsInBiggestFile - files of this size are completely frozen/immutable. -// files of smaller size are also immutable, but can be removed after merge to bigger files. -const StepsInBiggestFile = 32 - -var ( - mxCurrentTx = metrics.GetOrCreateCounter("domain_tx_processed") - mxCurrentBlock = metrics.GetOrCreateCounter("domain_block_current") - mxRunningMerges = metrics.GetOrCreateCounter("domain_running_merges") - mxRunningCollations = metrics.GetOrCreateCounter("domain_running_collations") - mxCollateTook = metrics.GetOrCreateHistogram("domain_collate_took") - mxPruneTook = metrics.GetOrCreateHistogram("domain_prune_took") - mxPruneHistTook = metrics.GetOrCreateHistogram("domain_prune_hist_took") - mxPruningProgress = metrics.GetOrCreateCounter("domain_pruning_progress") - mxCollationSize = metrics.GetOrCreateCounter("domain_collation_size") - mxCollationSizeHist = metrics.GetOrCreateCounter("domain_collation_hist_size") - mxPruneSize = metrics.GetOrCreateCounter("domain_prune_size") - mxBuildTook = metrics.GetOrCreateSummary("domain_build_files_took") - mxStepCurrent = metrics.GetOrCreateCounter("domain_step_current") - mxStepTook = metrics.GetOrCreateHistogram("domain_step_took") - mxCommitmentKeys = metrics.GetOrCreateCounter("domain_commitment_keys") - mxCommitmentRunning = metrics.GetOrCreateCounter("domain_running_commitment") - mxCommitmentTook = metrics.GetOrCreateSummary("domain_commitment_took") - mxCommitmentWriteTook = metrics.GetOrCreateHistogram("domain_commitment_write_took") - mxCommitmentUpdates = metrics.GetOrCreateCounter("domain_commitment_updates") - mxCommitmentUpdatesApplied = metrics.GetOrCreateCounter("domain_commitment_updates_applied") -) - -type Aggregator struct { - db kv.RwDB - aggregationStep uint64 - accounts *Domain - storage *Domain - code *Domain - commitment *DomainCommitted - logAddrs *InvertedIndex - logTopics *InvertedIndex - tracesFrom *InvertedIndex - tracesTo *InvertedIndex - txNum uint64 - seekTxNum uint64 - blockNum uint64 - stepDoneNotice chan [length.Hash]byte - rwTx kv.RwTx - stats FilesStats - tmpdir string - defaultCtx *AggregatorContext - - ps *background.ProgressSet - logger log.Logger -} - -//type exposedMetrics struct { -// CollationSize *metrics.Gauge -// CollationSizeHist *metrics.Gauge -// PruneSize *metrics.Gauge -// -// lastCollSize int -// lastColHistSize int -// lastPruneSize int -//} -// -//func (e exposedMetrics) init() { -// e.CollationSize = metrics.GetOrCreateGauge("domain_collation_size", func() float64 { return 0 }) -// e.CollationSizeHist = metrics.GetOrCreateGauge("domain_collation_hist_size", func() float64 { return 0 }) -// e.PruneSize = metrics.GetOrCreateGauge("domain_prune_size", func() float64 { return e.lastPruneSize }) -//} - -func NewAggregator(dir, tmpdir string, aggregationStep uint64, commitmentMode CommitmentMode, commitTrieVariant commitment.TrieVariant, logger log.Logger) (*Aggregator, error) { - a := &Aggregator{aggregationStep: aggregationStep, ps: background.NewProgressSet(), tmpdir: tmpdir, stepDoneNotice: make(chan [length.Hash]byte, 1), logger: logger} - - closeAgg := true - defer func() { - if closeAgg { - a.Close() - } - }() - err := os.MkdirAll(dir, 0764) - if err != nil { - return nil, err - } - if a.accounts, err = NewDomain(dir, tmpdir, aggregationStep, "accounts", kv.TblAccountKeys, kv.TblAccountVals, kv.TblAccountHistoryKeys, kv.TblAccountHistoryVals, kv.TblAccountIdx, false, false, logger); err != nil { - return nil, err - } - if a.storage, err = NewDomain(dir, tmpdir, aggregationStep, "storage", kv.TblStorageKeys, kv.TblStorageVals, kv.TblStorageHistoryKeys, kv.TblStorageHistoryVals, kv.TblStorageIdx, false, false, logger); err != nil { - return nil, err - } - if a.code, err = NewDomain(dir, tmpdir, aggregationStep, "code", kv.TblCodeKeys, kv.TblCodeVals, kv.TblCodeHistoryKeys, kv.TblCodeHistoryVals, kv.TblCodeIdx, true, true, logger); err != nil { - return nil, err - } - - commitd, err := NewDomain(dir, tmpdir, aggregationStep, "commitment", kv.TblCommitmentKeys, kv.TblCommitmentVals, kv.TblCommitmentHistoryKeys, kv.TblCommitmentHistoryVals, kv.TblCommitmentIdx, false, true, logger) - if err != nil { - return nil, err - } - a.commitment = NewCommittedDomain(commitd, commitmentMode, commitTrieVariant, logger) - - if a.logAddrs, err = NewInvertedIndex(dir, tmpdir, aggregationStep, "logaddrs", kv.TblLogAddressKeys, kv.TblLogAddressIdx, false, nil, logger); err != nil { - return nil, err - } - if a.logTopics, err = NewInvertedIndex(dir, tmpdir, aggregationStep, "logtopics", kv.TblLogTopicsKeys, kv.TblLogTopicsIdx, false, nil, logger); err != nil { - return nil, err - } - if a.tracesFrom, err = NewInvertedIndex(dir, tmpdir, aggregationStep, "tracesfrom", kv.TblTracesFromKeys, kv.TblTracesFromIdx, false, nil, logger); err != nil { - return nil, err - } - if a.tracesTo, err = NewInvertedIndex(dir, tmpdir, aggregationStep, "tracesto", kv.TblTracesToKeys, kv.TblTracesToIdx, false, nil, logger); err != nil { - return nil, err - } - closeAgg = false - - a.seekTxNum = a.EndTxNumMinimax() - return a, nil -} - -func (a *Aggregator) SetDB(db kv.RwDB) { a.db = db } - -func (a *Aggregator) buildMissedIdxBlocking(d *Domain) error { - eg, ctx := errgroup.WithContext(context.Background()) - eg.SetLimit(32) - if err := d.BuildMissedIndices(ctx, eg, a.ps); err != nil { - return err - } - return eg.Wait() -} -func (a *Aggregator) ReopenFolder() (err error) { - { - if err = a.buildMissedIdxBlocking(a.accounts); err != nil { - return err - } - if err = a.buildMissedIdxBlocking(a.storage); err != nil { - return err - } - if err = a.buildMissedIdxBlocking(a.code); err != nil { - return err - } - if err = a.buildMissedIdxBlocking(a.commitment.Domain); err != nil { - return err - } - } - - if err = a.accounts.OpenFolder(); err != nil { - return fmt.Errorf("OpenFolder: %w", err) - } - if err = a.storage.OpenFolder(); err != nil { - return fmt.Errorf("OpenFolder: %w", err) - } - if err = a.code.OpenFolder(); err != nil { - return fmt.Errorf("OpenFolder: %w", err) - } - if err = a.commitment.OpenFolder(); err != nil { - return fmt.Errorf("OpenFolder: %w", err) - } - if err = a.logAddrs.OpenFolder(); err != nil { - return fmt.Errorf("OpenFolder: %w", err) - } - if err = a.logTopics.OpenFolder(); err != nil { - return fmt.Errorf("OpenFolder: %w", err) - } - if err = a.tracesFrom.OpenFolder(); err != nil { - return fmt.Errorf("OpenFolder: %w", err) - } - if err = a.tracesTo.OpenFolder(); err != nil { - return fmt.Errorf("OpenFolder: %w", err) - } - return nil -} - -func (a *Aggregator) ReopenList(fNames []string) error { - var err error - if err = a.accounts.OpenList(fNames); err != nil { - return err - } - if err = a.storage.OpenList(fNames); err != nil { - return err - } - if err = a.code.OpenList(fNames); err != nil { - return err - } - if err = a.commitment.OpenList(fNames); err != nil { - return err - } - if err = a.logAddrs.OpenList(fNames); err != nil { - return err - } - if err = a.logTopics.OpenList(fNames); err != nil { - return err - } - if err = a.tracesFrom.OpenList(fNames); err != nil { - return err - } - if err = a.tracesTo.OpenList(fNames); err != nil { - return err - } - return nil -} - -func (a *Aggregator) GetAndResetStats() DomainStats { - stats := DomainStats{HistoryQueries: &atomic.Uint64{}, TotalQueries: &atomic.Uint64{}} - stats.Accumulate(a.accounts.GetAndResetStats()) - stats.Accumulate(a.storage.GetAndResetStats()) - stats.Accumulate(a.code.GetAndResetStats()) - stats.Accumulate(a.commitment.GetAndResetStats()) - - var tto, tfrom, ltopics, laddr DomainStats - tto.FilesCount, tto.DataSize, tto.IndexSize = a.tracesTo.collectFilesStat() - tfrom.FilesCount, tfrom.DataSize, tfrom.DataSize = a.tracesFrom.collectFilesStat() - ltopics.FilesCount, ltopics.DataSize, ltopics.IndexSize = a.logTopics.collectFilesStat() - laddr.FilesCount, laddr.DataSize, laddr.IndexSize = a.logAddrs.collectFilesStat() - - stats.Accumulate(tto) - stats.Accumulate(tfrom) - stats.Accumulate(ltopics) - stats.Accumulate(laddr) - return stats -} - -func (a *Aggregator) Close() { - if a.defaultCtx != nil { - a.defaultCtx.Close() - } - if a.stepDoneNotice != nil { - close(a.stepDoneNotice) - } - if a.accounts != nil { - a.accounts.Close() - } - if a.storage != nil { - a.storage.Close() - } - if a.code != nil { - a.code.Close() - } - if a.commitment != nil { - a.commitment.Close() - } - - if a.logAddrs != nil { - a.logAddrs.Close() - } - if a.logTopics != nil { - a.logTopics.Close() - } - if a.tracesFrom != nil { - a.tracesFrom.Close() - } - if a.tracesTo != nil { - a.tracesTo.Close() - } -} - -func (a *Aggregator) SetTx(tx kv.RwTx) { - a.rwTx = tx - a.accounts.SetTx(tx) - a.storage.SetTx(tx) - a.code.SetTx(tx) - a.commitment.SetTx(tx) - a.logAddrs.SetTx(tx) - a.logTopics.SetTx(tx) - a.tracesFrom.SetTx(tx) - a.tracesTo.SetTx(tx) -} - -func (a *Aggregator) SetTxNum(txNum uint64) { - mxCurrentTx.Set(txNum) - - a.txNum = txNum - a.accounts.SetTxNum(txNum) - a.storage.SetTxNum(txNum) - a.code.SetTxNum(txNum) - a.commitment.SetTxNum(txNum) - a.logAddrs.SetTxNum(txNum) - a.logTopics.SetTxNum(txNum) - a.tracesFrom.SetTxNum(txNum) - a.tracesTo.SetTxNum(txNum) -} - -func (a *Aggregator) SetBlockNum(blockNum uint64) { - a.blockNum = blockNum - mxCurrentBlock.Set(blockNum) -} - -func (a *Aggregator) SetWorkers(i int) { - a.accounts.compressWorkers = i - a.storage.compressWorkers = i - a.code.compressWorkers = i - a.commitment.compressWorkers = i - a.logAddrs.compressWorkers = i - a.logTopics.compressWorkers = i - a.tracesFrom.compressWorkers = i - a.tracesTo.compressWorkers = i -} - -func (a *Aggregator) SetCommitmentMode(mode CommitmentMode) { - a.commitment.mode = mode -} - -func (a *Aggregator) EndTxNumMinimax() uint64 { - min := a.accounts.endTxNumMinimax() - if txNum := a.storage.endTxNumMinimax(); txNum < min { - min = txNum - } - if txNum := a.code.endTxNumMinimax(); txNum < min { - min = txNum - } - if txNum := a.commitment.endTxNumMinimax(); txNum < min { - min = txNum - } - if txNum := a.logAddrs.endTxNumMinimax(); txNum < min { - min = txNum - } - if txNum := a.logTopics.endTxNumMinimax(); txNum < min { - min = txNum - } - if txNum := a.tracesFrom.endTxNumMinimax(); txNum < min { - min = txNum - } - if txNum := a.tracesTo.endTxNumMinimax(); txNum < min { - min = txNum - } - return min -} - -func (a *Aggregator) DomainEndTxNumMinimax() uint64 { - min := a.accounts.endTxNumMinimax() - if txNum := a.storage.endTxNumMinimax(); txNum < min { - min = txNum - } - if txNum := a.code.endTxNumMinimax(); txNum < min { - min = txNum - } - if txNum := a.commitment.endTxNumMinimax(); txNum < min { - min = txNum - } - return min -} - -func (a *Aggregator) SeekCommitment() (blockNum, txNum uint64, err error) { - filesTxNum := a.EndTxNumMinimax() - blockNum, txNum, err = a.commitment.SeekCommitment(a.aggregationStep, filesTxNum) - if err != nil { - return 0, 0, err - } - if txNum == 0 { - return - } - a.seekTxNum = txNum + 1 - return blockNum, txNum + 1, nil -} - -func (a *Aggregator) mergeDomainSteps(ctx context.Context) error { - mergeStartedAt := time.Now() - maxEndTxNum := a.DomainEndTxNumMinimax() - - var upmerges int - for { - a.defaultCtx.Close() - a.defaultCtx = a.MakeContext() - - somethingMerged, err := a.mergeLoopStep(ctx, maxEndTxNum, 1) - if err != nil { - return err - } - - if !somethingMerged { - break - } - upmerges++ - } - - if upmerges > 1 { - a.logger.Info("[stat] aggregation merged", - "upto_tx", maxEndTxNum, - "merge_took", time.Since(mergeStartedAt), - "merges_count", upmerges) - } - - return nil -} - -func (a *Aggregator) aggregate(ctx context.Context, step uint64) error { - var ( - logEvery = time.NewTicker(time.Second * 30) - wg sync.WaitGroup - errCh = make(chan error, 8) - maxSpan = StepsInBiggestFile * a.aggregationStep - txFrom = step * a.aggregationStep - txTo = (step + 1) * a.aggregationStep - workers = 1 - - stepStartedAt = time.Now() - ) - - defer logEvery.Stop() - - for _, d := range []*Domain{a.accounts, a.storage, a.code, a.commitment.Domain} { - wg.Add(1) - - mxRunningCollations.Inc() - start := time.Now() - collation, err := d.collateStream(ctx, step, txFrom, txTo, d.tx) - mxRunningCollations.Dec() - mxCollateTook.UpdateDuration(start) - - //mxCollationSize.Set(uint64(collation.valuesComp.Count())) - mxCollationSizeHist.Set(uint64(collation.historyComp.Count())) - - if err != nil { - collation.Close() - return fmt.Errorf("domain collation %q has failed: %w", d.filenameBase, err) - } - - go func(wg *sync.WaitGroup, d *Domain, collation Collation) { - defer wg.Done() - mxRunningMerges.Inc() - - start := time.Now() - sf, err := d.buildFiles(ctx, step, collation, a.ps) - collation.Close() - - if err != nil { - errCh <- err - - sf.Close() - mxRunningMerges.Dec() - return - } - - mxRunningMerges.Dec() - - d.integrateFiles(sf, step*a.aggregationStep, (step+1)*a.aggregationStep) - d.stats.LastFileBuildingTook = time.Since(start) - }(&wg, d, collation) - - mxPruningProgress.Add(2) // domain and history - if err := d.prune(ctx, step, txFrom, txTo, math.MaxUint64, logEvery); err != nil { - return err - } - mxPruningProgress.Dec() - mxPruningProgress.Dec() - - mxPruneTook.Update(d.stats.LastPruneTook.Seconds()) - mxPruneHistTook.Update(d.stats.LastPruneHistTook.Seconds()) - } - - // when domain files are build and db is pruned, we can merge them - wg.Add(1) - go func(wg *sync.WaitGroup) { - defer wg.Done() - - if err := a.mergeDomainSteps(ctx); err != nil { - errCh <- err - } - }(&wg) - - // indices are built concurrently - for _, d := range []*InvertedIndex{a.logTopics, a.logAddrs, a.tracesFrom, a.tracesTo} { - wg.Add(1) - - mxRunningCollations.Inc() - start := time.Now() - collation, err := d.collate(ctx, step*a.aggregationStep, (step+1)*a.aggregationStep, d.tx) - mxRunningCollations.Dec() - mxCollateTook.UpdateDuration(start) - - if err != nil { - return fmt.Errorf("index collation %q has failed: %w", d.filenameBase, err) - } - - go func(wg *sync.WaitGroup, d *InvertedIndex, tx kv.Tx) { - defer wg.Done() - - mxRunningMerges.Inc() - start := time.Now() - - sf, err := d.buildFiles(ctx, step, collation, a.ps) - if err != nil { - errCh <- err - sf.Close() - return - } - - mxRunningMerges.Dec() - mxBuildTook.UpdateDuration(start) - - d.integrateFiles(sf, step*a.aggregationStep, (step+1)*a.aggregationStep) - - icx := d.MakeContext() - mxRunningMerges.Inc() - - if err := d.mergeRangesUpTo(ctx, d.endTxNumMinimax(), maxSpan, workers, icx, a.ps); err != nil { - errCh <- err - - mxRunningMerges.Dec() - icx.Close() - return - } - - mxRunningMerges.Dec() - icx.Close() - }(&wg, d, d.tx) - - mxPruningProgress.Inc() - startPrune := time.Now() - if err := d.prune(ctx, txFrom, txTo, math.MaxUint64, logEvery); err != nil { - return err - } - mxPruneTook.UpdateDuration(startPrune) - mxPruningProgress.Dec() - } - - go func() { - wg.Wait() - close(errCh) - }() - - for err := range errCh { - a.logger.Warn("domain collate-buildFiles failed", "err", err) - return fmt.Errorf("domain collate-build failed: %w", err) - } - - a.logger.Info("[stat] aggregation is finished", - "range", fmt.Sprintf("%.2fM-%.2fM", float64(txFrom)/10e5, float64(txTo)/10e5), - "took", time.Since(stepStartedAt)) - - mxStepTook.UpdateDuration(stepStartedAt) - - return nil -} - -func (a *Aggregator) mergeLoopStep(ctx context.Context, maxEndTxNum uint64, workers int) (somethingDone bool, err error) { - closeAll := true - mergeStartedAt := time.Now() - - maxSpan := a.aggregationStep * StepsInBiggestFile - r := a.findMergeRange(maxEndTxNum, maxSpan) - if !r.any() { - return false, nil - } - - outs := a.staticFilesInRange(r, a.defaultCtx) - defer func() { - if closeAll { - outs.Close() - } - }() - - in, err := a.mergeFiles(ctx, outs, r, workers) - if err != nil { - return true, err - } - defer func() { - if closeAll { - in.Close() - } - }() - a.integrateMergedFiles(outs, in) - a.cleanAfterNewFreeze(in) - closeAll = false - - for _, s := range []DomainStats{a.accounts.stats, a.code.stats, a.storage.stats} { - mxBuildTook.Update(s.LastFileBuildingTook.Seconds()) - } - - a.logger.Info("[stat] finished merge step", - "upto_tx", maxEndTxNum, "merge_step_took", time.Since(mergeStartedAt)) - - return true, nil -} - -type Ranges struct { - accounts DomainRanges - storage DomainRanges - code DomainRanges - commitment DomainRanges -} - -func (r Ranges) String() string { - return fmt.Sprintf("accounts=%s, storage=%s, code=%s, commitment=%s", r.accounts.String(), r.storage.String(), r.code.String(), r.commitment.String()) -} - -func (r Ranges) any() bool { - return r.accounts.any() || r.storage.any() || r.code.any() || r.commitment.any() -} - -func (a *Aggregator) findMergeRange(maxEndTxNum, maxSpan uint64) Ranges { - var r Ranges - r.accounts = a.accounts.findMergeRange(maxEndTxNum, maxSpan) - r.storage = a.storage.findMergeRange(maxEndTxNum, maxSpan) - r.code = a.code.findMergeRange(maxEndTxNum, maxSpan) - r.commitment = a.commitment.findMergeRange(maxEndTxNum, maxSpan) - //if r.any() { - //log.Info(fmt.Sprintf("findMergeRange(%d, %d)=%+v\n", maxEndTxNum, maxSpan, r)) - //} - return r -} - -type SelectedStaticFiles struct { - accounts []*filesItem - accountsIdx []*filesItem - accountsHist []*filesItem - storage []*filesItem - storageIdx []*filesItem - storageHist []*filesItem - code []*filesItem - codeIdx []*filesItem - codeHist []*filesItem - commitment []*filesItem - commitmentIdx []*filesItem - commitmentHist []*filesItem - codeI int - storageI int - accountsI int - commitmentI int -} - -func (sf SelectedStaticFiles) Close() { - for _, group := range [][]*filesItem{ - sf.accounts, sf.accountsIdx, sf.accountsHist, - sf.storage, sf.storageIdx, sf.storageHist, - sf.code, sf.codeIdx, sf.codeHist, - sf.commitment, sf.commitmentIdx, sf.commitmentHist, - } { - for _, item := range group { - if item != nil { - if item.decompressor != nil { - item.decompressor.Close() - } - if item.index != nil { - item.index.Close() - } - if item.bindex != nil { - item.bindex.Close() - } - } - } - } -} - -func (a *Aggregator) staticFilesInRange(r Ranges, ac *AggregatorContext) SelectedStaticFiles { - var sf SelectedStaticFiles - if r.accounts.any() { - sf.accounts, sf.accountsIdx, sf.accountsHist, sf.accountsI = ac.accounts.staticFilesInRange(r.accounts) - } - if r.storage.any() { - sf.storage, sf.storageIdx, sf.storageHist, sf.storageI = ac.storage.staticFilesInRange(r.storage) - } - if r.code.any() { - sf.code, sf.codeIdx, sf.codeHist, sf.codeI = ac.code.staticFilesInRange(r.code) - } - if r.commitment.any() { - sf.commitment, sf.commitmentIdx, sf.commitmentHist, sf.commitmentI = ac.commitment.staticFilesInRange(r.commitment) - } - return sf -} - -type MergedFiles struct { - accounts *filesItem - accountsIdx, accountsHist *filesItem - storage *filesItem - storageIdx, storageHist *filesItem - code *filesItem - codeIdx, codeHist *filesItem - commitment *filesItem - commitmentIdx, commitmentHist *filesItem -} - -func (mf MergedFiles) Close() { - for _, item := range []*filesItem{ - mf.accounts, mf.accountsIdx, mf.accountsHist, - mf.storage, mf.storageIdx, mf.storageHist, - mf.code, mf.codeIdx, mf.codeHist, - mf.commitment, mf.commitmentIdx, mf.commitmentHist, - //mf.logAddrs, mf.logTopics, mf.tracesFrom, mf.tracesTo, - } { - if item != nil { - if item.decompressor != nil { - item.decompressor.Close() - } - if item.decompressor != nil { - item.index.Close() - } - if item.bindex != nil { - item.bindex.Close() - } - } - } -} - -func (a *Aggregator) mergeFiles(ctx context.Context, files SelectedStaticFiles, r Ranges, workers int) (MergedFiles, error) { - started := time.Now() - defer func(t time.Time) { - a.logger.Info("[snapshots] domain files has been merged", - "range", fmt.Sprintf("%d-%d", r.accounts.valuesStartTxNum/a.aggregationStep, r.accounts.valuesEndTxNum/a.aggregationStep), - "took", time.Since(t)) - }(started) - - var mf MergedFiles - closeFiles := true - defer func() { - if closeFiles { - mf.Close() - } - }() - - var ( - errCh = make(chan error, 4) - wg sync.WaitGroup - predicates sync.WaitGroup - ) - - wg.Add(4) - predicates.Add(2) - - go func() { - mxRunningMerges.Inc() - defer mxRunningMerges.Dec() - defer wg.Done() - - var err error - if r.code.any() { - if mf.code, mf.codeIdx, mf.codeHist, err = a.code.mergeFiles(ctx, files.code, files.codeIdx, files.codeHist, r.code, workers, a.ps); err != nil { - errCh <- err - } - } - }() - - go func(predicates *sync.WaitGroup) { - mxRunningMerges.Inc() - defer mxRunningMerges.Dec() - - defer wg.Done() - defer predicates.Done() - var err error - if r.accounts.any() { - if mf.accounts, mf.accountsIdx, mf.accountsHist, err = a.accounts.mergeFiles(ctx, files.accounts, files.accountsIdx, files.accountsHist, r.accounts, workers, a.ps); err != nil { - errCh <- err - } - } - }(&predicates) - go func(predicates *sync.WaitGroup) { - mxRunningMerges.Inc() - defer mxRunningMerges.Dec() - - defer wg.Done() - defer predicates.Done() - var err error - if r.storage.any() { - if mf.storage, mf.storageIdx, mf.storageHist, err = a.storage.mergeFiles(ctx, files.storage, files.storageIdx, files.storageHist, r.storage, workers, a.ps); err != nil { - errCh <- err - } - } - }(&predicates) - - go func(predicates *sync.WaitGroup) { - defer wg.Done() - predicates.Wait() - - mxRunningMerges.Inc() - defer mxRunningMerges.Dec() - - var err error - // requires storage|accounts to be merged at this point - if r.commitment.any() { - if mf.commitment, mf.commitmentIdx, mf.commitmentHist, err = a.commitment.mergeFiles(ctx, files, mf, r.commitment, workers, a.ps); err != nil { - errCh <- err - } - } - }(&predicates) - - go func() { - wg.Wait() - close(errCh) - }() - - var lastError error - for err := range errCh { - lastError = err - } - if lastError == nil { - closeFiles = false - } - return mf, lastError -} - -func (a *Aggregator) integrateMergedFiles(outs SelectedStaticFiles, in MergedFiles) { - a.accounts.integrateMergedFiles(outs.accounts, outs.accountsIdx, outs.accountsHist, in.accounts, in.accountsIdx, in.accountsHist) - a.storage.integrateMergedFiles(outs.storage, outs.storageIdx, outs.storageHist, in.storage, in.storageIdx, in.storageHist) - a.code.integrateMergedFiles(outs.code, outs.codeIdx, outs.codeHist, in.code, in.codeIdx, in.codeHist) - a.commitment.integrateMergedFiles(outs.commitment, outs.commitmentIdx, outs.commitmentHist, in.commitment, in.commitmentIdx, in.commitmentHist) -} - -func (a *Aggregator) cleanAfterNewFreeze(in MergedFiles) { - a.accounts.cleanAfterFreeze(in.accountsHist.endTxNum) - a.storage.cleanAfterFreeze(in.storageHist.endTxNum) - a.code.cleanAfterFreeze(in.codeHist.endTxNum) - a.commitment.cleanAfterFreeze(in.commitment.endTxNum) -} - -// ComputeCommitment evaluates commitment for processed state. -// If `saveStateAfter`=true, then trie state will be saved to DB after commitment evaluation. -func (a *Aggregator) ComputeCommitment(saveStateAfter, trace bool) (rootHash []byte, err error) { - // if commitment mode is Disabled, there will be nothing to compute on. - mxCommitmentRunning.Inc() - rootHash, branchNodeUpdates, err := a.commitment.ComputeCommitment(trace) - mxCommitmentRunning.Dec() - - if err != nil { - return nil, err - } - if a.seekTxNum > a.txNum { - saveStateAfter = false - } - - mxCommitmentKeys.Add(int(a.commitment.comKeys)) - mxCommitmentTook.Update(a.commitment.comTook.Seconds()) - - defer func(t time.Time) { mxCommitmentWriteTook.UpdateDuration(t) }(time.Now()) - - for pref, update := range branchNodeUpdates { - prefix := []byte(pref) - - stateValue, err := a.defaultCtx.ReadCommitment(prefix, a.rwTx) - if err != nil { - return nil, err - } - mxCommitmentUpdates.Inc() - stated := commitment.BranchData(stateValue) - merged, err := a.commitment.branchMerger.Merge(stated, update) - if err != nil { - return nil, err - } - if bytes.Equal(stated, merged) { - continue - } - if trace { - fmt.Printf("computeCommitment merge [%x] [%x]+[%x]=>[%x]\n", prefix, stated, update, merged) - } - if err = a.UpdateCommitmentData(prefix, merged); err != nil { - return nil, err - } - mxCommitmentUpdatesApplied.Inc() - } - - if saveStateAfter { - if err := a.commitment.storeCommitmentState(a.blockNum, a.txNum); err != nil { - return nil, err - } - } - - return rootHash, nil -} - -// Provides channel which receives commitment hash each time aggregation is occured -func (a *Aggregator) AggregatedRoots() chan [length.Hash]byte { - return a.stepDoneNotice -} - -func (a *Aggregator) notifyAggregated(rootHash []byte) { - rh := (*[length.Hash]byte)(rootHash) - select { - case a.stepDoneNotice <- *rh: - default: - } -} - -func (a *Aggregator) ReadyToFinishTx() bool { - return (a.txNum+1)%a.aggregationStep == 0 && a.seekTxNum < a.txNum -} - -func (a *Aggregator) FinishTx() (err error) { - atomic.AddUint64(&a.stats.TxCount, 1) - - if !a.ReadyToFinishTx() { - return nil - } - - mxRunningMerges.Inc() - defer mxRunningMerges.Dec() - - a.commitment.patriciaTrie.ResetFns(a.defaultCtx.branchFn, a.defaultCtx.accountFn, a.defaultCtx.storageFn) - rootHash, err := a.ComputeCommitment(true, false) - if err != nil { - return err - } - step := a.txNum / a.aggregationStep - mxStepCurrent.Set(step) - - if step == 0 { - a.notifyAggregated(rootHash) - return nil - } - step-- // Leave one step worth in the DB - - ctx := context.Background() - if err := a.Flush(ctx); err != nil { - return err - } - - if err := a.aggregate(ctx, step); err != nil { - return err - } - - a.notifyAggregated(rootHash) - return nil -} - -func (a *Aggregator) UpdateAccountData(addr []byte, account []byte) error { - a.commitment.TouchPlainKey(addr, account, a.commitment.TouchPlainKeyAccount) - return a.accounts.Put(addr, nil, account) -} - -func (a *Aggregator) UpdateAccountCode(addr []byte, code []byte) error { - a.commitment.TouchPlainKey(addr, code, a.commitment.TouchPlainKeyCode) - if len(code) == 0 { - return a.code.Delete(addr, nil) - } - return a.code.Put(addr, nil, code) -} - -func (a *Aggregator) UpdateCommitmentData(prefix []byte, code []byte) error { - return a.commitment.Put(prefix, nil, code) -} - -func (a *Aggregator) DeleteAccount(addr []byte) error { - a.commitment.TouchPlainKey(addr, nil, a.commitment.TouchPlainKeyAccount) - - if err := a.accounts.Delete(addr, nil); err != nil { - return err - } - if err := a.code.Delete(addr, nil); err != nil { - return err - } - var e error - if err := a.storage.defaultDc.IteratePrefix(addr, func(k, _ []byte) { - a.commitment.TouchPlainKey(k, nil, a.commitment.TouchPlainKeyStorage) - if e == nil { - e = a.storage.Delete(k, nil) - } - }); err != nil { - return err - } - return e -} - -func (a *Aggregator) WriteAccountStorage(addr, loc []byte, value []byte) error { - composite := make([]byte, len(addr)+len(loc)) - copy(composite, addr) - copy(composite[length.Addr:], loc) - - a.commitment.TouchPlainKey(composite, value, a.commitment.TouchPlainKeyStorage) - if len(value) == 0 { - return a.storage.Delete(addr, loc) - } - return a.storage.Put(addr, loc, value) -} - -func (a *Aggregator) AddTraceFrom(addr []byte) error { - return a.tracesFrom.Add(addr) -} - -func (a *Aggregator) AddTraceTo(addr []byte) error { - return a.tracesTo.Add(addr) -} - -func (a *Aggregator) AddLogAddr(addr []byte) error { - return a.logAddrs.Add(addr) -} - -func (a *Aggregator) AddLogTopic(topic []byte) error { - return a.logTopics.Add(topic) -} - -// StartWrites - pattern: `defer agg.StartWrites().FinishWrites()` -func (a *Aggregator) StartWrites() *Aggregator { - a.accounts.StartWrites() - a.storage.StartWrites() - a.code.StartWrites() - a.commitment.StartWrites() - a.logAddrs.StartWrites() - a.logTopics.StartWrites() - a.tracesFrom.StartWrites() - a.tracesTo.StartWrites() - - if a.defaultCtx != nil { - a.defaultCtx.Close() - } - a.defaultCtx = &AggregatorContext{ - a: a, - accounts: a.accounts.defaultDc, - storage: a.storage.defaultDc, - code: a.code.defaultDc, - commitment: a.commitment.defaultDc, - logAddrs: a.logAddrs.MakeContext(), - logTopics: a.logTopics.MakeContext(), - tracesFrom: a.tracesFrom.MakeContext(), - tracesTo: a.tracesTo.MakeContext(), - } - a.commitment.patriciaTrie.ResetFns(a.defaultCtx.branchFn, a.defaultCtx.accountFn, a.defaultCtx.storageFn) - return a -} - -func (a *Aggregator) FinishWrites() { - a.accounts.FinishWrites() - a.storage.FinishWrites() - a.code.FinishWrites() - a.commitment.FinishWrites() - a.logAddrs.FinishWrites() - a.logTopics.FinishWrites() - a.tracesFrom.FinishWrites() - a.tracesTo.FinishWrites() -} - -// Flush - must be called before Collate, if you did some writes -func (a *Aggregator) Flush(ctx context.Context) error { - flushers := []flusher{ - a.accounts.Rotate(), - a.storage.Rotate(), - a.code.Rotate(), - a.commitment.Domain.Rotate(), - a.logAddrs.Rotate(), - a.logTopics.Rotate(), - a.tracesFrom.Rotate(), - a.tracesTo.Rotate(), - } - defer func(t time.Time) { a.logger.Debug("[snapshots] history flush", "took", time.Since(t)) }(time.Now()) - for _, f := range flushers { - if err := f.Flush(ctx, a.rwTx); err != nil { - return err - } - } - return nil -} - -type FilesStats struct { - HistoryReads uint64 - TotalReads uint64 - IdxAccess time.Duration - TxCount uint64 - FilesCount uint64 - IdxSize uint64 - DataSize uint64 -} - -func (a *Aggregator) Stats() FilesStats { - res := a.stats - stat := a.GetAndResetStats() - res.IdxSize = stat.IndexSize - res.DataSize = stat.DataSize - res.FilesCount = stat.FilesCount - res.HistoryReads = stat.HistoryQueries.Load() - res.TotalReads = stat.TotalQueries.Load() - res.IdxAccess = stat.EfSearchTime - return res -} - -type AggregatorContext struct { - a *Aggregator - accounts *DomainContext - storage *DomainContext - code *DomainContext - commitment *DomainContext - logAddrs *InvertedIndexContext - logTopics *InvertedIndexContext - tracesFrom *InvertedIndexContext - tracesTo *InvertedIndexContext - keyBuf []byte -} - -func (a *Aggregator) MakeContext() *AggregatorContext { - return &AggregatorContext{ - a: a, - accounts: a.accounts.MakeContext(), - storage: a.storage.MakeContext(), - code: a.code.MakeContext(), - commitment: a.commitment.MakeContext(), - logAddrs: a.logAddrs.MakeContext(), - logTopics: a.logTopics.MakeContext(), - tracesFrom: a.tracesFrom.MakeContext(), - tracesTo: a.tracesTo.MakeContext(), - } -} - -func (ac *AggregatorContext) ReadAccountData(addr []byte, roTx kv.Tx) ([]byte, error) { - return ac.accounts.Get(addr, nil, roTx) -} - -func (ac *AggregatorContext) ReadAccountDataBeforeTxNum(addr []byte, txNum uint64, roTx kv.Tx) ([]byte, error) { - v, err := ac.accounts.GetBeforeTxNum(addr, txNum, roTx) - return v, err -} - -func (ac *AggregatorContext) ReadAccountStorage(addr []byte, loc []byte, roTx kv.Tx) ([]byte, error) { - return ac.storage.Get(addr, loc, roTx) -} - -func (ac *AggregatorContext) ReadAccountStorageBeforeTxNum(addr []byte, loc []byte, txNum uint64, roTx kv.Tx) ([]byte, error) { - if cap(ac.keyBuf) < len(addr)+len(loc) { - ac.keyBuf = make([]byte, len(addr)+len(loc)) - } else if len(ac.keyBuf) != len(addr)+len(loc) { - ac.keyBuf = ac.keyBuf[:len(addr)+len(loc)] - } - copy(ac.keyBuf, addr) - copy(ac.keyBuf[len(addr):], loc) - v, err := ac.storage.GetBeforeTxNum(ac.keyBuf, txNum, roTx) - return v, err -} - -func (ac *AggregatorContext) ReadAccountCode(addr []byte, roTx kv.Tx) ([]byte, error) { - return ac.code.Get(addr, nil, roTx) -} - -func (ac *AggregatorContext) ReadCommitment(addr []byte, roTx kv.Tx) ([]byte, error) { - return ac.commitment.Get(addr, nil, roTx) -} - -func (ac *AggregatorContext) ReadCommitmentBeforeTxNum(addr []byte, txNum uint64, roTx kv.Tx) ([]byte, error) { - v, err := ac.commitment.GetBeforeTxNum(addr, txNum, roTx) - return v, err -} - -func (ac *AggregatorContext) ReadAccountCodeBeforeTxNum(addr []byte, txNum uint64, roTx kv.Tx) ([]byte, error) { - v, err := ac.code.GetBeforeTxNum(addr, txNum, roTx) - return v, err -} - -func (ac *AggregatorContext) ReadAccountCodeSize(addr []byte, roTx kv.Tx) (int, error) { - code, err := ac.code.Get(addr, nil, roTx) - if err != nil { - return 0, err - } - return len(code), nil -} - -func (ac *AggregatorContext) ReadAccountCodeSizeBeforeTxNum(addr []byte, txNum uint64, roTx kv.Tx) (int, error) { - code, err := ac.code.GetBeforeTxNum(addr, txNum, roTx) - if err != nil { - return 0, err - } - return len(code), nil -} - -func (ac *AggregatorContext) branchFn(prefix []byte) ([]byte, error) { - // Look in the summary table first - stateValue, err := ac.ReadCommitment(prefix, ac.a.rwTx) - if err != nil { - return nil, fmt.Errorf("failed read branch %x: %w", commitment.CompactedKeyToHex(prefix), err) - } - if stateValue == nil { - return nil, nil - } - // fmt.Printf("Returning branch data prefix [%x], mergeVal=[%x]\n", commitment.CompactedKeyToHex(prefix), stateValue) - return stateValue[2:], nil // Skip touchMap but keep afterMap -} - -func (ac *AggregatorContext) accountFn(plainKey []byte, cell *commitment.Cell) error { - encAccount, err := ac.ReadAccountData(plainKey, ac.a.rwTx) - if err != nil { - return err - } - cell.Nonce = 0 - cell.Balance.Clear() - copy(cell.CodeHash[:], commitment.EmptyCodeHash) - if len(encAccount) > 0 { - nonce, balance, chash := DecodeAccountBytes(encAccount) - cell.Nonce = nonce - cell.Balance.Set(balance) - if chash != nil { - copy(cell.CodeHash[:], chash) - } - } - - code, err := ac.ReadAccountCode(plainKey, ac.a.rwTx) - if err != nil { - return err - } - if code != nil { - ac.a.commitment.keccak.Reset() - ac.a.commitment.keccak.Write(code) - copy(cell.CodeHash[:], ac.a.commitment.keccak.Sum(nil)) - } - cell.Delete = len(encAccount) == 0 && len(code) == 0 - return nil -} - -func (ac *AggregatorContext) storageFn(plainKey []byte, cell *commitment.Cell) error { - // Look in the summary table first - enc, err := ac.ReadAccountStorage(plainKey[:length.Addr], plainKey[length.Addr:], ac.a.rwTx) - if err != nil { - return err - } - cell.StorageLen = len(enc) - copy(cell.Storage[:], enc) - cell.Delete = cell.StorageLen == 0 - return nil -} - -func (ac *AggregatorContext) LogAddrIterator(addr []byte, startTxNum, endTxNum int, roTx kv.Tx) (iter.U64, error) { - return ac.logAddrs.IdxRange(addr, startTxNum, endTxNum, order.Asc, -1, roTx) -} - -func (ac *AggregatorContext) LogTopicIterator(topic []byte, startTxNum, endTxNum int, roTx kv.Tx) (iter.U64, error) { - return ac.logTopics.IdxRange(topic, startTxNum, endTxNum, order.Asc, -1, roTx) -} - -func (ac *AggregatorContext) TraceFromIterator(addr []byte, startTxNum, endTxNum int, roTx kv.Tx) (iter.U64, error) { - return ac.tracesFrom.IdxRange(addr, startTxNum, endTxNum, order.Asc, -1, roTx) -} - -func (ac *AggregatorContext) TraceToIterator(addr []byte, startTxNum, endTxNum int, roTx kv.Tx) (iter.U64, error) { - return ac.tracesTo.IdxRange(addr, startTxNum, endTxNum, order.Asc, -1, roTx) -} - -func (ac *AggregatorContext) Close() { - ac.accounts.Close() - ac.storage.Close() - ac.code.Close() - ac.commitment.Close() - ac.logAddrs.Close() - ac.logTopics.Close() - ac.tracesFrom.Close() - ac.tracesTo.Close() -} - -func DecodeAccountBytes(enc []byte) (nonce uint64, balance *uint256.Int, hash []byte) { - balance = new(uint256.Int) - - if len(enc) > 0 { - pos := 0 - nonceBytes := int(enc[pos]) - pos++ - if nonceBytes > 0 { - nonce = bytesToUint64(enc[pos : pos+nonceBytes]) - pos += nonceBytes - } - balanceBytes := int(enc[pos]) - pos++ - if balanceBytes > 0 { - balance.SetBytes(enc[pos : pos+balanceBytes]) - pos += balanceBytes - } - codeHashBytes := int(enc[pos]) - pos++ - if codeHashBytes > 0 { - codeHash := make([]byte, length.Hash) - copy(codeHash, enc[pos:pos+codeHashBytes]) - } - } - return -} - -func EncodeAccountBytes(nonce uint64, balance *uint256.Int, hash []byte, incarnation uint64) []byte { - l := int(1) - if nonce > 0 { - l += common.BitLenToByteLen(bits.Len64(nonce)) - } - l++ - if !balance.IsZero() { - l += balance.ByteLen() - } - l++ - if len(hash) == length.Hash { - l += 32 - } - l++ - if incarnation > 0 { - l += common.BitLenToByteLen(bits.Len64(incarnation)) - } - value := make([]byte, l) - pos := 0 - - if nonce == 0 { - value[pos] = 0 - pos++ - } else { - nonceBytes := common.BitLenToByteLen(bits.Len64(nonce)) - value[pos] = byte(nonceBytes) - var nonce = nonce - for i := nonceBytes; i > 0; i-- { - value[pos+i] = byte(nonce) - nonce >>= 8 - } - pos += nonceBytes + 1 - } - if balance.IsZero() { - value[pos] = 0 - pos++ - } else { - balanceBytes := balance.ByteLen() - value[pos] = byte(balanceBytes) - pos++ - balance.WriteToSlice(value[pos : pos+balanceBytes]) - pos += balanceBytes - } - if len(hash) == 0 { - value[pos] = 0 - pos++ - } else { - value[pos] = 32 - pos++ - copy(value[pos:pos+32], hash) - pos += 32 - } - if incarnation == 0 { - value[pos] = 0 - } else { - incBytes := common.BitLenToByteLen(bits.Len64(incarnation)) - value[pos] = byte(incBytes) - var inc = incarnation - for i := incBytes; i > 0; i-- { - value[pos+i] = byte(inc) - inc >>= 8 - } - } - return value -} - -func bytesToUint64(buf []byte) (x uint64) { - for i, b := range buf { - x = x<<8 + uint64(b) - if i == 7 { - return - } - } - return -} diff --git a/state/aggregator_bench_test.go b/state/aggregator_bench_test.go index 16b748fd9..5daecfb5f 100644 --- a/state/aggregator_bench_test.go +++ b/state/aggregator_bench_test.go @@ -14,7 +14,6 @@ import ( "github.com/ledgerwatch/log/v3" "github.com/stretchr/testify/require" - "github.com/ledgerwatch/erigon-lib/commitment" "github.com/ledgerwatch/erigon-lib/common" "github.com/ledgerwatch/erigon-lib/common/length" "github.com/ledgerwatch/erigon-lib/compress" @@ -23,7 +22,7 @@ import ( "github.com/ledgerwatch/erigon-lib/recsplit" ) -func testDbAndAggregatorBench(b *testing.B, aggStep uint64) (string, kv.RwDB, *Aggregator) { +func testDbAndAggregatorBench(b *testing.B, aggStep uint64) (string, kv.RwDB, *AggregatorV3) { b.Helper() logger := log.New() path := b.TempDir() @@ -32,7 +31,7 @@ func testDbAndAggregatorBench(b *testing.B, aggStep uint64) (string, kv.RwDB, *A return kv.ChaindataTablesCfg }).MustOpen() b.Cleanup(db.Close) - agg, err := NewAggregator(path, path, aggStep, CommitmentModeDirect, commitment.VariantHexPatriciaTrie, logger) + agg, err := NewAggregatorV3(context.Background(), path, path+"_tmp", aggStep, db, logger) require.NoError(b, err) b.Cleanup(agg.Close) return path, db, agg @@ -56,22 +55,33 @@ func BenchmarkAggregator_Processing(b *testing.B) { } }() - agg.SetTx(tx) - defer agg.StartWrites().FinishWrites() require.NoError(b, err) + ac := agg.MakeContext() + defer ac.Close() + + domains := agg.SharedDomains(ac) + defer agg.CloseSharedDomains() + defer agg.StartWrites().FinishWrites() + + domains.SetTx(tx) b.ReportAllocs() b.ResetTimer() + var prev []byte for i := 0; i < b.N; i++ { key := <-longKeys val := <-vals txNum := uint64(i) - agg.SetTxNum(txNum) - err := agg.WriteAccountStorage(key[:length.Addr], key[length.Addr:], val) - require.NoError(b, err) - err = agg.FinishTx() + domains.SetTxNum(txNum) + err := domains.WriteAccountStorage(key[:length.Addr], key[length.Addr:], val, prev) + prev = val require.NoError(b, err) + + if i%100000 == 0 { + _, err := domains.Commit(true, false) + require.NoError(b, err) + } } } @@ -98,7 +108,7 @@ func Benchmark_BtreeIndex_Allocation(b *testing.B) { for i := 0; i < b.N; i++ { now := time.Now() count := rnd.Intn(1000000000) - bt := newBtAlloc(uint64(count), uint64(1<<12), true) + bt := newBtAlloc(uint64(count), uint64(1<<12), true, nil, nil) bt.traverseDfs() fmt.Printf("alloc %v\n", time.Since(now)) } @@ -112,24 +122,23 @@ func Benchmark_BtreeIndex_Search(b *testing.B) { dataPath := "../../data/storage.256-288.kv" indexPath := path.Join(tmp, filepath.Base(dataPath)+".bti") - err := BuildBtreeIndex(dataPath, indexPath, logger) + comp := CompressKeys | CompressVals + err := BuildBtreeIndex(dataPath, indexPath, comp, 1, logger) require.NoError(b, err) M := 1024 - bt, err := OpenBtreeIndex(indexPath, dataPath, uint64(M)) + bt, err := OpenBtreeIndex(indexPath, dataPath, uint64(M), comp, false) require.NoError(b, err) - idx := NewBtIndexReader(bt) - keys, err := pivotKeysFromKV(dataPath) require.NoError(b, err) for i := 0; i < b.N; i++ { p := rnd.Intn(len(keys)) - cur, err := idx.Seek(keys[p]) + cur, err := bt.SeekDeprecated(keys[p]) require.NoErrorf(b, err, "i=%d", i) - require.EqualValues(b, keys[p], cur.key) + require.EqualValues(b, keys[p], cur.Key()) require.NotEmptyf(b, cur.Value(), "i=%d", i) } @@ -143,9 +152,9 @@ func benchInitBtreeIndex(b *testing.B, M uint64) (*BtIndex, [][]byte, string) { tmp := b.TempDir() b.Cleanup(func() { os.RemoveAll(tmp) }) - dataPath := generateCompressedKV(b, tmp, 52, 10, 1000000, logger) + dataPath := generateKV(b, tmp, 52, 10, 1000000, logger, 0) indexPath := path.Join(tmp, filepath.Base(dataPath)+".bt") - bt, err := CreateBtreeIndex(indexPath, dataPath, M, logger) + bt, err := CreateBtreeIndex(indexPath, dataPath, M, CompressNone, 1, logger) require.NoError(b, err) keys, err := pivotKeysFromKV(dataPath) @@ -164,7 +173,7 @@ func Benchmark_BTree_Seek(b *testing.B) { for i := 0; i < b.N; i++ { p := rnd.Intn(len(keys)) - cur, err := bt.Seek(keys[p]) + cur, err := bt.SeekDeprecated(keys[p]) require.NoError(b, err) require.EqualValues(b, keys[p], cur.key) @@ -175,7 +184,7 @@ func Benchmark_BTree_Seek(b *testing.B) { for i := 0; i < b.N; i++ { p := rnd.Intn(len(keys)) - cur, err := bt.Seek(keys[p]) + cur, err := bt.SeekDeprecated(keys[p]) require.NoError(b, err) require.EqualValues(b, keys[p], cur.key) diff --git a/state/aggregator_fuzz_test.go b/state/aggregator_fuzz_test.go index 0b471a923..2ad12e870 100644 --- a/state/aggregator_fuzz_test.go +++ b/state/aggregator_fuzz_test.go @@ -14,7 +14,7 @@ func Fuzz_BtreeIndex_Allocation(f *testing.F) { if keyCount < M*4 || M < 4 { t.Skip() } - bt := newBtAlloc(keyCount, M, false) + bt := newBtAlloc(keyCount, M, false, nil, nil) bt.traverseDfs() require.GreaterOrEqual(t, bt.N, keyCount) diff --git a/state/aggregator_test.go b/state/aggregator_test.go index 9fc43fb3a..70a97cb4c 100644 --- a/state/aggregator_test.go +++ b/state/aggregator_test.go @@ -3,6 +3,7 @@ package state import ( "context" "encoding/binary" + "encoding/hex" "fmt" "math/rand" "os" @@ -12,12 +13,13 @@ import ( "testing" "time" + "github.com/c2h5oh/datasize" "github.com/holiman/uint256" - "github.com/ledgerwatch/erigon-lib/common/background" "github.com/ledgerwatch/log/v3" "github.com/stretchr/testify/require" - "github.com/ledgerwatch/erigon-lib/commitment" + "github.com/ledgerwatch/erigon-lib/etl" + "github.com/ledgerwatch/erigon-lib/common" "github.com/ledgerwatch/erigon-lib/common/length" "github.com/ledgerwatch/erigon-lib/compress" @@ -25,83 +27,37 @@ import ( "github.com/ledgerwatch/erigon-lib/kv/mdbx" ) -func testDbAndAggregator(t *testing.T, aggStep uint64) (string, kv.RwDB, *Aggregator) { - t.Helper() - path := t.TempDir() - logger := log.New() - db := mdbx.NewMDBX(logger).InMem(filepath.Join(path, "db4")).WithTableCfg(func(defaultBuckets kv.TableCfg) kv.TableCfg { - return kv.ChaindataTablesCfg - }).MustOpen() - t.Cleanup(db.Close) - agg, err := NewAggregator(filepath.Join(path, "e4"), filepath.Join(path, "e4tmp"), aggStep, CommitmentModeDirect, commitment.VariantHexPatriciaTrie, logger) - require.NoError(t, err) - return path, db, agg -} - -func TestAggregator_WinAccess(t *testing.T) { - _, db, agg := testDbAndAggregator(t, 100) - defer agg.Close() +func TestAggregatorV3_Merge(t *testing.T) { + db, agg := testDbAndAggregatorv3(t, 1000) - tx, err := db.BeginRwNosync(context.Background()) + rwTx, err := db.BeginRwNosync(context.Background()) require.NoError(t, err) defer func() { - if tx != nil { - tx.Rollback() + if rwTx != nil { + rwTx.Rollback() } }() - agg.SetTx(tx) - agg.StartWrites() + domCtx := agg.MakeContext() + defer domCtx.Close() + domains := agg.SharedDomains(domCtx) + defer domains.Close() - rnd := rand.New(rand.NewSource(time.Now().UnixNano())) - for txNum := uint64(1); txNum <= 100; txNum++ { - agg.SetTxNum(txNum) - - addr := make([]byte, length.Addr) - n, err := rnd.Read(addr) - require.NoError(t, err) - require.EqualValues(t, length.Addr, n) + domains.SetTx(rwTx) - buf := EncodeAccountBytes(1, uint256.NewInt(uint64(rand.Intn(10e9))), nil, 0) - err = agg.UpdateAccountData(addr, buf) - require.NoError(t, err) - - var v [8]byte - binary.BigEndian.PutUint64(v[:], txNum) - require.NoError(t, err) - require.NoError(t, agg.FinishTx()) - } - agg.FinishWrites() - - require.NoError(t, err) - err = tx.Commit() - require.NoError(t, err) - tx = nil -} - -func TestAggregator_Merge(t *testing.T) { - _, db, agg := testDbAndAggregator(t, 1000) - defer agg.Close() - - tx, err := db.BeginRwNosync(context.Background()) - require.NoError(t, err) - defer func() { - if tx != nil { - tx.Rollback() - } - }() - agg.SetTx(tx) - - agg.StartWrites() - - txs := uint64(10000) + txs := uint64(100000) rnd := rand.New(rand.NewSource(time.Now().UnixNano())) + var ( + commKey1 = []byte("someCommKey") + commKey2 = []byte("otherCommKey") + ) + // keys are encodings of numbers 1..31 // each key changes value on every txNum which is multiple of the key var maxWrite, otherMaxWrite uint64 for txNum := uint64(1); txNum <= txs; txNum++ { - agg.SetTxNum(txNum) + domains.SetTxNum(txNum) addr, loc := make([]byte, length.Addr), make([]byte, length.Hash) @@ -112,32 +68,42 @@ func TestAggregator_Merge(t *testing.T) { n, err = rnd.Read(loc) require.NoError(t, err) require.EqualValues(t, length.Hash, n) - //keys[txNum-1] = append(addr, loc...) buf := EncodeAccountBytes(1, uint256.NewInt(0), nil, 0) - err = agg.UpdateAccountData(addr, buf) + err = domains.UpdateAccountData(addr, buf, nil) require.NoError(t, err) - err = agg.WriteAccountStorage(addr, loc, []byte{addr[0], loc[0]}) + err = domains.WriteAccountStorage(addr, loc, []byte{addr[0], loc[0]}, nil) require.NoError(t, err) var v [8]byte binary.BigEndian.PutUint64(v[:], txNum) if txNum%135 == 0 { - err = agg.UpdateCommitmentData([]byte("otherroothash"), v[:]) + pv, _, err := domCtx.GetLatest(kv.CommitmentDomain, commKey2, nil, rwTx) + require.NoError(t, err) + + err = domains.UpdateCommitmentData(commKey2, v[:], pv) + require.NoError(t, err) otherMaxWrite = txNum } else { - err = agg.UpdateCommitmentData([]byte("roothash"), v[:]) + pv, _, err := domCtx.GetLatest(kv.CommitmentDomain, commKey1, nil, rwTx) + require.NoError(t, err) + + err = domains.UpdateCommitmentData(commKey1, v[:], pv) + require.NoError(t, err) maxWrite = txNum } require.NoError(t, err) - require.NoError(t, agg.FinishTx()) + } + err = agg.Flush(context.Background(), rwTx) + require.NoError(t, err) agg.FinishWrites() + require.NoError(t, err) - err = tx.Commit() + err = rwTx.Commit() require.NoError(t, err) - tx = nil + rwTx = nil // Check the history roTx, err := db.BeginRo(context.Background()) @@ -146,27 +112,58 @@ func TestAggregator_Merge(t *testing.T) { dc := agg.MakeContext() - v, err := dc.ReadCommitment([]byte("roothash"), roTx) + v, ex, err := dc.GetLatest(kv.CommitmentDomain, commKey1, nil, roTx) require.NoError(t, err) + require.Truef(t, ex, "key %x not found", commKey1) require.EqualValues(t, maxWrite, binary.BigEndian.Uint64(v[:])) - v, err = dc.ReadCommitment([]byte("otherroothash"), roTx) + v, ex, err = dc.GetLatest(kv.CommitmentDomain, commKey2, nil, roTx) require.NoError(t, err) + require.Truef(t, ex, "key %x not found", commKey2) dc.Close() require.EqualValues(t, otherMaxWrite, binary.BigEndian.Uint64(v[:])) } +func TestAggregatorV3_RestartOnDatadir(t *testing.T) { + t.Run("BPlus", func(t *testing.T) { + rc := runCfg{ + aggStep: 50, + useBplus: true, + } + aggregatorV3_RestartOnDatadir(t, rc) + }) + t.Run("B", func(t *testing.T) { + rc := runCfg{ + aggStep: 50, + } + aggregatorV3_RestartOnDatadir(t, rc) + }) + +} + +type runCfg struct { + aggStep uint64 + useBplus bool + compressVals bool + largeVals bool +} + // here we create a bunch of updates for further aggregation. // FinishTx should merge underlying files several times // Expected that: // - we could close first aggregator and open another with previous data still available // - new aggregator SeekCommitment must return txNum equal to amount of total txns -func TestAggregator_RestartOnDatadir(t *testing.T) { +func aggregatorV3_RestartOnDatadir(t *testing.T, rc runCfg) { + t.Helper() logger := log.New() - aggStep := uint64(50) - path, db, agg := testDbAndAggregator(t, aggStep) + aggStep := rc.aggStep + db, agg := testDbAndAggregatorv3(t, aggStep) + if rc.useBplus { + UseBpsTree = true + defer func() { UseBpsTree = false }() + } tx, err := db.BeginRw(context.Background()) require.NoError(t, err) @@ -175,24 +172,30 @@ func TestAggregator_RestartOnDatadir(t *testing.T) { tx.Rollback() } }() - agg.SetTx(tx) agg.StartWrites() + domCtx := agg.MakeContext() + defer domCtx.Close() - var latestCommitTxNum uint64 + domains := agg.SharedDomains(domCtx) + defer domains.Close() + domains.SetTx(tx) + + var latestCommitTxNum uint64 rnd := rand.New(rand.NewSource(time.Now().Unix())) + someKey := []byte("somekey") txs := (aggStep / 2) * 19 t.Logf("step=%d tx_count=%d", aggStep, txs) var aux [8]byte // keys are encodings of numbers 1..31 // each key changes value on every txNum which is multiple of the key var maxWrite uint64 + addr, loc := make([]byte, length.Addr), make([]byte, length.Hash) for txNum := uint64(1); txNum <= txs; txNum++ { - agg.SetTxNum(txNum) + domains.SetTxNum(txNum) binary.BigEndian.PutUint64(aux[:], txNum) - addr, loc := make([]byte, length.Addr), make([]byte, length.Hash) n, err := rnd.Read(addr) require.NoError(t, err) require.EqualValues(t, length.Addr, n) @@ -202,33 +205,49 @@ func TestAggregator_RestartOnDatadir(t *testing.T) { require.EqualValues(t, length.Hash, n) //keys[txNum-1] = append(addr, loc...) - buf := EncodeAccountBytes(1, uint256.NewInt(0), nil, 0) - err = agg.UpdateAccountData(addr, buf) + buf := EncodeAccountBytes(1, uint256.NewInt(rnd.Uint64()), nil, 0) + err = domains.UpdateAccountData(addr, buf, nil) require.NoError(t, err) - err = agg.WriteAccountStorage(addr, loc, []byte{addr[0], loc[0]}) + err = domains.WriteAccountStorage(addr, loc, []byte{addr[0], loc[0]}, nil) require.NoError(t, err) - err = agg.UpdateCommitmentData([]byte("key"), aux[:]) + err = domains.UpdateCommitmentData(someKey, aux[:], nil) require.NoError(t, err) maxWrite = txNum - - require.NoError(t, agg.FinishTx()) } - agg.FinishWrites() - agg.Close() + _, err = domains.Commit(true, false) + require.NoError(t, err) + err = agg.Flush(context.Background(), tx) + require.NoError(t, err) err = tx.Commit() require.NoError(t, err) tx = nil - // Start another aggregator on same datadir - anotherAgg, err := NewAggregator(filepath.Join(path, "e4"), filepath.Join(path, "e4tmp"), aggStep, CommitmentModeDirect, commitment.VariantHexPatriciaTrie, logger) + //tx, err = db.BeginRw(context.Background()) + //require.NoError(t, err) + // + //ac := agg.MakeContext() + //ac.IterateAccounts(tx, []byte{}, func(addr, val []byte) { + // fmt.Printf("addr=%x val=%x\n", addr, val) + //}) + //ac.Close() + //tx.Rollback() + + err = agg.BuildFiles(txs) require.NoError(t, err) - require.NoError(t, anotherAgg.ReopenFolder()) + agg.FinishWrites() + agg.Close() + + // Start another aggregator on same datadir + anotherAgg, err := NewAggregatorV3(context.Background(), agg.dir, agg.dir, aggStep, db, logger) + require.NoError(t, err) defer anotherAgg.Close() + require.NoError(t, anotherAgg.OpenFolder()) + rwTx, err := db.BeginRw(context.Background()) require.NoError(t, err) defer func() { @@ -237,9 +256,15 @@ func TestAggregator_RestartOnDatadir(t *testing.T) { } }() - anotherAgg.SetTx(rwTx) + //anotherAgg.SetTx(rwTx) startTx := anotherAgg.EndTxNumMinimax() - _, sstartTx, err := anotherAgg.SeekCommitment() + ac2 := anotherAgg.MakeContext() + defer ac2.Close() + dom2 := anotherAgg.SharedDomains(ac2) + dom2.SetTx(rwTx) + + _, sstartTx, err := dom2.SeekCommitment(0, 1<<63-1) + require.NoError(t, err) require.GreaterOrEqual(t, sstartTx, startTx) require.GreaterOrEqual(t, sstartTx, latestCommitTxNum) @@ -253,18 +278,20 @@ func TestAggregator_RestartOnDatadir(t *testing.T) { defer roTx.Rollback() dc := anotherAgg.MakeContext() - v, err := dc.ReadCommitment([]byte("key"), roTx) + v, ex, err := dc.GetLatest(kv.CommitmentDomain, someKey, nil, roTx) require.NoError(t, err) + require.True(t, ex) dc.Close() require.EqualValues(t, maxWrite, binary.BigEndian.Uint64(v[:])) } -func TestAggregator_RestartOnFiles(t *testing.T) { +func TestAggregatorV3_RestartOnFiles(t *testing.T) { logger := log.New() aggStep := uint64(100) - path, db, agg := testDbAndAggregator(t, aggStep) + db, agg := testDbAndAggregatorv3(t, aggStep) + path := filepath.Dir(agg.dir) tx, err := db.BeginRw(context.Background()) require.NoError(t, err) @@ -273,8 +300,13 @@ func TestAggregator_RestartOnFiles(t *testing.T) { tx.Rollback() } }() - agg.SetTx(tx) + //agg.SetTx(tx) agg.StartWrites() + domCtx := agg.MakeContext() + defer domCtx.Close() + domains := agg.SharedDomains(domCtx) + defer domains.Close() + domains.SetTx(tx) txs := aggStep * 5 t.Logf("step=%d tx_count=%d\n", aggStep, txs) @@ -283,7 +315,7 @@ func TestAggregator_RestartOnFiles(t *testing.T) { keys := make([][]byte, txs) for txNum := uint64(1); txNum <= txs; txNum++ { - agg.SetTxNum(txNum) + domains.SetTxNum(txNum) addr, loc := make([]byte, length.Addr), make([]byte, length.Hash) n, err := rnd.Read(addr) @@ -295,72 +327,87 @@ func TestAggregator_RestartOnFiles(t *testing.T) { require.EqualValues(t, length.Hash, n) buf := EncodeAccountBytes(txNum, uint256.NewInt(1000000000000), nil, 0) - err = agg.UpdateAccountData(addr, buf[:]) + err = domains.UpdateAccountData(addr, buf[:], nil) require.NoError(t, err) - err = agg.WriteAccountStorage(addr, loc, []byte{addr[0], loc[0]}) + err = domains.WriteAccountStorage(addr, loc, []byte{addr[0], loc[0]}, nil) require.NoError(t, err) keys[txNum-1] = append(addr, loc...) - - err = agg.FinishTx() - require.NoError(t, err) } - agg.FinishWrites() + + // flush and build files + err = agg.Flush(context.Background(), tx) + require.NoError(t, err) + + latestStepInDB := agg.accounts.LastStepInDB(tx) + require.Equal(t, 5, int(latestStepInDB)) err = tx.Commit() require.NoError(t, err) + agg.FinishWrites() + + err = agg.BuildFiles(txs) + require.NoError(t, err) + tx = nil - db.Close() agg.Close() + db.Close() + // remove database files require.NoError(t, os.RemoveAll(filepath.Join(path, "db4"))) - newDb, err := mdbx.NewMDBX(logger).InMem(filepath.Join(path, "db4")).WithTableCfg(func(defaultBuckets kv.TableCfg) kv.TableCfg { + // open new db and aggregator instances + newDb := mdbx.NewMDBX(logger).InMem(filepath.Join(path, "db4")).WithTableCfg(func(defaultBuckets kv.TableCfg) kv.TableCfg { return kv.ChaindataTablesCfg - }).Open() - require.NoError(t, err) + }).MustOpen() t.Cleanup(newDb.Close) + newAgg, err := NewAggregatorV3(context.Background(), agg.dir, agg.dir, aggStep, newDb, logger) + require.NoError(t, err) + require.NoError(t, newAgg.OpenFolder()) + newTx, err := newDb.BeginRw(context.Background()) require.NoError(t, err) defer newTx.Rollback() - newAgg, err := NewAggregator(path, path, aggStep, CommitmentModeDirect, commitment.VariantHexPatriciaTrie, logger) - require.NoError(t, err) - require.NoError(t, newAgg.ReopenFolder()) + //newAgg.SetTx(newTx) + defer newAgg.StartWrites().FinishWrites() - newAgg.SetTx(newTx) - newAgg.StartWrites() + ac := newAgg.MakeContext() + defer ac.Close() + newDoms := newAgg.SharedDomains(ac) + defer newDoms.Close() + newDoms.SetTx(newTx) - _, latestTx, err := newAgg.SeekCommitment() + _, latestTx, err := newDoms.SeekCommitment(0, 1<<63-1) require.NoError(t, err) t.Logf("seek to latest_tx=%d", latestTx) - ctx := newAgg.defaultCtx miss := uint64(0) for i, key := range keys { if uint64(i+1) >= txs-aggStep { continue // finishtx always stores last agg step in db which we deleted, so missing values which were not aggregated is expected } - stored, err := ctx.ReadAccountData(key[:length.Addr], newTx) + stored, _, err := ac.GetLatest(kv.AccountsDomain, key[:length.Addr], nil, newTx) require.NoError(t, err) if len(stored) == 0 { miss++ - fmt.Printf("%x [%d/%d]", key, miss, i+1) // txnum starts from 1 + //fmt.Printf("%x [%d/%d]", key, miss, i+1) // txnum starts from 1 continue } - nonce, _, _ := DecodeAccountBytes(stored) - require.EqualValues(t, i+1, nonce) - storedV, err := ctx.ReadAccountStorage(key[:length.Addr], key[length.Addr:], newTx) + require.EqualValues(t, i+1, int(nonce)) + + storedV, found, err := ac.GetLatest(kv.StorageDomain, key[:length.Addr], key[length.Addr:], newTx) require.NoError(t, err) + require.True(t, found) + _ = key[0] + _ = storedV[0] require.EqualValues(t, key[0], storedV[0]) require.EqualValues(t, key[length.Addr], storedV[1]) } - newAgg.FinishWrites() - ctx.Close() newAgg.Close() require.NoError(t, err) @@ -369,8 +416,7 @@ func TestAggregator_RestartOnFiles(t *testing.T) { func TestAggregator_ReplaceCommittedKeys(t *testing.T) { aggStep := uint64(500) - _, db, agg := testDbAndAggregator(t, aggStep) - t.Cleanup(agg.Close) + db, agg := testDbAndAggregatorv3(t, aggStep) tx, err := db.BeginRw(context.Background()) require.NoError(t, err) @@ -379,31 +425,38 @@ func TestAggregator_ReplaceCommittedKeys(t *testing.T) { tx.Rollback() } }() - agg.SetTx(tx) - defer agg.StartWrites().FinishWrites() + defer agg.StartUnbufferedWrites().FinishWrites() + + ct := agg.MakeContext() + defer ct.Close() + domains := agg.SharedDomains(ct) + defer agg.CloseSharedDomains() + domains.SetTx(tx) var latestCommitTxNum uint64 commit := func(txn uint64) error { + ct.Close() err = tx.Commit() require.NoError(t, err) + tx, err = db.BeginRw(context.Background()) require.NoError(t, err) - t.Logf("commit to db txn=%d", txn) - + ct = agg.MakeContext() + domains = agg.SharedDomains(ct) atomic.StoreUint64(&latestCommitTxNum, txn) - agg.SetTx(tx) + domains.SetTx(tx) return nil } - roots := agg.AggregatedRoots() - txs := (aggStep) * StepsInBiggestFile + txs := (aggStep) * StepsInColdFile t.Logf("step=%d tx_count=%d", aggStep, txs) rnd := rand.New(rand.NewSource(0)) keys := make([][]byte, txs/2) - for txNum := uint64(1); txNum <= txs/2; txNum++ { - agg.SetTxNum(txNum) + var txNum uint64 + for txNum = uint64(1); txNum <= txs/2; txNum++ { + domains.SetTxNum(txNum) addr, loc := make([]byte, length.Addr), make([]byte, length.Hash) n, err := rnd.Read(addr) @@ -416,44 +469,46 @@ func TestAggregator_ReplaceCommittedKeys(t *testing.T) { keys[txNum-1] = append(addr, loc...) buf := EncodeAccountBytes(1, uint256.NewInt(0), nil, 0) - err = agg.UpdateAccountData(addr, buf) + + prev, _, err := ct.accounts.GetLatest(addr, nil, tx) require.NoError(t, err) - err = agg.WriteAccountStorage(addr, loc, []byte{addr[0], loc[0]}) + err = domains.UpdateAccountData(addr, buf, prev) require.NoError(t, err) - err = agg.FinishTx() + prev, _, err = ct.storage.GetLatest(addr, loc, tx) require.NoError(t, err) - select { - case <-roots: - require.NoError(t, commit(txNum)) - default: - continue - } + err = domains.WriteAccountStorage(addr, loc, []byte{addr[0], loc[0]}, prev) + require.NoError(t, err) + } + require.NoError(t, commit(txNum)) half := txs / 2 - for txNum := txs/2 + 1; txNum <= txs; txNum++ { - agg.SetTxNum(txNum) + for txNum = txNum + 1; txNum <= txs; txNum++ { + domains.SetTxNum(txNum) addr, loc := keys[txNum-1-half][:length.Addr], keys[txNum-1-half][length.Addr:] - err = agg.WriteAccountStorage(addr, loc, []byte{addr[0], loc[0]}) + prev, _, err := ct.storage.GetLatest(addr, loc, tx) require.NoError(t, err) - - err = agg.FinishTx() + err = domains.WriteAccountStorage(addr, loc, []byte{addr[0], loc[0]}, prev) require.NoError(t, err) } + ct.Close() err = tx.Commit() tx = nil tx, err = db.BeginRw(context.Background()) require.NoError(t, err) - ctx := agg.defaultCtx - for _, key := range keys { - storedV, err := ctx.ReadAccountStorage(key[:length.Addr], key[length.Addr:], tx) + ctx := agg.MakeContext() + defer ctx.Close() + + for i, key := range keys { + storedV, found, err := ctx.storage.GetLatest(key[:length.Addr], key[length.Addr:], tx) + require.Truef(t, found, "key %x not found %d", key, i) require.NoError(t, err) require.EqualValues(t, key[0], storedV[0]) require.EqualValues(t, key[length.Addr], storedV[1]) @@ -481,48 +536,6 @@ func Test_EncodeCommitmentState(t *testing.T) { require.EqualValues(t, cs.trieState, dec.trieState) } -func Test_BtreeIndex_Seek(t *testing.T) { - tmp := t.TempDir() - logger := log.New() - - keyCount, M := 120000, 1024 - dataPath := generateCompressedKV(t, tmp, 52, 180 /*val size*/, keyCount, logger) - defer os.RemoveAll(tmp) - - indexPath := path.Join(tmp, filepath.Base(dataPath)+".bti") - err := BuildBtreeIndex(dataPath, indexPath, logger) - require.NoError(t, err) - - bt, err := OpenBtreeIndex(indexPath, dataPath, uint64(M)) - require.NoError(t, err) - require.EqualValues(t, bt.KeyCount(), keyCount) - - keys, err := pivotKeysFromKV(dataPath) - require.NoError(t, err) - - for i := 0; i < len(keys); i++ { - cur, err := bt.Seek(keys[i]) - require.NoErrorf(t, err, "i=%d", i) - require.EqualValues(t, keys[i], cur.key) - require.NotEmptyf(t, cur.Value(), "i=%d", i) - // require.EqualValues(t, uint64(i), cur.Value()) - } - for i := 1; i < len(keys); i++ { - alt := common.Copy(keys[i]) - for j := len(alt) - 1; j >= 0; j-- { - if alt[j] > 0 { - alt[j] -= 1 - break - } - } - cur, err := bt.Seek(keys[i]) - require.NoError(t, err) - require.EqualValues(t, keys[i], cur.Key()) - } - - bt.Close() -} - func pivotKeysFromKV(dataPath string) ([][]byte, error) { decomp, err := compress.NewDecompressor(dataPath) if err != nil { @@ -549,7 +562,7 @@ func pivotKeysFromKV(dataPath string) ([][]byte, error) { return listing, nil } -func generateCompressedKV(tb testing.TB, tmp string, keySize, valueSize, keyCount int, logger log.Logger) string { +func generateKV(tb testing.TB, tmp string, keySize, valueSize, keyCount int, logger log.Logger, compressFlags FileCompression) string { tb.Helper() args := BtIndexWriterArgs{ @@ -569,22 +582,37 @@ func generateCompressedKV(tb testing.TB, tmp string, keySize, valueSize, keyCoun comp, err := compress.NewCompressor(context.Background(), "cmp", dataPath, tmp, compress.MinPatternScore, 1, log.LvlDebug, logger) require.NoError(tb, err) + collector := etl.NewCollector(BtreeLogPrefix+" genCompress", tb.TempDir(), etl.NewSortableBuffer(datasize.KB*8), logger) + for i := 0; i < keyCount; i++ { key := make([]byte, keySize) n, err := rnd.Read(key[:]) require.EqualValues(tb, keySize, n) binary.BigEndian.PutUint64(key[keySize-8:], uint64(i)) require.NoError(tb, err) - err = comp.AddWord(key[:]) - require.NoError(tb, err) n, err = rnd.Read(values[:rnd.Intn(valueSize)+1]) require.NoError(tb, err) - err = comp.AddWord(values[:n]) + err = collector.Collect(key, values[:n]) require.NoError(tb, err) } + writer := NewArchiveWriter(comp, compressFlags) + + loader := func(k, v []byte, _ etl.CurrentTableReader, _ etl.LoadNextFunc) error { + err = writer.AddWord(k) + require.NoError(tb, err) + err = writer.AddWord(v) + require.NoError(tb, err) + return nil + } + + err = collector.Load(nil, "", loader, etl.TransformArgs{}) + require.NoError(tb, err) + + collector.Close() + err = comp.Compress() require.NoError(tb, err) comp.Close() @@ -592,7 +620,7 @@ func generateCompressedKV(tb testing.TB, tmp string, keySize, valueSize, keyCoun decomp, err := compress.NewDecompressor(dataPath) require.NoError(tb, err) - getter := decomp.MakeGetter() + getter := NewArchiveGetter(decomp.MakeGetter(), compressFlags) getter.Reset(0) var pos uint64 @@ -617,21 +645,162 @@ func generateCompressedKV(tb testing.TB, tmp string, keySize, valueSize, keyCoun return decomp.FilePath() } -func Test_InitBtreeIndex(t *testing.T) { +func testDbAndAggregatorv3(t *testing.T, aggStep uint64) (kv.RwDB, *AggregatorV3) { + t.Helper() + path := t.TempDir() logger := log.New() - tmp := t.TempDir() + dir := filepath.Join(path, "snapshots", "history") + require.NoError(t, os.MkdirAll(filepath.Join(path, "db4"), 0740)) + require.NoError(t, os.MkdirAll(filepath.Join(path, "snapshots", "warm"), 0740)) + require.NoError(t, os.MkdirAll(dir, 0740)) + db := mdbx.NewMDBX(logger).InMem(path).WithTableCfg(func(defaultBuckets kv.TableCfg) kv.TableCfg { + return kv.ChaindataTablesCfg + }).MustOpen() + t.Cleanup(db.Close) + + agg, err := NewAggregatorV3(context.Background(), dir, filepath.Join(path, "e4", "tmp"), aggStep, db, logger) + require.NoError(t, err) + t.Cleanup(agg.Close) + err = agg.OpenFolder() + agg.DisableFsync() + require.NoError(t, err) + return db, agg +} + +// generate test data for table tests, containing n; n < 20 keys of length 20 bytes and values of length <= 16 bytes +func generateInputData(tb testing.TB, keySize, valueSize, keyCount int) ([][]byte, [][]byte) { + tb.Helper() + + rnd := rand.New(rand.NewSource(0)) + values := make([][]byte, keyCount) + keys := make([][]byte, keyCount) + + bk, bv := make([]byte, keySize), make([]byte, valueSize) + for i := 0; i < keyCount; i++ { + n, err := rnd.Read(bk[:]) + require.EqualValues(tb, keySize, n) + require.NoError(tb, err) + keys[i] = common.Copy(bk[:n]) + + n, err = rnd.Read(bv[:rnd.Intn(valueSize)+1]) + require.NoError(tb, err) + + values[i] = common.Copy(bv[:n]) + } + return keys, values +} + +func TestAggregatorV3_SharedDomains(t *testing.T) { + db, agg := testDbAndAggregatorv3(t, 20) - keyCount, M := 100, uint64(4) - compPath := generateCompressedKV(t, tmp, 52, 300, keyCount, logger) - decomp, err := compress.NewDecompressor(compPath) + mc2 := agg.MakeContext() + defer mc2.Close() + domains := agg.SharedDomains(mc2) + + rwTx, err := db.BeginRw(context.Background()) require.NoError(t, err) - defer decomp.Close() + defer rwTx.Rollback() + + domains.SetTx(rwTx) + agg.StartWrites() + + //agg.StartUnbufferedWrites() + defer agg.FinishWrites() + defer domains.Close() + + keys, vals := generateInputData(t, 20, 16, 10) + keys = keys[:2] + + var i int + roots := make([][]byte, 0, 10) + var pruneFrom uint64 = 5 - err = BuildBtreeIndexWithDecompressor(tmp+".bt", decomp, &background.Progress{}, tmp, logger) + mc := agg.MakeContext() + defer mc.Close() + + for i = 0; i < len(vals); i++ { + domains.SetTxNum(uint64(i)) + + for j := 0; j < len(keys); j++ { + buf := EncodeAccountBytes(uint64(i), uint256.NewInt(uint64(i*100_000)), nil, 0) + prev, err := domains.LatestAccount(keys[j]) + require.NoError(t, err) + + err = domains.UpdateAccountData(keys[j], buf, prev) + //err = domains.UpdateAccountCode(keys[j], vals[i], nil) + require.NoError(t, err) + } + rh, err := domains.Commit(true, false) + require.NoError(t, err) + require.NotEmpty(t, rh) + roots = append(roots, rh) + } + + err = agg.Flush(context.Background(), rwTx) require.NoError(t, err) - bt, err := OpenBtreeIndexWithDecompressor(tmp+".bt", M, decomp) + ac := agg.MakeContext() + err = ac.Unwind(context.Background(), pruneFrom, rwTx) require.NoError(t, err) - require.EqualValues(t, bt.KeyCount(), keyCount) - bt.Close() + ac.Close() + + for i = int(pruneFrom); i < len(vals); i++ { + domains.SetTxNum(uint64(i)) + + for j := 0; j < len(keys); j++ { + buf := EncodeAccountBytes(uint64(i), uint256.NewInt(uint64(i*100_000)), nil, 0) + prev, _, err := mc.GetLatest(kv.AccountsDomain, keys[j], nil, rwTx) + require.NoError(t, err) + + err = domains.UpdateAccountData(keys[j], buf, prev) + require.NoError(t, err) + //err = domains.UpdateAccountCode(keys[j], vals[i], nil) + //require.NoError(t, err) + } + + rh, err := domains.Commit(true, false) + require.NoError(t, err) + require.NotEmpty(t, rh) + require.EqualValues(t, roots[i], rh) + } + + err = agg.Flush(context.Background(), rwTx) + require.NoError(t, err) + + pruneFrom = 3 + + ac.Close() + + ac = agg.MakeContext() + err = ac.Unwind(context.Background(), pruneFrom, rwTx) + ac.Close() + require.NoError(t, err) + + for i = int(pruneFrom); i < len(vals); i++ { + domains.SetTxNum(uint64(i)) + + for j := 0; j < len(keys); j++ { + buf := EncodeAccountBytes(uint64(i), uint256.NewInt(uint64(i*100_000)), nil, 0) + prev, _, err := mc.GetLatest(kv.AccountsDomain, keys[j], nil, rwTx) + require.NoError(t, err) + + err = domains.UpdateAccountData(keys[j], buf, prev) + require.NoError(t, err) + //err = domains.UpdateAccountCode(keys[j], vals[i], nil) + //require.NoError(t, err) + } + + rh, err := domains.Commit(true, false) + require.NoError(t, err) + require.NotEmpty(t, rh) + require.EqualValues(t, roots[i], rh) + } +} + +func Test_helper_decodeAccountv3Bytes(t *testing.T) { + input, err := hex.DecodeString("000114000101") + require.NoError(t, err) + + n, b, ch := DecodeAccountBytes(input) + fmt.Printf("input %x nonce %d balance %d codeHash %d\n", input, n, b.Uint64(), ch) } diff --git a/state/aggregator_v3.go b/state/aggregator_v3.go index 371f5a607..feca02260 100644 --- a/state/aggregator_v3.go +++ b/state/aggregator_v3.go @@ -22,6 +22,8 @@ import ( "errors" "fmt" math2 "math" + "os" + "path/filepath" "runtime" "strings" "sync" @@ -29,36 +31,48 @@ import ( "time" "github.com/RoaringBitmap/roaring/roaring64" + "github.com/ledgerwatch/log/v3" + rand2 "golang.org/x/exp/rand" + "golang.org/x/sync/errgroup" + + "github.com/ledgerwatch/erigon-lib/commitment" common2 "github.com/ledgerwatch/erigon-lib/common" "github.com/ledgerwatch/erigon-lib/common/background" "github.com/ledgerwatch/erigon-lib/common/cmp" "github.com/ledgerwatch/erigon-lib/common/dbg" + "github.com/ledgerwatch/erigon-lib/common/dir" "github.com/ledgerwatch/erigon-lib/kv" "github.com/ledgerwatch/erigon-lib/kv/bitmapdb" "github.com/ledgerwatch/erigon-lib/kv/iter" "github.com/ledgerwatch/erigon-lib/kv/order" - "github.com/ledgerwatch/log/v3" - "golang.org/x/sync/errgroup" +) + +const ( + AccDomainLargeValues = true + StorageDomainLargeValues = true + CodeDomainLargeValues = true + CommitmentDomainLargeValues = true ) type AggregatorV3 struct { - rwTx kv.RwTx db kv.RoDB - storage *History + domains *SharedDomains + accounts *Domain + storage *Domain + code *Domain + commitment *DomainCommitted tracesTo *InvertedIndex - backgroundResult *BackgroundResult - code *History logAddrs *InvertedIndex logTopics *InvertedIndex tracesFrom *InvertedIndex - accounts *History - logPrefix string + backgroundResult *BackgroundResult dir string tmpdir string aggregationStep uint64 keepInDB uint64 minimaxTxNumInFiles atomic.Uint64 + aggregatedStep atomic.Uint64 filesMutationLock sync.Mutex @@ -73,10 +87,10 @@ type AggregatorV3 struct { ctxCancel context.CancelFunc needSaveFilesListInDB atomic.Bool - wg sync.WaitGroup + + wg sync.WaitGroup // goroutines spawned by Aggregator, to ensure all of them are finish at agg.Close onFreeze OnFreezeFunc - walLock sync.RWMutex ps *background.ProgressSet @@ -88,6 +102,11 @@ type AggregatorV3 struct { type OnFreezeFunc func(frozenFileNames []string) func NewAggregatorV3(ctx context.Context, dir, tmpdir string, aggregationStep uint64, db kv.RoDB, logger log.Logger) (*AggregatorV3, error) { + salt, err := getIndicesSalt(dir) + if err != nil { + return nil, err + } + ctx, ctxCancel := context.WithCancel(ctx) a := &AggregatorV3{ ctx: ctx, @@ -103,33 +122,105 @@ func NewAggregatorV3(ctx context.Context, dir, tmpdir string, aggregationStep ui backgroundResult: &BackgroundResult{}, logger: logger, } - var err error - if a.accounts, err = NewHistory(dir, a.tmpdir, aggregationStep, "accounts", kv.TblAccountHistoryKeys, kv.TblAccountIdx, kv.TblAccountHistoryVals, false, nil, false, logger); err != nil { + cfg := domainCfg{ + hist: histCfg{ + iiCfg: iiCfg{salt: salt, dir: dir, tmpdir: tmpdir}, + withLocalityIndex: false, withExistenceIndex: true, compression: CompressNone, historyLargeValues: false, + }, + domainLargeValues: AccDomainLargeValues, + } + if a.accounts, err = NewDomain(cfg, aggregationStep, "accounts", kv.TblAccountKeys, kv.TblAccountVals, kv.TblAccountHistoryKeys, kv.TblAccountHistoryVals, kv.TblAccountIdx, logger); err != nil { return nil, err } - if a.storage, err = NewHistory(dir, a.tmpdir, aggregationStep, "storage", kv.TblStorageHistoryKeys, kv.TblStorageIdx, kv.TblStorageHistoryVals, false, nil, false, logger); err != nil { + cfg = domainCfg{ + hist: histCfg{ + iiCfg: iiCfg{salt: salt, dir: dir, tmpdir: tmpdir}, + withLocalityIndex: false, withExistenceIndex: true, compression: CompressNone, historyLargeValues: false, + }, + domainLargeValues: StorageDomainLargeValues, + } + if a.storage, err = NewDomain(cfg, aggregationStep, "storage", kv.TblStorageKeys, kv.TblStorageVals, kv.TblStorageHistoryKeys, kv.TblStorageHistoryVals, kv.TblStorageIdx, logger); err != nil { return nil, err } - if a.code, err = NewHistory(dir, a.tmpdir, aggregationStep, "code", kv.TblCodeHistoryKeys, kv.TblCodeIdx, kv.TblCodeHistoryVals, true, nil, true, logger); err != nil { + cfg = domainCfg{ + hist: histCfg{ + iiCfg: iiCfg{salt: salt, dir: dir, tmpdir: tmpdir}, + withLocalityIndex: false, withExistenceIndex: true, compression: CompressKeys | CompressVals, historyLargeValues: true, + }, + domainLargeValues: CodeDomainLargeValues, + } + if a.code, err = NewDomain(cfg, aggregationStep, "code", kv.TblCodeKeys, kv.TblCodeVals, kv.TblCodeHistoryKeys, kv.TblCodeHistoryVals, kv.TblCodeIdx, logger); err != nil { return nil, err } - if a.logAddrs, err = NewInvertedIndex(dir, a.tmpdir, aggregationStep, "logaddrs", kv.TblLogAddressKeys, kv.TblLogAddressIdx, false, nil, logger); err != nil { + cfg = domainCfg{ + hist: histCfg{ + iiCfg: iiCfg{salt: salt, dir: dir, tmpdir: tmpdir}, + withLocalityIndex: false, withExistenceIndex: true, compression: CompressNone, historyLargeValues: true, + }, + domainLargeValues: CommitmentDomainLargeValues, + compress: CompressNone, + } + commitd, err := NewDomain(cfg, aggregationStep, "commitment", kv.TblCommitmentKeys, kv.TblCommitmentVals, kv.TblCommitmentHistoryKeys, kv.TblCommitmentHistoryVals, kv.TblCommitmentIdx, logger) + if err != nil { return nil, err } - if a.logTopics, err = NewInvertedIndex(dir, a.tmpdir, aggregationStep, "logtopics", kv.TblLogTopicsKeys, kv.TblLogTopicsIdx, false, nil, logger); err != nil { + a.commitment = NewCommittedDomain(commitd, CommitmentModeDirect, commitment.VariantHexPatriciaTrie) + idxCfg := iiCfg{salt: salt, dir: dir, tmpdir: a.tmpdir} + if a.logAddrs, err = NewInvertedIndex(idxCfg, aggregationStep, "logaddrs", kv.TblLogAddressKeys, kv.TblLogAddressIdx, false, true, nil, logger); err != nil { return nil, err } - if a.tracesFrom, err = NewInvertedIndex(dir, a.tmpdir, aggregationStep, "tracesfrom", kv.TblTracesFromKeys, kv.TblTracesFromIdx, false, nil, logger); err != nil { + idxCfg = iiCfg{salt: salt, dir: dir, tmpdir: a.tmpdir} + if a.logTopics, err = NewInvertedIndex(idxCfg, aggregationStep, "logtopics", kv.TblLogTopicsKeys, kv.TblLogTopicsIdx, false, true, nil, logger); err != nil { return nil, err } - if a.tracesTo, err = NewInvertedIndex(dir, a.tmpdir, aggregationStep, "tracesto", kv.TblTracesToKeys, kv.TblTracesToIdx, false, nil, logger); err != nil { + idxCfg = iiCfg{salt: salt, dir: dir, tmpdir: a.tmpdir} + if a.tracesFrom, err = NewInvertedIndex(idxCfg, aggregationStep, "tracesfrom", kv.TblTracesFromKeys, kv.TblTracesFromIdx, false, true, nil, logger); err != nil { + return nil, err + } + idxCfg = iiCfg{salt: salt, dir: dir, tmpdir: a.tmpdir} + if a.tracesTo, err = NewInvertedIndex(idxCfg, aggregationStep, "tracesto", kv.TblTracesToKeys, kv.TblTracesToIdx, false, true, nil, logger); err != nil { return nil, err } a.recalcMaxTxNum() return a, nil } + +// getIndicesSalt - try read salt for all indices from DB. Or fall-back to new salt creation. +// if db is Read-Only (for example remote RPCDaemon or utilities) - we will not create new indices - and existing indices have salt in metadata. +func getIndicesSalt(baseDir string) (salt *uint32, err error) { + fpath := filepath.Join(baseDir, "salt.txt") + if !dir.FileExist(fpath) { + if salt == nil { + saltV := rand2.Uint32() + salt = &saltV + } + saltBytes := make([]byte, 4) + binary.BigEndian.PutUint32(saltBytes, *salt) + if err := dir.WriteFileWithFsync(fpath, saltBytes, os.ModePerm); err != nil { + return nil, err + } + } + saltBytes, err := os.ReadFile(fpath) + if err != nil { + return nil, err + } + saltV := binary.BigEndian.Uint32(saltBytes) + salt = &saltV + return salt, nil +} + func (a *AggregatorV3) OnFreeze(f OnFreezeFunc) { a.onFreeze = f } +func (a *AggregatorV3) DisableFsync() { + a.accounts.DisableFsync() + a.storage.DisableFsync() + a.code.DisableFsync() + a.commitment.DisableFsync() + a.logAddrs.DisableFsync() + a.logTopics.DisableFsync() + a.tracesFrom.DisableFsync() + a.tracesTo.DisableFsync() +} func (a *AggregatorV3) OpenFolder() error { a.filesMutationLock.Lock() @@ -144,6 +235,9 @@ func (a *AggregatorV3) OpenFolder() error { if err = a.code.OpenFolder(); err != nil { return fmt.Errorf("OpenFolder: %w", err) } + if err = a.commitment.OpenFolder(); err != nil { + return fmt.Errorf("OpenFolder: %w", err) + } if err = a.logAddrs.OpenFolder(); err != nil { return fmt.Errorf("OpenFolder: %w", err) } @@ -157,32 +251,41 @@ func (a *AggregatorV3) OpenFolder() error { return fmt.Errorf("OpenFolder: %w", err) } a.recalcMaxTxNum() + mx := a.minimaxTxNumInFiles.Load() + if mx > 0 { + mx-- + } + a.aggregatedStep.Store(mx / a.aggregationStep) + return nil } -func (a *AggregatorV3) OpenList(fNames []string) error { +func (a *AggregatorV3) OpenList(fNames, warmNames []string) error { a.filesMutationLock.Lock() defer a.filesMutationLock.Unlock() var err error - if err = a.accounts.OpenList(fNames); err != nil { + if err = a.accounts.OpenList(fNames, warmNames); err != nil { + return err + } + if err = a.storage.OpenList(fNames, warmNames); err != nil { return err } - if err = a.storage.OpenList(fNames); err != nil { + if err = a.code.OpenList(fNames, warmNames); err != nil { return err } - if err = a.code.OpenList(fNames); err != nil { + if err = a.commitment.OpenList(fNames, warmNames); err != nil { return err } - if err = a.logAddrs.OpenList(fNames); err != nil { + if err = a.logAddrs.OpenList(fNames, warmNames); err != nil { return err } - if err = a.logTopics.OpenList(fNames); err != nil { + if err = a.logTopics.OpenList(fNames, warmNames); err != nil { return err } - if err = a.tracesFrom.OpenList(fNames); err != nil { + if err = a.tracesFrom.OpenList(fNames, warmNames); err != nil { return err } - if err = a.tracesTo.OpenList(fNames); err != nil { + if err = a.tracesTo.OpenList(fNames, warmNames); err != nil { return err } a.recalcMaxTxNum() @@ -190,7 +293,11 @@ func (a *AggregatorV3) OpenList(fNames []string) error { } func (a *AggregatorV3) Close() { + if a.ctxCancel == nil { // invariant: it's safe to call Close multiple times + return + } a.ctxCancel() + a.ctxCancel = nil a.wg.Wait() a.filesMutationLock.Lock() @@ -199,39 +306,35 @@ func (a *AggregatorV3) Close() { a.accounts.Close() a.storage.Close() a.code.Close() + a.commitment.Close() a.logAddrs.Close() a.logTopics.Close() a.tracesFrom.Close() a.tracesTo.Close() } -// CleanDir - call it manually on startup of Main application (don't call it from utilities or nother processes) -// - remove files ignored during opening of aggregator -// - remove files which marked as deleted but have no readers (usually last reader removing files marked as deleted) -func (a *AggregatorV3) CleanDir() { - a.accounts.deleteGarbageFiles() - a.storage.deleteGarbageFiles() - a.code.deleteGarbageFiles() - a.logAddrs.deleteGarbageFiles() - a.logTopics.deleteGarbageFiles() - a.tracesFrom.deleteGarbageFiles() - a.tracesTo.deleteGarbageFiles() - - ac := a.MakeContext() - defer ac.Close() - ac.a.accounts.cleanAfterFreeze(ac.accounts.frozenTo()) - ac.a.storage.cleanAfterFreeze(ac.storage.frozenTo()) - ac.a.code.cleanAfterFreeze(ac.code.frozenTo()) - ac.a.logAddrs.cleanAfterFreeze(ac.logAddrs.frozenTo()) - ac.a.logTopics.cleanAfterFreeze(ac.logTopics.frozenTo()) - ac.a.tracesFrom.cleanAfterFreeze(ac.tracesFrom.frozenTo()) - ac.a.tracesTo.cleanAfterFreeze(ac.tracesTo.frozenTo()) +func (a *AggregatorV3) CloseSharedDomains() { + if a.domains != nil { + a.domains.FinishWrites() + a.domains.SetTx(nil) + a.domains.Close() + a.domains = nil + } +} +func (a *AggregatorV3) SharedDomains(ac *AggregatorV3Context) *SharedDomains { + if a.domains == nil { + a.domains = NewSharedDomains(a.accounts, a.code, a.storage, a.commitment) + a.domains.SetInvertedIndices(a.tracesTo, a.tracesFrom, a.logAddrs, a.logTopics) + } + a.domains.SetContext(ac) + return a.domains } -func (a *AggregatorV3) SetWorkers(i int) { +func (a *AggregatorV3) SetCompressWorkers(i int) { a.accounts.compressWorkers = i a.storage.compressWorkers = i a.code.compressWorkers = i + a.commitment.compressWorkers = i a.logAddrs.compressWorkers = i a.logTopics.compressWorkers = i a.tracesFrom.compressWorkers = i @@ -241,17 +344,15 @@ func (a *AggregatorV3) SetWorkers(i int) { func (a *AggregatorV3) HasBackgroundFilesBuild() bool { return a.ps.Has() } func (a *AggregatorV3) BackgroundProgress() string { return a.ps.String() } -func (a *AggregatorV3) Files() (res []string) { - a.filesMutationLock.Lock() - defer a.filesMutationLock.Unlock() - - res = append(res, a.accounts.Files()...) - res = append(res, a.storage.Files()...) - res = append(res, a.code.Files()...) - res = append(res, a.logAddrs.Files()...) - res = append(res, a.logTopics.Files()...) - res = append(res, a.tracesFrom.Files()...) - res = append(res, a.tracesTo.Files()...) +func (ac *AggregatorV3Context) Files() (res []string) { + res = append(res, ac.accounts.Files()...) + res = append(res, ac.storage.Files()...) + res = append(res, ac.code.Files()...) + res = append(res, ac.commitment.Files()...) + res = append(res, ac.logAddrs.Files()...) + res = append(res, ac.logTopics.Files()...) + res = append(res, ac.tracesFrom.Files()...) + res = append(res, ac.tracesTo.Files()...) return res } func (a *AggregatorV3) BuildOptionalMissedIndicesInBackground(ctx context.Context, workers int) { @@ -264,26 +365,47 @@ func (a *AggregatorV3) BuildOptionalMissedIndicesInBackground(ctx context.Contex defer a.buildingOptionalIndices.Store(false) aggCtx := a.MakeContext() defer aggCtx.Close() - if err := aggCtx.BuildOptionalMissedIndices(ctx, workers); err != nil { - if errors.Is(err, context.Canceled) { + if err := aggCtx.buildOptionalMissedIndices(ctx, workers); err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, common2.ErrStopped) { return } - log.Warn("[snapshots] merge", "err", err) + log.Warn("[snapshots] BuildOptionalMissedIndicesInBackground", "err", err) } }() } -func (ac *AggregatorV3Context) BuildOptionalMissedIndices(ctx context.Context, workers int) error { +func (a *AggregatorV3) BuildOptionalMissedIndices(ctx context.Context, workers int) error { + if ok := a.buildingOptionalIndices.CompareAndSwap(false, true); !ok { + return nil + } + defer a.buildingOptionalIndices.Store(false) + aggCtx := a.MakeContext() + defer aggCtx.Close() + if err := aggCtx.buildOptionalMissedIndices(ctx, workers); err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, common2.ErrStopped) { + return nil + } + return err + } + return nil +} + +// Useless +func (ac *AggregatorV3Context) buildOptionalMissedIndices(ctx context.Context, workers int) error { g, ctx := errgroup.WithContext(ctx) g.SetLimit(workers) + ps := background.NewProgressSet() if ac.accounts != nil { - g.Go(func() error { return ac.accounts.BuildOptionalMissedIndices(ctx) }) + g.Go(func() error { return ac.accounts.BuildOptionalMissedIndices(ctx, ps) }) } if ac.storage != nil { - g.Go(func() error { return ac.storage.BuildOptionalMissedIndices(ctx) }) + g.Go(func() error { return ac.storage.BuildOptionalMissedIndices(ctx, ps) }) } if ac.code != nil { - g.Go(func() error { return ac.code.BuildOptionalMissedIndices(ctx) }) + g.Go(func() error { return ac.code.BuildOptionalMissedIndices(ctx, ps) }) + } + if ac.commitment != nil { + g.Go(func() error { return ac.commitment.BuildOptionalMissedIndices(ctx, ps) }) } return g.Wait() } @@ -309,10 +431,10 @@ func (a *AggregatorV3) BuildMissedIndices(ctx context.Context, workers int) erro } } }() - a.accounts.BuildMissedIndices(ctx, g, ps) a.storage.BuildMissedIndices(ctx, g, ps) a.code.BuildMissedIndices(ctx, g, ps) + a.commitment.BuildMissedIndices(ctx, g, ps) a.logAddrs.BuildMissedIndices(ctx, g, ps) a.logTopics.BuildMissedIndices(ctx, g, ps) a.tracesFrom.BuildMissedIndices(ctx, g, ps) @@ -325,49 +447,41 @@ func (a *AggregatorV3) BuildMissedIndices(ctx context.Context, workers int) erro return err } } - - ac := a.MakeContext() - defer ac.Close() - return ac.BuildOptionalMissedIndices(ctx, workers) + return nil } -func (a *AggregatorV3) SetLogPrefix(v string) { a.logPrefix = v } - +// Deprecated func (a *AggregatorV3) SetTx(tx kv.RwTx) { - a.rwTx = tx + if a.domains != nil { + a.domains.SetTx(tx) + } + a.accounts.SetTx(tx) a.storage.SetTx(tx) a.code.SetTx(tx) + a.commitment.SetTx(tx) a.logAddrs.SetTx(tx) a.logTopics.SetTx(tx) a.tracesFrom.SetTx(tx) a.tracesTo.SetTx(tx) } -func (a *AggregatorV3) SetTxNum(txNum uint64) { - a.accounts.SetTxNum(txNum) - a.storage.SetTxNum(txNum) - a.code.SetTxNum(txNum) - a.logAddrs.SetTxNum(txNum) - a.logTopics.SetTxNum(txNum) - a.tracesFrom.SetTxNum(txNum) - a.tracesTo.SetTxNum(txNum) -} - type AggV3Collation struct { logAddrs map[string]*roaring64.Bitmap logTopics map[string]*roaring64.Bitmap tracesFrom map[string]*roaring64.Bitmap tracesTo map[string]*roaring64.Bitmap - accounts HistoryCollation - storage HistoryCollation - code HistoryCollation + accounts Collation + storage Collation + code Collation + commitment Collation } func (c AggV3Collation) Close() { c.accounts.Close() c.storage.Close() c.code.Close() + c.commitment.Close() for _, b := range c.logAddrs { bitmapdb.ReturnToPool64(b) @@ -383,174 +497,160 @@ func (c AggV3Collation) Close() { } } -func (a *AggregatorV3) buildFiles(ctx context.Context, step, txFrom, txTo uint64) (AggV3StaticFiles, error) { - //logEvery := time.NewTicker(60 * time.Second) - //defer logEvery.Stop() - //defer func(t time.Time) { - // log.Info(fmt.Sprintf("[snapshot] build %d-%d", step, step+1), "took", time.Since(t)) - //}(time.Now()) - var sf AggV3StaticFiles - var ac AggV3Collation - closeColl := true +type AggV3StaticFiles struct { + accounts StaticFiles + storage StaticFiles + code StaticFiles + commitment StaticFiles + logAddrs InvertedFiles + logTopics InvertedFiles + tracesFrom InvertedFiles + tracesTo InvertedFiles +} + +// CleanupOnError - call it on collation fail. It closing all files +func (sf AggV3StaticFiles) CleanupOnError() { + sf.accounts.CleanupOnError() + sf.storage.CleanupOnError() + sf.code.CleanupOnError() + sf.logAddrs.CleanupOnError() + sf.logTopics.CleanupOnError() + sf.tracesFrom.CleanupOnError() + sf.tracesTo.CleanupOnError() +} + +func (a *AggregatorV3) buildFiles(ctx context.Context, step uint64) error { + var ( + logEvery = time.NewTicker(time.Second * 30) + txFrom = step * a.aggregationStep + txTo = (step + 1) * a.aggregationStep + stepStartedAt = time.Now() + ) + + defer logEvery.Stop() + + defer a.needSaveFilesListInDB.Store(true) + defer a.recalcMaxTxNum() + var static AggV3StaticFiles + + //log.Warn("[dbg] collate", "step", step) + + closeCollations := true + collListMu := sync.Mutex{} + collations := make([]Collation, 0) defer func() { - if closeColl { - ac.Close() + if !closeCollations { + return + } + for _, c := range collations { + c.Close() } }() - //var wg sync.WaitGroup - //wg.Add(7) - //errCh := make(chan error, 7) - //go func() { - // defer wg.Done() - var err error - if err = a.db.View(ctx, func(tx kv.Tx) error { - ac.accounts, err = a.accounts.collate(step, txFrom, txTo, tx) - return err - }); err != nil { - return sf, err - //errCh <- err - } - if sf.accounts, err = a.accounts.buildFiles(ctx, step, ac.accounts, a.ps); err != nil { - return sf, err - //errCh <- err - } - //}() - // - //go func() { - // defer wg.Done() - // var err error - if err = a.db.View(ctx, func(tx kv.Tx) error { - ac.storage, err = a.storage.collate(step, txFrom, txTo, tx) - return err - }); err != nil { - return sf, err - //errCh <- err - } + g, ctx := errgroup.WithContext(ctx) + for _, d := range []*Domain{a.accounts, a.storage, a.code, a.commitment.Domain} { + d := d - if sf.storage, err = a.storage.buildFiles(ctx, step, ac.storage, a.ps); err != nil { - return sf, err - //errCh <- err - } - //}() - //go func() { - // defer wg.Done() - // var err error - if err = a.db.View(ctx, func(tx kv.Tx) error { - ac.code, err = a.code.collate(step, txFrom, txTo, tx) - return err - }); err != nil { - return sf, err - //errCh <- err - } + a.wg.Add(1) + g.Go(func() error { + defer a.wg.Done() - if sf.code, err = a.code.buildFiles(ctx, step, ac.code, a.ps); err != nil { - return sf, err - //errCh <- err - } - //}() - //go func() { - // defer wg.Done() - // var err error - if err = a.db.View(ctx, func(tx kv.Tx) error { - ac.logAddrs, err = a.logAddrs.collate(ctx, txFrom, txTo, tx) - return err - }); err != nil { - return sf, err - //errCh <- err - } + var collation Collation + err := a.db.View(ctx, func(tx kv.Tx) (err error) { + collation, err = d.collate(ctx, step, txFrom, txTo, tx) + return err + }) + if err != nil { + return err + } + if err != nil { + return fmt.Errorf("domain collation %q has failed: %w", d.filenameBase, err) + } + collListMu.Lock() + collations = append(collations, collation) + collListMu.Unlock() + + mxCollationSize.Set(uint64(collation.valuesComp.Count())) + mxCollationSizeHist.Set(uint64(collation.historyComp.Count())) + + mxRunningMerges.Inc() + sf, err := d.buildFiles(ctx, step, collation, a.ps) + mxRunningMerges.Dec() + collation.Close() + if err != nil { + sf.CleanupOnError() + return err + } - if sf.logAddrs, err = a.logAddrs.buildFiles(ctx, step, ac.logAddrs, a.ps); err != nil { - return sf, err - //errCh <- err - } - //}() - //go func() { - // defer wg.Done() - // var err error - if err = a.db.View(ctx, func(tx kv.Tx) error { - ac.logTopics, err = a.logTopics.collate(ctx, txFrom, txTo, tx) - return err - }); err != nil { - return sf, err - //errCh <- err - } + switch kv.Domain(d.valsTable) { + case kv.TblAccountVals: + static.accounts = sf + case kv.TblStorageVals: + static.storage = sf + case kv.TblCodeVals: + static.code = sf + case kv.TblCommitmentVals: + static.commitment = sf + default: + panic("unknown domain " + d.valsTable) + } - if sf.logTopics, err = a.logTopics.buildFiles(ctx, step, ac.logTopics, a.ps); err != nil { - return sf, err - //errCh <- err + return nil + }) } - //}() - //go func() { - // defer wg.Done() - // var err error - if err = a.db.View(ctx, func(tx kv.Tx) error { - ac.tracesFrom, err = a.tracesFrom.collate(ctx, txFrom, txTo, tx) - return err - }); err != nil { - return sf, err - //errCh <- err + closeCollations = false + + // indices are built concurrently + for _, d := range []*InvertedIndex{a.logTopics, a.logAddrs, a.tracesFrom, a.tracesTo} { + d := d + a.wg.Add(1) + g.Go(func() error { + defer a.wg.Done() + var collation map[string]*roaring64.Bitmap + err := a.db.View(ctx, func(tx kv.Tx) (err error) { + collation, err = d.collate(ctx, step, step+1, tx) + return err + }) + if err != nil { + return fmt.Errorf("index collation %q has failed: %w", d.filenameBase, err) + } + sf, err := d.buildFiles(ctx, step, collation, a.ps) + if err != nil { + sf.CleanupOnError() + return err + } + + switch kv.Domain(d.indexKeysTable) { + case kv.TblLogTopicsKeys: + static.logTopics = sf + case kv.TblLogAddressKeys: + static.logAddrs = sf + case kv.TblTracesFromKeys: + static.tracesFrom = sf + case kv.TblTracesToKeys: + static.tracesTo = sf + default: + panic("unknown index " + d.indexKeysTable) + } + return nil + }) } - if sf.tracesFrom, err = a.tracesFrom.buildFiles(ctx, step, ac.tracesFrom, a.ps); err != nil { - return sf, err - //errCh <- err + if err := g.Wait(); err != nil { + static.CleanupOnError() + return fmt.Errorf("domain collate-build: %w", err) } - //}() - //go func() { - // defer wg.Done() - // var err error - if err = a.db.View(ctx, func(tx kv.Tx) error { - ac.tracesTo, err = a.tracesTo.collate(ctx, txFrom, txTo, tx) - return err - }); err != nil { - return sf, err - //errCh <- err - } - - if sf.tracesTo, err = a.tracesTo.buildFiles(ctx, step, ac.tracesTo, a.ps); err != nil { - return sf, err - // errCh <- err - } - //}() - //go func() { - // wg.Wait() - //close(errCh) - //}() - //var lastError error - //for err := range errCh { - // if err != nil { - // lastError = err - // } - //} - //if lastError == nil { - closeColl = false - //} - return sf, nil -} + mxStepTook.UpdateDuration(stepStartedAt) + a.integrateFiles(static, txFrom, txTo) + a.aggregatedStep.Store(step) -type AggV3StaticFiles struct { - accounts HistoryFiles - storage HistoryFiles - code HistoryFiles - logAddrs InvertedFiles - logTopics InvertedFiles - tracesFrom InvertedFiles - tracesTo InvertedFiles -} + a.logger.Info("[snapshots] aggregation", "step", step, "took", time.Since(stepStartedAt)) -func (sf AggV3StaticFiles) Close() { - sf.accounts.Close() - sf.storage.Close() - sf.code.Close() - sf.logAddrs.Close() - sf.logTopics.Close() - sf.tracesFrom.Close() - sf.tracesTo.Close() + return nil } func (a *AggregatorV3) BuildFiles(toTxNum uint64) (err error) { - a.BuildFilesInBackground(toTxNum) + finished := a.BuildFilesInBackground(toTxNum) if !(a.buildingFiles.Load() || a.mergeingFiles.Load() || a.buildingOptionalIndices.Load()) { return nil } @@ -562,6 +662,8 @@ Loop: select { case <-a.ctx.Done(): return a.ctx.Err() + case <-finished: + break Loop case <-logEvery.C: if !(a.buildingFiles.Load() || a.mergeingFiles.Load() || a.buildingOptionalIndices.Load()) { break Loop @@ -575,31 +677,12 @@ Loop: return nil } -func (a *AggregatorV3) buildFilesInBackground(ctx context.Context, step uint64) (err error) { - closeAll := true - //log.Info("[snapshots] history build", "step", fmt.Sprintf("%d-%d", step, step+1)) - sf, err := a.buildFiles(ctx, step, step*a.aggregationStep, (step+1)*a.aggregationStep) - if err != nil { - return err - } - defer func() { - if closeAll { - sf.Close() - } - }() - a.integrateFiles(sf, step*a.aggregationStep, (step+1)*a.aggregationStep) - //a.notifyAboutNewSnapshots() - - closeAll = false - return nil -} - func (a *AggregatorV3) mergeLoopStep(ctx context.Context, workers int) (somethingDone bool, err error) { - ac := a.MakeContext() // this need, to ensure we do all operations on files in "transaction-style", maybe we will ensure it on type-level in future + ac := a.MakeContext() defer ac.Close() closeAll := true - maxSpan := a.aggregationStep * StepsInBiggestFile + maxSpan := a.aggregationStep * StepsInColdFile r := ac.findMergeRange(a.minimaxTxNumInFiles.Load(), maxSpan) if !r.any() { return false, nil @@ -629,7 +712,10 @@ func (a *AggregatorV3) mergeLoopStep(ctx context.Context, workers int) (somethin closeAll = false return true, nil } + func (a *AggregatorV3) MergeLoop(ctx context.Context, workers int) error { + a.logger.Warn("[dbg] MergeLoop start") + defer a.logger.Warn("[dbg] MergeLoop done") for { somethingMerged, err := a.mergeLoopStep(ctx, workers) if err != nil { @@ -649,6 +735,7 @@ func (a *AggregatorV3) integrateFiles(sf AggV3StaticFiles, txNumFrom, txNumTo ui a.accounts.integrateFiles(sf.accounts, txNumFrom, txNumTo) a.storage.integrateFiles(sf.storage, txNumFrom, txNumTo) a.code.integrateFiles(sf.code, txNumFrom, txNumTo) + a.commitment.integrateFiles(sf.commitment, txNumFrom, txNumTo) a.logAddrs.integrateFiles(sf.logAddrs, txNumFrom, txNumTo) a.logTopics.integrateFiles(sf.logTopics, txNumFrom, txNumTo) a.tracesFrom.integrateFiles(sf.tracesFrom, txNumFrom, txNumTo) @@ -659,33 +746,6 @@ func (a *AggregatorV3) HasNewFrozenFiles() bool { return a.needSaveFilesListInDB.CompareAndSwap(true, false) } -func (a *AggregatorV3) Unwind(ctx context.Context, txUnwindTo uint64) error { - logEvery := time.NewTicker(30 * time.Second) - defer logEvery.Stop() - if err := a.accounts.prune(ctx, txUnwindTo, math2.MaxUint64, math2.MaxUint64, logEvery); err != nil { - return err - } - if err := a.storage.prune(ctx, txUnwindTo, math2.MaxUint64, math2.MaxUint64, logEvery); err != nil { - return err - } - if err := a.code.prune(ctx, txUnwindTo, math2.MaxUint64, math2.MaxUint64, logEvery); err != nil { - return err - } - if err := a.logAddrs.prune(ctx, txUnwindTo, math2.MaxUint64, math2.MaxUint64, logEvery); err != nil { - return err - } - if err := a.logTopics.prune(ctx, txUnwindTo, math2.MaxUint64, math2.MaxUint64, logEvery); err != nil { - return err - } - if err := a.tracesFrom.prune(ctx, txUnwindTo, math2.MaxUint64, math2.MaxUint64, logEvery); err != nil { - return err - } - if err := a.tracesTo.prune(ctx, txUnwindTo, math2.MaxUint64, math2.MaxUint64, logEvery); err != nil { - return err - } - return nil -} - func (a *AggregatorV3) Warmup(ctx context.Context, txFrom, limit uint64) error { if a.db == nil { return nil @@ -700,6 +760,9 @@ func (a *AggregatorV3) Warmup(ctx context.Context, txFrom, limit uint64) error { e.Go(func() error { return a.db.View(ctx, func(tx kv.Tx) error { return a.code.warmup(ctx, txFrom, limit, tx) }) }) + e.Go(func() error { + return a.db.View(ctx, func(tx kv.Tx) error { return a.commitment.warmup(ctx, txFrom, limit, tx) }) + }) e.Go(func() error { return a.db.View(ctx, func(tx kv.Tx) error { return a.logAddrs.warmup(ctx, txFrom, limit, tx) }) }) @@ -720,6 +783,7 @@ func (a *AggregatorV3) DiscardHistory() *AggregatorV3 { a.accounts.DiscardHistory() a.storage.DiscardHistory() a.code.DiscardHistory() + a.commitment.DiscardHistory() a.logAddrs.DiscardHistory(a.tmpdir) a.logTopics.DiscardHistory(a.tmpdir) a.tracesFrom.DiscardHistory(a.tmpdir) @@ -729,90 +793,119 @@ func (a *AggregatorV3) DiscardHistory() *AggregatorV3 { // StartWrites - pattern: `defer agg.StartWrites().FinishWrites()` func (a *AggregatorV3) StartWrites() *AggregatorV3 { - a.walLock.Lock() - defer a.walLock.Unlock() - a.accounts.StartWrites() - a.storage.StartWrites() - a.code.StartWrites() - a.logAddrs.StartWrites() - a.logTopics.StartWrites() - a.tracesFrom.StartWrites() - a.tracesTo.StartWrites() + if a.domains == nil { + a.SharedDomains(a.MakeContext()) + } + //a.walLock.Lock() + //defer a.walLock.Unlock() + //a.accounts.StartWrites() + //a.storage.StartWrites() + //a.code.StartWrites() + //a.commitment.StartWrites() + //a.logAddrs.StartWrites() + //a.logTopics.StartWrites() + //a.tracesFrom.StartWrites() + //a.tracesTo.StartWrites() + //return a + a.domains.StartWrites() return a } + func (a *AggregatorV3) StartUnbufferedWrites() *AggregatorV3 { - a.walLock.Lock() - defer a.walLock.Unlock() - a.accounts.StartWrites() - a.storage.StartWrites() - a.code.StartWrites() - a.logAddrs.StartWrites() - a.logTopics.StartWrites() - a.tracesFrom.StartWrites() - a.tracesTo.StartWrites() + if a.domains == nil { + a.SharedDomains(a.MakeContext()) + } + //a.walLock.Lock() + //defer a.walLock.Unlock() + //a.accounts.StartUnbufferedWrites() + //a.storage.StartUnbufferedWrites() + //a.code.StartUnbufferedWrites() + //a.commitment.StartUnbufferedWrites() + //a.logAddrs.StartUnbufferedWrites() + //a.logTopics.StartUnbufferedWrites() + //a.tracesFrom.StartUnbufferedWrites() + //a.tracesTo.StartUnbufferedWrites() + //return a + a.domains.StartUnbufferedWrites() return a } func (a *AggregatorV3) FinishWrites() { - a.walLock.Lock() - defer a.walLock.Unlock() - a.accounts.FinishWrites() - a.storage.FinishWrites() - a.code.FinishWrites() - a.logAddrs.FinishWrites() - a.logTopics.FinishWrites() - a.tracesFrom.FinishWrites() - a.tracesTo.FinishWrites() + //a.walLock.Lock() + //defer a.walLock.Unlock() + //a.accounts.FinishWrites() + //a.storage.FinishWrites() + //a.code.FinishWrites() + //a.commitment.FinishWrites() + //a.logAddrs.FinishWrites() + //a.logTopics.FinishWrites() + //a.tracesFrom.FinishWrites() + //a.tracesTo.FinishWrites() + if a.domains != nil { + a.domains.FinishWrites() + } } type flusher interface { Flush(ctx context.Context, tx kv.RwTx) error } -func (a *AggregatorV3) rotate() []flusher { - a.walLock.Lock() - defer a.walLock.Unlock() - return []flusher{ - a.accounts.Rotate(), - a.storage.Rotate(), - a.code.Rotate(), - a.logAddrs.Rotate(), - a.logTopics.Rotate(), - a.tracesFrom.Rotate(), - a.tracesTo.Rotate(), - } -} func (a *AggregatorV3) Flush(ctx context.Context, tx kv.RwTx) error { - flushers := a.rotate() - defer func(t time.Time) { log.Debug("[snapshots] history flush", "took", time.Since(t)) }(time.Now()) - for _, f := range flushers { - if err := f.Flush(ctx, tx); err != nil { - return err - } - } - return nil + return a.domains.Flush(ctx, tx) } -func (a *AggregatorV3) CanPrune(tx kv.Tx) bool { - return a.CanPruneFrom(tx) < a.minimaxTxNumInFiles.Load() +func (ac *AggregatorV3Context) maxTxNumInFiles(cold bool) uint64 { + return cmp.Min( + cmp.Min( + cmp.Min( + ac.accounts.maxTxNumInFiles(cold), + ac.code.maxTxNumInFiles(cold)), + cmp.Min( + ac.storage.maxTxNumInFiles(cold), + ac.commitment.maxTxNumInFiles(cold)), + ), + cmp.Min( + cmp.Min( + ac.logAddrs.maxTxNumInFiles(cold), + ac.logTopics.maxTxNumInFiles(cold)), + cmp.Min( + ac.tracesFrom.maxTxNumInFiles(cold), + ac.tracesTo.maxTxNumInFiles(cold)), + ), + ) } -func (a *AggregatorV3) CanPruneFrom(tx kv.Tx) uint64 { - fst, _ := kv.FirstKey(tx, kv.TblTracesToKeys) - fst2, _ := kv.FirstKey(tx, kv.TblStorageHistoryKeys) - if len(fst) > 0 && len(fst2) > 0 { + +func (ac *AggregatorV3Context) CanPrune(tx kv.Tx) bool { + //fmt.Printf("can prune: from=%d < current=%d, keep=%d\n", ac.CanPruneFrom(tx)/ac.a.aggregationStep, ac.maxTxNumInFiles(false)/ac.a.aggregationStep, ac.a.keepInDB) + return ac.CanPruneFrom(tx) < ac.maxTxNumInFiles(false) +} +func (ac *AggregatorV3Context) CanPruneFrom(tx kv.Tx) uint64 { + fst, _ := kv.FirstKey(tx, ac.a.tracesTo.indexKeysTable) + fst2, _ := kv.FirstKey(tx, ac.a.storage.History.indexKeysTable) + fst3, _ := kv.FirstKey(tx, ac.a.commitment.History.indexKeysTable) + if len(fst) > 0 && len(fst2) > 0 && len(fst3) > 0 { fstInDb := binary.BigEndian.Uint64(fst) fstInDb2 := binary.BigEndian.Uint64(fst2) - return cmp.Min(fstInDb, fstInDb2) + fstInDb3 := binary.BigEndian.Uint64(fst3) + return cmp.Min(cmp.Min(fstInDb, fstInDb2), fstInDb3) } return math2.MaxUint64 } -func (a *AggregatorV3) PruneWithTiemout(ctx context.Context, timeout time.Duration) error { - t := time.Now() - for a.CanPrune(a.rwTx) && time.Since(t) < timeout { - if err := a.Prune(ctx, 1_000); err != nil { // prune part of retired data, before commit - return err +func (ac *AggregatorV3Context) PruneWithTimeout(ctx context.Context, timeout time.Duration, tx kv.RwTx) error { + cc, cancel := context.WithTimeout(ctx, timeout) + defer cancel() + + //for s := ac.a.stepToPrune.Load(); s < ac.a.aggregatedStep.Load(); s++ { + if err := ac.Prune(cc, ac.a.aggregatedStep.Load(), math2.MaxUint64, tx); err != nil { // prune part of retired data, before commit + if errors.Is(err, context.DeadlineExceeded) { + return nil } + return err } + if cc.Err() != nil { //nolint + return nil //nolint + } + //} return nil } @@ -821,6 +914,7 @@ func (a *AggregatorV3) StepsRangeInDBAsStr(tx kv.Tx) string { a.accounts.stepsRangeInDBAsStr(tx), a.storage.stepsRangeInDBAsStr(tx), a.code.stepsRangeInDBAsStr(tx), + a.commitment.stepsRangeInDBAsStr(tx), a.logAddrs.stepsRangeInDBAsStr(tx), a.logTopics.stepsRangeInDBAsStr(tx), a.tracesFrom.stepsRangeInDBAsStr(tx), @@ -828,84 +922,134 @@ func (a *AggregatorV3) StepsRangeInDBAsStr(tx kv.Tx) string { }, ", ") } -func (a *AggregatorV3) Prune(ctx context.Context, limit uint64) error { - //if limit/a.aggregationStep > StepsInBiggestFile { - // ctx, cancel := context.WithCancel(ctx) - // defer cancel() - // - // a.wg.Add(1) - // go func() { - // defer a.wg.Done() - // _ = a.Warmup(ctx, 0, cmp.Max(a.aggregationStep, limit)) // warmup is asyn and moving faster than data deletion - // }() - //} - return a.prune(ctx, 0, a.minimaxTxNumInFiles.Load(), limit) +func (ac *AggregatorV3Context) Prune(ctx context.Context, step, limit uint64, tx kv.RwTx) error { + if dbg.NoPrune() { + return nil + } + + txTo := step * ac.a.aggregationStep + var txFrom uint64 + + logEvery := time.NewTicker(30 * time.Second) + defer logEvery.Stop() + ac.a.logger.Info("aggregator prune", "step", step, + "range", fmt.Sprintf("[%d,%d)", txFrom, txTo), /*"limit", limit, + "stepsLimit", limit/ac.a.aggregationStep,*/"stepsRangeInDB", ac.a.StepsRangeInDBAsStr(tx)) + + if err := ac.accounts.Prune(ctx, tx, step, txFrom, txTo, limit, logEvery); err != nil { + return err + } + if err := ac.storage.Prune(ctx, tx, step, txFrom, txTo, limit, logEvery); err != nil { + return err + } + if err := ac.code.Prune(ctx, tx, step, txFrom, txTo, limit, logEvery); err != nil { + return err + } + if err := ac.commitment.Prune(ctx, tx, step, txFrom, txTo, limit, logEvery); err != nil { + return err + } + if err := ac.logAddrs.Prune(ctx, tx, txFrom, txTo, limit, logEvery); err != nil { + return err + } + if err := ac.logTopics.Prune(ctx, tx, txFrom, txTo, limit, logEvery); err != nil { + return err + } + if err := ac.tracesFrom.Prune(ctx, tx, txFrom, txTo, limit, logEvery); err != nil { + return err + } + if err := ac.tracesTo.Prune(ctx, tx, txFrom, txTo, limit, logEvery); err != nil { + return err + } + return nil } -func (a *AggregatorV3) prune(ctx context.Context, txFrom, txTo, limit uint64) error { +func (ac *AggregatorV3Context) Unwind(ctx context.Context, txUnwindTo uint64, rwTx kv.RwTx) error { + step := txUnwindTo / ac.a.aggregationStep + logEvery := time.NewTicker(30 * time.Second) defer logEvery.Stop() - if err := a.accounts.prune(ctx, txFrom, txTo, limit, logEvery); err != nil { + ac.a.logger.Info("aggregator unwind", "step", step, + "txUnwindTo", txUnwindTo, "stepsRangeInDB", ac.a.StepsRangeInDBAsStr(rwTx)) + + if err := ac.accounts.Unwind(ctx, rwTx, step, txUnwindTo, math2.MaxUint64, math2.MaxUint64, nil); err != nil { + return err + } + if err := ac.storage.Unwind(ctx, rwTx, step, txUnwindTo, math2.MaxUint64, math2.MaxUint64, nil); err != nil { + return err + } + if err := ac.code.Unwind(ctx, rwTx, step, txUnwindTo, math2.MaxUint64, math2.MaxUint64, nil); err != nil { return err } - if err := a.storage.prune(ctx, txFrom, txTo, limit, logEvery); err != nil { + if err := ac.commitment.Unwind(ctx, rwTx, step, txUnwindTo, math2.MaxUint64, math2.MaxUint64, nil); err != nil { return err } - if err := a.code.prune(ctx, txFrom, txTo, limit, logEvery); err != nil { + if err := ac.logAddrs.Prune(ctx, rwTx, txUnwindTo, math2.MaxUint64, math2.MaxUint64, logEvery); err != nil { return err } - if err := a.logAddrs.prune(ctx, txFrom, txTo, limit, logEvery); err != nil { + if err := ac.logTopics.Prune(ctx, rwTx, txUnwindTo, math2.MaxUint64, math2.MaxUint64, logEvery); err != nil { return err } - if err := a.logTopics.prune(ctx, txFrom, txTo, limit, logEvery); err != nil { + if err := ac.tracesFrom.Prune(ctx, rwTx, txUnwindTo, math2.MaxUint64, math2.MaxUint64, logEvery); err != nil { return err } - if err := a.tracesFrom.prune(ctx, txFrom, txTo, limit, logEvery); err != nil { + if err := ac.tracesTo.Prune(ctx, rwTx, txUnwindTo, math2.MaxUint64, math2.MaxUint64, logEvery); err != nil { return err } - if err := a.tracesTo.prune(ctx, txFrom, txTo, limit, logEvery); err != nil { + if err := ac.a.domains.Unwind(ctx, rwTx, txUnwindTo); err != nil { return err } return nil } -func (a *AggregatorV3) LogStats(tx kv.Tx, tx2block func(endTxNumMinimax uint64) uint64) { - if a.minimaxTxNumInFiles.Load() == 0 { +func (ac *AggregatorV3Context) LogStats(tx kv.Tx, tx2block func(endTxNumMinimax uint64) uint64) { + if ac.a.minimaxTxNumInFiles.Load() == 0 { return } - histBlockNumProgress := tx2block(a.minimaxTxNumInFiles.Load()) - str := make([]string, 0, a.accounts.InvertedIndex.files.Len()) - a.accounts.InvertedIndex.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - bn := tx2block(item.endTxNum) - str = append(str, fmt.Sprintf("%d=%dK", item.endTxNum/a.aggregationStep, bn/1_000)) - } - return true - }) - c, err := tx.CursorDupSort(a.accounts.InvertedIndex.indexTable) - if err != nil { - // TODO pass error properly around - panic(err) + histBlockNumProgress := tx2block(ac.maxTxNumInFiles(false)) + str := make([]string, 0, len(ac.accounts.files)) + for _, item := range ac.accounts.files { + bn := tx2block(item.endTxNum) + str = append(str, fmt.Sprintf("%d=%dK", item.endTxNum/ac.a.aggregationStep, bn/1_000)) } - _, v, err := c.First() - if err != nil { - // TODO pass error properly around - panic(err) - } - var firstHistoryIndexBlockInDB uint64 - if len(v) != 0 { - firstHistoryIndexBlockInDB = tx2block(binary.BigEndian.Uint64(v)) + //str2 := make([]string, 0, len(ac.storage.files)) + //for _, item := range ac.storage.files { + // str2 = append(str2, fmt.Sprintf("%s:%dm", item.src.decompressor.FileName(), item.src.decompressor.Count()/1_000_000)) + //} + //for _, item := range ac.commitment.files { + // bn := tx2block(item.endTxNum) / 1_000 + // str2 = append(str2, fmt.Sprintf("%s:%dK", item.src.decompressor.FileName(), bn)) + //} + var lastCommitmentBlockNum, lastCommitmentTxNum uint64 + if len(ac.commitment.files) > 0 { + lastCommitmentTxNum = ac.commitment.files[len(ac.commitment.files)-1].endTxNum + lastCommitmentBlockNum = tx2block(lastCommitmentTxNum) } - + firstHistoryIndexBlockInDB := tx2block(ac.a.accounts.FirstStepInDB(tx) * ac.a.aggregationStep) var m runtime.MemStats dbg.ReadMemStats(&m) log.Info("[snapshots] History Stat", "blocks", fmt.Sprintf("%dk", (histBlockNumProgress+1)/1000), - "txs", fmt.Sprintf("%dm", a.minimaxTxNumInFiles.Load()/1_000_000), + "txs", fmt.Sprintf("%dm", ac.a.minimaxTxNumInFiles.Load()/1_000_000), "txNum2blockNum", strings.Join(str, ","), "first_history_idx_in_db", firstHistoryIndexBlockInDB, + "last_comitment_block", lastCommitmentBlockNum, + "last_comitment_tx_num", lastCommitmentTxNum, + //"cnt_in_files", strings.Join(str2, ","), + //"used_files", strings.Join(ac.Files(), ","), "alloc", common2.ByteCount(m.Alloc), "sys", common2.ByteCount(m.Sys)) + +} + +func (a *AggregatorV3) EndTxNumNoCommitment() uint64 { + min := a.accounts.endTxNumMinimax() + if txNum := a.storage.endTxNumMinimax(); txNum < min { + min = txNum + } + if txNum := a.code.endTxNumMinimax(); txNum < min { + min = txNum + } + return min } func (a *AggregatorV3) EndTxNumMinimax() uint64 { return a.minimaxTxNumInFiles.Load() } @@ -915,7 +1059,10 @@ func (a *AggregatorV3) EndTxNumFrozenAndIndexed() uint64 { a.accounts.endIndexedTxNumMinimax(true), a.storage.endIndexedTxNumMinimax(true), ), - a.code.endIndexedTxNumMinimax(true), + cmp.Min( + a.code.endIndexedTxNumMinimax(true), + a.commitment.endIndexedTxNumMinimax(true), + ), ) } func (a *AggregatorV3) recalcMaxTxNum() { @@ -926,6 +1073,9 @@ func (a *AggregatorV3) recalcMaxTxNum() { if txNum := a.code.endTxNumMinimax(); txNum < min { min = txNum } + if txNum := a.commitment.endTxNumMinimax(); txNum < min { + min = txNum + } if txNum := a.logAddrs.endTxNumMinimax(); txNum < min { min = txNum } @@ -942,9 +1092,10 @@ func (a *AggregatorV3) recalcMaxTxNum() { } type RangesV3 struct { - accounts HistoryRanges - storage HistoryRanges - code HistoryRanges + accounts DomainRanges + storage DomainRanges + code DomainRanges + commitment DomainRanges logTopicsStartTxNum uint64 logAddrsEndTxNum uint64 logAddrsStartTxNum uint64 @@ -959,46 +1110,88 @@ type RangesV3 struct { tracesTo bool } +func (r RangesV3) String() string { + ss := []string{} + if r.accounts.any() { + ss = append(ss, fmt.Sprintf("accounts(%s)", r.accounts.String())) + } + if r.storage.any() { + ss = append(ss, fmt.Sprintf("storage(%s)", r.storage.String())) + } + if r.code.any() { + ss = append(ss, fmt.Sprintf("code(%s)", r.code.String())) + } + if r.commitment.any() { + ss = append(ss, fmt.Sprintf("commitment(%s)", r.commitment.String())) + } + if r.logAddrs { + ss = append(ss, fmt.Sprintf("logAddr=%d-%d", r.logAddrsStartTxNum/r.accounts.aggStep, r.logAddrsEndTxNum/r.accounts.aggStep)) + } + if r.logTopics { + ss = append(ss, fmt.Sprintf("logTopic=%d-%d", r.logTopicsStartTxNum/r.accounts.aggStep, r.logTopicsEndTxNum/r.accounts.aggStep)) + } + if r.tracesFrom { + ss = append(ss, fmt.Sprintf("traceFrom=%d-%d", r.tracesFromStartTxNum/r.accounts.aggStep, r.tracesFromEndTxNum/r.accounts.aggStep)) + } + if r.tracesTo { + ss = append(ss, fmt.Sprintf("traceTo=%d-%d", r.tracesToStartTxNum/r.accounts.aggStep, r.tracesToEndTxNum/r.accounts.aggStep)) + } + return strings.Join(ss, ", ") +} func (r RangesV3) any() bool { - return r.accounts.any() || r.storage.any() || r.code.any() || r.logAddrs || r.logTopics || r.tracesFrom || r.tracesTo + return r.accounts.any() || r.storage.any() || r.code.any() || r.commitment.any() || r.logAddrs || r.logTopics || r.tracesFrom || r.tracesTo } func (ac *AggregatorV3Context) findMergeRange(maxEndTxNum, maxSpan uint64) RangesV3 { var r RangesV3 - r.accounts = ac.a.accounts.findMergeRange(maxEndTxNum, maxSpan) - r.storage = ac.a.storage.findMergeRange(maxEndTxNum, maxSpan) - r.code = ac.a.code.findMergeRange(maxEndTxNum, maxSpan) - r.logAddrs, r.logAddrsStartTxNum, r.logAddrsEndTxNum = ac.a.logAddrs.findMergeRange(maxEndTxNum, maxSpan) - r.logTopics, r.logTopicsStartTxNum, r.logTopicsEndTxNum = ac.a.logTopics.findMergeRange(maxEndTxNum, maxSpan) - r.tracesFrom, r.tracesFromStartTxNum, r.tracesFromEndTxNum = ac.a.tracesFrom.findMergeRange(maxEndTxNum, maxSpan) - r.tracesTo, r.tracesToStartTxNum, r.tracesToEndTxNum = ac.a.tracesTo.findMergeRange(maxEndTxNum, maxSpan) - //log.Info(fmt.Sprintf("findMergeRange(%d, %d)=%+v\n", maxEndTxNum, maxSpan, r)) + r.accounts = ac.accounts.findMergeRange(maxEndTxNum, maxSpan) + r.storage = ac.storage.findMergeRange(maxEndTxNum, maxSpan) + r.code = ac.code.findMergeRange(maxEndTxNum, maxSpan) + r.commitment = ac.commitment.findMergeRange(maxEndTxNum, maxSpan) + r.logAddrs, r.logAddrsStartTxNum, r.logAddrsEndTxNum = ac.logAddrs.findMergeRange(maxEndTxNum, maxSpan) + r.logTopics, r.logTopicsStartTxNum, r.logTopicsEndTxNum = ac.logTopics.findMergeRange(maxEndTxNum, maxSpan) + r.tracesFrom, r.tracesFromStartTxNum, r.tracesFromEndTxNum = ac.tracesFrom.findMergeRange(maxEndTxNum, maxSpan) + r.tracesTo, r.tracesToStartTxNum, r.tracesToEndTxNum = ac.tracesTo.findMergeRange(maxEndTxNum, maxSpan) + //log.Info(fmt.Sprintf("findMergeRange(%d, %d)=%s\n", maxEndTxNum/ac.a.aggregationStep, maxSpan/ac.a.aggregationStep, r)) return r } type SelectedStaticFilesV3 struct { - logTopics []*filesItem - accountsHist []*filesItem - tracesTo []*filesItem - storageIdx []*filesItem - storageHist []*filesItem - tracesFrom []*filesItem - codeIdx []*filesItem - codeHist []*filesItem - accountsIdx []*filesItem - logAddrs []*filesItem - codeI int - logAddrsI int - logTopicsI int - storageI int - tracesFromI int - accountsI int - tracesToI int + accounts []*filesItem + accountsIdx []*filesItem + accountsHist []*filesItem + storage []*filesItem + storageIdx []*filesItem + storageHist []*filesItem + code []*filesItem + codeIdx []*filesItem + codeHist []*filesItem + commitment []*filesItem + commitmentIdx []*filesItem + commitmentHist []*filesItem + logTopics []*filesItem + tracesTo []*filesItem + tracesFrom []*filesItem + logAddrs []*filesItem + accountsI int + storageI int + codeI int + commitmentI int + logAddrsI int + logTopicsI int + tracesFromI int + tracesToI int } func (sf SelectedStaticFilesV3) Close() { - for _, group := range [][]*filesItem{sf.accountsIdx, sf.accountsHist, sf.storageIdx, sf.accountsHist, sf.codeIdx, sf.codeHist, - sf.logAddrs, sf.logTopics, sf.tracesFrom, sf.tracesTo} { + clist := [...][]*filesItem{ + sf.accounts, sf.accountsIdx, sf.accountsHist, + sf.storage, sf.storageIdx, sf.accountsHist, + sf.code, sf.codeIdx, sf.codeHist, + sf.commitment, sf.commitmentIdx, sf.commitmentHist, + sf.logAddrs, sf.logTopics, sf.tracesFrom, sf.tracesTo, + } + for _, group := range clist { for _, item := range group { if item != nil { if item.decompressor != nil { @@ -1014,22 +1207,16 @@ func (sf SelectedStaticFilesV3) Close() { func (ac *AggregatorV3Context) staticFilesInRange(r RangesV3) (sf SelectedStaticFilesV3, err error) { if r.accounts.any() { - sf.accountsIdx, sf.accountsHist, sf.accountsI, err = ac.accounts.staticFilesInRange(r.accounts) - if err != nil { - return sf, err - } + sf.accounts, sf.accountsIdx, sf.accountsHist, sf.accountsI = ac.accounts.staticFilesInRange(r.accounts) } if r.storage.any() { - sf.storageIdx, sf.storageHist, sf.storageI, err = ac.storage.staticFilesInRange(r.storage) - if err != nil { - return sf, err - } + sf.storage, sf.storageIdx, sf.storageHist, sf.storageI = ac.storage.staticFilesInRange(r.storage) } if r.code.any() { - sf.codeIdx, sf.codeHist, sf.codeI, err = ac.code.staticFilesInRange(r.code) - if err != nil { - return sf, err - } + sf.code, sf.codeIdx, sf.codeHist, sf.codeI = ac.code.staticFilesInRange(r.code) + } + if r.commitment.any() { + sf.commitment, sf.commitmentIdx, sf.commitmentHist, sf.commitmentI = ac.commitment.staticFilesInRange(r.commitment) } if r.logAddrs { sf.logAddrs, sf.logAddrsI = ac.logAddrs.staticFilesInRange(r.logAddrsStartTxNum, r.logAddrsEndTxNum) @@ -1047,13 +1234,18 @@ func (ac *AggregatorV3Context) staticFilesInRange(r RangesV3) (sf SelectedStatic } type MergedFilesV3 struct { - accountsIdx, accountsHist *filesItem - storageIdx, storageHist *filesItem - codeIdx, codeHist *filesItem - logAddrs *filesItem - logTopics *filesItem - tracesFrom *filesItem - tracesTo *filesItem + accounts *filesItem + accountsIdx, accountsHist *filesItem + storage *filesItem + storageIdx, storageHist *filesItem + code *filesItem + codeIdx, codeHist *filesItem + commitment *filesItem + commitmentIdx, commitmentHist *filesItem + logAddrs *filesItem + logTopics *filesItem + tracesFrom *filesItem + tracesTo *filesItem } func (mf MergedFilesV3) FrozenList() (frozen []string) { @@ -1093,8 +1285,15 @@ func (mf MergedFilesV3) FrozenList() (frozen []string) { return frozen } func (mf MergedFilesV3) Close() { - for _, item := range []*filesItem{mf.accountsIdx, mf.accountsHist, mf.storageIdx, mf.storageHist, mf.codeIdx, mf.codeHist, - mf.logAddrs, mf.logTopics, mf.tracesFrom, mf.tracesTo} { + clist := [...]*filesItem{ + mf.accounts, mf.accountsIdx, mf.accountsHist, + mf.storage, mf.storageIdx, mf.storageHist, + mf.code, mf.codeIdx, mf.codeHist, + mf.commitment, mf.commitmentIdx, mf.commitmentHist, + mf.logAddrs, mf.logTopics, mf.tracesFrom, mf.tracesTo, + } + + for _, item := range clist { if item != nil { if item.decompressor != nil { item.decompressor.Close() @@ -1116,28 +1315,44 @@ func (ac *AggregatorV3Context) mergeFiles(ctx context.Context, files SelectedSta mf.Close() } }() + + var predicates sync.WaitGroup if r.accounts.any() { - g.Go(func() error { - var err error - mf.accountsIdx, mf.accountsHist, err = ac.a.accounts.mergeFiles(ctx, files.accountsIdx, files.accountsHist, r.accounts, workers, ac.a.ps) + log.Info(fmt.Sprintf("[snapshots] merge: %s", r.String())) + predicates.Add(1) + g.Go(func() (err error) { + defer predicates.Done() + mf.accounts, mf.accountsIdx, mf.accountsHist, err = ac.a.accounts.mergeFiles(ctx, files.accounts, files.accountsIdx, files.accountsHist, r.accounts, workers, ac.a.ps) return err }) } if r.storage.any() { - g.Go(func() error { - var err error - mf.storageIdx, mf.storageHist, err = ac.a.storage.mergeFiles(ctx, files.storageIdx, files.storageHist, r.storage, workers, ac.a.ps) + predicates.Add(1) + g.Go(func() (err error) { + defer predicates.Done() + mf.storage, mf.storageIdx, mf.storageHist, err = ac.a.storage.mergeFiles(ctx, files.storage, files.storageIdx, files.storageHist, r.storage, workers, ac.a.ps) return err }) } if r.code.any() { - g.Go(func() error { - var err error - mf.codeIdx, mf.codeHist, err = ac.a.code.mergeFiles(ctx, files.codeIdx, files.codeHist, r.code, workers, ac.a.ps) + g.Go(func() (err error) { + mf.code, mf.codeIdx, mf.codeHist, err = ac.a.code.mergeFiles(ctx, files.code, files.codeIdx, files.codeHist, r.code, workers, ac.a.ps) return err }) } + if r.commitment.any() { + predicates.Wait() + //log.Info(fmt.Sprintf("[snapshots] merge commitment: %d-%d", r.accounts.historyStartTxNum/ac.a.aggregationStep, r.accounts.historyEndTxNum/ac.a.aggregationStep)) + g.Go(func() (err error) { + var v4Files SelectedStaticFiles + var v4MergedF MergedFiles + + mf.commitment, mf.commitmentIdx, mf.commitmentHist, err = ac.a.commitment.mergeFiles(ctx, v4Files.FillV3(&files), v4MergedF.FillV3(&mf), r.commitment, workers, ac.a.ps) + return err + }) + } + if r.logAddrs { g.Go(func() error { var err error @@ -1178,9 +1393,11 @@ func (a *AggregatorV3) integrateMergedFiles(outs SelectedStaticFilesV3, in Merge defer a.filesMutationLock.Unlock() defer a.needSaveFilesListInDB.Store(true) defer a.recalcMaxTxNum() - a.accounts.integrateMergedFiles(outs.accountsIdx, outs.accountsHist, in.accountsIdx, in.accountsHist) - a.storage.integrateMergedFiles(outs.storageIdx, outs.storageHist, in.storageIdx, in.storageHist) - a.code.integrateMergedFiles(outs.codeIdx, outs.codeHist, in.codeIdx, in.codeHist) + + a.accounts.integrateMergedFiles(outs.accounts, outs.accountsIdx, outs.accountsHist, in.accounts, in.accountsIdx, in.accountsHist) + a.storage.integrateMergedFiles(outs.storage, outs.storageIdx, outs.storageHist, in.storage, in.storageIdx, in.storageHist) + a.code.integrateMergedFiles(outs.code, outs.codeIdx, outs.codeHist, in.code, in.codeIdx, in.codeHist) + a.commitment.integrateMergedFiles(outs.commitment, outs.commitmentIdx, outs.commitmentHist, in.commitment, in.commitmentIdx, in.commitmentHist) a.logAddrs.integrateMergedFiles(outs.logAddrs, in.logAddrs) a.logTopics.integrateMergedFiles(outs.logTopics, in.logTopics) a.tracesFrom.integrateMergedFiles(outs.tracesFrom, in.tracesFrom) @@ -1189,15 +1406,10 @@ func (a *AggregatorV3) integrateMergedFiles(outs SelectedStaticFilesV3, in Merge return frozen } func (a *AggregatorV3) cleanAfterNewFreeze(in MergedFilesV3) { - if in.accountsHist != nil && in.accountsHist.frozen { - a.accounts.cleanAfterFreeze(in.accountsHist.endTxNum) - } - if in.storageHist != nil && in.storageHist.frozen { - a.storage.cleanAfterFreeze(in.storageHist.endTxNum) - } - if in.codeHist != nil && in.codeHist.frozen { - a.code.cleanAfterFreeze(in.codeHist.endTxNum) - } + a.accounts.cleanAfterFreeze(in.accounts, in.accountsHist, in.accountsIdx) + a.storage.cleanAfterFreeze(in.storage, in.storageHist, in.storageIdx) + a.code.cleanAfterFreeze(in.code, in.codeHist, in.codeIdx) + a.commitment.cleanAfterFreeze(in.commitment, in.commitmentHist, in.commitmentIdx) if in.logAddrs != nil && in.logAddrs.frozen { a.logAddrs.cleanAfterFreeze(in.logAddrs.endTxNum) } @@ -1212,31 +1424,36 @@ func (a *AggregatorV3) cleanAfterNewFreeze(in MergedFilesV3) { } } -// KeepInDB - usually equal to one a.aggregationStep, but when we exec blocks from snapshots +// KeepStepsInDB - usually equal to one a.aggregationStep, but when we exec blocks from snapshots // we can set it to 0, because no re-org on this blocks are possible -func (a *AggregatorV3) KeepInDB(v uint64) { a.keepInDB = v } +func (a *AggregatorV3) KeepStepsInDB(steps uint64) *AggregatorV3 { + a.keepInDB = steps * a.aggregationStep + return a +} + +// Returns channel which is closed when aggregation is done +func (a *AggregatorV3) BuildFilesInBackground(txNum uint64) chan struct{} { + fin := make(chan struct{}) -func (a *AggregatorV3) BuildFilesInBackground(txNum uint64) { if (txNum + 1) <= a.minimaxTxNumInFiles.Load()+a.aggregationStep+a.keepInDB { // Leave one step worth in the DB - return + return fin } if ok := a.buildingFiles.CompareAndSwap(false, true); !ok { - return + return fin } step := a.minimaxTxNumInFiles.Load() / a.aggregationStep - toTxNum := (step + 1) * a.aggregationStep - hasData := false a.wg.Add(1) go func() { defer a.wg.Done() defer a.buildingFiles.Store(false) - // check if db has enough data (maybe we didn't commit them yet) - lastInDB := lastIdInDB(a.db, a.accounts.indexKeysTable) - hasData = lastInDB >= toTxNum + // check if db has enough data (maybe we didn't commit them yet or all keys are unique so history is empty) + lastInDB := lastIdInDB(a.db, a.accounts) + hasData := lastInDB > step // `step` must be fully-written - means `step+1` records must be visible if !hasData { + close(fin) return } @@ -1244,26 +1461,29 @@ func (a *AggregatorV3) BuildFilesInBackground(txNum uint64) { // - to reduce amount of small merges // - to remove old data from db as early as possible // - during files build, may happen commit of new data. on each loop step getting latest id in db - for step < lastIdInDB(a.db, a.accounts.indexKeysTable)/a.aggregationStep { - if err := a.buildFilesInBackground(a.ctx, step); err != nil { - if errors.Is(err, context.Canceled) { + for ; step < lastIdInDB(a.db, a.accounts); step++ { //`step` must be fully-written - means `step+1` records must be visible + if err := a.buildFiles(a.ctx, step); err != nil { + if errors.Is(err, context.Canceled) || errors.Is(err, common2.ErrStopped) { + close(fin) return } log.Warn("[snapshots] buildFilesInBackground", "err", err) break } - step++ } + a.BuildOptionalMissedIndicesInBackground(a.ctx, 1) if ok := a.mergeingFiles.CompareAndSwap(false, true); !ok { + close(fin) return } a.wg.Add(1) go func() { defer a.wg.Done() defer a.mergeingFiles.Store(false) + defer func() { close(fin) }() if err := a.MergeLoop(a.ctx, 1); err != nil { - if errors.Is(err, context.Canceled) { + if errors.Is(err, context.Canceled) || errors.Is(err, common2.ErrStopped) { return } log.Warn("[snapshots] merge", "err", err) @@ -1272,93 +1492,39 @@ func (a *AggregatorV3) BuildFilesInBackground(txNum uint64) { a.BuildOptionalMissedIndicesInBackground(a.ctx, 1) }() }() + return fin } func (a *AggregatorV3) BatchHistoryWriteStart() *AggregatorV3 { - a.walLock.RLock() + //a.walLock.RLock() + a.domains.BatchHistoryWriteStart() return a } -func (a *AggregatorV3) BatchHistoryWriteEnd() { - a.walLock.RUnlock() -} - -func (a *AggregatorV3) AddAccountPrev(addr []byte, prev []byte) error { - return a.accounts.AddPrevValue(addr, nil, prev) -} - -func (a *AggregatorV3) AddStoragePrev(addr []byte, loc []byte, prev []byte) error { - return a.storage.AddPrevValue(addr, loc, prev) -} -// AddCodePrev - addr+inc => code -func (a *AggregatorV3) AddCodePrev(addr []byte, prev []byte) error { - return a.code.AddPrevValue(addr, nil, prev) +func (a *AggregatorV3) BatchHistoryWriteEnd() { + //a.walLock.RUnlock() + a.domains.BatchHistoryWriteEnd() } -func (a *AggregatorV3) PutIdx(idx kv.InvertedIdx, key []byte) error { - switch idx { - case kv.TblTracesFromIdx: - return a.tracesFrom.Add(key) - case kv.TblTracesToIdx: - return a.tracesTo.Add(key) - case kv.TblLogAddressIdx: - return a.logAddrs.Add(key) - case kv.LogTopicIndex: - return a.logTopics.Add(key) - default: - panic(idx) - } -} - -// DisableReadAhead - usage: `defer d.EnableReadAhead().DisableReadAhead()`. Please don't use this funcs without `defer` to avoid leak. -func (a *AggregatorV3) DisableReadAhead() { - a.accounts.DisableReadAhead() - a.storage.DisableReadAhead() - a.code.DisableReadAhead() - a.logAddrs.DisableReadAhead() - a.logTopics.DisableReadAhead() - a.tracesFrom.DisableReadAhead() - a.tracesTo.DisableReadAhead() -} -func (a *AggregatorV3) EnableReadAhead() *AggregatorV3 { - a.accounts.EnableReadAhead() - a.storage.EnableReadAhead() - a.code.EnableReadAhead() - a.logAddrs.EnableReadAhead() - a.logTopics.EnableReadAhead() - a.tracesFrom.EnableReadAhead() - a.tracesTo.EnableReadAhead() - return a -} -func (a *AggregatorV3) EnableMadvWillNeed() *AggregatorV3 { - a.accounts.EnableMadvWillNeed() - a.storage.EnableMadvWillNeed() - a.code.EnableMadvWillNeed() - a.logAddrs.EnableMadvWillNeed() - a.logTopics.EnableMadvWillNeed() - a.tracesFrom.EnableMadvWillNeed() - a.tracesTo.EnableMadvWillNeed() - return a -} -func (a *AggregatorV3) EnableMadvNormal() *AggregatorV3 { - a.accounts.EnableMadvNormalReadAhead() - a.storage.EnableMadvNormalReadAhead() - a.code.EnableMadvNormalReadAhead() - a.logAddrs.EnableMadvNormalReadAhead() - a.logTopics.EnableMadvNormalReadAhead() - a.tracesFrom.EnableMadvNormalReadAhead() - a.tracesTo.EnableMadvNormalReadAhead() - return a +// ComputeCommitment evaluates commitment for processed state. +// If `saveStateAfter`=true, then trie state will be saved to DB after commitment evaluation. +func (a *AggregatorV3) ComputeCommitment(saveStateAfter, trace bool) (rootHash []byte, err error) { + // if commitment mode is Disabled, there will be nothing to compute on. + // TODO: create new SharedDomain with new aggregator Context to compute commitment on most recent committed state. + // for now we use only one sharedDomain -> no major difference among contexts. + //aggCtx := a.MakeContext() + //defer aggCtx.Close() + return a.domains.Commit(saveStateAfter, trace) } func (ac *AggregatorV3Context) IndexRange(name kv.InvertedIdx, k []byte, fromTs, toTs int, asc order.By, limit int, tx kv.Tx) (timestamps iter.U64, err error) { switch name { case kv.AccountsHistoryIdx: - return ac.accounts.IdxRange(k, fromTs, toTs, asc, limit, tx) + return ac.accounts.hc.IdxRange(k, fromTs, toTs, asc, limit, tx) case kv.StorageHistoryIdx: - return ac.storage.IdxRange(k, fromTs, toTs, asc, limit, tx) + return ac.storage.hc.IdxRange(k, fromTs, toTs, asc, limit, tx) case kv.CodeHistoryIdx: - return ac.code.IdxRange(k, fromTs, toTs, asc, limit, tx) + return ac.code.hc.IdxRange(k, fromTs, toTs, asc, limit, tx) case kv.LogTopicIdx: return ac.logTopics.IdxRange(k, fromTs, toTs, asc, limit, tx) case kv.LogAddrIdx: @@ -1375,11 +1541,7 @@ func (ac *AggregatorV3Context) IndexRange(name kv.InvertedIdx, k []byte, fromTs, // -- range end func (ac *AggregatorV3Context) ReadAccountDataNoStateWithRecent(addr []byte, txNum uint64, tx kv.Tx) ([]byte, bool, error) { - return ac.accounts.GetNoStateWithRecent(addr, txNum, tx) -} - -func (ac *AggregatorV3Context) ReadAccountDataNoState(addr []byte, txNum uint64) ([]byte, bool, error) { - return ac.accounts.GetNoState(addr, txNum) + return ac.accounts.hc.GetNoStateWithRecent(addr, txNum, tx) } func (ac *AggregatorV3Context) ReadAccountStorageNoStateWithRecent(addr []byte, loc []byte, txNum uint64, tx kv.Tx) ([]byte, bool, error) { @@ -1390,82 +1552,45 @@ func (ac *AggregatorV3Context) ReadAccountStorageNoStateWithRecent(addr []byte, } copy(ac.keyBuf, addr) copy(ac.keyBuf[len(addr):], loc) - return ac.storage.GetNoStateWithRecent(ac.keyBuf, txNum, tx) + return ac.storage.hc.GetNoStateWithRecent(ac.keyBuf, txNum, tx) } func (ac *AggregatorV3Context) ReadAccountStorageNoStateWithRecent2(key []byte, txNum uint64, tx kv.Tx) ([]byte, bool, error) { - return ac.storage.GetNoStateWithRecent(key, txNum, tx) -} - -func (ac *AggregatorV3Context) ReadAccountStorageNoState(addr []byte, loc []byte, txNum uint64) ([]byte, bool, error) { - if cap(ac.keyBuf) < len(addr)+len(loc) { - ac.keyBuf = make([]byte, len(addr)+len(loc)) - } else if len(ac.keyBuf) != len(addr)+len(loc) { - ac.keyBuf = ac.keyBuf[:len(addr)+len(loc)] - } - copy(ac.keyBuf, addr) - copy(ac.keyBuf[len(addr):], loc) - return ac.storage.GetNoState(ac.keyBuf, txNum) + return ac.storage.hc.GetNoStateWithRecent(key, txNum, tx) } func (ac *AggregatorV3Context) ReadAccountCodeNoStateWithRecent(addr []byte, txNum uint64, tx kv.Tx) ([]byte, bool, error) { - return ac.code.GetNoStateWithRecent(addr, txNum, tx) -} -func (ac *AggregatorV3Context) ReadAccountCodeNoState(addr []byte, txNum uint64) ([]byte, bool, error) { - return ac.code.GetNoState(addr, txNum) + return ac.code.hc.GetNoStateWithRecent(addr, txNum, tx) } - -func (ac *AggregatorV3Context) ReadAccountCodeSizeNoStateWithRecent(addr []byte, txNum uint64, tx kv.Tx) (int, bool, error) { - code, noState, err := ac.code.GetNoStateWithRecent(addr, txNum, tx) - if err != nil { - return 0, false, err - } - return len(code), noState, nil -} -func (ac *AggregatorV3Context) ReadAccountCodeSizeNoState(addr []byte, txNum uint64) (int, bool, error) { - code, noState, err := ac.code.GetNoState(addr, txNum) - if err != nil { - return 0, false, err - } - return len(code), noState, nil -} - func (ac *AggregatorV3Context) AccountHistoryRange(startTxNum, endTxNum int, asc order.By, limit int, tx kv.Tx) (iter.KV, error) { - return ac.accounts.HistoryRange(startTxNum, endTxNum, asc, limit, tx) + return ac.accounts.hc.HistoryRange(startTxNum, endTxNum, asc, limit, tx) } func (ac *AggregatorV3Context) StorageHistoryRange(startTxNum, endTxNum int, asc order.By, limit int, tx kv.Tx) (iter.KV, error) { - return ac.storage.HistoryRange(startTxNum, endTxNum, asc, limit, tx) + return ac.storage.hc.HistoryRange(startTxNum, endTxNum, asc, limit, tx) } func (ac *AggregatorV3Context) CodeHistoryRange(startTxNum, endTxNum int, asc order.By, limit int, tx kv.Tx) (iter.KV, error) { - return ac.code.HistoryRange(startTxNum, endTxNum, asc, limit, tx) + return ac.code.hc.HistoryRange(startTxNum, endTxNum, asc, limit, tx) } -func (ac *AggregatorV3Context) AccountHistoricalStateRange(startTxNum uint64, from, to []byte, limit int, tx kv.Tx) iter.KV { - return ac.accounts.WalkAsOf(startTxNum, from, to, tx, limit) -} - -func (ac *AggregatorV3Context) StorageHistoricalStateRange(startTxNum uint64, from, to []byte, limit int, tx kv.Tx) iter.KV { - return ac.storage.WalkAsOf(startTxNum, from, to, tx, limit) -} - -func (ac *AggregatorV3Context) CodeHistoricalStateRange(startTxNum uint64, from, to []byte, limit int, tx kv.Tx) iter.KV { - return ac.code.WalkAsOf(startTxNum, from, to, tx, limit) -} - -type FilesStats22 struct { -} +type FilesStats22 struct{} func (a *AggregatorV3) Stats() FilesStats22 { var fs FilesStats22 return fs } +// AggregatorV3Context guarantee consistent View of files: +// - long-living consistent view of all files (no limitations) +// - hiding garbage and files overlaps +// - protecting useful files from removal +// - other will not see "partial writes" or "new files appearance" type AggregatorV3Context struct { a *AggregatorV3 - accounts *HistoryContext - storage *HistoryContext - code *HistoryContext + accounts *DomainContext + storage *DomainContext + code *DomainContext + commitment *DomainContext logAddrs *InvertedIndexContext logTopics *InvertedIndexContext tracesFrom *InvertedIndexContext @@ -1481,6 +1606,7 @@ func (a *AggregatorV3) MakeContext() *AggregatorV3Context { accounts: a.accounts.MakeContext(), storage: a.storage.MakeContext(), code: a.code.MakeContext(), + commitment: a.commitment.MakeContext(), logAddrs: a.logAddrs.MakeContext(), logTopics: a.logTopics.MakeContext(), tracesFrom: a.tracesFrom.MakeContext(), @@ -1491,11 +1617,87 @@ func (a *AggregatorV3) MakeContext() *AggregatorV3Context { return ac } + +// --- Domain part START --- + +func (ac *AggregatorV3Context) DomainRange(tx kv.Tx, domain kv.Domain, fromKey, toKey []byte, ts uint64, asc order.By, limit int) (it iter.KV, err error) { + switch domain { + case kv.AccountsDomain: + return ac.accounts.DomainRange(tx, fromKey, toKey, ts, asc, limit) + case kv.StorageDomain: + return ac.storage.DomainRange(tx, fromKey, toKey, ts, asc, limit) + case kv.CodeDomain: + return ac.code.DomainRange(tx, fromKey, toKey, ts, asc, limit) + case kv.CommitmentDomain: + return ac.commitment.DomainRange(tx, fromKey, toKey, ts, asc, limit) + default: + panic(domain) + } +} +func (ac *AggregatorV3Context) DomainRangeLatest(tx kv.Tx, domain kv.Domain, from, to []byte, limit int) (iter.KV, error) { + switch domain { + case kv.AccountsDomain: + return ac.accounts.DomainRangeLatest(tx, from, to, limit) + case kv.StorageDomain: + return ac.storage.DomainRangeLatest(tx, from, to, limit) + case kv.CodeDomain: + return ac.code.DomainRangeLatest(tx, from, to, limit) + case kv.CommitmentDomain: + return ac.commitment.DomainRangeLatest(tx, from, to, limit) + default: + panic(domain) + } +} + +func (ac *AggregatorV3Context) IterateAccounts(tx kv.Tx, pref []byte, fn func(key, value []byte)) error { + return ac.accounts.IteratePrefix(tx, pref, fn) +} +func (ac *AggregatorV3Context) DomainGetAsOf(tx kv.Tx, name kv.Domain, key []byte, ts uint64) (v []byte, ok bool, err error) { + switch name { + case kv.AccountsDomain: + v, err := ac.accounts.GetAsOf(key, ts, tx) + return v, v != nil, err + case kv.StorageDomain: + v, err := ac.storage.GetAsOf(key, ts, tx) + return v, v != nil, err + case kv.CodeDomain: + v, err := ac.code.GetAsOf(key, ts, tx) + return v, v != nil, err + case kv.CommitmentDomain: + v, err := ac.commitment.GetAsOf(key, ts, tx) + return v, v != nil, err + default: + panic(fmt.Sprintf("unexpected: %s", name)) + } +} +func (ac *AggregatorV3Context) GetLatest(domain kv.Domain, k, k2 []byte, tx kv.Tx) (v []byte, ok bool, err error) { + switch domain { + case kv.AccountsDomain: + return ac.accounts.GetLatest(k, k2, tx) + case kv.StorageDomain: + return ac.storage.GetLatest(k, k2, tx) + case kv.CodeDomain: + return ac.code.GetLatest(k, k2, tx) + case kv.CommitmentDomain: + return ac.commitment.GetLatest(k, k2, tx) + default: + panic(fmt.Sprintf("unexpected: %s", domain)) + } +} + +// --- Domain part END --- + func (ac *AggregatorV3Context) Close() { + if ac.a == nil { // invariant: it's safe to call Close multiple times + return + } ac.a.leakDetector.Del(ac.id) + ac.a = nil + ac.accounts.Close() ac.storage.Close() ac.code.Close() + ac.commitment.Close() ac.logAddrs.Close() ac.logTopics.Close() ac.tracesFrom.Close() @@ -1517,12 +1719,10 @@ func (br *BackgroundResult) GetAndReset() (bool, error) { return has, err } -func lastIdInDB(db kv.RoDB, table string) (lstInDb uint64) { +// Inverted index tables only +func lastIdInDB(db kv.RoDB, domain *Domain) (lstInDb uint64) { if err := db.View(context.Background(), func(tx kv.Tx) error { - lst, _ := kv.LastKey(tx, table) - if len(lst) > 0 { - lstInDb = binary.BigEndian.Uint64(lst) - } + lstInDb = domain.LastStepInDB(tx) return nil }); err != nil { log.Warn("[snapshots] lastIdInDB", "err", err) @@ -1533,11 +1733,12 @@ func lastIdInDB(db kv.RoDB, table string) (lstInDb uint64) { // AggregatorStep is used for incremental reconstitution, it allows // accessing history in isolated way for each step type AggregatorStep struct { - a *AggregatorV3 - accounts *HistoryStep - storage *HistoryStep - code *HistoryStep - keyBuf []byte + a *AggregatorV3 + accounts *HistoryStep + storage *HistoryStep + code *HistoryStep + commitment *HistoryStep + keyBuf []byte } func (a *AggregatorV3) MakeSteps() ([]*AggregatorStep, error) { @@ -1545,16 +1746,18 @@ func (a *AggregatorV3) MakeSteps() ([]*AggregatorStep, error) { accountSteps := a.accounts.MakeSteps(frozenAndIndexed) codeSteps := a.code.MakeSteps(frozenAndIndexed) storageSteps := a.storage.MakeSteps(frozenAndIndexed) + commitmentSteps := a.commitment.MakeSteps(frozenAndIndexed) if len(accountSteps) != len(storageSteps) || len(storageSteps) != len(codeSteps) { return nil, fmt.Errorf("different limit of steps (try merge snapshots): accountSteps=%d, storageSteps=%d, codeSteps=%d", len(accountSteps), len(storageSteps), len(codeSteps)) } steps := make([]*AggregatorStep, len(accountSteps)) for i, accountStep := range accountSteps { steps[i] = &AggregatorStep{ - a: a, - accounts: accountStep, - storage: storageSteps[i], - code: codeSteps[i], + a: a, + accounts: accountStep, + storage: storageSteps[i], + code: codeSteps[i], + commitment: commitmentSteps[i], } } return steps, nil diff --git a/state/archive.go b/state/archive.go new file mode 100644 index 000000000..4b37a1f4d --- /dev/null +++ b/state/archive.go @@ -0,0 +1,113 @@ +package state + +import "github.com/ledgerwatch/erigon-lib/compress" + +type FileCompression uint8 + +const ( + CompressNone FileCompression = 0b0 // no compression + CompressKeys FileCompression = 0b1 // compress keys only + CompressVals FileCompression = 0b10 // compress values only +) + +type getter struct { + *compress.Getter + nextValue bool // if nextValue true then getter.Next() expected to return value + c FileCompression // compressed +} + +func NewArchiveGetter(g *compress.Getter, c FileCompression) ArchiveGetter { + return &getter{Getter: g, c: c} +} + +func (g *getter) MatchPrefix(prefix []byte) bool { + if g.c&CompressKeys != 0 { + return g.Getter.MatchPrefix(prefix) + } + return g.Getter.MatchPrefixUncompressed(prefix) == 0 +} + +func (g *getter) Next(buf []byte) ([]byte, uint64) { + fl := CompressKeys + if g.nextValue { + fl = CompressVals + g.nextValue = false + } else { + g.nextValue = true + } + + if g.c&fl != 0 { + return g.Getter.Next(buf) + } + return g.Getter.NextUncompressed() +} + +func (g *getter) Reset(offset uint64) { + g.nextValue = false + g.Getter.Reset(offset) +} +func (g *getter) Skip() (uint64, int) { + fl := CompressKeys + if g.nextValue { + fl = CompressVals + g.nextValue = false + } else { + g.nextValue = true + } + + if g.c&fl != 0 { + return g.Getter.Skip() + } + return g.Getter.SkipUncompressed() + +} + +// ArchiveGetter hides if the underlying compress.Getter is compressed or not +type ArchiveGetter interface { + HasNext() bool + FileName() string + MatchPrefix(prefix []byte) bool + Skip() (uint64, int) + Size() int + Next(buf []byte) ([]byte, uint64) + Reset(offset uint64) +} + +type ArchiveWriter interface { + AddWord(word []byte) error + Count() int + Compress() error + DisableFsync() + Close() +} + +type compWriter struct { + *compress.Compressor + keyWritten bool + c FileCompression +} + +func NewArchiveWriter(kv *compress.Compressor, compress FileCompression) ArchiveWriter { + return &compWriter{kv, false, compress} +} + +func (c *compWriter) AddWord(word []byte) error { + fl := CompressKeys + if c.keyWritten { + fl = CompressVals + c.keyWritten = false + } else { + c.keyWritten = true + } + + if c.c&fl != 0 { + return c.Compressor.AddWord(word) + } + return c.Compressor.AddUncompressedWord(word) +} + +func (c *compWriter) Close() { + if c.Compressor != nil { + c.Compressor.Close() + } +} diff --git a/state/archive_test.go b/state/archive_test.go new file mode 100644 index 000000000..c64b0d858 --- /dev/null +++ b/state/archive_test.go @@ -0,0 +1,125 @@ +package state + +import ( + "bytes" + "context" + "path" + "path/filepath" + "sort" + "testing" + + "github.com/c2h5oh/datasize" + "github.com/ledgerwatch/log/v3" + "github.com/stretchr/testify/require" + + "github.com/ledgerwatch/erigon-lib/compress" +) + +func TestArchiveWriter(t *testing.T) { + tmp := t.TempDir() + logger := log.New() + + td := generateTestData(t, 20, 52, 1, 1, 100000) + + openWriter := func(tb testing.TB, tmp, name string, compFlags FileCompression) ArchiveWriter { + tb.Helper() + file := filepath.Join(tmp, name) + comp, err := compress.NewCompressor(context.Background(), "", file, tmp, 8, 1, log.LvlDebug, logger) + require.NoError(tb, err) + return NewArchiveWriter(comp, compFlags) + } + keys := make([][]byte, 0, len(td)) + for k := range td { + keys = append(keys, []byte(k)) + } + sort.Slice(keys, func(i, j int) bool { return bytes.Compare(keys[i], keys[j]) < 0 }) + + writeLatest := func(tb testing.TB, w ArchiveWriter, td map[string][]upd) { + tb.Helper() + + for _, k := range keys { + upd := td[string(k)] + + err := w.AddWord(k) + require.NoError(tb, err) + err = w.AddWord(upd[0].value) + require.NoError(tb, err) + } + err := w.Compress() + require.NoError(tb, err) + } + + checkLatest := func(tb testing.TB, g ArchiveGetter, td map[string][]upd) { + tb.Helper() + + for _, k := range keys { + upd := td[string(k)] + + fk, _ := g.Next(nil) + fv, _ := g.Next(nil) + require.EqualValues(tb, k, fk) + require.EqualValues(tb, upd[0].value, fv) + } + } + + t.Run("Uncompressed", func(t *testing.T) { + w := openWriter(t, tmp, "uncompressed", CompressNone) + writeLatest(t, w, td) + w.Close() + + decomp, err := compress.NewDecompressor(path.Join(tmp, "uncompressed")) + require.NoError(t, err) + defer decomp.Close() + + ds := (datasize.B * datasize.ByteSize(decomp.Size())).HR() + t.Logf("keys %d, fsize %v compressed fully", len(keys), ds) + + r := NewArchiveGetter(decomp.MakeGetter(), CompressNone) + checkLatest(t, r, td) + }) + t.Run("Compressed", func(t *testing.T) { + w := openWriter(t, tmp, "compressed", CompressKeys|CompressVals) + writeLatest(t, w, td) + w.Close() + + decomp, err := compress.NewDecompressor(path.Join(tmp, "compressed")) + require.NoError(t, err) + defer decomp.Close() + ds := (datasize.B * datasize.ByteSize(decomp.Size())).HR() + t.Logf("keys %d, fsize %v compressed fully", len(keys), ds) + + r := NewArchiveGetter(decomp.MakeGetter(), CompressKeys|CompressVals) + checkLatest(t, r, td) + }) + + t.Run("Compressed Keys", func(t *testing.T) { + w := openWriter(t, tmp, "compressed-keys", CompressKeys) + writeLatest(t, w, td) + w.Close() + + decomp, err := compress.NewDecompressor(path.Join(tmp, "compressed-keys")) + require.NoError(t, err) + defer decomp.Close() + ds := (datasize.B * datasize.ByteSize(decomp.Size())).HR() + t.Logf("keys %d, fsize %v compressed keys", len(keys), ds) + + r := NewArchiveGetter(decomp.MakeGetter(), CompressKeys) + checkLatest(t, r, td) + }) + + t.Run("Compressed Vals", func(t *testing.T) { + w := openWriter(t, tmp, "compressed-vals", CompressVals) + writeLatest(t, w, td) + w.Close() + + decomp, err := compress.NewDecompressor(path.Join(tmp, "compressed-vals")) + require.NoError(t, err) + defer decomp.Close() + ds := (datasize.B * datasize.ByteSize(decomp.Size())).HR() + t.Logf("keys %d, fsize %v compressed vals", len(keys), ds) + + r := NewArchiveGetter(decomp.MakeGetter(), CompressVals) + checkLatest(t, r, td) + }) + +} diff --git a/state/bps_tree.go b/state/bps_tree.go new file mode 100644 index 000000000..902cf1e7b --- /dev/null +++ b/state/bps_tree.go @@ -0,0 +1,313 @@ +package state + +import ( + "bytes" + "errors" + "fmt" + + "github.com/ledgerwatch/erigon-lib/common" + "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" +) + +type indexSeeker interface { + WarmUp(g ArchiveGetter) error + Get(g ArchiveGetter, key []byte) (k []byte, found bool, di uint64, err error) + //Seek(g ArchiveGetter, key []byte) (indexSeekerIterator, error) + Seek(g ArchiveGetter, seek []byte) (k []byte, di uint64, found bool, err error) +} + +type indexSeekerIterator interface { + Next() bool + Di() uint64 + KVFromGetter(g ArchiveGetter) ([]byte, []byte, error) +} + +type dataLookupFunc func(di uint64, g ArchiveGetter) ([]byte, []byte, error) +type keyCmpFunc func(k []byte, di uint64, g ArchiveGetter) (int, []byte, error) + +func NewBpsTree(kv ArchiveGetter, offt *eliasfano32.EliasFano, M uint64, dataLookup dataLookupFunc, keyCmp keyCmpFunc) *BpsTree { + bt := &BpsTree{M: M, offt: offt, dataLookupFunc: dataLookup, keyCmpFunc: keyCmp} + if err := bt.WarmUp(kv); err != nil { + panic(err) + } + return bt +} + +type BpsTree struct { + offt *eliasfano32.EliasFano + mx [][]Node + M uint64 + trace bool + naccess uint64 + + dataLookupFunc dataLookupFunc + keyCmpFunc keyCmpFunc +} + +type BpsTreeIterator struct { + t *BpsTree + i uint64 +} + +func (it *BpsTreeIterator) Di() uint64 { + return it.i +} + +func (it *BpsTreeIterator) KVFromGetter(g ArchiveGetter) ([]byte, []byte, error) { + if it == nil { + return nil, nil, fmt.Errorf("iterator is nil") + } + //fmt.Printf("kv from %p getter %p tree %p offt %d\n", it, g, it.t, it.i) + k, v, err := it.t.dataLookupFunc(it.i, g) + if err != nil { + if errors.Is(err, ErrBtIndexLookupBounds) { + return nil, nil, nil + } + return nil, nil, err + } + return k, v, nil +} + +func (it *BpsTreeIterator) Next() bool { + if it.i+1 == it.t.offt.Count() { + return false + } + it.i++ + return true +} + +//// If data[i] == key, returns 0 (equal) and value, nil err +//// if data[i] <> key, returns comparation result and nil value and error -- to be able to compare later +//func (b *BpsTree) matchKeyValue(g ArchiveGetter, i uint64, key []byte) (int, []byte, error) { +// if i >= b.offt.Count() { +// return 0, nil, ErrBtIndexLookupBounds +// } +// if b.trace { +// fmt.Printf("match %d-%x count %d\n", i, key, b.offt.Count()) +// } +// g.Reset(b.offt.Get(i)) +// buf, _ := g.Next(nil) +// if !bytes.Equal(buf, key) { +// return bytes.Compare(buf, key), nil, nil +// } +// val, _ := g.Next(nil) +// return 0, val, nil +//} +// +//func (b *BpsTree) lookupKeyWGetter(g ArchiveGetter, i uint64) ([]byte, uint64) { +// if i >= b.offt.Count() { +// return nil, 0 +// } +// o := b.offt.Get(i) +// g.Reset(o) +// buf, _ := g.Next(nil) +// return buf, o +//} + +type Node struct { + off uint64 + di uint64 + prefix []byte +} + +func (b *BpsTree) traverse(g ArchiveGetter, mx [][]Node, n, di, i uint64) { + if i >= n { + return + } + + for j := uint64(1); j <= b.M; j += b.M / 2 { + ik := i*b.M + j + if ik >= n { + break + } + _, k, err := b.keyCmpFunc(nil, ik, g) + if err != nil { + panic(err) + } + if k != nil { + mx[di] = append(mx[di], Node{off: b.offt.Get(ik), prefix: common.Copy(k), di: ik}) + //fmt.Printf("d=%d k %x %d\n", di+1, k, offt) + } + b.traverse(g, mx, n, di, ik) + } +} + +func (b *BpsTree) WarmUp(kv ArchiveGetter) error { + k := b.offt.Count() + d := logBase(k, b.M) + + mx := make([][]Node, d+1) + _, key, err := b.keyCmpFunc(nil, 0, kv) + if err != nil { + return err + } + if key != nil { + mx[0] = append(mx[0], Node{off: b.offt.Get(0), prefix: common.Copy(key)}) + //fmt.Printf("d=%d k %x %d\n", di, k, offt) + } + b.traverse(kv, mx, k, 0, 0) + + if b.trace { + for i := 0; i < len(mx); i++ { + for j := 0; j < len(mx[i]); j++ { + fmt.Printf("mx[%d][%d] %x %d %d\n", i, j, mx[i][j].prefix, mx[i][j].off, mx[i][j].di) + } + } + } + b.mx = mx + return nil +} + +func (b *BpsTree) bs(x []byte) (n Node, dl, dr uint64) { + dr = b.offt.Count() + for d, row := range b.mx { + m, l, r := 0, 0, len(row) //nolint + for l < r { + m = (l + r) >> 1 + n = row[m] + b.naccess++ + + if b.trace { + fmt.Printf("bs[%d][%d] i=%d %x\n", d, m, n.di, n.prefix) + } + switch bytes.Compare(n.prefix, x) { + case 0: + return n, n.di, n.di + case 1: + r = m + dr = n.di + case -1: + l = m + 1 + dl = n.di + } + } + + } + return n, dl, dr +} + +// Seek returns first key which is >= key. +// Found is true iff exact key match is found. +// If key is nil, returns first key and found=true +// If found item.key has a prefix of key, returns found=false and item.key +// if key is greater than all keys, returns nil, found=false +func (b *BpsTree) Seek(g ArchiveGetter, key []byte) (skey []byte, di uint64, found bool, err error) { + if key == nil && b.offt.Count() > 0 { + //return &BpsTreeIterator{t: b, i: 0}, nil + var cmp int + cmp, skey, err = b.keyCmpFunc(key, 0, g) + if err != nil { + return nil, 0, false, err + } + return skey, 0, cmp == 0, nil + } + + l, r := uint64(0), b.offt.Count() + if b.trace { + fmt.Printf("seek %x [%d %d]\n", key, l, r) + } + defer func() { + if b.trace { + fmt.Printf("found %x [%d %d] naccsess %d\n", key, l, r, b.naccess) + } + b.naccess = 0 + }() + + n, dl, dr := b.bs(key) + if b.trace { + fmt.Printf("pivot %d n %x [%d %d]\n", n.di, n.prefix, dl, dr) + } + l, r = dl, dr + + var m uint64 + var cmp int + for l < r { + m = (l + r) >> 1 + cmp, skey, err = b.keyCmpFunc(key, m, g) + if err != nil { + return nil, 0, false, err + } + b.naccess++ + if b.trace { + fmt.Printf("lr %x [%d %d]\n", skey, l, r) + } + + switch cmp { + case 0: + return skey, m, true, nil + //return &BpsTreeIterator{t: b, i: m}, nil + case 1: + r = m + case -1: + l = m + 1 + } + } + if l == r { + m = l + //return &BpsTreeIterator{t: b, i: l}, nil + } + + cmp, skey, err = b.keyCmpFunc(key, m, g) + if err != nil { + return nil, 0, false, err + } + return skey, m, cmp == 0, nil +} + +// returns first key which is >= key. +// If key is nil, returns first key +// if key is greater than all keys, returns nil +func (b *BpsTree) Get(g ArchiveGetter, key []byte) ([]byte, bool, uint64, error) { + if key == nil && b.offt.Count() > 0 { + k0, v0, err := b.dataLookupFunc(0, g) + if err != nil || k0 != nil { + return nil, false, 0, err + } + return v0, true, 0, nil + } + + l, r := uint64(0), b.offt.Count() + if b.trace { + fmt.Printf("seek %x [%d %d]\n", key, l, r) + } + defer func() { + if b.trace { + fmt.Printf("found %x [%d %d] naccsess %d\n", key, l, r, b.naccess) + } + b.naccess = 0 + }() + + n, dl, dr := b.bs(key) + if b.trace { + fmt.Printf("pivot %d n %x [%d %d]\n", n.di, n.prefix, dl, dr) + } + l, r = dl, dr + + var m uint64 + for l < r { + m = (l + r) >> 1 + cmp, k, err := b.keyCmpFunc(key, m, g) + if err != nil { + return nil, false, 0, err + } + b.naccess++ + if b.trace { + fmt.Printf("lr [%d %d]\n", l, r) + } + + switch cmp { + case 0: + return k, true, m, nil + case 1: + r = m + case -1: + l = m + 1 + } + } + + cmp, k, err := b.keyCmpFunc(key, l, g) + if err != nil || cmp != 0 { + return nil, false, 0, err + } + return k, true, l, nil +} diff --git a/state/btree_index.go b/state/btree_index.go index 00d2b9e13..3130a3e91 100644 --- a/state/btree_index.go +++ b/state/btree_index.go @@ -8,25 +8,35 @@ import ( "errors" "fmt" "math" - "math/bits" "os" "path" "path/filepath" + "sort" + "strings" "time" "github.com/c2h5oh/datasize" "github.com/edsrzf/mmap-go" - "github.com/ledgerwatch/erigon-lib/common/dbg" "github.com/ledgerwatch/log/v3" - - "github.com/ledgerwatch/erigon-lib/common/background" + "github.com/spaolacci/murmur3" "github.com/ledgerwatch/erigon-lib/common" - "github.com/ledgerwatch/erigon-lib/common/length" + "github.com/ledgerwatch/erigon-lib/common/background" + "github.com/ledgerwatch/erigon-lib/common/dbg" "github.com/ledgerwatch/erigon-lib/compress" "github.com/ledgerwatch/erigon-lib/etl" + "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" ) +var UseBpsTree bool = true + +const BtreeLogPrefix = "btree" + +// DefaultBtreeM - amount of keys on leaf of BTree +// It will do log2(M) co-located-reads from data file - for binary-search inside leaf +var DefaultBtreeM = uint64(256) +var ErrBtIndexLookupBounds = errors.New("BtIndex: lookup di bounds error") + func logBase(n, base uint64) uint64 { return uint64(math.Ceil(math.Log(float64(n)) / math.Log(float64(base)))) } @@ -55,29 +65,31 @@ type node struct { } type Cursor struct { - ctx context.Context - ix *btAlloc - - key []byte - value []byte - d uint64 + btt *BtIndex + ctx context.Context + getter ArchiveGetter + key []byte + value []byte + d uint64 } -func (a *btAlloc) newCursor(ctx context.Context, k, v []byte, d uint64) *Cursor { - return &Cursor{ - ctx: ctx, - key: common.Copy(k), - value: common.Copy(v), - d: d, - ix: a, - } -} +//getter should be alive all the time of cursor usage +//Key and value is valid until cursor.Next is called +//func NewCursor(ctx context.Context, k, v []byte, d uint64, g ArchiveGetter) *Cursor { +// return &Cursor{ +// ctx: ctx, +// getter: g, +// key: common.Copy(k), +// value: common.Copy(v), +// d: d, +// } +//} func (c *Cursor) Key() []byte { return c.key } -func (c *Cursor) Ordinal() uint64 { +func (c *Cursor) Di() uint64 { return c.d } @@ -86,15 +98,24 @@ func (c *Cursor) Value() []byte { } func (c *Cursor) Next() bool { - if c.d > c.ix.K-1 { + if !c.next() { return false } - k, v, err := c.ix.dataLookup(c.d + 1) + + key, value, err := c.btt.dataLookup(c.d, c.getter) if err != nil { return false } - c.key = common.Copy(k) - c.value = common.Copy(v) + c.key, c.value = key, value + return true +} + +// next returns if another key/value pair is available int that index. +// moves pointer d to next element if successful +func (c *Cursor) next() bool { + if c.d+1 == c.btt.ef.Count() { + return false + } c.d++ return true } @@ -111,25 +132,29 @@ type btAlloc struct { naccess uint64 trace bool - dataLookup func(di uint64) ([]byte, []byte, error) + dataLookup dataLookupFunc + keyCmp keyCmpFunc } -func newBtAlloc(k, M uint64, trace bool) *btAlloc { +func newBtAlloc(k, M uint64, trace bool, dataLookup dataLookupFunc, keyCmp keyCmpFunc) *btAlloc { if k == 0 { return nil } d := logBase(k, M) a := &btAlloc{ - vx: make([]uint64, d+1), - sons: make([][]uint64, d+1), - cursors: make([]markupCursor, d), - nodes: make([][]node, d), - M: M, - K: k, - d: d, - trace: trace, + vx: make([]uint64, d+1), + sons: make([][]uint64, d+1), + cursors: make([]markupCursor, d), + nodes: make([][]node, d), + M: M, + K: k, + d: d, + trace: trace, + dataLookup: dataLookup, + keyCmp: keyCmp, } + if trace { fmt.Printf("k=%d d=%d, M=%d\n", k, d, M) } @@ -189,86 +214,6 @@ func newBtAlloc(k, M uint64, trace bool) *btAlloc { return a } -// nolint -// another implementation of traverseDfs supposed to be a bit cleaner but buggy yet -func (a *btAlloc) traverseTrick() { - for l := 0; l < len(a.sons)-1; l++ { - if len(a.sons[l]) < 2 { - panic("invalid btree allocation markup") - } - a.cursors[l] = markupCursor{uint64(l), 1, 0, 0} - a.nodes[l] = make([]node, 0) - } - - lf := a.cursors[len(a.cursors)-1] - c := a.cursors[(len(a.cursors) - 2)] - - var d uint64 - var fin bool - - lf.di = d - lf.si++ - d++ - a.cursors[len(a.cursors)-1] = lf - - moved := true - for int(c.p) <= len(a.sons[c.l]) { - if fin || d > a.K { - break - } - c, lf = a.cursors[c.l], a.cursors[lf.l] - - c.di = d - c.si++ - - sons := a.sons[lf.l][lf.p] - for i := uint64(1); i < sons; i++ { - lf.si++ - d++ - } - lf.di = d - d++ - - a.nodes[lf.l] = append(a.nodes[lf.l], node{p: lf.p, s: lf.si, d: lf.di}) - a.nodes[c.l] = append(a.nodes[c.l], node{p: c.p, s: c.si, d: c.di}) - a.cursors[lf.l] = lf - a.cursors[c.l] = c - - for l := lf.l; l >= 0; l-- { - sc := a.cursors[l] - sons, gsons := a.sons[sc.l][sc.p-1], a.sons[sc.l][sc.p] - if l < c.l && moved { - sc.di = d - a.nodes[sc.l] = append(a.nodes[sc.l], node{d: sc.di}) - sc.si++ - d++ - } - moved = (sc.si-1)/gsons != sc.si/gsons - if sc.si/gsons >= sons { - sz := uint64(len(a.sons[sc.l]) - 1) - if sc.p+2 > sz { - fin = l == lf.l - break - } else { - sc.p += 2 - sc.si, sc.di = 0, 0 - } - //moved = true - } - if l == lf.l { - sc.si++ - sc.di = d - d++ - } - a.cursors[l] = sc - if l == 0 { - break - } - } - moved = false - } -} - func (a *btAlloc) traverseDfs() { for l := 0; l < len(a.sons)-1; l++ { a.cursors[l] = markupCursor{uint64(l), 1, 0, 0} @@ -411,22 +356,23 @@ func (a *btAlloc) traverseDfs() { } } -func (a *btAlloc) bsKey(x []byte, l, r uint64) (*Cursor, error) { +func (a *btAlloc) bsKey(x []byte, l, r uint64, g ArchiveGetter) (k []byte, di uint64, found bool, err error) { + //i := 0 + var cmp int for l <= r { - di := (l + r) >> 1 + di = (l + r) >> 1 - mk, value, err := a.dataLookup(di) + cmp, k, err = a.keyCmp(x, di, g) a.naccess++ - cmp := bytes.Compare(mk, x) switch { case err != nil: if errors.Is(err, ErrBtIndexLookupBounds) { - return nil, nil + return k, 0, false, nil } - return nil, err + return k, 0, false, err case cmp == 0: - return a.newCursor(context.TODO(), mk, value, di), nil + return k, di, true, err case cmp == -1: l = di + 1 default: @@ -436,14 +382,7 @@ func (a *btAlloc) bsKey(x []byte, l, r uint64) (*Cursor, error) { break } } - k, v, err := a.dataLookup(l) - if err != nil { - if errors.Is(err, ErrBtIndexLookupBounds) { - return nil, nil - } - return nil, fmt.Errorf("key >= %x was not found. %w", x, err) - } - return a.newCursor(context.TODO(), k, v, l), nil + return k, l, true, nil } func (a *btAlloc) bsNode(i, l, r uint64, x []byte) (n node, lm int64, rm int64) { @@ -452,9 +391,8 @@ func (a *btAlloc) bsNode(i, l, r uint64, x []byte) (n node, lm int64, rm int64) for l < r { m = (l + r) >> 1 - - a.naccess++ cmp := bytes.Compare(a.nodes[i][m].key, x) + a.naccess++ switch { case cmp == 0: return a.nodes[i][m], int64(m), int64(m) @@ -473,17 +411,28 @@ func (a *btAlloc) bsNode(i, l, r uint64, x []byte) (n node, lm int64, rm int64) // find position of key with node.di <= d at level lvl func (a *btAlloc) seekLeast(lvl, d uint64) uint64 { - for i := range a.nodes[lvl] { - if a.nodes[lvl][i].d >= d { - return uint64(i) - } + //TODO: this seems calculatable from M and tree depth + return uint64(sort.Search(len(a.nodes[lvl]), func(i int) bool { + return a.nodes[lvl][i].d >= d + })) +} + +// Get returns value if found exact match of key +// TODO k as return is useless(almost) +func (a *btAlloc) Get(g ArchiveGetter, key []byte) (k []byte, found bool, di uint64, err error) { + k, di, found, err = a.Seek(g, key) + if err != nil { + return nil, false, 0, err + } + if !found || !bytes.Equal(k, key) { + return nil, false, 0, nil } - return uint64(len(a.nodes[lvl])) + return k, found, di, nil } -func (a *btAlloc) Seek(ik []byte) (*Cursor, error) { +func (a *btAlloc) Seek(g ArchiveGetter, seek []byte) (k []byte, di uint64, found bool, err error) { if a.trace { - fmt.Printf("seek key %x\n", ik) + fmt.Printf("seek key %x\n", seek) } var ( @@ -499,29 +448,27 @@ func (a *btAlloc) Seek(ik []byte) (*Cursor, error) { maxD = ln.d break } - ln, lm, rm = a.bsNode(uint64(l), L, R, ik) + ln, lm, rm = a.bsNode(uint64(l), L, R, seek) if ln.key == nil { // should return node which is nearest to key from the left so never nil if a.trace { fmt.Printf("found nil key %x pos_range[%d-%d] naccess_ram=%d\n", l, lm, rm, a.naccess) } - return nil, fmt.Errorf("bt index nil node at level %d", l) + return nil, 0, false, fmt.Errorf("bt index nil node at level %d", l) } - - switch bytes.Compare(ln.key, ik) { + //fmt.Printf("b: %x, %x\n", ik, ln.key) + cmp := bytes.Compare(ln.key, seek) + switch cmp { case 1: // key > ik maxD = ln.d case -1: // key < ik minD = ln.d case 0: if a.trace { - fmt.Printf("found key %x v=%x naccess_ram=%d\n", ik, ln.val /*level[m].d,*/, a.naccess) + fmt.Printf("found key %x v=%x naccess_ram=%d\n", seek, ln.val /*level[m].d,*/, a.naccess) } - return a.newCursor(context.TODO(), common.Copy(ln.key), common.Copy(ln.val), ln.d), nil + return ln.key, ln.d, true, nil } - if rm-lm >= 1 { - break - } if lm >= 0 { minD = a.nodes[l][lm].d L = level[lm].fc @@ -541,27 +488,33 @@ func (a *btAlloc) Seek(ik []byte) (*Cursor, error) { } } + if maxD-minD <= a.M+2 { + break + } + if a.trace { fmt.Printf("range={%x d=%d p=%d} (%d, %d) L=%d naccess_ram=%d\n", ln.key, ln.d, ln.p, minD, maxD, l, a.naccess) } } a.naccess = 0 // reset count before actually go to disk - cursor, err := a.bsKey(ik, minD, maxD) + if maxD-minD > a.M+2 { + log.Warn("too big binary search", "minD", minD, "maxD", maxD, "keysCount", a.K, "key", fmt.Sprintf("%x", seek)) + //return nil, nil, 0, fmt.Errorf("too big binary search: minD=%d, maxD=%d, keysCount=%d, key=%x", minD, maxD, a.K, ik) + } + k, di, found, err = a.bsKey(seek, minD, maxD, g) if err != nil { if a.trace { - fmt.Printf("key %x not found\n", ik) + fmt.Printf("key %x not found\n", seek) } - return nil, err - } - - if a.trace { - fmt.Printf("finally found key %x v=%x naccess_disk=%d\n", cursor.key, cursor.value, a.naccess) + return nil, 0, false, err } - return cursor, nil + return k, di, found, nil } -func (a *btAlloc) fillSearchMx() { +func (a *btAlloc) WarmUp(gr ArchiveGetter) error { + a.traverseDfs() + for i, n := range a.nodes { if a.trace { fmt.Printf("D%d |%d| ", i, len(n)) @@ -574,84 +527,41 @@ func (a *btAlloc) fillSearchMx() { break } - kb, v, err := a.dataLookup(s.d) + kb, v, err := a.dataLookup(s.d, gr) if err != nil { fmt.Printf("d %d not found %v\n", s.d, err) } - a.nodes[i][j].key = common.Copy(kb) - a.nodes[i][j].val = common.Copy(v) + a.nodes[i][j].key = kb + a.nodes[i][j].val = v } if a.trace { fmt.Printf("\n") } } + return nil } -// deprecated -type BtIndexReader struct { - index *BtIndex -} - -func NewBtIndexReader(index *BtIndex) *BtIndexReader { - return &BtIndexReader{ - index: index, - } -} - -// Lookup wraps index Lookup -func (r *BtIndexReader) Lookup(key []byte) uint64 { - if r.index != nil { - return r.index.Lookup(key) - } - return 0 -} - -func (r *BtIndexReader) Lookup2(key1, key2 []byte) uint64 { - fk := make([]byte, 52) - copy(fk[:length.Addr], key1) - copy(fk[length.Addr:], key2) - - if r.index != nil { - return r.index.Lookup(fk) - } - return 0 -} +type BtIndexWriter struct { + maxOffset uint64 + prevOffset uint64 + minDelta uint64 + indexW *bufio.Writer + indexF *os.File + ef *eliasfano32.EliasFano + collector *etl.Collector -func (r *BtIndexReader) Seek(x []byte) (*Cursor, error) { - if r.index != nil { - cursor, err := r.index.alloc.Seek(x) - if err != nil { - return nil, fmt.Errorf("seek key %x: %w", x, err) - } - return cursor, nil - } - return nil, fmt.Errorf("seek has been failed") -} + args BtIndexWriterArgs -func (r *BtIndexReader) Empty() bool { - return r.index.Empty() -} + indexFileName string + tmpFilePath string -type BtIndexWriter struct { - built bool - lvl log.Lvl - maxOffset uint64 - prevOffset uint64 - minDelta uint64 - indexW *bufio.Writer - indexF *os.File - bucketCollector *etl.Collector // Collector that sorts by buckets - - indexFileName string - indexFile, tmpFilePath string - - tmpDir string numBuf [8]byte - keyCount uint64 - etlBufLimit datasize.ByteSize - bytesPerRec int - logger log.Logger - noFsync bool // fsync is enabled by default, but tests can manually disable + keysWritten uint64 + + built bool + lvl log.Lvl + logger log.Logger + noFsync bool // fsync is enabled by default, but tests can manually disable } type BtIndexWriterArgs struct { @@ -659,34 +569,57 @@ type BtIndexWriterArgs struct { TmpDir string KeyCount int EtlBufLimit datasize.ByteSize + Lvl log.Lvl } -const BtreeLogPrefix = "btree" - // NewBtIndexWriter creates a new BtIndexWriter instance with given number of keys // Typical bucket size is 100 - 2048, larger bucket sizes result in smaller representations of hash functions, at a cost of slower access // salt parameters is used to randomise the hash function construction, to ensure that different Erigon instances (nodes) // are likely to use different hash function, to collision attacks are unlikely to slow down any meaningful number of nodes at the same time func NewBtIndexWriter(args BtIndexWriterArgs, logger log.Logger) (*BtIndexWriter, error) { - btw := &BtIndexWriter{lvl: log.LvlDebug, logger: logger} - btw.tmpDir = args.TmpDir - btw.indexFile = args.IndexFile - btw.tmpFilePath = args.IndexFile + ".tmp" + if args.EtlBufLimit == 0 { + args.EtlBufLimit = etl.BufferOptimalSize + } + if args.Lvl == 0 { + args.Lvl = log.LvlTrace + } - _, fname := filepath.Split(btw.indexFile) + btw := &BtIndexWriter{lvl: args.Lvl, logger: logger, args: args, + tmpFilePath: args.IndexFile + ".tmp"} + + _, fname := filepath.Split(btw.args.IndexFile) btw.indexFileName = fname - btw.etlBufLimit = args.EtlBufLimit - if btw.etlBufLimit == 0 { - btw.etlBufLimit = etl.BufferOptimalSize - } - btw.bucketCollector = etl.NewCollector(BtreeLogPrefix+" "+fname, btw.tmpDir, etl.NewSortableBuffer(btw.etlBufLimit), logger) - btw.bucketCollector.LogLvl(log.LvlDebug) + btw.collector = etl.NewCollector(BtreeLogPrefix+" "+fname, btw.args.TmpDir, etl.NewSortableBuffer(btw.args.EtlBufLimit), logger) + btw.collector.LogLvl(btw.args.Lvl) - btw.maxOffset = 0 return btw, nil } +func (btw *BtIndexWriter) AddKey(key []byte, offset uint64) error { + if btw.built { + return fmt.Errorf("cannot add keys after perfect hash function had been built") + } + + binary.BigEndian.PutUint64(btw.numBuf[:], offset) + if offset > btw.maxOffset { + btw.maxOffset = offset + } + if btw.keysWritten > 0 { + delta := offset - btw.prevOffset + if btw.keysWritten == 1 || delta < btw.minDelta { + btw.minDelta = delta + } + } + + if err := btw.collector.Collect(key, btw.numBuf[:]); err != nil { + return err + } + btw.keysWritten++ + btw.prevOffset = offset + return nil +} + // loadFuncBucket is required to satisfy the type etl.LoadFunc type, to use with collector.Load func (btw *BtIndexWriter) loadFuncBucket(k, v []byte, _ etl.CurrentTableReader, _ etl.LoadNextFunc) error { // k is the BigEndian encoding of the bucket number, and the v is the key that is assigned into that bucket @@ -699,9 +632,13 @@ func (btw *BtIndexWriter) loadFuncBucket(k, v []byte, _ etl.CurrentTableReader, // if _, err := btw.indexW.Write(k); err != nil { // return err // } - if _, err := btw.indexW.Write(v[8-btw.bytesPerRec:]); err != nil { - return err - } + //if _, err := btw.indexW.Write(v); err != nil { + // return err + //} + //copy(btw.numBuf[8-btw.bytesPerRec:], v) + //btw.ef.AddOffset(binary.BigEndian.Uint64(btw.numBuf[:])) + + btw.ef.AddOffset(binary.BigEndian.Uint64(v)) //btw.keys = append(btw.keys, binary.BigEndian.Uint64(k), binary.BigEndian.Uint64(k[8:])) //btw.vals = append(btw.vals, binary.BigEndian.Uint64(v)) @@ -714,34 +651,28 @@ func (btw *BtIndexWriter) Build() error { if btw.built { return fmt.Errorf("already built") } - //if btw.keysAdded != btw.keyCount { - // return fmt.Errorf("expected keys %d, got %d", btw.keyCount, btw.keysAdded) - //} var err error if btw.indexF, err = os.Create(btw.tmpFilePath); err != nil { - return fmt.Errorf("create index file %s: %w", btw.indexFile, err) + return fmt.Errorf("create index file %s: %w", btw.args.IndexFile, err) } defer btw.indexF.Close() btw.indexW = bufio.NewWriterSize(btw.indexF, etl.BufIOSize) - // Write number of keys - binary.BigEndian.PutUint64(btw.numBuf[:], btw.keyCount) - if _, err = btw.indexW.Write(btw.numBuf[:]); err != nil { - return fmt.Errorf("write number of keys: %w", err) - } - // Write number of bytes per index record - btw.bytesPerRec = common.BitLenToByteLen(bits.Len64(btw.maxOffset)) - if err = btw.indexW.WriteByte(byte(btw.bytesPerRec)); err != nil { - return fmt.Errorf("write bytes per record: %w", err) - } + defer btw.collector.Close() + log.Log(btw.args.Lvl, "[index] calculating", "file", btw.indexFileName) - defer btw.bucketCollector.Close() - log.Log(btw.lvl, "[index] calculating", "file", btw.indexFileName) - if err := btw.bucketCollector.Load(nil, "", btw.loadFuncBucket, etl.TransformArgs{}); err != nil { - return err + if btw.keysWritten > 0 { + btw.ef = eliasfano32.NewEliasFano(btw.keysWritten, btw.maxOffset) + if err := btw.collector.Load(nil, "", btw.loadFuncBucket, etl.TransformArgs{}); err != nil { + return err + } + btw.ef.Build() + if err := btw.ef.Write(btw.indexW); err != nil { + return fmt.Errorf("[index] write ef: %w", err) + } } - btw.logger.Log(btw.lvl, "[index] write", "file", btw.indexFileName) + btw.logger.Log(btw.args.Lvl, "[index] write", "file", btw.indexFileName) btw.built = true if err = btw.indexW.Flush(); err != nil { @@ -753,7 +684,7 @@ func (btw *BtIndexWriter) Build() error { if err = btw.indexF.Close(); err != nil { return err } - if err = os.Rename(btw.tmpFilePath, btw.indexFile); err != nil { + if err = os.Rename(btw.tmpFilePath, btw.args.IndexFile); err != nil { return err } return nil @@ -779,129 +710,84 @@ func (btw *BtIndexWriter) Close() { if btw.indexF != nil { btw.indexF.Close() } - if btw.bucketCollector != nil { - btw.bucketCollector.Close() + if btw.collector != nil { + btw.collector.Close() } //if btw.offsetCollector != nil { // btw.offsetCollector.Close() //} } -func (btw *BtIndexWriter) AddKey(key []byte, offset uint64) error { - if btw.built { - return fmt.Errorf("cannot add keys after perfect hash function had been built") - } - - binary.BigEndian.PutUint64(btw.numBuf[:], offset) - if offset > btw.maxOffset { - btw.maxOffset = offset - } - if btw.keyCount > 0 { - delta := offset - btw.prevOffset - if btw.keyCount == 1 || delta < btw.minDelta { - btw.minDelta = delta - } - } - - if err := btw.bucketCollector.Collect(key, btw.numBuf[:]); err != nil { - return err - } - btw.keyCount++ - btw.prevOffset = offset - return nil -} - type BtIndex struct { - alloc *btAlloc - m mmap.MMap - data []byte - file *os.File - size int64 - modTime time.Time - filePath string - keyCount uint64 - bytesPerRec int - dataoffset uint64 - auxBuf []byte + alloc *btAlloc // pointless? + bplus *BpsTree + m mmap.MMap + data []byte + ef *eliasfano32.EliasFano + file *os.File + size int64 + modTime time.Time + filePath string + + // TODO do not sotre decompressor ptr in index, pass ArchiveGetter always instead of decomp directly + compressed FileCompression decompressor *compress.Decompressor - getter *compress.Getter } -func CreateBtreeIndex(indexPath, dataPath string, M uint64, logger log.Logger) (*BtIndex, error) { - err := BuildBtreeIndex(dataPath, indexPath, logger) +func CreateBtreeIndex(indexPath, dataPath string, M uint64, compressed FileCompression, seed uint32, logger log.Logger) (*BtIndex, error) { + err := BuildBtreeIndex(dataPath, indexPath, compressed, seed, logger) if err != nil { return nil, err } - return OpenBtreeIndex(indexPath, dataPath, M) + return OpenBtreeIndex(indexPath, dataPath, M, compressed, false) } -var DefaultBtreeM = uint64(2048) - -func CreateBtreeIndexWithDecompressor(indexPath string, M uint64, decompressor *compress.Decompressor, p *background.Progress, tmpdir string, logger log.Logger) (*BtIndex, error) { - err := BuildBtreeIndexWithDecompressor(indexPath, decompressor, p, tmpdir, logger) +func CreateBtreeIndexWithDecompressor(indexPath string, M uint64, decompressor *compress.Decompressor, compressed FileCompression, seed uint32, ps *background.ProgressSet, tmpdir string, logger log.Logger) (*BtIndex, error) { + err := BuildBtreeIndexWithDecompressor(indexPath, decompressor, compressed, ps, tmpdir, seed, logger) if err != nil { return nil, err } - return OpenBtreeIndexWithDecompressor(indexPath, M, decompressor) + return OpenBtreeIndexWithDecompressor(indexPath, M, decompressor, compressed) } -func BuildBtreeIndexWithDecompressor(indexPath string, kv *compress.Decompressor, p *background.Progress, tmpdir string, logger log.Logger) error { - defer kv.EnableReadAhead().DisableReadAhead() - - args := BtIndexWriterArgs{ - IndexFile: indexPath, - TmpDir: tmpdir, - } - - iw, err := NewBtIndexWriter(args, logger) +// Opens .kv at dataPath and generates index over it to file 'indexPath' +func BuildBtreeIndex(dataPath, indexPath string, compressed FileCompression, seed uint32, logger log.Logger) error { + decomp, err := compress.NewDecompressor(dataPath) if err != nil { return err } + defer decomp.Close() + return BuildBtreeIndexWithDecompressor(indexPath, decomp, compressed, background.NewProgressSet(), filepath.Dir(indexPath), seed, logger) +} - getter := kv.MakeGetter() - getter.Reset(0) +func OpenBtreeIndex(indexPath, dataPath string, M uint64, compressed FileCompression, trace bool) (*BtIndex, error) { + kv, err := compress.NewDecompressor(dataPath) + if err != nil { + return nil, err + } + return OpenBtreeIndexWithDecompressor(indexPath, M, kv, compressed) +} - key := make([]byte, 0, 64) - ks := make(map[int]int) +func BuildBtreeIndexWithDecompressor(indexPath string, kv *compress.Decompressor, compression FileCompression, ps *background.ProgressSet, tmpdir string, salt uint32, logger log.Logger) error { + _, indexFileName := filepath.Split(indexPath) + p := ps.AddNew(indexFileName, uint64(kv.Count()/2)) + defer ps.Delete(p) - var pos, kp uint64 - emptys := 0 - for getter.HasNext() { - p.Processed.Add(1) - key, kp = getter.Next(key[:0]) - err = iw.AddKey(key, pos) + defer kv.EnableReadAhead().DisableReadAhead() + bloomPath := strings.TrimSuffix(indexPath, ".bt") + ".kvei" + var bloom *bloomFilter + var err error + if kv.Count() >= 2 { + bloom, err = NewBloom(uint64(kv.Count()/2), bloomPath) if err != nil { return err } - - pos, _ = getter.Skip() - if pos-kp == 1 { - ks[len(key)]++ - emptys++ - } - } - //fmt.Printf("emptys %d %#+v\n", emptys, ks) - - if err := iw.Build(); err != nil { - return err - } - iw.Close() - return nil -} - -// Opens .kv at dataPath and generates index over it to file 'indexPath' -func BuildBtreeIndex(dataPath, indexPath string, logger log.Logger) error { - decomp, err := compress.NewDecompressor(dataPath) - if err != nil { - return err } - defer decomp.Close() - - defer decomp.EnableReadAhead().DisableReadAhead() + hasher := murmur3.New128WithSeed(salt) args := BtIndexWriterArgs{ IndexFile: indexPath, - TmpDir: filepath.Dir(indexPath), + TmpDir: tmpdir, } iw, err := NewBtIndexWriter(args, logger) @@ -910,31 +796,47 @@ func BuildBtreeIndex(dataPath, indexPath string, logger log.Logger) error { } defer iw.Close() - getter := decomp.MakeGetter() + getter := NewArchiveGetter(kv.MakeGetter(), compression) getter.Reset(0) key := make([]byte, 0, 64) - var pos uint64 + + //var kp, emptys uint64 + //ks := make(map[int]int) for getter.HasNext() { key, _ = getter.Next(key[:0]) err = iw.AddKey(key, pos) if err != nil { return err } - + hasher.Reset() + hasher.Write(key) //nolint:errcheck + hi, _ := hasher.Sum128() + bloom.AddHash(hi) pos, _ = getter.Skip() - } - decomp.Close() + //if pos-kp == 1 { + // ks[len(key)]++ + // emptys++ + //} + p.Processed.Add(1) + } + //logger.Warn("empty keys", "key lengths", ks, "total emptys", emptys, "total", kv.Count()/2) if err := iw.Build(); err != nil { return err } - iw.Close() + + if bloom != nil { + if err := bloom.Build(); err != nil { + return err + } + } return nil } -func OpenBtreeIndexWithDecompressor(indexPath string, M uint64, kv *compress.Decompressor) (*BtIndex, error) { +// For now, M is not stored inside index file. +func OpenBtreeIndexWithDecompressor(indexPath string, M uint64, kv *compress.Decompressor, compress FileCompression) (*BtIndex, error) { s, err := os.Stat(indexPath) if err != nil { return nil, err @@ -944,13 +846,18 @@ func OpenBtreeIndexWithDecompressor(indexPath string, M uint64, kv *compress.Dec filePath: indexPath, size: s.Size(), modTime: s.ModTime(), - auxBuf: make([]byte, 64), + + decompressor: kv, + compressed: compress, } idx.file, err = os.Open(indexPath) if err != nil { return nil, err } + if idx.size == 0 { + return idx, nil + } idx.m, err = mmap.MapRegion(idx.file, int(idx.size), mmap.RDONLY, 0, 0) if err != nil { @@ -958,118 +865,82 @@ func OpenBtreeIndexWithDecompressor(indexPath string, M uint64, kv *compress.Dec } idx.data = idx.m[:idx.size] - // Read number of keys and bytes per record - pos := 8 - idx.keyCount = binary.BigEndian.Uint64(idx.data[:pos]) - if idx.keyCount == 0 { + var pos int + if len(idx.data[pos:]) == 0 { return idx, nil } - idx.bytesPerRec = int(idx.data[pos]) - pos += 1 + defer idx.decompressor.EnableReadAhead().DisableReadAhead() - //p := (*[]byte)(unsafe.Pointer(&idx.data[pos])) - //l := int(idx.keyCount)*idx.bytesPerRec + (16 * int(idx.keyCount)) + idx.ef, pos = eliasfano32.ReadEliasFano(idx.data[pos:]) - idx.getter = kv.MakeGetter() + getter := NewArchiveGetter(idx.decompressor.MakeGetter(), idx.compressed) - idx.dataoffset = uint64(pos) - idx.alloc = newBtAlloc(idx.keyCount, M, false) - if idx.alloc != nil { - idx.alloc.dataLookup = idx.dataLookup - idx.alloc.traverseDfs() - defer idx.decompressor.EnableReadAhead().DisableReadAhead() - idx.alloc.fillSearchMx() + //fmt.Printf("open btree index %s with %d keys b+=%t data compressed %t\n", indexPath, idx.ef.Count(), UseBpsTree, idx.compressed) + switch UseBpsTree { + case true: + idx.bplus = NewBpsTree(getter, idx.ef, M, idx.dataLookup, idx.keyCmp) + default: + idx.alloc = newBtAlloc(idx.ef.Count(), M, false, idx.dataLookup, idx.keyCmp) + if idx.alloc != nil { + idx.alloc.WarmUp(getter) + } } + return idx, nil } -func OpenBtreeIndex(indexPath, dataPath string, M uint64) (*BtIndex, error) { - s, err := os.Stat(indexPath) - if err != nil { - return nil, err - } - - idx := &BtIndex{ - filePath: indexPath, - size: s.Size(), - modTime: s.ModTime(), - auxBuf: make([]byte, 64), - } - - idx.file, err = os.Open(indexPath) - if err != nil { - return nil, err - } - - idx.m, err = mmap.MapRegion(idx.file, int(idx.size), mmap.RDONLY, 0, 0) - if err != nil { - return nil, err +// dataLookup fetches key and value from data file by di (data index) +// di starts from 0 so di is never >= keyCount +func (b *BtIndex) dataLookup(di uint64, g ArchiveGetter) ([]byte, []byte, error) { + if di >= b.ef.Count() { + return nil, nil, fmt.Errorf("%w: keyCount=%d, but key %d requested. file: %s", ErrBtIndexLookupBounds, b.ef.Count(), di, b.FileName()) } - idx.data = idx.m[:idx.size] - - // Read number of keys and bytes per record - pos := 8 - idx.keyCount = binary.BigEndian.Uint64(idx.data[:pos]) - idx.bytesPerRec = int(idx.data[pos]) - pos += 1 - // offset := int(idx.keyCount) * idx.bytesPerRec //+ (idx.keySize * int(idx.keyCount)) - // if offset < 0 { - // return nil, fmt.Errorf("offset is: %d which is below zero, the file: %s is broken", offset, indexPath) - // } - - //p := (*[]byte)(unsafe.Pointer(&idx.data[pos])) - //l := int(idx.keyCount)*idx.bytesPerRec + (16 * int(idx.keyCount)) - - idx.decompressor, err = compress.NewDecompressor(dataPath) - if err != nil { - idx.Close() - return nil, err + offset := b.ef.Get(di) + g.Reset(offset) + if !g.HasNext() { + return nil, nil, fmt.Errorf("pair %d/%d key not found, file: %s/%s", di, b.ef.Count(), b.FileName(), g.FileName()) } - idx.getter = idx.decompressor.MakeGetter() - idx.dataoffset = uint64(pos) - idx.alloc = newBtAlloc(idx.keyCount, M, false) - if idx.alloc != nil { - idx.alloc.dataLookup = idx.dataLookup - idx.alloc.traverseDfs() - defer idx.decompressor.EnableReadAhead().DisableReadAhead() - idx.alloc.fillSearchMx() + k, _ := g.Next(nil) + if !g.HasNext() { + return nil, nil, fmt.Errorf("pair %d/%d value not found, file: %s/%s", di, b.ef.Count(), b.FileName(), g.FileName()) } - return idx, nil + v, _ := g.Next(nil) + return k, v, nil } -var ErrBtIndexLookupBounds = errors.New("BtIndex: lookup di bounds error") - -// dataLookup fetches key and value from data file by di (data index) -// di starts from 0 so di is never >= keyCount -func (b *BtIndex) dataLookup(di uint64) ([]byte, []byte, error) { - if di >= b.keyCount { - return nil, nil, fmt.Errorf("%w: keyCount=%d, item %d requested. file: %s", ErrBtIndexLookupBounds, b.keyCount, di+1, b.FileName()) - } - p := int(b.dataoffset) + int(di)*b.bytesPerRec - if len(b.data) < p+b.bytesPerRec { - return nil, nil, fmt.Errorf("data lookup gone too far (%d after %d). keyCount=%d, requesed item %d. file: %s", p+b.bytesPerRec-len(b.data), len(b.data), b.keyCount, di, b.FileName()) +// comparing `k` with item of index `di`. using buffer `kBuf` to avoid allocations +func (b *BtIndex) keyCmp(k []byte, di uint64, g ArchiveGetter) (int, []byte, error) { + if di >= b.ef.Count() { + return 0, nil, fmt.Errorf("%w: keyCount=%d, but key %d requested. file: %s", ErrBtIndexLookupBounds, b.ef.Count(), di+1, b.FileName()) } - var aux [8]byte - dst := aux[8-b.bytesPerRec:] - copy(dst, b.data[p:p+b.bytesPerRec]) - - offset := binary.BigEndian.Uint64(aux[:]) - b.getter.Reset(offset) - if !b.getter.HasNext() { - return nil, nil, fmt.Errorf("pair %d not found. keyCount=%d. file: %s", di, b.keyCount, b.FileName()) + offset := b.ef.Get(di) + g.Reset(offset) + if !g.HasNext() { + return 0, nil, fmt.Errorf("key at %d/%d not found, file: %s", di, b.ef.Count(), b.FileName()) } - key, kp := b.getter.Next(nil) + var res []byte + res, _ = g.Next(res[:0]) + + //TODO: use `b.getter.Match` after https://github.com/ledgerwatch/erigon/issues/7855 + return bytes.Compare(res, k), res, nil + //return b.getter.Match(k), result, nil +} - if !b.getter.HasNext() { - return nil, nil, fmt.Errorf("pair %d not found. keyCount=%d. file: %s", di, b.keyCount, b.FileName()) +// getter should be alive all the time of cursor usage +// Key and value is valid until cursor.Next is called +func (b *BtIndex) newCursor(ctx context.Context, k, v []byte, d uint64, g ArchiveGetter) *Cursor { + return &Cursor{ + btt: b, + ctx: ctx, + getter: g, + key: common.Copy(k), + value: common.Copy(v), + d: d, } - val, vp := b.getter.Next(nil) - _, _ = kp, vp - return key, val, nil } func (b *BtIndex) Size() int64 { return b.size } @@ -1080,17 +951,24 @@ func (b *BtIndex) FilePath() string { return b.filePath } func (b *BtIndex) FileName() string { return path.Base(b.filePath) } -func (b *BtIndex) Empty() bool { return b == nil || b.keyCount == 0 } +func (b *BtIndex) Empty() bool { return b == nil || b.ef == nil || b.ef.Count() == 0 } -func (b *BtIndex) KeyCount() uint64 { return b.keyCount } +func (b *BtIndex) KeyCount() uint64 { + if b.Empty() { + return 0 + } + return b.ef.Count() +} func (b *BtIndex) Close() { if b == nil { return } if b.file != nil { - if err := b.m.Unmap(); err != nil { - log.Log(dbg.FileCloseLogLevel, "unmap", "err", err, "file", b.FileName(), "stack", dbg.Stack()) + if b.m != nil { + if err := b.m.Unmap(); err != nil { + log.Log(dbg.FileCloseLogLevel, "unmap", "err", err, "file", b.FileName(), "stack", dbg.Stack()) + } } b.m = nil if err := b.file.Close(); err != nil { @@ -1098,49 +976,135 @@ func (b *BtIndex) Close() { } b.file = nil } + if b.decompressor != nil { b.decompressor.Close() b.decompressor = nil } } -func (b *BtIndex) Seek(x []byte) (*Cursor, error) { - if b.alloc == nil { - return nil, nil +// Get - exact match of key. `k == nil` - means not found +func (b *BtIndex) Get(lookup []byte, gr ArchiveGetter) (k, v []byte, found bool, err error) { + // TODO: optimize by "push-down" - instead of using seek+compare, alloc can have method Get which will return nil if key doesn't exists + // alternativaly: can allocate cursor on-stack + // it := Iter{} // allocation on stack + // it.Initialize(file) + + if b.Empty() { + return k, v, false, nil } - cursor, err := b.alloc.Seek(x) + + var index uint64 + // defer func() { + // fmt.Printf("[Bindex][%s] Get (%t) '%x' -> '%x' di=%d err %v\n", b.FileName(), found, lookup, v, index, err) + // }() + if UseBpsTree { + if b.bplus == nil { + panic(fmt.Errorf("Get: `b.bplus` is nil: %s", gr.FileName())) + } + //it, err := b.bplus.Seek(gr, lookup) + //if err != nil { + // return k, v, false, err + //} + //k, v, err := it.KVFromGetter(gr) + //if err != nil { + // return nil, nil, false, fmt.Errorf("kv from getter: %w", err) + //} + //if !bytes.Equal(k, lookup) { + // return nil, nil, false, nil + //} + //index = it.i + // v is actual value, not offset. + + // weak assumption that k will be ignored and used lookup instead. + // since fetching k and v from data file is required to use Getter. + // Why to do Getter.Reset twice when we can get kv right there. + + k, found, index, err = b.bplus.Get(gr, lookup) + } else { + if b.alloc == nil { + return k, v, false, err + } + k, found, index, err = b.alloc.Get(gr, lookup) + } + if err != nil || !found { + if errors.Is(err, ErrBtIndexLookupBounds) { + return k, v, false, nil + } + return nil, nil, false, err + } + + // this comparation should be done by index get method, and in case of mismatch, key is not found + //if !bytes.Equal(k, lookup) { + // return k, v, false, nil + //} + k, v, err = b.dataLookup(index, gr) if err != nil { - return nil, fmt.Errorf("seek key %x: %w", x, err) + if errors.Is(err, ErrBtIndexLookupBounds) { + return k, v, false, nil + } + return k, v, false, err } - // cursor could be nil along with err if nothing found - return cursor, nil + return k, v, true, nil } -// deprecated -func (b *BtIndex) Lookup(key []byte) uint64 { - if b.alloc == nil { - return 0 +// Seek moves cursor to position where key >= x. +// Then if x == nil - first key returned +// +// if x is larger than any other key in index, nil cursor is returned. +func (b *BtIndex) SeekDeprecated(x []byte) (*Cursor, error) { + g := NewArchiveGetter(b.decompressor.MakeGetter(), b.compressed) + return b.Seek(g, x) +} + +// Seek moves cursor to position where key >= x. +// Then if x == nil - first key returned +// +// if x is larger than any other key in index, nil cursor is returned. +func (b *BtIndex) Seek(g ArchiveGetter, x []byte) (*Cursor, error) { + if b.Empty() { + return nil, nil + } + + // defer func() { + // fmt.Printf("[Bindex][%s] Seek '%x' -> '%x' di=%d\n", b.FileName(), x, cursor.Value(), cursor.d) + // }() + var ( + k []byte + dt uint64 + found bool + err error + ) + + if UseBpsTree { + _, dt, found, err = b.bplus.Seek(g, x) + } else { + _, dt, found, err = b.alloc.Seek(g, x) } - cursor, err := b.alloc.Seek(key) + _ = found + if err != nil /*|| !found*/ { + if errors.Is(err, ErrBtIndexLookupBounds) { + return nil, nil + } + return nil, err + } + + k, v, err := b.dataLookup(dt, g) if err != nil { - panic(err) + if errors.Is(err, ErrBtIndexLookupBounds) { + return nil, nil + } + return nil, err } - return binary.BigEndian.Uint64(cursor.value) + return b.newCursor(context.Background(), k, v, dt, g), nil } func (b *BtIndex) OrdinalLookup(i uint64) *Cursor { - if b.alloc == nil { - return nil - } - if i > b.alloc.K { - return nil - } - k, v, err := b.dataLookup(i) + getter := NewArchiveGetter(b.decompressor.MakeGetter(), b.compressed) + k, v, err := b.dataLookup(i, getter) if err != nil { return nil } - return &Cursor{ - key: k, value: v, d: i, ix: b.alloc, - } + return b.newCursor(context.Background(), k, v, i, getter) } diff --git a/state/btree_index_test.go b/state/btree_index_test.go new file mode 100644 index 000000000..10f8b8879 --- /dev/null +++ b/state/btree_index_test.go @@ -0,0 +1,348 @@ +package state + +import ( + "bytes" + "fmt" + "path" + "path/filepath" + "testing" + + bloomfilter "github.com/holiman/bloomfilter/v2" + "github.com/ledgerwatch/log/v3" + "github.com/stretchr/testify/require" + + "github.com/ledgerwatch/erigon-lib/common" + "github.com/ledgerwatch/erigon-lib/common/background" + "github.com/ledgerwatch/erigon-lib/compress" + "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" +) + +func Test_BtreeIndex_Init2(t *testing.T) { + //mainnnet: storage.128-160.kv 110mil keys, 100mb bloomfilter of 0.01 (1%) miss-probability + //no much reason to merge bloomfilter - can merge them on starup + //1B keys: 1Gb + + sizes := []int{54, 74, 135, 139, 109, 105, 144} + sum := 0 + sumB := 0 + for _, sz := range sizes { + sum += sz + sumB += int(bloomfilter.OptimalM(uint64(sz*1_000_000), 0.001)) + } + large := bloomfilter.OptimalM(uint64(sum*1_000_000), 0.001) + fmt.Printf("see: %d\n", bloomfilter.OptimalM(uint64(1_000_000_000), 0.001)/8/1024/1024) + fmt.Printf("see: %d vs %d\n", sumB/8/1024/1024, large/8/1024/1024) + +} +func Test_BtreeIndex_Init(t *testing.T) { + logger := log.New() + tmp := t.TempDir() + + keyCount, M := 100, uint64(4) + compPath := generateKV(t, tmp, 52, 300, keyCount, logger, 0) + decomp, err := compress.NewDecompressor(compPath) + require.NoError(t, err) + defer decomp.Close() + + err = BuildBtreeIndexWithDecompressor(filepath.Join(tmp, "a.bt"), decomp, CompressNone, background.NewProgressSet(), tmp, 1, logger) + require.NoError(t, err) + + bt, err := OpenBtreeIndexWithDecompressor(filepath.Join(tmp, "a.bt"), M, decomp, CompressKeys|CompressVals) + require.NoError(t, err) + require.EqualValues(t, bt.KeyCount(), keyCount) + bt.Close() +} + +func Test_BtreeIndex_Seek(t *testing.T) { + tmp := t.TempDir() + logger := log.New() + keyCount, M := 120, 30 + compressFlags := CompressKeys | CompressVals + //UseBpsTree = true + + t.Run("empty index", func(t *testing.T) { + dataPath := generateKV(t, tmp, 52, 180, 0, logger, 0) + indexPath := path.Join(tmp, filepath.Base(dataPath)+".bti") + err := BuildBtreeIndex(dataPath, indexPath, compressFlags, 1, logger) + require.NoError(t, err) + + bt, err := OpenBtreeIndex(indexPath, dataPath, uint64(M), compressFlags, false) + require.NoError(t, err) + require.EqualValues(t, 0, bt.KeyCount()) + }) + dataPath := generateKV(t, tmp, 52, 180, keyCount, logger, 0) + + indexPath := path.Join(tmp, filepath.Base(dataPath)+".bti") + err := BuildBtreeIndex(dataPath, indexPath, compressFlags, 1, logger) + require.NoError(t, err) + + bt, err := OpenBtreeIndex(indexPath, dataPath, uint64(M), compressFlags, false) + require.NoError(t, err) + require.EqualValues(t, bt.KeyCount(), keyCount) + + keys, err := pivotKeysFromKV(dataPath) + require.NoError(t, err) + + getter := NewArchiveGetter(bt.decompressor.MakeGetter(), compressFlags) + + t.Run("seek beyond the last key", func(t *testing.T) { + _, _, err := bt.dataLookup(bt.ef.Count()+1, getter) + require.ErrorIs(t, err, ErrBtIndexLookupBounds) + + _, _, err = bt.dataLookup(bt.ef.Count(), getter) + require.ErrorIs(t, err, ErrBtIndexLookupBounds) + require.Error(t, err) + + _, _, err = bt.dataLookup(bt.ef.Count()-1, getter) + require.NoError(t, err) + + cur, err := bt.SeekDeprecated(common.FromHex("0xffffffffffffff")) //seek beyeon the last key + require.NoError(t, err) + require.Nil(t, cur) + }) + + c, err := bt.SeekDeprecated(nil) + require.NoError(t, err) + for i := 0; i < len(keys); i++ { + k := c.Key() + //if !bytes.Equal(keys[i], k) { + // fmt.Printf("\tinvalid, want %x, got %x\n", keys[i], k) + //} + require.EqualValues(t, keys[i], k) + c.Next() + } + + for i := 0; i < len(keys); i++ { + cur, err := bt.SeekDeprecated(keys[i]) + require.NoErrorf(t, err, "i=%d", i) + require.EqualValues(t, keys[i], cur.key) + require.NotEmptyf(t, cur.Value(), "i=%d", i) + // require.EqualValues(t, uint64(i), cur.Value()) + } + for i := 1; i < len(keys); i++ { + alt := common.Copy(keys[i]) + for j := len(alt) - 1; j >= 0; j-- { + if alt[j] > 0 { + alt[j] -= 1 + break + } + } + cur, err := bt.SeekDeprecated(keys[i]) + require.NoError(t, err) + require.EqualValues(t, keys[i], cur.Key()) + } + + bt.Close() +} + +func Test_BtreeIndex_Build(t *testing.T) { + tmp := t.TempDir() + logger := log.New() + keyCount, M := 20000, 510 + + compressFlags := CompressKeys | CompressVals + dataPath := generateKV(t, tmp, 52, 48, keyCount, logger, compressFlags) + keys, err := pivotKeysFromKV(dataPath) + require.NoError(t, err) + + indexPath := path.Join(tmp, filepath.Base(dataPath)+".bti") + err = BuildBtreeIndex(dataPath, indexPath, compressFlags, 1, logger) + require.NoError(t, err) + + bt, err := OpenBtreeIndex(indexPath, dataPath, uint64(M), compressFlags, false) + require.NoError(t, err) + require.EqualValues(t, bt.KeyCount(), keyCount) + + c, err := bt.SeekDeprecated(nil) + require.NoError(t, err) + for i := 0; i < len(keys); i++ { + k := c.Key() + if !bytes.Equal(keys[i], k) { + fmt.Printf("\tinvalid, want %x\n", keys[i]) + } + c.Next() + } + for i := 0; i < 10000; i++ { + c, err := bt.SeekDeprecated(keys[i]) + require.NoError(t, err) + require.EqualValues(t, keys[i], c.Key()) + } + defer bt.Close() +} + +func Test_BtreeIndex_Seek2(t *testing.T) { + tmp := t.TempDir() + logger := log.New() + keyCount, M := 1_200_000, 1024 + UseBpsTree = false + + compressFlags := CompressKeys | CompressVals + dataPath := generateKV(t, tmp, 52, 48, keyCount, logger, compressFlags) + + indexPath := path.Join(tmp, filepath.Base(dataPath)+".bti") + err := BuildBtreeIndex(dataPath, indexPath, compressFlags, 1, logger) + require.NoError(t, err) + + bt, err := OpenBtreeIndex(indexPath, dataPath, uint64(M), compressFlags, false) + require.NoError(t, err) + require.EqualValues(t, bt.KeyCount(), keyCount) + + keys, err := pivotKeysFromKV(dataPath) + require.NoError(t, err) + + getter := NewArchiveGetter(bt.decompressor.MakeGetter(), compressFlags) + + t.Run("seek beyond the last key", func(t *testing.T) { + _, _, err := bt.dataLookup(bt.ef.Count()+1, getter) + require.ErrorIs(t, err, ErrBtIndexLookupBounds) + + _, _, err = bt.dataLookup(bt.ef.Count(), getter) + require.ErrorIs(t, err, ErrBtIndexLookupBounds) + require.Error(t, err) + + _, _, err = bt.dataLookup(bt.ef.Count()-1, getter) + require.NoError(t, err) + + cur, err := bt.SeekDeprecated(common.FromHex("0xffffffffffffff")) //seek beyeon the last key + require.NoError(t, err) + require.Nil(t, cur) + }) + + c, err := bt.SeekDeprecated(nil) + require.NoError(t, err) + for i := 0; i < len(keys); i++ { + k := c.Key() + if !bytes.Equal(keys[i], k) { + fmt.Printf("\tinvalid, want %x\n", keys[i]) + } + c.Next() + } + + for i := 0; i < len(keys); i++ { + cur, err := bt.SeekDeprecated(keys[i]) + require.NoErrorf(t, err, "i=%d", i) + require.EqualValues(t, keys[i], cur.key) + require.NotEmptyf(t, cur.Value(), "i=%d", i) + // require.EqualValues(t, uint64(i), cur.Value()) + } + for i := 1; i < len(keys); i++ { + alt := common.Copy(keys[i]) + for j := len(alt) - 1; j >= 0; j-- { + if alt[j] > 0 { + alt[j] -= 1 + break + } + } + cur, err := bt.SeekDeprecated(keys[i]) + require.NoError(t, err) + require.EqualValues(t, keys[i], cur.Key()) + } + + bt.Close() +} + +func TestBpsTree_Seek(t *testing.T) { + keyCount, M := 48, 4 + tmp := t.TempDir() + + logger := log.New() + + compressFlag := CompressNone + dataPath := generateKV(t, tmp, 10, 48, keyCount, logger, compressFlag) + + kv, err := compress.NewDecompressor(dataPath) + require.NoError(t, err) + defer kv.Close() + + g := NewArchiveGetter(kv.MakeGetter(), compressFlag) + + g.Reset(0) + ps := make([]uint64, 0, keyCount) + keys := make([][]byte, 0, keyCount) + + p := uint64(0) + i := 0 + for g.HasNext() { + ps = append(ps, p) + k, _ := g.Next(nil) + _, p = g.Next(nil) + keys = append(keys, k) + //fmt.Printf("%2d k=%x, p=%v\n", i, k, p) + i++ + } + + //tr := newTrie() + ef := eliasfano32.NewEliasFano(uint64(keyCount), ps[len(ps)-1]) + for i := 0; i < len(ps); i++ { + //tr.insert(Node{i: uint64(i), prefix: common.Copy(keys[i]), off: ps[i]}) + ef.AddOffset(ps[i]) + } + ef.Build() + + efi, _ := eliasfano32.ReadEliasFano(ef.AppendBytes(nil)) + + ir := NewMockIndexReader(efi) + bp := NewBpsTree(g, efi, uint64(M), ir.dataLookup, ir.keyCmp) + bp.trace = true + + for i := 0; i < len(keys); i++ { + sk := keys[i] + k, di, found, err := bp.Seek(g, sk[:len(sk)/2]) + _ = di + _ = found + require.NoError(t, err) + require.NotNil(t, k) + require.False(t, found) // we are looking up by half of key, while FOUND=true when exact match found. + + //k, _, err := it.KVFromGetter(g) + //require.NoError(t, err) + require.EqualValues(t, keys[i], k) + } +} + +func NewMockIndexReader(ef *eliasfano32.EliasFano) *mockIndexReader { + return &mockIndexReader{ef: ef} +} + +type mockIndexReader struct { + ef *eliasfano32.EliasFano +} + +func (b *mockIndexReader) dataLookup(di uint64, g ArchiveGetter) ([]byte, []byte, error) { + if di >= b.ef.Count() { + return nil, nil, fmt.Errorf("%w: keyCount=%d, but key %d requested. file: %s", ErrBtIndexLookupBounds, b.ef.Count(), di, g.FileName()) + } + + offset := b.ef.Get(di) + g.Reset(offset) + if !g.HasNext() { + return nil, nil, fmt.Errorf("pair %d/%d key not found, file: %s", di, b.ef.Count(), g.FileName()) + } + + k, _ := g.Next(nil) + if !g.HasNext() { + return nil, nil, fmt.Errorf("pair %d/%d value not found, file: %s", di, b.ef.Count(), g.FileName()) + } + v, _ := g.Next(nil) + return k, v, nil +} + +// comparing `k` with item of index `di`. using buffer `kBuf` to avoid allocations +func (b *mockIndexReader) keyCmp(k []byte, di uint64, g ArchiveGetter) (int, []byte, error) { + if di >= b.ef.Count() { + return 0, nil, fmt.Errorf("%w: keyCount=%d, but key %d requested. file: %s", ErrBtIndexLookupBounds, b.ef.Count(), di+1, g.FileName()) + } + + offset := b.ef.Get(di) + g.Reset(offset) + if !g.HasNext() { + return 0, nil, fmt.Errorf("key at %d/%d not found, file: %s", di, b.ef.Count(), g.FileName()) + } + + var res []byte + res, _ = g.Next(res[:0]) + + //TODO: use `b.getter.Match` after https://github.com/ledgerwatch/erigon/issues/7855 + return bytes.Compare(res, k), res, nil + //return b.getter.Match(k), result, nil +} diff --git a/state/domain.go b/state/domain.go index 34f856372..f935bf12f 100644 --- a/state/domain.go +++ b/state/domain.go @@ -23,6 +23,7 @@ import ( "encoding/binary" "fmt" "math" + "math/bits" "os" "path/filepath" "regexp" @@ -31,31 +32,77 @@ import ( "sync/atomic" "time" - "github.com/RoaringBitmap/roaring/roaring64" - "github.com/ledgerwatch/erigon-lib/common/background" + "github.com/VictoriaMetrics/metrics" + bloomfilter "github.com/holiman/bloomfilter/v2" + "github.com/holiman/uint256" + "github.com/pkg/errors" btree2 "github.com/tidwall/btree" "golang.org/x/sync/errgroup" "github.com/ledgerwatch/log/v3" + "github.com/ledgerwatch/erigon-lib/common/background" + "github.com/ledgerwatch/erigon-lib/common/cmp" + "github.com/ledgerwatch/erigon-lib/common/length" + "github.com/ledgerwatch/erigon-lib/kv/iter" + "github.com/ledgerwatch/erigon-lib/kv/order" + "github.com/ledgerwatch/erigon-lib/common" "github.com/ledgerwatch/erigon-lib/common/dir" "github.com/ledgerwatch/erigon-lib/compress" + "github.com/ledgerwatch/erigon-lib/etl" "github.com/ledgerwatch/erigon-lib/kv" "github.com/ledgerwatch/erigon-lib/kv/bitmapdb" "github.com/ledgerwatch/erigon-lib/recsplit" ) +var ( + LatestStateReadWarm = metrics.GetOrCreateSummary(`latest_state_read{type="warm",found="yes"}`) //nolint + LatestStateReadWarmNotFound = metrics.GetOrCreateSummary(`latest_state_read{type="warm",found="no"}`) //nolint + LatestStateReadGrind = metrics.GetOrCreateSummary(`latest_state_read{type="grind",found="yes"}`) //nolint + LatestStateReadGrindNotFound = metrics.GetOrCreateSummary(`latest_state_read{type="grind",found="no"}`) //nolint + LatestStateReadCold = metrics.GetOrCreateSummary(`latest_state_read{type="cold",found="yes"}`) //nolint + LatestStateReadColdNotFound = metrics.GetOrCreateSummary(`latest_state_read{type="cold",found="no"}`) //nolint + LatestStateReadDB = metrics.GetOrCreateSummary(`latest_state_read{type="db",found="yes"}`) //nolint + LatestStateReadDBNotFound = metrics.GetOrCreateSummary(`latest_state_read{type="db",found="no"}`) //nolint + + mxRunningMerges = metrics.GetOrCreateCounter("domain_running_merges") + mxRunningCollations = metrics.GetOrCreateCounter("domain_running_collations") + mxCollateTook = metrics.GetOrCreateHistogram("domain_collate_took") + mxPruneTookDomain = metrics.GetOrCreateHistogram(`domain_prune_took{type="domain"}`) + mxPruneTookHistory = metrics.GetOrCreateHistogram(`domain_prune_took{type="history"}`) + mxPruneTookIndex = metrics.GetOrCreateHistogram(`domain_prune_took{type="index"}`) + mxPruneInProgress = metrics.GetOrCreateCounter("domain_pruning_progress") + mxCollationSize = metrics.GetOrCreateCounter("domain_collation_size") + mxCollationSizeHist = metrics.GetOrCreateCounter("domain_collation_hist_size") + mxPruneSizeDomain = metrics.GetOrCreateCounter(`domain_prune_size{type="domain"}`) + mxPruneSizeHistory = metrics.GetOrCreateCounter(`domain_prune_size{type="history"}`) + mxPruneSizeIndex = metrics.GetOrCreateCounter(`domain_prune_size{type="index"}`) + mxBuildTook = metrics.GetOrCreateSummary("domain_build_files_took") + mxStepTook = metrics.GetOrCreateHistogram("domain_step_took") + mxCommitmentKeys = metrics.GetOrCreateCounter("domain_commitment_keys") + mxCommitmentRunning = metrics.GetOrCreateCounter("domain_running_commitment") + mxCommitmentTook = metrics.GetOrCreateSummary("domain_commitment_took") + mxCommitmentWriteTook = metrics.GetOrCreateHistogram("domain_commitment_write_took") + mxCommitmentBranchUpdates = metrics.GetOrCreateCounter("domain_commitment_updates_applied") +) + +// StepsInColdFile - files of this size are completely frozen/immutable. +// files of smaller size are also immutable, but can be removed after merge to bigger files. +const StepsInColdFile = 32 + // filesItem corresponding to a pair of files (.dat and .idx) type filesItem struct { decompressor *compress.Decompressor index *recsplit.Index bindex *BtIndex + bm *bitmapdb.FixedSizeBitmaps + bloom *bloomFilter startTxNum uint64 endTxNum uint64 - // Frozen: file of size StepsInBiggestFile. Completely immutable. - // Cold: file of size < StepsInBiggestFile. Immutable, but can be closed/removed after merge to bigger file. + // Frozen: file of size StepsInColdFile. Completely immutable. + // Cold: file of size < StepsInColdFile. Immutable, but can be closed/removed after merge to bigger file. // Hot: Stored in DB. Providing Snapshot-Isolation by CopyOnWrite. frozen bool // immutable, don't need atomic refcount atomic.Int32 // only for `frozen=false` @@ -64,11 +111,52 @@ type filesItem struct { // other processes (which also reading files, may have same logic) canDelete atomic.Bool } +type bloomFilter struct { + *bloomfilter.Filter + FileName, FilePath string + f *os.File +} + +func NewBloom(keysCount uint64, filePath string) (*bloomFilter, error) { + m := bloomfilter.OptimalM(keysCount, 0.01) + //TODO: make filters compatible by usinig same seed/keys + _, fileName := filepath.Split(filePath) + bloom, err := bloomfilter.New(m) + if err != nil { + return nil, fmt.Errorf("%w, %s", err, fileName) + } + return &bloomFilter{FilePath: filePath, FileName: fileName, Filter: bloom}, nil +} +func (b *bloomFilter) Build() error { + log.Trace("[agg] write file", "file", b.FileName) + //TODO: fsync and tmp-file rename + if _, err := b.Filter.WriteFile(b.FilePath); err != nil { + return err + } + return nil +} + +func OpenBloom(filePath string) (*bloomFilter, error) { + _, fileName := filepath.Split(filePath) + f := &bloomFilter{FilePath: filePath, FileName: fileName} + var err error + f.Filter, _, err = bloomfilter.ReadFile(filePath) + if err != nil { + return nil, fmt.Errorf("OpenBloom: %w, %s", err, fileName) + } + return f, nil +} +func (b *bloomFilter) Close() { + if b.f != nil { + b.f.Close() + b.f = nil + } +} -func newFilesItem(startTxNum, endTxNum uint64, stepSize uint64) *filesItem { +func newFilesItem(startTxNum, endTxNum, stepSize uint64) *filesItem { startStep := startTxNum / stepSize endStep := endTxNum / stepSize - frozen := endStep-startStep == StepsInBiggestFile + frozen := endStep-startStep == StepsInColdFile return &filesItem{startTxNum: startTxNum, endTxNum: endTxNum, frozen: frozen} } @@ -88,7 +176,7 @@ func (i *filesItem) closeFilesAndRemove() { // paranoic-mode on: don't delete frozen files if !i.frozen { if err := os.Remove(i.decompressor.FilePath()); err != nil { - log.Trace("close", "err", err, "file", i.decompressor.FileName()) + log.Trace("remove after close", "err", err, "file", i.decompressor.FileName()) } } i.decompressor = nil @@ -98,7 +186,7 @@ func (i *filesItem) closeFilesAndRemove() { // paranoic-mode on: don't delete frozen files if !i.frozen { if err := os.Remove(i.index.FilePath()); err != nil { - log.Trace("close", "err", err, "file", i.index.FileName()) + log.Trace("remove after close", "err", err, "file", i.index.FileName()) } } i.index = nil @@ -106,10 +194,24 @@ func (i *filesItem) closeFilesAndRemove() { if i.bindex != nil { i.bindex.Close() if err := os.Remove(i.bindex.FilePath()); err != nil { - log.Trace("close", "err", err, "file", i.bindex.FileName()) + log.Trace("remove after close", "err", err, "file", i.bindex.FileName()) } i.bindex = nil } + if i.bm != nil { + i.bm.Close() + if err := os.Remove(i.bm.FilePath()); err != nil { + log.Trace("remove after close", "err", err, "file", i.bm.FileName()) + } + i.bm = nil + } + if i.bloom != nil { + i.bloom.Close() + if err := os.Remove(i.bloom.FilePath); err != nil { + log.Trace("remove after close", "err", err, "file", i.bloom.FileName) + } + i.bloom = nil + } } type DomainStats struct { @@ -121,17 +223,21 @@ type DomainStats struct { LastCollationSize uint64 LastPruneSize uint64 - HistoryQueries *atomic.Uint64 - TotalQueries *atomic.Uint64 - EfSearchTime time.Duration - DataSize uint64 - IndexSize uint64 - FilesCount uint64 + FilesQueries *atomic.Uint64 + TotalQueries *atomic.Uint64 + EfSearchTime time.Duration + DataSize uint64 + IndexSize uint64 + FilesCount uint64 } func (ds *DomainStats) Accumulate(other DomainStats) { - ds.HistoryQueries.Add(other.HistoryQueries.Load()) - ds.TotalQueries.Add(other.TotalQueries.Load()) + if other.FilesQueries != nil { + ds.FilesQueries.Add(other.FilesQueries.Load()) + } + if other.TotalQueries != nil { + ds.TotalQueries.Add(other.TotalQueries.Load()) + } ds.EfSearchTime += other.EfSearchTime ds.IndexSize += other.IndexSize ds.DataSize += other.DataSize @@ -140,45 +246,65 @@ func (ds *DomainStats) Accumulate(other DomainStats) { // Domain is a part of the state (examples are Accounts, Storage, Code) // Domain should not have any go routines or locks +// +// Data-Existence in .kv vs .v files: +// 1. key doesn’t exists, then create: .kv - yes, .v - yes +// 2. acc exists, then update/delete: .kv - yes, .v - yes +// 3. acc doesn’t exists, then delete: .kv - no, .v - no type Domain struct { + *History + files *btree2.BTreeG[*filesItem] // thread-safe, but maybe need 1 RWLock for all trees in AggregatorV3 + + // roFiles derivative from field `file`, but without garbage: + // - no files with `canDelete=true` + // - no overlaps + // - no un-indexed files (`power-off` may happen between .ef and .efi creation) + // + // MakeContext() using this field in zero-copy way + roFiles atomic.Pointer[[]ctxItem] + keysTable string // key -> invertedStep , invertedStep = ^(txNum / aggregationStep), Needs to be table with DupSort + valsTable string // key + invertedStep -> values + stats DomainStats + wal *domainWAL + + garbageFiles []*filesItem // files that exist on disk, but ignored on opening folder - because they are garbage + /* not large: keys: key -> ^step vals: key -> ^step+value (DupSort) large: keys: key -> ^step - vals: key + ^step -> value + vals: key + ^step -> value */ - *History - files *btree2.BTreeG[*filesItem] // thread-safe, but maybe need 1 RWLock for all trees in AggregatorV3 - // roFiles derivative from field `file`, but without garbage (canDelete=true, overlaps, etc...) - // MakeContext() using this field in zero-copy way - roFiles atomic.Pointer[[]ctxItem] - defaultDc *DomainContext - keysTable string // key -> invertedStep , invertedStep = ^(txNum / aggregationStep), Needs to be table with DupSort - valsTable string // key + invertedStep -> values - stats DomainStats - mergesCount uint64 + domainLargeValues bool + compression FileCompression - garbageFiles []*filesItem // files that exist on disk, but ignored on opening folder - because they are garbage - logger log.Logger + dir string +} + +type domainCfg struct { + hist histCfg + compress FileCompression + domainLargeValues bool } -func NewDomain(dir, tmpdir string, aggregationStep uint64, - filenameBase, keysTable, valsTable, indexKeysTable, historyValsTable, indexTable string, - compressVals, largeValues bool, logger log.Logger) (*Domain, error) { +func NewDomain(cfg domainCfg, aggregationStep uint64, filenameBase, keysTable, valsTable, indexKeysTable, historyValsTable, indexTable string, logger log.Logger) (*Domain, error) { d := &Domain{ - keysTable: keysTable, - valsTable: valsTable, - files: btree2.NewBTreeGOptions[*filesItem](filesItemLess, btree2.Options{Degree: 128, NoLocks: false}), - stats: DomainStats{HistoryQueries: &atomic.Uint64{}, TotalQueries: &atomic.Uint64{}}, - logger: logger, + dir: filepath.Join(filepath.Dir(cfg.hist.iiCfg.dir), "warm"), + keysTable: keysTable, + valsTable: valsTable, + compression: cfg.compress, + files: btree2.NewBTreeGOptions[*filesItem](filesItemLess, btree2.Options{Degree: 128, NoLocks: false}), + stats: DomainStats{FilesQueries: &atomic.Uint64{}, TotalQueries: &atomic.Uint64{}}, + + domainLargeValues: cfg.domainLargeValues, } d.roFiles.Store(&[]ctxItem{}) var err error - if d.History, err = NewHistory(dir, tmpdir, aggregationStep, filenameBase, indexKeysTable, indexTable, historyValsTable, compressVals, []string{"kv"}, largeValues, logger); err != nil { + if d.History, err = NewHistory(cfg.hist, aggregationStep, filenameBase, indexKeysTable, indexTable, historyValsTable, []string{}, logger); err != nil { return nil, err } @@ -187,20 +313,41 @@ func NewDomain(dir, tmpdir string, aggregationStep uint64, // LastStepInDB - return the latest available step in db (at-least 1 value in such step) func (d *Domain) LastStepInDB(tx kv.Tx) (lstInDb uint64) { - lst, _ := kv.FirstKey(tx, d.valsTable) - if len(lst) > 0 { - lstInDb = ^binary.BigEndian.Uint64(lst[len(lst)-8:]) + lstIdx, _ := kv.LastKey(tx, d.History.indexKeysTable) + if len(lstIdx) == 0 { + return 0 + } + return binary.BigEndian.Uint64(lstIdx) / d.aggregationStep +} +func (d *Domain) FirstStepInDB(tx kv.Tx) (lstInDb uint64) { + lstIdx, _ := kv.FirstKey(tx, d.History.indexKeysTable) + if len(lstIdx) == 0 { + return 0 } - return lstInDb + return binary.BigEndian.Uint64(lstIdx) / d.aggregationStep +} + +func (d *Domain) DiscardHistory() { + d.History.DiscardHistory() + // can't discard domain wal - it required, but can discard history + d.wal = d.newWriter(d.tmpdir, true, false) +} + +func (d *Domain) StartUnbufferedWrites() { + d.wal = d.newWriter(d.tmpdir, false, false) + d.History.StartUnbufferedWrites() } func (d *Domain) StartWrites() { - d.defaultDc = d.MakeContext() + d.wal = d.newWriter(d.tmpdir, true, false) d.History.StartWrites() } func (d *Domain) FinishWrites() { - d.defaultDc.Close() + if d.wal != nil { + d.wal.close() + d.wal = nil + } d.History.FinishWrites() } @@ -208,42 +355,42 @@ func (d *Domain) FinishWrites() { // It's ok if some files was open earlier. // If some file already open: noop. // If some file already open but not in provided list: close and remove from `files` field. -func (d *Domain) OpenList(fNames []string) error { - if err := d.History.OpenList(fNames); err != nil { +func (d *Domain) OpenList(coldNames, warmNames []string) error { + if err := d.History.OpenList(coldNames, warmNames); err != nil { return err } - return d.openList(fNames) + return d.openList(warmNames) } -func (d *Domain) openList(fNames []string) error { - d.closeWhatNotInList(fNames) - d.garbageFiles = d.scanStateFiles(fNames) +func (d *Domain) openList(names []string) error { + d.closeWhatNotInList(names) + d.garbageFiles = d.scanStateFiles(names) if err := d.openFiles(); err != nil { - return fmt.Errorf("History.OpenList: %s, %w", d.filenameBase, err) + return fmt.Errorf("Domain.OpenList: %s, %w", d.filenameBase, err) } return nil } func (d *Domain) OpenFolder() error { - files, err := d.fileNamesOnDisk() + files, warmNames, err := d.fileNamesOnDisk() if err != nil { return err } - return d.OpenList(files) + return d.OpenList(files, warmNames) } func (d *Domain) GetAndResetStats() DomainStats { r := d.stats r.DataSize, r.IndexSize, r.FilesCount = d.collectFilesStats() - d.stats = DomainStats{} + d.stats = DomainStats{FilesQueries: &atomic.Uint64{}, TotalQueries: &atomic.Uint64{}} return r } func (d *Domain) scanStateFiles(fileNames []string) (garbageFiles []*filesItem) { re := regexp.MustCompile("^" + d.filenameBase + ".([0-9]+)-([0-9]+).kv$") var err error -Loop: + for _, name := range fileNames { subs := re.FindStringSubmatch(name) if len(subs) != 3 { @@ -268,15 +415,16 @@ Loop: startTxNum, endTxNum := startStep*d.aggregationStep, endStep*d.aggregationStep var newFile = newFilesItem(startTxNum, endTxNum, d.aggregationStep) + newFile.frozen = false - for _, ext := range d.integrityFileExtensions { - requiredFile := fmt.Sprintf("%s.%d-%d.%s", d.filenameBase, startStep, endStep, ext) - if !dir.FileExist(filepath.Join(d.dir, requiredFile)) { - d.logger.Debug(fmt.Sprintf("[snapshots] skip %s because %s doesn't exists", name, requiredFile)) - garbageFiles = append(garbageFiles, newFile) - continue Loop - } - } + //for _, ext := range d.integrityFileExtensions { + // requiredFile := fmt.Sprintf("%s.%d-%d.%s", d.filenameBase, startStep, endStep, ext) + // if !dir.FileExist(filepath.Join(d.dir, requiredFile)) { + // d.logger.Debug(fmt.Sprintf("[snapshots] skip %s because %s doesn't exists", name, requiredFile)) + // garbageFiles = append(garbageFiles, newFile) + // continue Loop + // } + //} if _, has := d.files.Get(newFile); has { continue @@ -309,7 +457,7 @@ Loop: } func (d *Domain) openFiles() (err error) { - var totalKeys uint64 + //var totalKeys uint64 invalidFileItems := make([]*filesItem, 0) d.files.Walk(func(items []*filesItem) bool { @@ -324,28 +472,41 @@ func (d *Domain) openFiles() (err error) { continue } if item.decompressor, err = compress.NewDecompressor(datPath); err != nil { + err = errors.Wrap(err, "decompressor") + d.logger.Debug("Domain.openFiles: %w, %s", err, datPath) return false } - if item.index != nil { - continue - } - idxPath := filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.kvi", d.filenameBase, fromStep, toStep)) - if dir.FileExist(idxPath) { - if item.index, err = recsplit.OpenIndex(idxPath); err != nil { - d.logger.Debug("InvertedIndex.openFiles: %w, %s", err, idxPath) - return false + if item.index == nil && !UseBpsTree { + idxPath := filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.kvi", d.filenameBase, fromStep, toStep)) + if dir.FileExist(idxPath) { + if item.index, err = recsplit.OpenIndex(idxPath); err != nil { + err = errors.Wrap(err, "recsplit index") + d.logger.Debug("Domain.openFiles: %w, %s", err, idxPath) + return false + } + //totalKeys += item.index.KeyCount() } - totalKeys += item.index.KeyCount() } if item.bindex == nil { bidxPath := filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.bt", d.filenameBase, fromStep, toStep)) - if item.bindex, err = OpenBtreeIndexWithDecompressor(bidxPath, 2048, item.decompressor); err != nil { - d.logger.Debug("InvertedIndex.openFiles: %w, %s", err, bidxPath) - return false + if dir.FileExist(bidxPath) { + if item.bindex, err = OpenBtreeIndexWithDecompressor(bidxPath, DefaultBtreeM, item.decompressor, d.compression); err != nil { + err = errors.Wrap(err, "btree index") + d.logger.Debug("Domain.openFiles: %w, %s", err, bidxPath) + return false + } } //totalKeys += item.bindex.KeyCount() } + if item.bloom == nil { + idxPath := filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.kvei", d.filenameBase, fromStep, toStep)) + if dir.FileExist(idxPath) { + if item.bloom, err = OpenBloom(idxPath); err != nil { + return false + } + } + } } return true }) @@ -392,7 +553,7 @@ func (d *Domain) closeWhatNotInList(fNames []string) { } func (d *Domain) reCalcRoFiles() { - roFiles := ctxFiles(d.files) + roFiles := ctxFiles(d.files, true, true) d.roFiles.Store(&roFiles) } @@ -402,61 +563,53 @@ func (d *Domain) Close() { d.reCalcRoFiles() } -func (dc *DomainContext) get(key []byte, fromTxNum uint64, roTx kv.Tx) ([]byte, bool, error) { - //var invertedStep [8]byte - dc.d.stats.TotalQueries.Add(1) - - invertedStep := dc.numBuf - binary.BigEndian.PutUint64(invertedStep[:], ^(fromTxNum / dc.d.aggregationStep)) - keyCursor, err := roTx.CursorDupSort(dc.d.keysTable) - if err != nil { - return nil, false, err - } - defer keyCursor.Close() - foundInvStep, err := keyCursor.SeekBothRange(key, invertedStep[:]) - if err != nil { - return nil, false, err - } - if len(foundInvStep) == 0 { - dc.d.stats.HistoryQueries.Add(1) - return dc.readFromFiles(key, fromTxNum) - } - //keySuffix := make([]byte, len(key)+8) - copy(dc.keyBuf[:], key) - copy(dc.keyBuf[len(key):], foundInvStep) - v, err := roTx.GetOne(dc.d.valsTable, dc.keyBuf[:len(key)+8]) - if err != nil { - return nil, false, err +func (d *Domain) PutWithPrev(key1, key2, val, preval []byte) error { + // This call to update needs to happen before d.tx.Put() later, because otherwise the content of `preval`` slice is invalidated + if err := d.History.AddPrevValue(key1, key2, preval); err != nil { + return err } - return v, true, nil + return d.wal.addValue(key1, key2, val) } -func (dc *DomainContext) Get(key1, key2 []byte, roTx kv.Tx) ([]byte, error) { - //key := make([]byte, len(key1)+len(key2)) - copy(dc.keyBuf[:], key1) - copy(dc.keyBuf[len(key1):], key2) - // keys larger than 52 bytes will panic - v, _, err := dc.get(dc.keyBuf[:len(key1)+len(key2)], dc.d.txNum, roTx) - return v, err +func (d *Domain) DeleteWithPrev(key1, key2, prev []byte) (err error) { + // This call to update needs to happen before d.tx.Delete() later, because otherwise the content of `original`` slice is invalidated + if err := d.History.AddPrevValue(key1, key2, prev); err != nil { + return err + } + return d.wal.addValue(key1, key2, nil) } -func (d *Domain) update(key, original []byte) error { +func (d *Domain) update(key []byte) error { var invertedStep [8]byte binary.BigEndian.PutUint64(invertedStep[:], ^(d.txNum / d.aggregationStep)) + //fmt.Printf("put: %s, %x, %x\n", d.filenameBase, key, invertedStep[:]) if err := d.tx.Put(d.keysTable, key, invertedStep[:]); err != nil { return err } return nil } +func (d *Domain) put(key, val []byte) error { + if err := d.update(key); err != nil { + return err + } + invertedStep := ^(d.txNum / d.aggregationStep) + keySuffix := make([]byte, len(key)+8) + copy(keySuffix, key) + binary.BigEndian.PutUint64(keySuffix[len(key):], invertedStep) + //fmt.Printf("put2: %s, %x, %x\n", d.filenameBase, keySuffix, val) + return d.tx.Put(d.valsTable, keySuffix, val) +} + +// Deprecated func (d *Domain) Put(key1, key2, val []byte) error { - key := make([]byte, len(key1)+len(key2)) - copy(key, key1) - copy(key[len(key1):], key2) - original, _, err := d.defaultDc.get(key, d.txNum, d.tx) + key := common.Append(key1, key2) + dc := d.MakeContext() + original, _, err := dc.GetLatest(key, nil, d.tx) if err != nil { return err } + dc.Close() if bytes.Equal(original, val) { return nil } @@ -464,42 +617,143 @@ func (d *Domain) Put(key1, key2, val []byte) error { if err = d.History.AddPrevValue(key1, key2, original); err != nil { return err } - if err = d.update(key, original); err != nil { - return err - } - invertedStep := ^(d.txNum / d.aggregationStep) - keySuffix := make([]byte, len(key)+8) - copy(keySuffix, key) - binary.BigEndian.PutUint64(keySuffix[len(key):], invertedStep) - if err = d.tx.Put(d.valsTable, keySuffix, val); err != nil { - return err - } - return nil + return d.put(key, val) } +// Deprecated func (d *Domain) Delete(key1, key2 []byte) error { - key := make([]byte, len(key1)+len(key2)) - copy(key, key1) - copy(key[len(key1):], key2) - original, found, err := d.defaultDc.get(key, d.txNum, d.tx) + key := common.Append(key1, key2) + dc := d.MakeContext() + original, found, err := dc.GetLatest(key, nil, d.tx) + dc.Close() if err != nil { return err } if !found { return nil } - // This call to update needs to happen before d.tx.Delete() later, because otherwise the content of `original`` slice is invalidated - if err = d.History.AddPrevValue(key1, key2, original); err != nil { + return d.DeleteWithPrev(key1, key2, original) +} + +func (d *Domain) newWriter(tmpdir string, buffered, discard bool) *domainWAL { + w := &domainWAL{d: d, + tmpdir: tmpdir, + buffered: buffered, + discard: discard, + aux: make([]byte, 0, 128), + largeValues: d.domainLargeValues, + } + + if buffered { + w.values = etl.NewCollector(d.valsTable, tmpdir, etl.NewSortableBuffer(WALCollectorRAM), d.logger) + w.values.LogLvl(log.LvlTrace) + w.keys = etl.NewCollector(d.keysTable, tmpdir, etl.NewSortableBuffer(WALCollectorRAM), d.logger) + w.keys.LogLvl(log.LvlTrace) + } + return w +} + +type domainWAL struct { + d *Domain + keys *etl.Collector + values *etl.Collector + aux []byte + tmpdir string + buffered bool + discard bool + largeValues bool +} + +func (d *domainWAL) close() { + if d == nil { // allow dobule-close + return + } + if d.keys != nil { + d.keys.Close() + } + if d.values != nil { + d.values.Close() + } +} + +// nolint +func loadSkipFunc() etl.LoadFunc { + var preKey, preVal []byte + return func(k, v []byte, table etl.CurrentTableReader, next etl.LoadNextFunc) error { + if bytes.Equal(k, preKey) { + preVal = v + return nil + } + if err := next(nil, preKey, preVal); err != nil { + return err + } + if err := next(k, k, v); err != nil { + return err + } + preKey, preVal = k, v + return nil + } +} + +func (d *domainWAL) flush(ctx context.Context, tx kv.RwTx) error { + if d.discard || !d.buffered { + return nil + } + if err := d.keys.Load(tx, d.d.keysTable, loadFunc, etl.TransformArgs{Quit: ctx.Done()}); err != nil { return err } - if err = d.update(key, original); err != nil { + if err := d.values.Load(tx, d.d.valsTable, loadFunc, etl.TransformArgs{Quit: ctx.Done()}); err != nil { return err } - invertedStep := ^(d.txNum / d.aggregationStep) - keySuffix := make([]byte, len(key)+8) - copy(keySuffix, key) - binary.BigEndian.PutUint64(keySuffix[len(key):], invertedStep) - if err = d.tx.Delete(d.valsTable, keySuffix); err != nil { + return nil +} + +func (d *domainWAL) addValue(key1, key2, value []byte) error { + if d.discard { + return nil + } + + kl := len(key1) + len(key2) + d.aux = append(append(d.aux[:0], key1...), key2...) + fullkey := d.aux[:kl+8] + //TODO: we have ii.txNumBytes, need also have d.stepBytes. update it at d.SetTxNum() + binary.BigEndian.PutUint64(fullkey[kl:], ^(d.d.txNum / d.d.aggregationStep)) + // defer func() { + // fmt.Printf("addValue %x->%x buffered %t largeVals %t file %s\n", fullkey, value, d.buffered, d.largeValues, d.d.filenameBase) + // }() + + if d.largeValues { + if d.buffered { + if err := d.keys.Collect(fullkey[:kl], fullkey[kl:]); err != nil { + return err + } + if err := d.values.Collect(fullkey, value); err != nil { + return err + } + return nil + } + if err := d.d.tx.Put(d.d.keysTable, fullkey[:kl], fullkey[kl:]); err != nil { + return err + } + if err := d.d.tx.Put(d.d.valsTable, fullkey, value); err != nil { + return err + } + return nil + } + + if d.buffered { + if err := d.keys.Collect(fullkey[:kl], fullkey[kl:]); err != nil { + return err + } + if err := d.values.Collect(fullkey[:kl], common.Append(fullkey[kl:], value)); err != nil { + return err + } + return nil + } + if err := d.d.tx.Put(d.d.keysTable, fullkey[:kl], fullkey[kl:]); err != nil { + return err + } + if err := d.d.tx.Put(d.d.valsTable, fullkey[:kl], common.Append(fullkey[kl:], value)); err != nil { return err } return nil @@ -510,19 +764,23 @@ type CursorType uint8 const ( FILE_CURSOR CursorType = iota DB_CURSOR + RAM_CURSOR ) // CursorItem is the item in the priority queue used to do merge interation // over storage of a given account type CursorItem struct { - c kv.CursorDupSort - dg *compress.Getter - dg2 *compress.Getter - key []byte - val []byte - endTxNum uint64 - t CursorType // Whether this item represents state file or DB record, or tree - reverse bool + c kv.CursorDupSort + iter btree2.MapIter[string, []byte] + dg ArchiveGetter + dg2 ArchiveGetter + btCursor *Cursor + key []byte + val []byte + endTxNum uint64 + latestOffset uint64 // offset of the latest value in the file + t CursorType // Whether this item represents state file or DB record, or tree + reverse bool } type CursorHeap []*CursorItem @@ -571,52 +829,88 @@ type ctxItem struct { src *filesItem } -type ctxLocalityIdx struct { - reader *recsplit.IndexReader - bm *bitmapdb.FixedSizeBitmaps - file *ctxItem -} +func (i *ctxItem) isSubSetOf(j *ctxItem) bool { return i.src.isSubsetOf(j.src) } //nolint +func (i *ctxItem) isSubsetOf(j *ctxItem) bool { return i.src.isSubsetOf(j.src) } //nolint -func ctxItemLess(i, j ctxItem) bool { //nolint - if i.endTxNum == j.endTxNum { - return i.startTxNum > j.startTxNum - } - return i.endTxNum < j.endTxNum +type ctxLocalityIdx struct { + reader *recsplit.IndexReader + file *ctxItem + aggregationStep uint64 } // DomainContext allows accesing the same domain from multiple go-routines type DomainContext struct { - d *Domain - files []ctxItem - getters []*compress.Getter - readers []*BtIndex - hc *HistoryContext - keyBuf [60]byte // 52b key and 8b for inverted step - numBuf [8]byte + d *Domain + files []ctxItem + getters []ArchiveGetter + readers []*BtIndex + idxReaders []*recsplit.IndexReader + hc *HistoryContext + keyBuf [60]byte // 52b key and 8b for inverted step + valKeyBuf [60]byte // 52b key and 8b for inverted step + + keysC kv.CursorDupSort + valsC kv.Cursor } -func (dc *DomainContext) statelessGetter(i int) *compress.Getter { - if dc.getters == nil { - dc.getters = make([]*compress.Getter, len(dc.files)) +// getFromFile returns exact match for the given key from the given file +func (dc *DomainContext) getFromFile(i int, filekey []byte) ([]byte, bool, error) { + g := dc.statelessGetter(i) + if UseBtree || UseBpsTree { + if dc.d.withExistenceIndex && dc.files[i].src.bloom != nil { + hi, _ := dc.hc.ic.hashKey(filekey) + if !dc.files[i].src.bloom.ContainsHash(hi) { + return nil, false, nil + } + } + + _, v, ok, err := dc.statelessBtree(i).Get(filekey, g) + if err != nil || !ok { + return nil, false, err + } + //fmt.Printf("getLatestFromBtreeColdFiles key %x shard %d %x\n", filekey, exactColdShard, v) + return v, true, nil } - r := dc.getters[i] - if r == nil { - r = dc.files[i].src.decompressor.MakeGetter() - dc.getters[i] = r + + reader := dc.statelessIdxReader(i) + if reader.Empty() { + return nil, false, nil } - return r + offset := reader.Lookup(filekey) + g.Reset(offset) + + k, _ := g.Next(nil) + if !bytes.Equal(filekey, k) { + return nil, false, nil + } + v, _ := g.Next(nil) + return v, true, nil } -func (dc *DomainContext) statelessBtree(i int) *BtIndex { - if dc.readers == nil { - dc.readers = make([]*BtIndex, len(dc.files)) +func (dc *DomainContext) getFromFile2(i int, filekey []byte) ([]byte, bool, error) { + g := dc.statelessGetter(i) + if UseBtree || UseBpsTree { + _, v, ok, err := dc.statelessBtree(i).Get(filekey, g) + if err != nil || !ok { + return nil, false, err + } + //fmt.Printf("getLatestFromBtreeColdFiles key %x shard %d %x\n", filekey, exactColdShard, v) + return v, true, nil } - r := dc.readers[i] - if r == nil { - r = dc.files[i].src.bindex - dc.readers[i] = r + + reader := dc.statelessIdxReader(i) + if reader.Empty() { + return nil, false, nil } - return r + offset := reader.Lookup(filekey) + g.Reset(offset) + + k, _ := g.Next(nil) + if !bytes.Equal(filekey, k) { + return nil, false, nil + } + v, _ := g.Next(nil) + return v, true, nil } func (d *Domain) collectFilesStats() (datsz, idxsz, files uint64) { @@ -627,7 +921,8 @@ func (d *Domain) collectFilesStats() (datsz, idxsz, files uint64) { } datsz += uint64(item.decompressor.Size()) idxsz += uint64(item.index.Size()) - files += 2 + idxsz += uint64(item.bindex.Size()) + files += 3 } return true }) @@ -653,29 +948,25 @@ func (d *Domain) collectFilesStats() (datsz, idxsz, files uint64) { } func (d *Domain) MakeContext() *DomainContext { - dc := &DomainContext{ + files := *d.roFiles.Load() + for i := 0; i < len(files); i++ { + if !files[i].src.frozen { + files[i].src.refcount.Add(1) + } + } + return &DomainContext{ d: d, hc: d.History.MakeContext(), - files: *d.roFiles.Load(), - } - for _, item := range dc.files { - if !item.src.frozen { - item.src.refcount.Add(1) - } + files: files, } - - return dc } // Collation is the set of compressors created after aggregation type Collation struct { - valuesComp *compress.Compressor - historyComp *compress.Compressor - indexBitmaps map[string]*roaring64.Bitmap - valuesPath string - historyPath string - valuesCount int - historyCount int + HistoryCollation + valuesComp *compress.Compressor + valuesPath string + valuesCount int } func (c Collation) Close() { @@ -683,93 +974,39 @@ func (c Collation) Close() { c.valuesComp.Close() } if c.historyComp != nil { - c.historyComp.Close() - } -} - -type kvpair struct { - k, v []byte -} - -func (d *Domain) writeCollationPair(valuesComp *compress.Compressor, pairs chan kvpair) (count int, err error) { - for kv := range pairs { - if err = valuesComp.AddUncompressedWord(kv.k); err != nil { - return count, fmt.Errorf("add %s values key [%x]: %w", d.filenameBase, kv.k, err) - } - mxCollationSize.Inc() - count++ // Only counting keys, not values - if err = valuesComp.AddUncompressedWord(kv.v); err != nil { - return count, fmt.Errorf("add %s values val [%x]=>[%x]: %w", d.filenameBase, kv.k, kv.v, err) - } - } - return count, nil -} - -// nolint -func (d *Domain) aggregate(ctx context.Context, step uint64, txFrom, txTo uint64, tx kv.Tx, ps *background.ProgressSet) (err error) { - mxRunningCollations.Inc() - start := time.Now() - collation, err := d.collateStream(ctx, step, txFrom, txTo, tx) - mxRunningCollations.Dec() - mxCollateTook.UpdateDuration(start) - - mxCollationSize.Set(uint64(collation.valuesComp.Count())) - mxCollationSizeHist.Set(uint64(collation.historyComp.Count())) - - if err != nil { - collation.Close() - //return fmt.Errorf("domain collation %q has failed: %w", d.filenameBase, err) - return err - } - - mxRunningMerges.Inc() - - start = time.Now() - sf, err := d.buildFiles(ctx, step, collation, ps) - collation.Close() - defer sf.Close() - - if err != nil { - sf.Close() - mxRunningMerges.Dec() - return + c.HistoryCollation.Close() } - - mxRunningMerges.Dec() - - d.integrateFiles(sf, step*d.aggregationStep, (step+1)*d.aggregationStep) - d.stats.LastFileBuildingTook = time.Since(start) - return nil } // collate gathers domain changes over the specified step, using read-only transaction, // and returns compressors, elias fano, and bitmaps // [txFrom; txTo) -func (d *Domain) collateStream(ctx context.Context, step, txFrom, txTo uint64, roTx kv.Tx) (Collation, error) { +func (d *Domain) collate(ctx context.Context, step, txFrom, txTo uint64, roTx kv.Tx) (coll Collation, err error) { + mxRunningCollations.Inc() started := time.Now() defer func() { d.stats.LastCollationTook = time.Since(started) + mxRunningCollations.Dec() + mxCollateTook.UpdateDuration(started) }() - hCollation, err := d.History.collate(step, txFrom, txTo, roTx) + coll.HistoryCollation, err = d.History.collate(step, txFrom, txTo, roTx) if err != nil { return Collation{}, err } - var valuesComp *compress.Compressor - closeComp := true + closeCollation := true defer func() { - if closeComp { - if valuesComp != nil { - valuesComp.Close() - } + if closeCollation { + coll.Close() } }() - valuesPath := filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.kv", d.filenameBase, step, step+1)) - if valuesComp, err = compress.NewCompressor(context.Background(), "collate values", valuesPath, d.tmpdir, compress.MinPatternScore, 1, log.LvlTrace, d.logger); err != nil { + coll.valuesPath = filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.kv", d.filenameBase, step, step+1)) + if coll.valuesComp, err = compress.NewCompressor(context.Background(), "collate values", coll.valuesPath, d.tmpdir, compress.MinPatternScore, d.compressWorkers, log.LvlTrace, d.logger); err != nil { return Collation{}, fmt.Errorf("create %s values compressor: %w", d.filenameBase, err) } + comp := NewArchiveWriter(coll.valuesComp, d.compression) keysCursor, err := roTx.CursorDupSort(d.keysTable) if err != nil { @@ -778,178 +1015,92 @@ func (d *Domain) collateStream(ctx context.Context, step, txFrom, txTo uint64, r defer keysCursor.Close() var ( - k, v []byte - pos uint64 - valCount int - pairs = make(chan kvpair, 1024) - ) - - //totalKeys, err := keysCursor.Count() - //if err != nil { - // return Collation{}, fmt.Errorf("failed to obtain keys count for domain %q", d.filenameBase) - //} - - eg, _ := errgroup.WithContext(ctx) - eg.Go(func() error { - valCount, err = d.writeCollationPair(valuesComp, pairs) - return err - }) - - var ( + pos uint64 stepBytes = make([]byte, 8) keySuffix = make([]byte, 256+8) + v []byte + + valsDup kv.CursorDupSort ) binary.BigEndian.PutUint64(stepBytes, ^step) - - for k, _, err = keysCursor.First(); err == nil && k != nil; k, _, err = keysCursor.NextNoDup() { - pos++ - - if v, err = keysCursor.LastDup(); err != nil { - return Collation{}, fmt.Errorf("find last %s key for aggregation step k=[%x]: %w", d.filenameBase, k, err) + if !d.domainLargeValues { + valsDup, err = roTx.CursorDupSort(d.valsTable) + if err != nil { + return Collation{}, fmt.Errorf("create %s values cursorDupsort: %w", d.filenameBase, err) } - if bytes.Equal(v, stepBytes) { - copy(keySuffix, k) - copy(keySuffix[len(k):], v) - ks := len(k) + len(v) + defer valsDup.Close() + } - v, err := roTx.GetOne(d.valsTable, keySuffix[:ks]) + if err := func() error { + for k, stepInDB, err := keysCursor.First(); k != nil; k, stepInDB, err = keysCursor.Next() { if err != nil { - return Collation{}, fmt.Errorf("find last %s value for aggregation step k=[%x]: %w", d.filenameBase, k, err) + return err } - - select { - case <-ctx.Done(): - return Collation{}, ctx.Err() - default: + pos++ + if !bytes.Equal(stepBytes, stepInDB) { + continue } - pairs <- kvpair{k: k, v: v} - } - } - close(pairs) - if err != nil { - return Collation{}, fmt.Errorf("iterate over %s keys cursor: %w", d.filenameBase, err) - } + copy(keySuffix, k) + copy(keySuffix[len(k):], stepInDB) - if err := eg.Wait(); err != nil { - return Collation{}, fmt.Errorf("collate over %s keys cursor: %w", d.filenameBase, err) - } - - closeComp = false - return Collation{ - valuesPath: valuesPath, - valuesComp: valuesComp, - valuesCount: valCount, - historyPath: hCollation.historyPath, - historyComp: hCollation.historyComp, - historyCount: hCollation.historyCount, - indexBitmaps: hCollation.indexBitmaps, - }, nil -} - -// collate gathers domain changes over the specified step, using read-only transaction, -// and returns compressors, elias fano, and bitmaps -// [txFrom; txTo) -func (d *Domain) collate(ctx context.Context, step, txFrom, txTo uint64, roTx kv.Tx, logEvery *time.Ticker) (Collation, error) { - started := time.Now() - defer func() { - d.stats.LastCollationTook = time.Since(started) - }() + switch d.domainLargeValues { + case true: + v, err = roTx.GetOne(d.valsTable, keySuffix[:len(k)+8]) + default: + v, err = valsDup.SeekBothRange(keySuffix[:len(k)], keySuffix[len(k):len(k)+8]) + //fmt.Printf("seek: %x -> %x\n", keySuffix[:len(k)], v) + for { + k, _, _ := valsDup.Next() + if len(k) == 0 { + break + } - hCollation, err := d.History.collate(step, txFrom, txTo, roTx) - if err != nil { - return Collation{}, err - } - var valuesComp *compress.Compressor - closeComp := true - defer func() { - if closeComp { - hCollation.Close() - if valuesComp != nil { - valuesComp.Close() + if bytes.HasPrefix(k, keySuffix[:len(k)]) { + //fmt.Printf("next: %x -> %x\n", k, v) + } else { + break + } + } } - } - }() - valuesPath := filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.kv", d.filenameBase, step, step+1)) - if valuesComp, err = compress.NewCompressor(context.Background(), "collate values", valuesPath, d.tmpdir, compress.MinPatternScore, 1, log.LvlTrace, d.logger); err != nil { - return Collation{}, fmt.Errorf("create %s values compressor: %w", d.filenameBase, err) - } - keysCursor, err := roTx.CursorDupSort(d.keysTable) - if err != nil { - return Collation{}, fmt.Errorf("create %s keys cursor: %w", d.filenameBase, err) - } - defer keysCursor.Close() - - var ( - k, v []byte - pos uint64 - valuesCount uint - ) - - //TODO: use prorgesSet - //totalKeys, err := keysCursor.Count() - //if err != nil { - // return Collation{}, fmt.Errorf("failed to obtain keys count for domain %q", d.filenameBase) - //} - for k, _, err = keysCursor.First(); err == nil && k != nil; k, _, err = keysCursor.NextNoDup() { - if err != nil { - return Collation{}, err - } - pos++ - select { - case <-ctx.Done(): - d.logger.Warn("[snapshots] collate domain cancelled", "name", d.filenameBase, "err", ctx.Err()) - return Collation{}, ctx.Err() - default: - } - - if v, err = keysCursor.LastDup(); err != nil { - return Collation{}, fmt.Errorf("find last %s key for aggregation step k=[%x]: %w", d.filenameBase, k, err) - } - s := ^binary.BigEndian.Uint64(v) - if s == step { - keySuffix := make([]byte, len(k)+8) - copy(keySuffix, k) - copy(keySuffix[len(k):], v) - v, err := roTx.GetOne(d.valsTable, keySuffix) if err != nil { - return Collation{}, fmt.Errorf("find last %s value for aggregation step k=[%x]: %w", d.filenameBase, k, err) + return fmt.Errorf("find last %s value for aggregation step k=[%x]: %w", d.filenameBase, k, err) + } + + if err = comp.AddWord(k); err != nil { + return fmt.Errorf("add %s values key [%x]: %w", d.filenameBase, k, err) } - if err = valuesComp.AddUncompressedWord(k); err != nil { - return Collation{}, fmt.Errorf("add %s values key [%x]: %w", d.filenameBase, k, err) + if err = comp.AddWord(v); err != nil { + return fmt.Errorf("add %s values [%x]=>[%x]: %w", d.filenameBase, k, v, err) } - valuesCount++ // Only counting keys, not values - if err = valuesComp.AddUncompressedWord(v); err != nil { - return Collation{}, fmt.Errorf("add %s values val [%x]=>[%x]: %w", d.filenameBase, k, v, err) + mxCollationSize.Inc() + + select { + case <-ctx.Done(): + return ctx.Err() + default: } } - } - if err != nil { + return nil + }(); err != nil { return Collation{}, fmt.Errorf("iterate over %s keys cursor: %w", d.filenameBase, err) } - closeComp = false - return Collation{ - valuesPath: valuesPath, - valuesComp: valuesComp, - valuesCount: int(valuesCount), - historyPath: hCollation.historyPath, - historyComp: hCollation.historyComp, - historyCount: hCollation.historyCount, - indexBitmaps: hCollation.indexBitmaps, - }, nil + + closeCollation = false + coll.valuesCount = coll.valuesComp.Count() / 2 + return coll, nil } type StaticFiles struct { - valuesDecomp *compress.Decompressor - valuesIdx *recsplit.Index - valuesBt *BtIndex - historyDecomp *compress.Decompressor - historyIdx *recsplit.Index - efHistoryDecomp *compress.Decompressor - efHistoryIdx *recsplit.Index + HistoryFiles + valuesDecomp *compress.Decompressor + valuesIdx *recsplit.Index + valuesBt *BtIndex + bloom *bloomFilter } -func (sf StaticFiles) Close() { +// CleanupOnError - call it on collation fail. It closing all files +func (sf StaticFiles) CleanupOnError() { if sf.valuesDecomp != nil { sf.valuesDecomp.Close() } @@ -976,12 +1127,16 @@ func (sf StaticFiles) Close() { // buildFiles performs potentially resource intensive operations of creating // static files and their indices func (d *Domain) buildFiles(ctx context.Context, step uint64, collation Collation, ps *background.ProgressSet) (StaticFiles, error) { - hStaticFiles, err := d.History.buildFiles(ctx, step, HistoryCollation{ - historyPath: collation.historyPath, - historyComp: collation.historyComp, - historyCount: collation.historyCount, - indexBitmaps: collation.indexBitmaps, - }, ps) + if d.filenameBase == AggTraceFileLife { + d.logger.Warn("[snapshots] buildFiles", "step", step, "domain", d.filenameBase) + } + + start := time.Now() + defer func() { + d.stats.LastFileBuildingTook = time.Since(start) + }() + + hStaticFiles, err := d.History.buildFiles(ctx, step, collation.HistoryCollation, ps) if err != nil { return StaticFiles{}, err } @@ -1017,43 +1172,59 @@ func (d *Domain) buildFiles(ctx context.Context, step uint64, collation Collatio valuesIdxFileName := fmt.Sprintf("%s.%d-%d.kvi", d.filenameBase, step, step+1) valuesIdxPath := filepath.Join(d.dir, valuesIdxFileName) - { - p := ps.AddNew(valuesIdxFileName, uint64(valuesDecomp.Count()*2)) - defer ps.Delete(p) - if valuesIdx, err = buildIndexThenOpen(ctx, valuesDecomp, valuesIdxPath, d.tmpdir, collation.valuesCount, false, p, d.logger, d.noFsync); err != nil { + if !UseBpsTree { + if valuesIdx, err = buildIndexThenOpen(ctx, valuesDecomp, d.compression, valuesIdxPath, d.tmpdir, false, d.salt, ps, d.logger, d.noFsync); err != nil { return StaticFiles{}, fmt.Errorf("build %s values idx: %w", d.filenameBase, err) } } var bt *BtIndex { - btFileName := strings.TrimSuffix(valuesIdxFileName, "kvi") + "bt" + btFileName := fmt.Sprintf("%s.%d-%d.bt", d.filenameBase, step, step+1) btPath := filepath.Join(d.dir, btFileName) - p := ps.AddNew(btFileName, uint64(valuesDecomp.Count()*2)) - defer ps.Delete(p) - bt, err = CreateBtreeIndexWithDecompressor(btPath, DefaultBtreeM, valuesDecomp, p, d.tmpdir, d.logger) + bt, err = CreateBtreeIndexWithDecompressor(btPath, DefaultBtreeM, valuesDecomp, d.compression, *d.salt, ps, d.tmpdir, d.logger) if err != nil { - return StaticFiles{}, fmt.Errorf("build %s values bt idx: %w", d.filenameBase, err) + return StaticFiles{}, fmt.Errorf("build %s .bt idx: %w", d.filenameBase, err) + } + } + var bloom *bloomFilter + { + fileName := fmt.Sprintf("%s.%d-%d.kvei", d.filenameBase, step, step+1) + if dir.FileExist(filepath.Join(d.dir, fileName)) { + bloom, err = OpenBloom(filepath.Join(d.dir, fileName)) + if err != nil { + return StaticFiles{}, fmt.Errorf("build %s .kvei: %w", d.filenameBase, err) + } } } - closeComp = false return StaticFiles{ - valuesDecomp: valuesDecomp, - valuesIdx: valuesIdx, - valuesBt: bt, - historyDecomp: hStaticFiles.historyDecomp, - historyIdx: hStaticFiles.historyIdx, - efHistoryDecomp: hStaticFiles.efHistoryDecomp, - efHistoryIdx: hStaticFiles.efHistoryIdx, + HistoryFiles: hStaticFiles, + valuesDecomp: valuesDecomp, + valuesIdx: valuesIdx, + valuesBt: bt, + bloom: bloom, }, nil } -func (d *Domain) missedIdxFiles() (l []*filesItem) { +func (d *Domain) missedBtreeIdxFiles() (l []*filesItem) { + d.files.Walk(func(items []*filesItem) bool { // don't run slow logic while iterating on btree + for _, item := range items { + fromStep, toStep := item.startTxNum/d.aggregationStep, item.endTxNum/d.aggregationStep + fname := fmt.Sprintf("%s.%d-%d.bt", d.filenameBase, fromStep, toStep) + if !dir.FileExist(filepath.Join(d.dir, fname)) { + l = append(l, item) + } + } + return true + }) + return l +} +func (d *Domain) missedKviIdxFiles() (l []*filesItem) { d.files.Walk(func(items []*filesItem) bool { // don't run slow logic while iterating on btree for _, item := range items { fromStep, toStep := item.startTxNum/d.aggregationStep, item.endTxNum/d.aggregationStep - if !dir.FileExist(filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.bt", d.filenameBase, fromStep, toStep))) { + if !dir.FileExist(filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.kvi", d.filenameBase, fromStep, toStep))) { l = append(l, item) } } @@ -1062,45 +1233,107 @@ func (d *Domain) missedIdxFiles() (l []*filesItem) { return l } +//func (d *Domain) missedIdxFilesBloom() (l []*filesItem) { +// d.files.Walk(func(items []*filesItem) bool { // don't run slow logic while iterating on btree +// for _, item := range items { +// fromStep, toStep := item.startTxNum/d.aggregationStep, item.endTxNum/d.aggregationStep +// if !dir.FileExist(filepath.Join(d.dir, fmt.Sprintf("%s.%d-%d.kvei", d.filenameBase, fromStep, toStep))) { +// l = append(l, item) +// } +// } +// return true +// }) +// return l +//} + // BuildMissedIndices - produce .efi/.vi/.kvi from .ef/.v/.kv -func (d *Domain) BuildMissedIndices(ctx context.Context, g *errgroup.Group, ps *background.ProgressSet) (err error) { +func (d *Domain) BuildMissedIndices(ctx context.Context, g *errgroup.Group, ps *background.ProgressSet) { d.History.BuildMissedIndices(ctx, g, ps) - d.InvertedIndex.BuildMissedIndices(ctx, g, ps) - for _, item := range d.missedIdxFiles() { - //TODO: build .kvi + for _, item := range d.missedBtreeIdxFiles() { fitem := item g.Go(func() error { - idxPath := filepath.Join(fitem.decompressor.FilePath(), fitem.decompressor.FileName()) + idxPath := fitem.decompressor.FilePath() idxPath = strings.TrimSuffix(idxPath, "kv") + "bt" - - p := ps.AddNew("fixme", uint64(fitem.decompressor.Count())) - defer ps.Delete(p) - if err := BuildBtreeIndexWithDecompressor(idxPath, fitem.decompressor, p, d.tmpdir, d.logger); err != nil { + if err := BuildBtreeIndexWithDecompressor(idxPath, fitem.decompressor, CompressNone, ps, d.tmpdir, *d.salt, d.logger); err != nil { return fmt.Errorf("failed to build btree index for %s: %w", fitem.decompressor.FileName(), err) } return nil }) } - return nil + for _, item := range d.missedKviIdxFiles() { + fitem := item + g.Go(func() error { + if UseBpsTree { + return nil + } + + idxPath := fitem.decompressor.FilePath() + idxPath = strings.TrimSuffix(idxPath, "kv") + "kvi" + ix, err := buildIndexThenOpen(ctx, fitem.decompressor, d.compression, idxPath, d.tmpdir, false, d.salt, ps, d.logger, d.noFsync) + if err != nil { + return fmt.Errorf("build %s values recsplit index: %w", d.filenameBase, err) + } + ix.Close() + return nil + }) + } + //for _, item := range d.missedIdxFilesBloom() { + // fitem := item + // g.Go(func() error { + // if UseBpsTree { + // return nil + // } + // + // idxPath := fitem.decompressor.FilePath() + // idxPath = strings.TrimSuffix(idxPath, "kv") + "ibl" + // ix, err := buildIndexThenOpen(ctx, fitem.decompressor, d.compression, idxPath, d.tmpdir, false, ps, d.logger, d.noFsync) + // if err != nil { + // return fmt.Errorf("build %s values recsplit index: %w", d.filenameBase, err) + // } + // ix.Close() + // return nil + // }) + //} } -func buildIndexThenOpen(ctx context.Context, d *compress.Decompressor, idxPath, tmpdir string, count int, values bool, p *background.Progress, logger log.Logger, noFsync bool) (*recsplit.Index, error) { - if err := buildIndex(ctx, d, idxPath, tmpdir, count, values, p, logger, noFsync); err != nil { +func buildIndexThenOpen(ctx context.Context, d *compress.Decompressor, compressed FileCompression, idxPath, tmpdir string, values bool, salt *uint32, ps *background.ProgressSet, logger log.Logger, noFsync bool) (*recsplit.Index, error) { + if err := buildIndex(ctx, d, compressed, idxPath, tmpdir, values, salt, ps, logger, noFsync); err != nil { return nil, err } return recsplit.OpenIndex(idxPath) } +func buildIndexFilterThenOpen(ctx context.Context, d *compress.Decompressor, compressed FileCompression, idxPath, tmpdir string, salt *uint32, ps *background.ProgressSet, logger log.Logger, noFsync bool) (*bloomFilter, error) { + if err := buildIdxFilter(ctx, d, compressed, idxPath, tmpdir, salt, ps, logger, noFsync); err != nil { + return nil, err + } + if !dir.FileExist(idxPath) { + return nil, nil + } + return OpenBloom(idxPath) +} +func buildIndex(ctx context.Context, d *compress.Decompressor, compressed FileCompression, idxPath, tmpdir string, values bool, salt *uint32, ps *background.ProgressSet, logger log.Logger, noFsync bool) error { + _, fileName := filepath.Split(idxPath) + count := d.Count() + if !values { + count = d.Count() / 2 + } + p := ps.AddNew(fileName, uint64(count)) + defer ps.Delete(p) -func buildIndex(ctx context.Context, d *compress.Decompressor, idxPath, tmpdir string, count int, values bool, p *background.Progress, logger log.Logger, noFsync bool) error { + defer d.EnableReadAhead().DisableReadAhead() + + g := NewArchiveGetter(d.MakeGetter(), compressed) var rs *recsplit.RecSplit var err error if rs, err = recsplit.NewRecSplit(recsplit.RecSplitArgs{ - KeyCount: count, - Enums: false, - BucketSize: 2000, - LeafSize: 8, - TmpDir: tmpdir, - IndexFile: idxPath, + KeyCount: count, + Enums: false, + BucketSize: 2000, + LeafSize: 8, + TmpDir: tmpdir, + IndexFile: idxPath, + Salt: salt, + EtlBufLimit: etl.BufferOptimalSize / 2, }, logger); err != nil { return fmt.Errorf("create recsplit: %w", err) } @@ -1109,14 +1342,11 @@ func buildIndex(ctx context.Context, d *compress.Decompressor, idxPath, tmpdir s if noFsync { rs.DisableFsync() } - defer d.EnableMadvNormal().DisableReadAhead() word := make([]byte, 0, 256) var keyPos, valPos uint64 - g := d.MakeGetter() for { if err := ctx.Err(); err != nil { - logger.Warn("recsplit index building cancelled", "err", err) return err } g.Reset(0) @@ -1131,6 +1361,7 @@ func buildIndex(ctx context.Context, d *compress.Decompressor, idxPath, tmpdir s return fmt.Errorf("add idx key [%x]: %w", word, err) } } + // Skip value keyPos, _ = g.Skip() @@ -1151,138 +1382,148 @@ func buildIndex(ctx context.Context, d *compress.Decompressor, idxPath, tmpdir s } func (d *Domain) integrateFiles(sf StaticFiles, txNumFrom, txNumTo uint64) { - d.History.integrateFiles(HistoryFiles{ - historyDecomp: sf.historyDecomp, - historyIdx: sf.historyIdx, - efHistoryDecomp: sf.efHistoryDecomp, - efHistoryIdx: sf.efHistoryIdx, - }, txNumFrom, txNumTo) + d.History.integrateFiles(sf.HistoryFiles, txNumFrom, txNumTo) fi := newFilesItem(txNumFrom, txNumTo, d.aggregationStep) + fi.frozen = false fi.decompressor = sf.valuesDecomp fi.index = sf.valuesIdx fi.bindex = sf.valuesBt + fi.bloom = sf.bloom d.files.Set(fi) d.reCalcRoFiles() } -// [txFrom; txTo) -func (d *Domain) prune(ctx context.Context, step uint64, txFrom, txTo, limit uint64, logEvery *time.Ticker) error { - defer func(t time.Time) { d.stats.LastPruneTook = time.Since(t) }(time.Now()) - mxPruningProgress.Inc() - defer mxPruningProgress.Dec() - - var ( - _state = "scan steps" - pos atomic.Uint64 - totalKeys uint64 - ) - - keysCursor, err := d.tx.RwCursorDupSort(d.keysTable) +// unwind is similar to prune but the difference is that it restores domain values from the history as of txFrom +func (dc *DomainContext) Unwind(ctx context.Context, rwTx kv.RwTx, step, txFrom, txTo, limit uint64, f func(step uint64, k, v []byte) error) error { + d := dc.d + keysCursorForDeletes, err := rwTx.RwCursorDupSort(d.keysTable) + if err != nil { + return fmt.Errorf("create %s domain cursor: %w", d.filenameBase, err) + } + defer keysCursorForDeletes.Close() + keysCursor, err := rwTx.RwCursorDupSort(d.keysTable) if err != nil { - return fmt.Errorf("%s keys cursor: %w", d.filenameBase, err) + return fmt.Errorf("create %s domain cursor: %w", d.filenameBase, err) } defer keysCursor.Close() - totalKeys, err = keysCursor.Count() + var k, v []byte + var valsC kv.RwCursor + var valsCDup kv.RwCursorDupSort + + if d.domainLargeValues { + valsC, err = rwTx.RwCursor(d.valsTable) + if err != nil { + return err + } + defer valsC.Close() + } else { + valsCDup, err = rwTx.RwCursorDupSort(d.valsTable) + if err != nil { + return err + } + defer valsCDup.Close() + } if err != nil { - return fmt.Errorf("get count of %s keys: %w", d.filenameBase, err) + return err } - var ( - k, v, stepBytes []byte - keyMaxSteps = make(map[string]uint64) - c = 0 - ) - stepBytes = make([]byte, 8) + //fmt.Printf("unwind %s txs [%d; %d) step %d\n", d.filenameBase, txFrom, txTo, step) + + stepBytes := make([]byte, 8) binary.BigEndian.PutUint64(stepBytes, ^step) + restore := d.newWriter(filepath.Join(d.tmpdir, "unwind"+d.filenameBase), true, false) + for k, v, err = keysCursor.First(); err == nil && k != nil; k, v, err = keysCursor.Next() { - if bytes.Equal(v, stepBytes) { - c++ - kl, vl, err := keysCursor.PrevDup() - if err != nil { - break + if !bytes.Equal(v, stepBytes) { + continue + } + + edgeRecords, err := d.History.unwindKey(k, txFrom, rwTx) + //fmt.Printf("unwind %x to tx %d edges %+v\n", k, txFrom, edgeRecords) + if err != nil { + return err + } + switch len(edgeRecords) { + case 1: // its value should be nil, actual value is in domain, BUT if txNum exactly match, need to restore + //fmt.Printf("recent %x txn %d '%x'\n", k, edgeRecords[0].TxNum, edgeRecords[0].Value) + if edgeRecords[0].TxNum == txFrom && edgeRecords[0].Value != nil { + d.SetTxNum(edgeRecords[0].TxNum) + if err := restore.addValue(k, nil, edgeRecords[0].Value); err != nil { + return err + } + } else if edgeRecords[0].TxNum < txFrom { + continue } - if kl == nil && vl == nil { + case 2: // here one first value is before txFrom (holds txNum when value was set) and second is after (actual value at that txNum) + l, r := edgeRecords[0], edgeRecords[1] + if r.TxNum >= txFrom /*&& l.TxNum < txFrom*/ && r.Value != nil { + d.SetTxNum(l.TxNum) + if err := restore.addValue(k, nil, r.Value); err != nil { + return err + } + } else { continue } - s := ^binary.BigEndian.Uint64(vl) - if s > step { - _, vn, err := keysCursor.NextDup() - if err != nil { - break + //fmt.Printf("restore %x txn [%d, %d] '%x' '%x'\n", k, l.TxNum, r.TxNum, l.Value, r.Value) + } + + seek := common.Append(k, stepBytes) + if d.domainLargeValues { + kk, vv, err := valsC.SeekExact(seek) + if err != nil { + return err + } + if f != nil { + if err := f(step, kk, vv); err != nil { + return err } - if bytes.Equal(vn, stepBytes) { - if err := keysCursor.DeleteCurrent(); err != nil { - return fmt.Errorf("prune key %x: %w", k, err) - } - mxPruneSize.Inc() - keyMaxSteps[string(k)] = s + } + if kk != nil { + //fmt.Printf("rm large value %x v %x\n", kk, vv) + if err = valsC.DeleteCurrent(); err != nil { + return err + } + } + } else { + vv, err := valsCDup.SeekBothRange(seek, stepBytes) + if err != nil { + return err + } + if f != nil { + if err := f(step, k, vv); err != nil { + return err } } + //fmt.Printf("rm %d dupes %x v %x\n", dups, seek, vv) + if err = valsCDup.DeleteCurrentDuplicates(); err != nil { + return err + } } - pos.Add(1) - if ctx.Err() != nil { - d.logger.Warn("[snapshots] prune domain cancelled", "name", d.filenameBase, "err", ctx.Err()) - return ctx.Err() + // This DeleteCurrent needs to the last in the loop iteration, because it invalidates k and v + if _, _, err = keysCursorForDeletes.SeekBothExact(k, v); err != nil { + return err } - - select { - case <-ctx.Done(): - return ctx.Err() - case <-logEvery.C: - d.logger.Info("[snapshots] prune domain", "name", d.filenameBase, - "stage", _state, - "range", fmt.Sprintf("%.2f-%.2f", float64(txFrom)/float64(d.aggregationStep), float64(txTo)/float64(d.aggregationStep)), - "progress", fmt.Sprintf("%.2f%%", (float64(pos.Load())/float64(totalKeys))*100)) - default: + if err = keysCursorForDeletes.DeleteCurrent(); err != nil { + return err } } if err != nil { - return fmt.Errorf("iterate of %s keys: %w", d.filenameBase, err) - } - - pos.Store(0) - // It is important to clean up tables in a specific order - // First keysTable, because it is the first one access in the `get` function, i.e. if the record is deleted from there, other tables will not be accessed - var valsCursor kv.RwCursor - if valsCursor, err = d.tx.RwCursor(d.valsTable); err != nil { - return fmt.Errorf("%s vals cursor: %w", d.filenameBase, err) + return fmt.Errorf("iterate over %s domain keys: %w", d.filenameBase, err) } - defer valsCursor.Close() - - for k, _, err := valsCursor.First(); err == nil && k != nil; k, _, err = valsCursor.Next() { - if bytes.HasSuffix(k, stepBytes) { - if _, ok := keyMaxSteps[string(k)]; !ok { - continue - } - if err := valsCursor.DeleteCurrent(); err != nil { - return fmt.Errorf("prune val %x: %w", k, err) - } - mxPruneSize.Inc() - } - pos.Add(1) - //_prog = 100 * (float64(pos) / float64(totalKeys)) - select { - case <-ctx.Done(): - return ctx.Err() - case <-logEvery.C: - d.logger.Info("[snapshots] prune domain", "name", d.filenameBase, "step", step) - //"steps", fmt.Sprintf("%.2f-%.2f", float64(txFrom)/float64(d.aggregationStep), float64(txTo)/float64(d.aggregationStep))) - default: - } - } - if err != nil { - return fmt.Errorf("iterate over %s vals: %w", d.filenameBase, err) + if err = restore.flush(ctx, rwTx); err != nil { + return err } - defer func(t time.Time) { d.stats.LastPruneHistTook = time.Since(t) }(time.Now()) + logEvery := time.NewTicker(time.Second * 30) + defer logEvery.Stop() - if err = d.History.prune(ctx, txFrom, txTo, limit, logEvery); err != nil { + if err := dc.hc.Prune(ctx, rwTx, txFrom, txTo, limit, logEvery); err != nil { return fmt.Errorf("prune history at step %d [%d, %d): %w", step, txFrom, txTo, err) } return nil @@ -1335,7 +1576,10 @@ func (d *Domain) warmup(ctx context.Context, txFrom, limit uint64, tx kv.Tx) err if limit != math.MaxUint64 && limit != 0 { txTo = txFrom + limit } - for ; err == nil && k != nil; k, v, err = domainKeysCursor.Next() { + for ; k != nil; k, v, err = domainKeysCursor.Next() { + if err != nil { + return fmt.Errorf("iterate over %s domain keys: %w", d.filenameBase, err) + } txNum := binary.BigEndian.Uint64(k) if txNum >= txTo { break @@ -1349,105 +1593,200 @@ func (d *Domain) warmup(ctx context.Context, txFrom, limit uint64, tx kv.Tx) err default: } } - if err != nil { - return fmt.Errorf("iterate over %s domain keys: %w", d.filenameBase, err) - } return d.History.warmup(ctx, txFrom, limit, tx) } -var COMPARE_INDEXES = false // if true, will compare values from Btree and INvertedIndex +func (d *Domain) Rotate() flusher { + hf := d.History.Rotate() + if d.wal != nil { + w := d.wal + if w.buffered { + if err := w.keys.Flush(); err != nil { + panic(err) + } + if err := w.values.Flush(); err != nil { + panic(err) + } + } + hf.d = w + d.wal = d.newWriter(d.wal.tmpdir, d.wal.buffered, d.wal.discard) + } + return hf +} + +var ( + UseBtree = true // if true, will use btree for all files +) -func (dc *DomainContext) readFromFiles(filekey []byte, fromTxNum uint64) ([]byte, bool, error) { - var val []byte - var found bool +func (dc *DomainContext) getLatestFromFilesWithExistenceIndex(filekey []byte) (v []byte, found bool, err error) { + hi, _ := dc.hc.ic.hashKey(filekey) for i := len(dc.files) - 1; i >= 0; i-- { - if dc.files[i].endTxNum < fromTxNum { - break - } - reader := dc.statelessBtree(i) - if reader.Empty() { - continue + if dc.d.withExistenceIndex && dc.files[i].src.bloom != nil { + if !dc.files[i].src.bloom.ContainsHash(hi) { + continue + } } - cur, err := reader.Seek(filekey) + + t := time.Now() + v, found, err = dc.getFromFile2(i, filekey) if err != nil { - //return nil, false, nil //TODO: uncomment me return nil, false, err } - if cur == nil { + if !found { + LatestStateReadGrindNotFound.UpdateDuration(t) continue } - - if bytes.Equal(cur.Key(), filekey) { - val = cur.Value() - found = true - break - } + LatestStateReadGrind.UpdateDuration(t) + return v, true, nil } - return val, found, nil + return nil, false, nil } +func (dc *DomainContext) getLatestFromFiles(filekey []byte) (v []byte, found bool, err error) { + if dc.d.withExistenceIndex { + return dc.getLatestFromFilesWithExistenceIndex(filekey) + } + + if v, found, err = dc.getLatestFromWarmFiles(filekey); err != nil { + return nil, false, err + } else if found { + return v, true, nil + } + + if v, found, err = dc.getLatestFromColdFilesGrind(filekey); err != nil { + return nil, false, err + } else if found { + return v, true, nil + } -// historyBeforeTxNum searches history for a value of specified key before txNum -// second return value is true if the value is found in the history (even if it is nil) -func (dc *DomainContext) historyBeforeTxNum(key []byte, txNum uint64, roTx kv.Tx) ([]byte, bool, error) { - dc.d.stats.HistoryQueries.Add(1) + // still not found, search in indexed cold shards + return dc.getLatestFromColdFiles(filekey) +} - v, found, err := dc.hc.GetNoState(key, txNum) +func (dc *DomainContext) getLatestFromWarmFiles(filekey []byte) ([]byte, bool, error) { + exactWarmStep, ok, err := dc.hc.ic.warmLocality.lookupLatest(filekey) if err != nil { return nil, false, err } - if found { - return v, true, nil + // _ = ok + if !ok { + return nil, false, nil } - var anyItem bool - var topState ctxItem - for _, item := range dc.hc.ic.files { - if item.endTxNum < txNum { + t := time.Now() + exactTxNum := exactWarmStep * dc.d.aggregationStep + for i := len(dc.files) - 1; i >= 0; i-- { + isUseful := dc.files[i].startTxNum <= exactTxNum && dc.files[i].endTxNum > exactTxNum + if !isUseful { + continue + } + + v, found, err := dc.getFromFile(i, filekey) + if err != nil { + return nil, false, err + } + if !found { + LatestStateReadWarmNotFound.UpdateDuration(t) + t = time.Now() continue } - anyItem = true - topState = item - break + // fmt.Printf("warm [%d] want %x keys i idx %v %v\n", i, filekey, bt.ef.Count(), bt.decompressor.FileName()) + + LatestStateReadWarm.UpdateDuration(t) + return v, found, nil } - if anyItem { - // If there were no changes but there were history files, the value can be obtained from value files - var val []byte - for i := len(dc.files) - 1; i >= 0; i-- { - if dc.files[i].startTxNum > topState.startTxNum { - continue - } - reader := dc.statelessBtree(i) - if reader.Empty() { - continue - } - cur, err := reader.Seek(key) - if err != nil { - dc.d.logger.Warn("failed to read history before from file", "key", key, "err", err) - return nil, false, err - } - if cur == nil { - continue - } - if bytes.Equal(cur.Key(), key) { - val = cur.Value() - break - } + return nil, false, nil +} + +func (dc *DomainContext) getLatestFromColdFilesGrind(filekey []byte) (v []byte, found bool, err error) { + // sometimes there is a gap between indexed cold files and indexed warm files. just grind them. + // possible reasons: + // - no locality indices at all + // - cold locality index is "lazy"-built + // corner cases: + // - cold and warm segments can overlap + lastColdIndexedTxNum := dc.hc.ic.coldLocality.indexedTo() + firstWarmIndexedTxNum, haveWarmIdx := dc.hc.ic.warmLocality.indexedFrom() + if !haveWarmIdx && len(dc.files) > 0 { + firstWarmIndexedTxNum = dc.files[len(dc.files)-1].endTxNum + } + + if firstWarmIndexedTxNum <= lastColdIndexedTxNum { + return nil, false, nil + } + + t := time.Now() + //if firstWarmIndexedTxNum/dc.d.aggregationStep-lastColdIndexedTxNum/dc.d.aggregationStep > 0 && dc.d.withLocalityIndex { + // if dc.d.filenameBase != "commitment" { + // log.Warn("[dbg] gap between warm and cold locality", "cold", lastColdIndexedTxNum/dc.d.aggregationStep, "warm", firstWarmIndexedTxNum/dc.d.aggregationStep, "nil", dc.hc.ic.coldLocality == nil, "name", dc.d.filenameBase) + // if dc.hc.ic.coldLocality != nil && dc.hc.ic.coldLocality.file != nil { + // log.Warn("[dbg] gap", "cold_f", dc.hc.ic.coldLocality.file.src.bm.FileName()) + // } + // if dc.hc.ic.warmLocality != nil && dc.hc.ic.warmLocality.file != nil { + // log.Warn("[dbg] gap", "warm_f", dc.hc.ic.warmLocality.file.src.bm.FileName()) + // } + // } + //} + + for i := len(dc.files) - 1; i >= 0; i-- { + isUseful := dc.files[i].startTxNum >= lastColdIndexedTxNum && dc.files[i].endTxNum <= firstWarmIndexedTxNum + if !isUseful { + continue + } + v, ok, err := dc.getFromFile(i, filekey) + if err != nil { + return nil, false, err } - return val, true, nil + if !ok { + LatestStateReadGrindNotFound.UpdateDuration(t) + t = time.Now() + continue + } + LatestStateReadGrind.UpdateDuration(t) + return v, true, nil } - // Value not found in history files, look in the recent history - if roTx == nil { - return nil, false, fmt.Errorf("roTx is nil") + return nil, false, nil +} + +func (dc *DomainContext) getLatestFromColdFiles(filekey []byte) (v []byte, found bool, err error) { + // exactColdShard, ok, err := dc.hc.ic.coldLocality.lookupLatest(filekey) + // if err != nil { + // return nil, false, err + // } + // _ = ok + // if !ok { + // return nil, false, nil + // } + //dc.d.stats.FilesQuerie.Add(1) + t := time.Now() + // exactTxNum := exactColdShard * StepsInColdFile * dc.d.aggregationStep + // fmt.Printf("exactColdShard: %d, exactTxNum=%d\n", exactColdShard, exactTxNum) + for i := len(dc.files) - 1; i >= 0; i-- { + // isUseful := dc.files[i].startTxNum <= exactTxNum && dc.files[i].endTxNum > exactTxNum + //fmt.Printf("read3: %s, %t, %d-%d\n", dc.files[i].src.decompressor.FileName(), isUseful, dc.files[i].startTxNum, dc.files[i].endTxNum) + // if !isUseful { + // continue + // } + v, found, err = dc.getFromFile(i, filekey) + if err != nil { + return nil, false, err + } + if !found { + LatestStateReadColdNotFound.UpdateDuration(t) + t = time.Now() + continue + } + LatestStateReadCold.UpdateDuration(t) + return v, true, nil } - return dc.hc.getNoStateFromDB(key, txNum, roTx) + return nil, false, nil } -// GetBeforeTxNum does not always require usage of roTx. If it is possible to determine +// GetAsOf does not always require usage of roTx. If it is possible to determine // historical value based only on static files, roTx will not be used. -func (dc *DomainContext) GetBeforeTxNum(key []byte, txNum uint64, roTx kv.Tx) ([]byte, error) { - v, hOk, err := dc.historyBeforeTxNum(key, txNum, roTx) +func (dc *DomainContext) GetAsOf(key []byte, txNum uint64, roTx kv.Tx) ([]byte, error) { + v, hOk, err := dc.hc.GetNoStateWithRecent(key, txNum, roTx) if err != nil { return nil, err } @@ -1459,38 +1798,168 @@ func (dc *DomainContext) GetBeforeTxNum(key []byte, txNum uint64, roTx kv.Tx) ([ } return v, nil } - if v, _, err = dc.get(key, txNum-1, roTx); err != nil { + v, _, err = dc.GetLatest(key, nil, roTx) + if err != nil { return nil, err } return v, nil } func (dc *DomainContext) Close() { - for _, item := range dc.files { - if item.src.frozen { + if dc.files == nil { // invariant: it's safe to call Close multiple times + return + } + files := dc.files + dc.files = nil + for i := 0; i < len(files); i++ { + if files[i].src.frozen { continue } - refCnt := item.src.refcount.Add(-1) + refCnt := files[i].src.refcount.Add(-1) //GC: last reader responsible to remove useles files: close it and delete - if refCnt == 0 && item.src.canDelete.Load() { - item.src.closeFilesAndRemove() + if refCnt == 0 && files[i].src.canDelete.Load() { + files[i].src.closeFilesAndRemove() } } + //for _, r := range dc.readers { + // r.Close() + //} dc.hc.Close() } -// IteratePrefix iterates over key-value pairs of the domain that start with given prefix -// Such iteration is not intended to be used in public API, therefore it uses read-write transaction -// inside the domain. Another version of this for public API use needs to be created, that uses -// roTx instead and supports ending the iterations before it reaches the end. -func (dc *DomainContext) IteratePrefix(prefix []byte, it func(k, v []byte)) error { - dc.d.stats.HistoryQueries.Add(1) +func (dc *DomainContext) statelessGetter(i int) ArchiveGetter { + if dc.getters == nil { + dc.getters = make([]ArchiveGetter, len(dc.files)) + } + r := dc.getters[i] + if r == nil { + r = NewArchiveGetter(dc.files[i].src.decompressor.MakeGetter(), dc.d.compression) + dc.getters[i] = r + } + return r +} + +func (dc *DomainContext) statelessIdxReader(i int) *recsplit.IndexReader { + if dc.idxReaders == nil { + dc.idxReaders = make([]*recsplit.IndexReader, len(dc.files)) + } + r := dc.idxReaders[i] + if r == nil { + r = dc.files[i].src.index.GetReaderFromPool() + dc.idxReaders[i] = r + } + return r +} + +func (dc *DomainContext) statelessBtree(i int) *BtIndex { + if dc.readers == nil { + dc.readers = make([]*BtIndex, len(dc.files)) + } + r := dc.readers[i] + if r == nil { + r = dc.files[i].src.bindex + dc.readers[i] = r + } + return r +} + +func (dc *DomainContext) valsCursor(tx kv.Tx) (c kv.Cursor, err error) { + if dc.valsC != nil { + return dc.valsC, nil + } + dc.valsC, err = tx.Cursor(dc.d.valsTable) + if err != nil { + return nil, err + } + return dc.valsC, nil +} +func (dc *DomainContext) keysCursor(tx kv.Tx) (c kv.CursorDupSort, err error) { + if dc.keysC != nil { + return dc.keysC, nil + } + dc.keysC, err = tx.CursorDupSort(dc.d.keysTable) + if err != nil { + return nil, err + } + return dc.keysC, nil +} + +func (dc *DomainContext) GetLatest(key1, key2 []byte, roTx kv.Tx) ([]byte, bool, error) { + //t := time.Now() + key := key1 + if len(key2) > 0 { + key = append(append(dc.keyBuf[:0], key1...), key2...) + } + + var ( + v []byte + err error + ) + + keysC, err := dc.keysCursor(roTx) + if err != nil { + return nil, false, err + } + _, foundInvStep, err := keysC.SeekExact(key) // reads first DupSort value + if err != nil { + return nil, false, err + } + if foundInvStep != nil { + copy(dc.valKeyBuf[:], key) + copy(dc.valKeyBuf[len(key):], foundInvStep) + + switch dc.d.domainLargeValues { + case true: + valsC, err := dc.valsCursor(roTx) + if err != nil { + return nil, false, err + } + _, v, err = valsC.SeekExact(dc.valKeyBuf[:len(key)+8]) + if err != nil { + return nil, false, fmt.Errorf("GetLatest value: %w", err) + } + default: + valsDup, err := roTx.CursorDupSort(dc.d.valsTable) + if err != nil { + return nil, false, err + } + v, err = valsDup.SeekBothRange(dc.valKeyBuf[:len(key)], dc.valKeyBuf[len(key):len(key)+8]) + if err != nil { + return nil, false, fmt.Errorf("GetLatest value: %w", err) + } + } + + //LatestStateReadDB.UpdateDuration(t) + return v, true, nil + } + //LatestStateReadDBNotFound.UpdateDuration(t) + + v, found, err := dc.getLatestFromFiles(key) + if err != nil { + return nil, false, err + } + return v, found, nil +} + +func (dc *DomainContext) IteratePrefix(roTx kv.Tx, prefix []byte, it func(k, v []byte)) error { var cp CursorHeap heap.Init(&cp) var k, v []byte var err error - keysCursor, err := dc.d.tx.CursorDupSort(dc.d.keysTable) + + //iter := sd.storage.Iter() + //if iter.Seek(string(prefix)) { + // kx := iter.Key() + // v = iter.Value() + // k = []byte(kx) + // + // if len(kx) > 0 && bytes.HasPrefix(k, prefix) { + // heap.Push(&cp, &CursorItem{t: RAM_CURSOR, key: common.Copy(k), val: common.Copy(v), iter: iter, endTxNum: sd.txNum.Load(), reverse: true}) + // } + //} + + keysCursor, err := roTx.CursorDupSort(dc.d.keysTable) if err != nil { return err } @@ -1498,54 +1967,89 @@ func (dc *DomainContext) IteratePrefix(prefix []byte, it func(k, v []byte)) erro if k, v, err = keysCursor.Seek(prefix); err != nil { return err } - if bytes.HasPrefix(k, prefix) { + if k != nil && bytes.HasPrefix(k, prefix) { keySuffix := make([]byte, len(k)+8) copy(keySuffix, k) copy(keySuffix[len(k):], v) step := ^binary.BigEndian.Uint64(v) txNum := step * dc.d.aggregationStep - if v, err = dc.d.tx.GetOne(dc.d.valsTable, keySuffix); err != nil { + if v, err = roTx.GetOne(dc.d.valsTable, keySuffix); err != nil { return err } - heap.Push(&cp, &CursorItem{t: DB_CURSOR, key: common.Copy(k), val: common.Copy(v), c: keysCursor, endTxNum: txNum, reverse: true}) + heap.Push(&cp, &CursorItem{t: DB_CURSOR, key: k, val: v, c: keysCursor, endTxNum: txNum + dc.d.aggregationStep, reverse: true}) } for i, item := range dc.files { - bg := dc.statelessBtree(i) - if bg.Empty() { - continue - } - - cursor, err := bg.Seek(prefix) - if err != nil { - continue - } - - g := dc.statelessGetter(i) - key := cursor.Key() - if bytes.HasPrefix(key, prefix) { - val := cursor.Value() - heap.Push(&cp, &CursorItem{t: FILE_CURSOR, key: key, val: val, dg: g, endTxNum: item.endTxNum, reverse: true}) + if UseBtree || UseBpsTree { + cursor, err := dc.statelessBtree(i).Seek(dc.statelessGetter(i), prefix) + if err != nil { + return err + } + if cursor == nil { + continue + } + dc.d.stats.FilesQueries.Add(1) + key := cursor.Key() + if key != nil && bytes.HasPrefix(key, prefix) { + val := cursor.Value() + heap.Push(&cp, &CursorItem{t: FILE_CURSOR, dg: dc.statelessGetter(i), key: key, val: val, btCursor: cursor, endTxNum: item.endTxNum, reverse: true}) + } + } else { + ir := dc.statelessIdxReader(i) + offset := ir.Lookup(prefix) + g := dc.statelessGetter(i) + g.Reset(offset) + if !g.HasNext() { + continue + } + key, _ := g.Next(nil) + dc.d.stats.FilesQueries.Add(1) + if key != nil && bytes.HasPrefix(key, prefix) { + val, lofft := g.Next(nil) + heap.Push(&cp, &CursorItem{t: FILE_CURSOR, dg: g, latestOffset: lofft, key: key, val: val, endTxNum: item.endTxNum, reverse: true}) + } } } + for cp.Len() > 0 { lastKey := common.Copy(cp[0].key) lastVal := common.Copy(cp[0].val) // Advance all the items that have this key (including the top) for cp.Len() > 0 && bytes.Equal(cp[0].key, lastKey) { - ci1 := cp[0] + ci1 := heap.Pop(&cp).(*CursorItem) + //if string(ci1.key) == string(hexutility.MustDecodeString("301f9a245a0adeb61835403f6fd256dd96d103942d747c6d41e95a5d655bc20ab0fac941c854894cc0ed84cdaf557374b49ed723")) { + // fmt.Printf("found %x\n", ci1.key) + //} switch ci1.t { + //case RAM_CURSOR: + // if ci1.iter.Next() { + // k = []byte(ci1.iter.Key()) + // if k != nil && bytes.HasPrefix(k, prefix) { + // ci1.key = common.Copy(k) + // ci1.val = common.Copy(ci1.iter.Value()) + // } + // } + // heap.Push(&cp, ci1) case FILE_CURSOR: - if ci1.dg.HasNext() { - ci1.key, _ = ci1.dg.Next(ci1.key[:0]) - if bytes.HasPrefix(ci1.key, prefix) { - ci1.val, _ = ci1.dg.Next(ci1.val[:0]) - heap.Fix(&cp, 0) - } else { - heap.Pop(&cp) + if UseBtree || UseBpsTree { + if ci1.btCursor.Next() { + ci1.key = ci1.btCursor.Key() + if ci1.key != nil && bytes.HasPrefix(ci1.key, prefix) { + ci1.val = ci1.btCursor.Value() + heap.Push(&cp, ci1) + } } } else { - heap.Pop(&cp) + ci1.dg.Reset(ci1.latestOffset) + if !ci1.dg.HasNext() { + break + } + key, _ := ci1.dg.Next(nil) + if key != nil && bytes.HasPrefix(key, prefix) { + ci1.key = key + ci1.val, ci1.latestOffset = ci1.dg.Next(nil) + heap.Push(&cp, ci1) + } } case DB_CURSOR: k, v, err = ci1.c.NextNoDup() @@ -1553,23 +2057,586 @@ func (dc *DomainContext) IteratePrefix(prefix []byte, it func(k, v []byte)) erro return err } if k != nil && bytes.HasPrefix(k, prefix) { + ci1.key = k + keySuffix := make([]byte, len(k)+8) + copy(keySuffix, k) + copy(keySuffix[len(k):], v) + if v, err = roTx.GetOne(dc.d.valsTable, keySuffix); err != nil { + return err + } + ci1.val = v + heap.Push(&cp, ci1) + } + } + } + if len(lastVal) > 0 { + it(lastKey, lastVal) + } + } + return nil +} + +func (dc *DomainContext) DomainRange(tx kv.Tx, fromKey, toKey []byte, ts uint64, asc order.By, limit int) (it iter.KV, err error) { + if !asc { + panic("implement me") + } + //histStateIt, err := tx.aggCtx.AccountHistoricalStateRange(asOfTs, fromKey, toKey, limit, tx.MdbxTx) + //if err != nil { + // return nil, err + //} + //lastestStateIt, err := tx.aggCtx.DomainRangeLatest(tx.MdbxTx, kv.AccountDomain, fromKey, toKey, limit) + //if err != nil { + // return nil, err + //} + histStateIt, err := dc.hc.WalkAsOf(ts, fromKey, toKey, tx, limit) + if err != nil { + return nil, err + } + lastestStateIt, err := dc.DomainRangeLatest(tx, fromKey, toKey, limit) + if err != nil { + return nil, err + } + return iter.UnionKV(histStateIt, lastestStateIt, limit), nil +} + +func (dc *DomainContext) IteratePrefix2(roTx kv.Tx, fromKey, toKey []byte, limit int) (iter.KV, error) { + return dc.DomainRangeLatest(roTx, fromKey, toKey, limit) +} + +func (dc *DomainContext) DomainRangeLatest(roTx kv.Tx, fromKey, toKey []byte, limit int) (iter.KV, error) { + fit := &DomainLatestIterFile{from: fromKey, to: toKey, limit: limit, dc: dc, + roTx: roTx, + idxKeysTable: dc.d.keysTable, + h: &CursorHeap{}, + } + if err := fit.init(dc); err != nil { + return nil, err + } + return fit, nil +} + +func (dc *DomainContext) CanPruneFrom(tx kv.Tx) uint64 { + fst, _ := kv.FirstKey(tx, dc.d.indexKeysTable) + //fst2, _ := kv.FirstKey(tx, dc.d.keysTable) + //if len(fst) > 0 && len(fst2) > 0 { + // fstInDb := binary.BigEndian.Uint64(fst) + // fstInDb2 := binary.BigEndian.Uint64(fst2) + // return cmp.Min(fstInDb, fstInDb2) + //} + if len(fst) > 0 { + fstInDb := binary.BigEndian.Uint64(fst) + return cmp.Min(fstInDb, math.MaxUint64) + } + return math.MaxUint64 +} + +func (dc *DomainContext) CanPrune(tx kv.Tx) bool { + return dc.CanPruneFrom(tx) < dc.maxTxNumInFiles(false) +} + +// history prunes keys in range [txFrom; txTo), domain prunes any records with rStep <= step. +// In case of context cancellation pruning stops and returns error, but simply could be started again straight away. +func (dc *DomainContext) Prune(ctx context.Context, rwTx kv.RwTx, step, txFrom, txTo, limit uint64, logEvery *time.Ticker) error { + if !dc.CanPrune(rwTx) { + return nil + } + + st := time.Now() + mxPruneInProgress.Inc() + defer mxPruneInProgress.Dec() + + keysCursorForDeletes, err := rwTx.RwCursorDupSort(dc.d.keysTable) + if err != nil { + return fmt.Errorf("create %s domain cursor: %w", dc.d.filenameBase, err) + } + defer keysCursorForDeletes.Close() + keysCursor, err := rwTx.RwCursorDupSort(dc.d.keysTable) + if err != nil { + return fmt.Errorf("create %s domain cursor: %w", dc.d.filenameBase, err) + } + defer keysCursor.Close() + + var ( + k, v []byte + prunedKeys uint64 + prunedMaxStep uint64 + prunedMinStep = uint64(math.MaxUint64) + seek = make([]byte, 0, 256) + valsDup kv.RwCursorDupSort + ) + + if !dc.d.domainLargeValues { + valsDup, err = rwTx.RwCursorDupSort(dc.d.valsTable) + if err != nil { + return err + } + defer valsDup.Close() + } + + for k, v, err = keysCursor.Last(); k != nil; k, v, err = keysCursor.Prev() { + if err != nil { + return fmt.Errorf("iterate over %s domain keys: %w", dc.d.filenameBase, err) + } + is := ^binary.BigEndian.Uint64(v) + if is > step { + continue + } + if limit == 0 { + return nil + } + limit-- + + k, v, err = keysCursorForDeletes.SeekBothExact(k, v) + if err != nil { + return err + } + seek = append(append(seek[:0], k...), v...) + //if bytes.HasPrefix(seek, hexutility.MustDecodeString("1a4a4de8fe37b308fea3eb786195af8c813e18f8196bcb830a40cd57f169692572197d70495a7c6d0184c5093dcc960e1384239e")) { + // fmt.Printf("prune key: %x->%x [%x] step %d dom %s\n", k, v, seek, ^binary.BigEndian.Uint64(v), dc.d.filenameBase) + //} + //fmt.Printf("prune key: %x->%x [%x] step %d dom %s\n", k, v, seek, ^binary.BigEndian.Uint64(v), dc.d.filenameBase) + + mxPruneSizeDomain.Inc() + prunedKeys++ + + if dc.d.domainLargeValues { + //if bytes.HasPrefix(seek, hexutility.MustDecodeString("1a4a4de8fe37b308fea3eb786195af8c813e18f8196bcb830a40cd57f169692572197d70495a7c6d0184c5093dcc960e1384239e")) { + // fmt.Printf("prune value: %x step %d dom %s\n", seek, ^binary.BigEndian.Uint64(v), dc.d.filenameBase) + //} + //fmt.Printf("prune value: %x step %d dom %s\n", seek, ^binary.BigEndian.Uint64(v), dc.d.filenameBase) + err = rwTx.Delete(dc.d.valsTable, seek) + } else { + sv, err := valsDup.SeekBothRange(seek[:len(k)], seek[len(k):len(k)+len(v)]) + if err != nil { + return err + } + if bytes.HasPrefix(sv, v) { + //fmt.Printf("prune value: %x->%x, step %d dom %s\n", k, sv, ^binary.BigEndian.Uint64(v), dc.d.filenameBase) + err = valsDup.DeleteCurrent() + if err != nil { + return err + } + } + } + if err != nil { + return fmt.Errorf("prune domain value: %w", err) + } + + if err = keysCursorForDeletes.DeleteCurrent(); err != nil { // invalidates kk, vv + return err + } + + if is < prunedMinStep { + prunedMinStep = is + } + if is > prunedMaxStep { + prunedMaxStep = is + } + + select { + case <-ctx.Done(): + return ctx.Err() + case <-logEvery.C: + dc.d.logger.Info("[snapshots] prune domain", "name", dc.d.filenameBase, "step", step, + "steps", fmt.Sprintf("%.2f-%.2f", float64(txFrom)/float64(dc.d.aggregationStep), float64(txTo)/float64(dc.d.aggregationStep))) + default: + } + } + if prunedMinStep == math.MaxUint64 { + prunedMinStep = 0 + } // minMax pruned step doesn't mean that we pruned all kv pairs for those step - we just pruned some keys of those steps. + + dc.d.logger.Info("[snapshots] prune domain", "name", dc.d.filenameBase, "step range", fmt.Sprintf("[%d, %d] requested %d", prunedMinStep, prunedMaxStep, step), "pruned keys", prunedKeys) + mxPruneTookDomain.UpdateDuration(st) + + if err := dc.hc.Prune(ctx, rwTx, txFrom, txTo, limit, logEvery); err != nil { + return fmt.Errorf("prune history at step %d [%d, %d): %w", step, txFrom, txTo, err) + } + return nil +} + +type DomainLatestIterFile struct { + dc *DomainContext + + roTx kv.Tx + idxKeysTable string + + limit int + + from, to []byte + nextVal []byte + nextKey []byte + + h *CursorHeap + + k, v, kBackup, vBackup []byte +} + +func (hi *DomainLatestIterFile) Close() { +} +func (hi *DomainLatestIterFile) init(dc *DomainContext) error { + heap.Init(hi.h) + var k, v []byte + var err error + + keysCursor, err := hi.roTx.CursorDupSort(dc.d.keysTable) + if err != nil { + return err + } + if k, v, err = keysCursor.Seek(hi.from); err != nil { + return err + } + if k != nil && (hi.to == nil || bytes.Compare(k, hi.to) < 0) { + keySuffix := make([]byte, len(k)+8) + copy(keySuffix, k) + copy(keySuffix[len(k):], v) + step := ^binary.BigEndian.Uint64(v) + txNum := step * dc.d.aggregationStep + if v, err = hi.roTx.GetOne(dc.d.valsTable, keySuffix); err != nil { + return err + } + heap.Push(hi.h, &CursorItem{t: DB_CURSOR, key: common.Copy(k), val: common.Copy(v), c: keysCursor, endTxNum: txNum, reverse: true}) + } + + for i, item := range dc.files { + btCursor, err := dc.statelessBtree(i).Seek(dc.statelessGetter(i), hi.from) + if err != nil { + return err + } + if btCursor == nil { + continue + } + + key := btCursor.Key() + if key != nil && (hi.to == nil || bytes.Compare(key, hi.to) < 0) { + val := btCursor.Value() + heap.Push(hi.h, &CursorItem{t: FILE_CURSOR, key: key, val: val, btCursor: btCursor, endTxNum: item.endTxNum, reverse: true}) + } + } + return hi.advanceInFiles() +} + +func (hi *DomainLatestIterFile) advanceInFiles() error { + for hi.h.Len() > 0 { + lastKey := (*hi.h)[0].key + lastVal := (*hi.h)[0].val + + // Advance all the items that have this key (including the top) + for hi.h.Len() > 0 && bytes.Equal((*hi.h)[0].key, lastKey) { + ci1 := heap.Pop(hi.h).(*CursorItem) + switch ci1.t { + case FILE_CURSOR: + if ci1.btCursor.Next() { + ci1.key = ci1.btCursor.Key() + ci1.val = ci1.btCursor.Value() + if ci1.key != nil && (hi.to == nil || bytes.Compare(ci1.key, hi.to) < 0) { + heap.Push(hi.h, ci1) + } + } + case DB_CURSOR: + k, v, err := ci1.c.NextNoDup() + if err != nil { + return err + } + if k != nil && (hi.to == nil || bytes.Compare(k, hi.to) < 0) { ci1.key = common.Copy(k) keySuffix := make([]byte, len(k)+8) copy(keySuffix, k) copy(keySuffix[len(k):], v) - if v, err = dc.d.tx.GetOne(dc.d.valsTable, keySuffix); err != nil { + if v, err = hi.roTx.GetOne(hi.dc.d.valsTable, keySuffix); err != nil { return err } ci1.val = common.Copy(v) - heap.Fix(&cp, 0) - } else { - heap.Pop(&cp) + heap.Push(hi.h, ci1) } } } if len(lastVal) > 0 { - it(lastKey, lastVal) + hi.nextKey, hi.nextVal = lastKey, lastVal + return nil // founc } } + hi.nextKey = nil return nil } + +func (hi *DomainLatestIterFile) HasNext() bool { + return hi.limit != 0 && hi.nextKey != nil +} + +func (hi *DomainLatestIterFile) Next() ([]byte, []byte, error) { + hi.limit-- + hi.k, hi.v = append(hi.k[:0], hi.nextKey...), append(hi.v[:0], hi.nextVal...) + + // Satisfy iter.Dual Invariant 2 + hi.k, hi.kBackup, hi.v, hi.vBackup = hi.kBackup, hi.k, hi.vBackup, hi.v + if err := hi.advanceInFiles(); err != nil { + return nil, nil, err + } + return hi.kBackup, hi.vBackup, nil +} + +func (d *Domain) stepsRangeInDBAsStr(tx kv.Tx) string { + a1, a2 := d.History.InvertedIndex.stepsRangeInDB(tx) + //ad1, ad2 := d.stepsRangeInDB(tx) + //if ad2-ad1 < 0 { + // fmt.Printf("aaa: %f, %f\n", ad1, ad2) + //} + return fmt.Sprintf("%s:%.1f", d.filenameBase, a2-a1) +} +func (d *Domain) stepsRangeInDB(tx kv.Tx) (from, to float64) { + if d.domainLargeValues { + fst, _ := kv.FirstKey(tx, d.valsTable) + if len(fst) > 0 { + to = float64(^binary.BigEndian.Uint64(fst[len(fst)-8:])) + } + lst, _ := kv.LastKey(tx, d.valsTable) + if len(lst) > 0 { + from = float64(^binary.BigEndian.Uint64(lst[len(lst)-8:])) + } + if to == 0 { + to = from + } + } else { + c, err := tx.Cursor(d.valsTable) + if err != nil { + return 0, 0 + } + _, fst, _ := c.First() + if len(fst) > 0 { + to = float64(^binary.BigEndian.Uint64(fst[:8])) + } + _, lst, _ := c.Last() + if len(lst) > 0 { + from = float64(^binary.BigEndian.Uint64(lst[:8])) + } + c.Close() + if to == 0 { + to = from + } + } + return from, to +} + +func (dc *DomainContext) Files() (res []string) { + for _, item := range dc.files { + if item.src.decompressor != nil { + res = append(res, item.src.decompressor.FileName()) + } + } + return append(res, dc.hc.Files()...) +} + +type Ranges struct { + accounts DomainRanges + storage DomainRanges + code DomainRanges + commitment DomainRanges +} + +func (r Ranges) String() string { + return fmt.Sprintf("accounts=%s, storage=%s, code=%s, commitment=%s", r.accounts.String(), r.storage.String(), r.code.String(), r.commitment.String()) +} + +func (r Ranges) any() bool { + return r.accounts.any() || r.storage.any() || r.code.any() || r.commitment.any() +} + +type SelectedStaticFiles struct { + accounts []*filesItem + accountsIdx []*filesItem + accountsHist []*filesItem + storage []*filesItem + storageIdx []*filesItem + storageHist []*filesItem + code []*filesItem + codeIdx []*filesItem + codeHist []*filesItem + commitment []*filesItem + commitmentIdx []*filesItem + commitmentHist []*filesItem + codeI int + storageI int + accountsI int + commitmentI int +} + +func (sf SelectedStaticFiles) FillV3(s *SelectedStaticFilesV3) SelectedStaticFiles { + sf.accounts, sf.accountsIdx, sf.accountsHist = s.accounts, s.accountsIdx, s.accountsHist + sf.storage, sf.storageIdx, sf.storageHist = s.storage, s.storageIdx, s.storageHist + sf.code, sf.codeIdx, sf.codeHist = s.code, s.codeIdx, s.codeHist + sf.commitment, sf.commitmentIdx, sf.commitmentHist = s.commitment, s.commitmentIdx, s.commitmentHist + sf.codeI, sf.accountsI, sf.storageI, sf.commitmentI = s.codeI, s.accountsI, s.storageI, s.commitmentI + return sf +} + +func (sf SelectedStaticFiles) Close() { + for _, group := range [][]*filesItem{ + sf.accounts, sf.accountsIdx, sf.accountsHist, + sf.storage, sf.storageIdx, sf.storageHist, + sf.code, sf.codeIdx, sf.codeHist, + sf.commitment, sf.commitmentIdx, sf.commitmentHist, + } { + for _, item := range group { + if item != nil { + if item.decompressor != nil { + item.decompressor.Close() + } + if item.index != nil { + item.index.Close() + } + if item.bindex != nil { + item.bindex.Close() + } + } + } + } +} + +type MergedFiles struct { + accounts *filesItem + accountsIdx, accountsHist *filesItem + storage *filesItem + storageIdx, storageHist *filesItem + code *filesItem + codeIdx, codeHist *filesItem + commitment *filesItem + commitmentIdx, commitmentHist *filesItem +} + +func (mf MergedFiles) FillV3(m *MergedFilesV3) MergedFiles { + mf.accounts, mf.accountsIdx, mf.accountsHist = m.accounts, m.accountsIdx, m.accountsHist + mf.storage, mf.storageIdx, mf.storageHist = m.storage, m.storageIdx, m.storageHist + mf.code, mf.codeIdx, mf.codeHist = m.code, m.codeIdx, m.codeHist + mf.commitment, mf.commitmentIdx, mf.commitmentHist = m.commitment, m.commitmentIdx, m.commitmentHist + return mf +} + +func (mf MergedFiles) Close() { + for _, item := range []*filesItem{ + mf.accounts, mf.accountsIdx, mf.accountsHist, + mf.storage, mf.storageIdx, mf.storageHist, + mf.code, mf.codeIdx, mf.codeHist, + mf.commitment, mf.commitmentIdx, mf.commitmentHist, + //mf.logAddrs, mf.logTopics, mf.tracesFrom, mf.tracesTo, + } { + if item != nil { + if item.decompressor != nil { + item.decompressor.Close() + } + if item.decompressor != nil { + item.index.Close() + } + if item.bindex != nil { + item.bindex.Close() + } + } + } +} + +func DecodeAccountBytes(enc []byte) (nonce uint64, balance *uint256.Int, hash []byte) { + if len(enc) == 0 { + return + } + pos := 0 + nonceBytes := int(enc[pos]) + balance = uint256.NewInt(0) + pos++ + if nonceBytes > 0 { + nonce = bytesToUint64(enc[pos : pos+nonceBytes]) + pos += nonceBytes + } + balanceBytes := int(enc[pos]) + pos++ + if balanceBytes > 0 { + balance.SetBytes(enc[pos : pos+balanceBytes]) + pos += balanceBytes + } + codeHashBytes := int(enc[pos]) + pos++ + if codeHashBytes == length.Hash { + hash = make([]byte, codeHashBytes) + copy(hash, enc[pos:pos+codeHashBytes]) + pos += codeHashBytes + } + if pos >= len(enc) { + panic(fmt.Errorf("deserialse2: %d >= %d ", pos, len(enc))) + } + return +} + +func EncodeAccountBytes(nonce uint64, balance *uint256.Int, hash []byte, incarnation uint64) []byte { + l := int(1) + if nonce > 0 { + l += common.BitLenToByteLen(bits.Len64(nonce)) + } + l++ + if !balance.IsZero() { + l += balance.ByteLen() + } + l++ + if len(hash) == length.Hash { + l += 32 + } + l++ + if incarnation > 0 { + l += common.BitLenToByteLen(bits.Len64(incarnation)) + } + value := make([]byte, l) + pos := 0 + + if nonce == 0 { + value[pos] = 0 + pos++ + } else { + nonceBytes := common.BitLenToByteLen(bits.Len64(nonce)) + value[pos] = byte(nonceBytes) + var nonce = nonce + for i := nonceBytes; i > 0; i-- { + value[pos+i] = byte(nonce) + nonce >>= 8 + } + pos += nonceBytes + 1 + } + if balance.IsZero() { + value[pos] = 0 + pos++ + } else { + balanceBytes := balance.ByteLen() + value[pos] = byte(balanceBytes) + pos++ + balance.WriteToSlice(value[pos : pos+balanceBytes]) + pos += balanceBytes + } + if len(hash) == 0 { + value[pos] = 0 + pos++ + } else { + value[pos] = 32 + pos++ + copy(value[pos:pos+32], hash) + pos += 32 + } + if incarnation == 0 { + value[pos] = 0 + } else { + incBytes := common.BitLenToByteLen(bits.Len64(incarnation)) + value[pos] = byte(incBytes) + var inc = incarnation + for i := incBytes; i > 0; i-- { + value[pos+i] = byte(inc) + inc >>= 8 + } + } + return value +} + +func bytesToUint64(buf []byte) (x uint64) { + for i, b := range buf { + x = x<<8 + uint64(b) + if i == 7 { + return + } + } + return +} diff --git a/state/domain_committed.go b/state/domain_committed.go index 9c046975c..5c8841729 100644 --- a/state/domain_committed.go +++ b/state/domain_committed.go @@ -18,24 +18,21 @@ package state import ( "bytes" - "container/heap" - "context" "encoding/binary" "fmt" "hash" - "path/filepath" - "strings" "time" + "github.com/c2h5oh/datasize" "github.com/google/btree" - "github.com/ledgerwatch/erigon-lib/common/background" - "github.com/ledgerwatch/log/v3" "golang.org/x/crypto/sha3" "github.com/ledgerwatch/erigon-lib/commitment" "github.com/ledgerwatch/erigon-lib/common" + "github.com/ledgerwatch/erigon-lib/common/cryptozerocopy" + "github.com/ledgerwatch/erigon-lib/common/dbg" "github.com/ledgerwatch/erigon-lib/common/length" - "github.com/ledgerwatch/erigon-lib/compress" + "github.com/ledgerwatch/erigon-lib/etl" ) // Defines how to evaluate commitments @@ -75,153 +72,172 @@ func ParseCommitmentMode(s string) CommitmentMode { type ValueMerger func(prev, current []byte) (merged []byte, err error) -type DomainCommitted struct { - *Domain - mode CommitmentMode - trace bool - commTree *btree.BTreeG[*CommitmentItem] - keccak hash.Hash - patriciaTrie commitment.Trie - branchMerger *commitment.BranchMerger - - comKeys uint64 - comTook time.Duration - logger log.Logger +type UpdateTree struct { + tree *btree.BTreeG[*commitmentItem] + keccak cryptozerocopy.KeccakState + keys etl.Buffer + mode CommitmentMode } -func NewCommittedDomain(d *Domain, mode CommitmentMode, trieVariant commitment.TrieVariant, logger log.Logger) *DomainCommitted { - return &DomainCommitted{ - Domain: d, - patriciaTrie: commitment.InitializeTrie(trieVariant), - commTree: btree.NewG[*CommitmentItem](32, commitmentItemLess), - keccak: sha3.NewLegacyKeccak256(), - mode: mode, - branchMerger: commitment.NewHexBranchMerger(8192), - logger: logger, +func NewUpdateTree(m CommitmentMode) *UpdateTree { + return &UpdateTree{ + tree: btree.NewG[*commitmentItem](64, commitmentItemLessPlain), + keccak: sha3.NewLegacyKeccak256().(cryptozerocopy.KeccakState), + keys: etl.NewOldestEntryBuffer(datasize.MB * 32), + mode: m, } } -func (d *DomainCommitted) SetCommitmentMode(m CommitmentMode) { d.mode = m } +func (t *UpdateTree) get(key []byte) (*commitmentItem, bool) { + c := &commitmentItem{plainKey: key, update: commitment.Update{CodeHashOrStorage: commitment.EmptyCodeHashArray}} + el, ok := t.tree.Get(c) + if ok { + return el, true + } + c.plainKey = common.Copy(c.plainKey) + return c, false +} // TouchPlainKey marks plainKey as updated and applies different fn for different key types // (different behaviour for Code, Account and Storage key modifications). -func (d *DomainCommitted) TouchPlainKey(key, val []byte, fn func(c *CommitmentItem, val []byte)) { - if d.mode == CommitmentModeDisabled { - return - } - c := &CommitmentItem{plainKey: common.Copy(key), hashedKey: d.hashAndNibblizeKey(key)} - if d.mode > CommitmentModeDirect { - fn(c, val) +func (t *UpdateTree) TouchPlainKey(key, val []byte, fn func(c *commitmentItem, val []byte)) { + switch t.mode { + case CommitmentModeUpdate: + item, _ := t.get(key) + fn(item, val) + t.tree.ReplaceOrInsert(item) + case CommitmentModeDirect: + t.keys.Put(key, nil) + default: } - d.commTree.ReplaceOrInsert(c) } -func (d *DomainCommitted) TouchPlainKeyAccount(c *CommitmentItem, val []byte) { +func (t *UpdateTree) Size() uint64 { + return uint64(t.keys.Len()) +} + +func (t *UpdateTree) TouchAccount(c *commitmentItem, val []byte) { if len(val) == 0 { c.update.Flags = commitment.DeleteUpdate return } - c.update.DecodeForStorage(val) - c.update.Flags = commitment.BalanceUpdate | commitment.NonceUpdate - item, found := d.commTree.Get(&CommitmentItem{hashedKey: c.hashedKey}) - if !found { - return + if c.update.Flags&commitment.DeleteUpdate != 0 { + c.update.Flags ^= commitment.DeleteUpdate } - if item.update.Flags&commitment.CodeUpdate != 0 { - c.update.Flags |= commitment.CodeUpdate - copy(c.update.CodeHashOrStorage[:], item.update.CodeHashOrStorage[:]) + nonce, balance, chash := DecodeAccountBytes(val) + if c.update.Nonce != nonce { + c.update.Nonce = nonce + c.update.Flags |= commitment.NonceUpdate } + if !c.update.Balance.Eq(balance) { + c.update.Balance.Set(balance) + c.update.Flags |= commitment.BalanceUpdate + } + if !bytes.Equal(chash, c.update.CodeHashOrStorage[:]) { + if len(chash) == 0 { + c.update.ValLength = length.Hash + copy(c.update.CodeHashOrStorage[:], commitment.EmptyCodeHash) + } else { + copy(c.update.CodeHashOrStorage[:], chash) + c.update.ValLength = length.Hash + c.update.Flags |= commitment.CodeUpdate + } + } +} + +func (t *UpdateTree) UpdatePrefix(prefix, val []byte, fn func(c *commitmentItem, val []byte)) { + t.tree.AscendGreaterOrEqual(&commitmentItem{}, func(item *commitmentItem) bool { + if !bytes.HasPrefix(item.plainKey, prefix) { + return false + } + fn(item, val) + return true + }) } -func (d *DomainCommitted) TouchPlainKeyStorage(c *CommitmentItem, val []byte) { +func (t *UpdateTree) TouchStorage(c *commitmentItem, val []byte) { c.update.ValLength = len(val) if len(val) == 0 { c.update.Flags = commitment.DeleteUpdate } else { - c.update.Flags = commitment.StorageUpdate + c.update.Flags |= commitment.StorageUpdate copy(c.update.CodeHashOrStorage[:], val) } } -func (d *DomainCommitted) TouchPlainKeyCode(c *CommitmentItem, val []byte) { - c.update.Flags = commitment.CodeUpdate - item, found := d.commTree.Get(c) - if !found { - d.keccak.Reset() - d.keccak.Write(val) - copy(c.update.CodeHashOrStorage[:], d.keccak.Sum(nil)) +func (t *UpdateTree) TouchCode(c *commitmentItem, val []byte) { + t.keccak.Reset() + t.keccak.Write(val) + t.keccak.Read(c.update.CodeHashOrStorage[:]) + if c.update.Flags == commitment.DeleteUpdate && len(val) == 0 { + c.update.Flags = commitment.DeleteUpdate + c.update.ValLength = 0 return } - if item.update.Flags&commitment.BalanceUpdate != 0 { - c.update.Flags |= commitment.BalanceUpdate - c.update.Balance.Set(&item.update.Balance) - } - if item.update.Flags&commitment.NonceUpdate != 0 { - c.update.Flags |= commitment.NonceUpdate - c.update.Nonce = item.update.Nonce - } - if item.update.Flags == commitment.DeleteUpdate && len(val) == 0 { - c.update.Flags = commitment.DeleteUpdate - } else { - d.keccak.Reset() - d.keccak.Write(val) - copy(c.update.CodeHashOrStorage[:], d.keccak.Sum(nil)) + c.update.ValLength = length.Hash + if len(val) != 0 { + c.update.Flags |= commitment.CodeUpdate } } -type CommitmentItem struct { - plainKey []byte - hashedKey []byte - update commitment.Update -} - -func commitmentItemLess(i, j *CommitmentItem) bool { - return bytes.Compare(i.hashedKey, j.hashedKey) < 0 -} - // Returns list of both plain and hashed keys. If .mode is CommitmentModeUpdate, updates also returned. -func (d *DomainCommitted) TouchedKeyList() ([][]byte, [][]byte, []commitment.Update) { - plainKeys := make([][]byte, d.commTree.Len()) - hashedKeys := make([][]byte, d.commTree.Len()) - updates := make([]commitment.Update, d.commTree.Len()) - - j := 0 - d.commTree.Ascend(func(item *CommitmentItem) bool { - plainKeys[j] = item.plainKey - hashedKeys[j] = item.hashedKey - updates[j] = item.update - j++ - return true - }) +func (t *UpdateTree) List(clear bool) ([][]byte, []commitment.Update) { + switch t.mode { + case CommitmentModeDirect: + plainKeys := make([][]byte, t.keys.Len()) + t.keys.Sort() - d.commTree.Clear(true) - return plainKeys, hashedKeys, updates + keyBuf := make([]byte, 0) + for i := 0; i < len(plainKeys); i++ { + key, _ := t.keys.Get(i, keyBuf, nil) + plainKeys[i] = common.Copy(key) + } + if clear { + t.keys.Reset() + } + return plainKeys, nil + case CommitmentModeUpdate: + plainKeys := make([][]byte, t.tree.Len()) + updates := make([]commitment.Update, t.tree.Len()) + i := 0 + t.tree.Ascend(func(item *commitmentItem) bool { + plainKeys[i], updates[i] = item.plainKey, item.update + i++ + return true + }) + if clear { + t.tree.Clear(true) + } + return plainKeys, updates + default: + return nil, nil + } } -// TODO(awskii): let trie define hashing function -func (d *DomainCommitted) hashAndNibblizeKey(key []byte) []byte { - hashedKey := make([]byte, length.Hash) - - d.keccak.Reset() - d.keccak.Write(key[:length.Addr]) - copy(hashedKey[:length.Hash], d.keccak.Sum(nil)) - - if len(key[length.Addr:]) > 0 { - hashedKey = append(hashedKey, make([]byte, length.Hash)...) - d.keccak.Reset() - d.keccak.Write(key[length.Addr:]) - copy(hashedKey[length.Hash:], d.keccak.Sum(nil)) - } +type DomainCommitted struct { + *Domain + trace bool + updates *UpdateTree + mode CommitmentMode + patriciaTrie commitment.Trie + branchMerger *commitment.BranchMerger + prevState []byte + discard bool +} - nibblized := make([]byte, len(hashedKey)*2) - for i, b := range hashedKey { - nibblized[i*2] = (b >> 4) & 0xf - nibblized[i*2+1] = b & 0xf +func NewCommittedDomain(d *Domain, mode CommitmentMode, trieVariant commitment.TrieVariant) *DomainCommitted { + return &DomainCommitted{ + Domain: d, + mode: mode, + trace: false, + updates: NewUpdateTree(mode), + discard: dbg.DiscardCommitment(), + patriciaTrie: commitment.InitializeTrie(trieVariant), + branchMerger: commitment.NewHexBranchMerger(8192), } - return nibblized } -func (d *DomainCommitted) storeCommitmentState(blockNum, txNum uint64) error { +func (d *DomainCommitted) PatriciaState() ([]byte, error) { var state []byte var err error @@ -229,45 +245,134 @@ func (d *DomainCommitted) storeCommitmentState(blockNum, txNum uint64) error { case *commitment.HexPatriciaHashed: state, err = trie.EncodeCurrentState(nil) if err != nil { - return err + return nil, err } default: - return fmt.Errorf("unsupported state storing for patricia trie type: %T", d.patriciaTrie) + return nil, fmt.Errorf("unsupported state storing for patricia trie type: %T", d.patriciaTrie) + } + return state, nil +} + +func (d *DomainCommitted) Reset() { + d.patriciaTrie.Reset() +} + +func (d *DomainCommitted) ResetFns( + branchFn func(prefix []byte) ([]byte, error), + accountFn func(plainKey []byte, cell *commitment.Cell) error, + storageFn func(plainKey []byte, cell *commitment.Cell) error, +) { + d.patriciaTrie.ResetFns(branchFn, accountFn, storageFn) +} + +func (d *DomainCommitted) Hasher() hash.Hash { + return d.updates.keccak +} + +func (d *DomainCommitted) SetCommitmentMode(m CommitmentMode) { d.mode = m } + +// TouchPlainKey marks plainKey as updated and applies different fn for different key types +// (different behaviour for Code, Account and Storage key modifications). +func (d *DomainCommitted) TouchPlainKey(key, val []byte, fn func(c *commitmentItem, val []byte)) { + if d.discard { + return } - cs := &commitmentState{txNum: txNum, trieState: state, blockNum: blockNum} + d.updates.TouchPlainKey(key, val, fn) +} + +func (d *DomainCommitted) Size() uint64 { + return d.updates.Size() +} + +func (d *DomainCommitted) TouchAccount(c *commitmentItem, val []byte) { + d.updates.TouchAccount(c, val) +} + +func (d *DomainCommitted) TouchStorage(c *commitmentItem, val []byte) { + d.updates.TouchStorage(c, val) +} + +func (d *DomainCommitted) TouchCode(c *commitmentItem, val []byte) { + d.updates.TouchCode(c, val) +} + +type commitmentItem struct { + plainKey []byte + update commitment.Update +} + +func commitmentItemLessPlain(i, j *commitmentItem) bool { + return bytes.Compare(i.plainKey, j.plainKey) < 0 +} + +func (d *DomainCommitted) storeCommitmentState(blockNum uint64, rh []byte) error { + state, err := d.PatriciaState() + if err != nil { + return err + } + cs := &commitmentState{txNum: d.txNum, trieState: state, blockNum: blockNum} encoded, err := cs.Encode() if err != nil { return err } - var stepbuf [2]byte - step := uint16(txNum / d.aggregationStep) - binary.BigEndian.PutUint16(stepbuf[:], step) - if err = d.Domain.Put(keyCommitmentState, stepbuf[:], encoded); err != nil { + if d.trace { + fmt.Printf("[commitment] put tx %d rh %x\n", d.txNum, rh) + } + if err := d.Domain.PutWithPrev(keyCommitmentState, nil, encoded, d.prevState); err != nil { return err } + d.prevState = common.Copy(encoded) return nil } +func (d *DomainCommitted) Restore(value []byte) (uint64, uint64, error) { + cs := new(commitmentState) + if err := cs.Decode(value); err != nil { + if len(value) > 0 { + return 0, 0, fmt.Errorf("failed to decode previous stored commitment state: %w", err) + } + // nil value is acceptable for SetState and will reset trie + } + if hext, ok := d.patriciaTrie.(*commitment.HexPatriciaHashed); ok { + if err := hext.SetState(cs.trieState); err != nil { + return 0, 0, fmt.Errorf("failed restore state : %w", err) + } + if d.trace { + rh, err := hext.RootHash() + if err != nil { + return 0, 0, fmt.Errorf("failed to get root hash after state restore: %w", err) + } + fmt.Printf("[commitment] restored state: block=%d txn=%d rh=%x\n", cs.blockNum, cs.txNum, rh) + } + } else { + return 0, 0, fmt.Errorf("state storing is only supported hex patricia trie") + } + return cs.blockNum, cs.txNum, nil +} + // nolint func (d *DomainCommitted) replaceKeyWithReference(fullKey, shortKey []byte, typeAS string, list ...*filesItem) bool { numBuf := [2]byte{} var found bool for _, item := range list { - //g := item.decompressor.MakeGetter() + g := NewArchiveGetter(item.decompressor.MakeGetter(), d.compression) //index := recsplit.NewIndexReader(item.index) - cur, err := item.bindex.Seek(fullKey) + cur, err := item.bindex.Seek(g, fullKey) if err != nil { continue } + if cur == nil { + continue + } step := uint16(item.endTxNum / d.aggregationStep) binary.BigEndian.PutUint16(numBuf[:], step) - shortKey = encodeU64(cur.Ordinal(), numBuf[:]) + shortKey = encodeU64(cur.Di(), numBuf[:]) if d.trace { - fmt.Printf("replacing %s [%x] => {%x} [step=%d, offset=%d, file=%s.%d-%d]\n", typeAS, fullKey, shortKey, step, cur.Ordinal(), typeAS, item.startTxNum, item.endTxNum) + fmt.Printf("replacing %s [%x] => {%x} [step=%d, offset=%d, file=%s.%d-%d]\n", typeAS, fullKey, shortKey, step, cur.Di(), typeAS, item.startTxNum, item.endTxNum) } found = true break @@ -289,9 +394,15 @@ func (d *DomainCommitted) lookupShortenedKey(shortKey, fullKey []byte, typAS str continue } - cur := item.bindex.OrdinalLookup(offset) - //nolint - fullKey = cur.Key() + g := NewArchiveGetter(item.decompressor.MakeGetter(), d.compression) + fullKey, _, err := item.bindex.dataLookup(offset, g) + if err != nil { + return false + } + + // cur := item.bindex.OrdinalLookup(offset) + // //nolint + // fullKey = cur.Key() if d.trace { fmt.Printf("offsetToKey %s [%x]=>{%x} step=%d offset=%d, file=%s.%d-%d.kv\n", typAS, fullKey, shortKey, fileStep, offset, typAS, item.startTxNum, item.endTxNum) } @@ -353,227 +464,42 @@ func (d *DomainCommitted) commitmentValTransform(files *SelectedStaticFiles, mer return transValBuf, nil } -func (d *DomainCommitted) mergeFiles(ctx context.Context, oldFiles SelectedStaticFiles, mergedFiles MergedFiles, r DomainRanges, workers int, ps *background.ProgressSet) (valuesIn, indexIn, historyIn *filesItem, err error) { - if !r.any() { - return - } - - domainFiles := oldFiles.commitment - indexFiles := oldFiles.commitmentIdx - historyFiles := oldFiles.commitmentHist - - var comp *compress.Compressor - var closeItem bool = true - defer func() { - if closeItem { - if comp != nil { - comp.Close() - } - if indexIn != nil { - if indexIn.decompressor != nil { - indexIn.decompressor.Close() - } - if indexIn.index != nil { - indexIn.index.Close() - } - if indexIn.bindex != nil { - indexIn.bindex.Close() - } - } - if historyIn != nil { - if historyIn.decompressor != nil { - historyIn.decompressor.Close() - } - if historyIn.index != nil { - historyIn.index.Close() - } - if historyIn.bindex != nil { - historyIn.bindex.Close() - } - } - if valuesIn != nil { - if valuesIn.decompressor != nil { - valuesIn.decompressor.Close() - } - if valuesIn.index != nil { - valuesIn.index.Close() - } - if valuesIn.bindex != nil { - valuesIn.bindex.Close() - } - } - } - }() - if indexIn, historyIn, err = d.History.mergeFiles(ctx, indexFiles, historyFiles, - HistoryRanges{ - historyStartTxNum: r.historyStartTxNum, - historyEndTxNum: r.historyEndTxNum, - history: r.history, - indexStartTxNum: r.indexStartTxNum, - indexEndTxNum: r.indexEndTxNum, - index: r.index}, workers, ps); err != nil { - return nil, nil, nil, err - } - - if r.values { - datFileName := fmt.Sprintf("%s.%d-%d.kv", d.filenameBase, r.valuesStartTxNum/d.aggregationStep, r.valuesEndTxNum/d.aggregationStep) - datPath := filepath.Join(d.dir, datFileName) - p := ps.AddNew(datFileName, 1) - defer ps.Delete(p) - - if comp, err = compress.NewCompressor(ctx, "merge", datPath, d.dir, compress.MinPatternScore, workers, log.LvlTrace, d.logger); err != nil { - return nil, nil, nil, fmt.Errorf("merge %s compressor: %w", d.filenameBase, err) - } - var cp CursorHeap - heap.Init(&cp) - for _, item := range domainFiles { - g := item.decompressor.MakeGetter() - g.Reset(0) - if g.HasNext() { - key, _ := g.NextUncompressed() - var val []byte - if d.compressVals { - val, _ = g.Next(nil) - } else { - val, _ = g.NextUncompressed() - } - if d.trace { - fmt.Printf("merge: read value '%x'\n", key) - } - heap.Push(&cp, &CursorItem{ - t: FILE_CURSOR, - dg: g, - key: key, - val: val, - endTxNum: item.endTxNum, - reverse: true, - }) - } - } - keyCount := 0 - // In the loop below, the pair `keyBuf=>valBuf` is always 1 item behind `lastKey=>lastVal`. - // `lastKey` and `lastVal` are taken from the top of the multi-way merge (assisted by the CursorHeap cp), but not processed right away - // instead, the pair from the previous iteration is processed first - `keyBuf=>valBuf`. After that, `keyBuf` and `valBuf` are assigned - // to `lastKey` and `lastVal` correspondingly, and the next step of multi-way merge happens. Therefore, after the multi-way merge loop - // (when CursorHeap cp is empty), there is a need to process the last pair `keyBuf=>valBuf`, because it was one step behind - var keyBuf, valBuf []byte - for cp.Len() > 0 { - lastKey := common.Copy(cp[0].key) - lastVal := common.Copy(cp[0].val) - // Advance all the items that have this key (including the top) - for cp.Len() > 0 && bytes.Equal(cp[0].key, lastKey) { - ci1 := cp[0] - if ci1.dg.HasNext() { - ci1.key, _ = ci1.dg.NextUncompressed() - if d.compressVals { - ci1.val, _ = ci1.dg.Next(ci1.val[:0]) - } else { - ci1.val, _ = ci1.dg.NextUncompressed() - } - heap.Fix(&cp, 0) - } else { - heap.Pop(&cp) - } - } - // For the rest of types, empty value means deletion - skip := r.valuesStartTxNum == 0 && len(lastVal) == 0 - if !skip { - if keyBuf != nil { - if err = comp.AddUncompressedWord(keyBuf); err != nil { - return nil, nil, nil, err - } - keyCount++ // Only counting keys, not values - switch d.compressVals { - case true: - if err = comp.AddWord(valBuf); err != nil { - return nil, nil, nil, err - } - default: - if err = comp.AddUncompressedWord(valBuf); err != nil { - return nil, nil, nil, err - } - } - } - keyBuf = append(keyBuf[:0], lastKey...) - valBuf = append(valBuf[:0], lastVal...) - } - } - if keyBuf != nil { - if err = comp.AddUncompressedWord(keyBuf); err != nil { - return nil, nil, nil, err - } - keyCount++ // Only counting keys, not values - //fmt.Printf("last heap key %x\n", keyBuf) - valBuf, err = d.commitmentValTransform(&oldFiles, &mergedFiles, valBuf) - if err != nil { - return nil, nil, nil, fmt.Errorf("merge: 2valTransform [%x] %w", valBuf, err) - } - if d.compressVals { - if err = comp.AddWord(valBuf); err != nil { - return nil, nil, nil, err - } - } else { - if err = comp.AddUncompressedWord(valBuf); err != nil { - return nil, nil, nil, err - } - } - } - if err = comp.Compress(); err != nil { - return nil, nil, nil, err - } - comp.Close() - comp = nil - valuesIn = newFilesItem(r.valuesStartTxNum, r.valuesEndTxNum, d.aggregationStep) - if valuesIn.decompressor, err = compress.NewDecompressor(datPath); err != nil { - return nil, nil, nil, fmt.Errorf("merge %s decompressor [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) - } - ps.Delete(p) - - idxFileName := fmt.Sprintf("%s.%d-%d.kvi", d.filenameBase, r.valuesStartTxNum/d.aggregationStep, r.valuesEndTxNum/d.aggregationStep) - idxPath := filepath.Join(d.dir, idxFileName) - - p = ps.AddNew(datFileName, uint64(keyCount)) - defer ps.Delete(p) - if valuesIn.index, err = buildIndexThenOpen(ctx, valuesIn.decompressor, idxPath, d.dir, keyCount, false /* values */, p, d.logger, d.noFsync); err != nil { - return nil, nil, nil, fmt.Errorf("merge %s buildIndex [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) - } - - btPath := strings.TrimSuffix(idxPath, "kvi") + "bt" - valuesIn.bindex, err = CreateBtreeIndexWithDecompressor(btPath, 2048, valuesIn.decompressor, p, d.tmpdir, d.logger) - if err != nil { - return nil, nil, nil, fmt.Errorf("create btindex %s [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) - } - } - closeItem = false - d.stats.MergesCount++ - d.mergesCount++ - return +func (d *DomainCommitted) Close() { + d.Domain.Close() + d.updates.keys.Reset() + d.updates.tree.Clear(true) } -// Evaluates commitment for processed state. Commit=true - store trie state after evaluation +// Evaluates commitment for processed state. func (d *DomainCommitted) ComputeCommitment(trace bool) (rootHash []byte, branchNodeUpdates map[string]commitment.BranchData, err error) { - defer func(s time.Time) { d.comTook = time.Since(s) }(time.Now()) + if dbg.DiscardCommitment() { + d.updates.List(true) + return nil, nil, nil + } + defer func(s time.Time) { mxCommitmentTook.UpdateDuration(s) }(time.Now()) - touchedKeys, hashedKeys, updates := d.TouchedKeyList() - d.comKeys = uint64(len(touchedKeys)) + touchedKeys, updates := d.updates.List(true) + mxCommitmentKeys.Add(len(touchedKeys)) if len(touchedKeys) == 0 { rootHash, err = d.patriciaTrie.RootHash() return rootHash, nil, err } + if len(touchedKeys) > 1 { + d.patriciaTrie.Reset() + } // data accessing functions should be set once before - d.patriciaTrie.Reset() d.patriciaTrie.SetTrace(trace) switch d.mode { case CommitmentModeDirect: - rootHash, branchNodeUpdates, err = d.patriciaTrie.ReviewKeys(touchedKeys, hashedKeys) + rootHash, branchNodeUpdates, err = d.patriciaTrie.ProcessKeys(touchedKeys) if err != nil { return nil, nil, err } case CommitmentModeUpdate: - rootHash, branchNodeUpdates, err = d.patriciaTrie.ProcessUpdates(touchedKeys, hashedKeys, updates) + rootHash, branchNodeUpdates, err = d.patriciaTrie.ProcessUpdates(touchedKeys, updates) if err != nil { return nil, nil, err } @@ -589,57 +515,31 @@ var keyCommitmentState = []byte("state") // SeekCommitment searches for last encoded state from DomainCommitted // and if state found, sets it up to current domain -func (d *DomainCommitted) SeekCommitment(aggStep, sinceTx uint64) (blockNum, txNum uint64, err error) { +func (d *DomainCommitted) SeekCommitment(sinceTx, untilTx uint64, cd *DomainContext) (blockNum, txNum uint64, err error) { + if dbg.DiscardCommitment() { + return 0, 0, nil + } if d.patriciaTrie.Variant() != commitment.VariantHexPatriciaTrie { return 0, 0, fmt.Errorf("state storing is only supported hex patricia trie") } - // todo add support of bin state dumping - - var ( - latestState []byte - stepbuf [2]byte - step = uint16(sinceTx/aggStep) - 1 - latestTxNum uint64 = sinceTx - 1 - ) - - d.SetTxNum(latestTxNum) - ctx := d.MakeContext() - defer ctx.Close() - for { - binary.BigEndian.PutUint16(stepbuf[:], step) - - s, err := ctx.Get(keyCommitmentState, stepbuf[:], d.tx) - if err != nil { - return 0, 0, err - } - if len(s) < 8 { - break + fmt.Printf("[commitment] SeekCommitment [%d, %d]\n", sinceTx, untilTx) + var latestState []byte + err = cd.IteratePrefix(d.tx, keyCommitmentState, func(key, value []byte) { + if len(value) < 8 { + fmt.Printf("[commitment] SeekCommitment invalid value size %d [%x]\n", len(value), value) + return } - v := binary.BigEndian.Uint64(s) - if v == latestTxNum && len(latestState) != 0 { - break + txn := binary.BigEndian.Uint64(value) + fmt.Printf("[commitment] Seek txn=%d %x\n", txn, value[:16]) + if txn >= sinceTx && txn <= untilTx { + latestState = value } - latestTxNum, latestState = v, s - lookupTxN := latestTxNum + aggStep - step = uint16(latestTxNum/aggStep) + 1 - d.SetTxNum(lookupTxN) - } - - var latest commitmentState - if err := latest.Decode(latestState); err != nil { - return 0, 0, nil - } - - if hext, ok := d.patriciaTrie.(*commitment.HexPatriciaHashed); ok { - if err := hext.SetState(latest.trieState); err != nil { - return 0, 0, err - } - } else { - return 0, 0, fmt.Errorf("state storing is only supported hex patricia trie") + }) + if err != nil { + return 0, 0, err } - - return latest.blockNum, latest.txNum, nil + return d.Restore(latestState) } type commitmentState struct { @@ -650,7 +550,7 @@ type commitmentState struct { func (cs *commitmentState) Decode(buf []byte) error { if len(buf) < 10 { - return fmt.Errorf("ivalid commitment state buffer size") + return fmt.Errorf("ivalid commitment state buffer size %d, expected at least 10b", len(buf)) } pos := 0 cs.txNum = binary.BigEndian.Uint64(buf[pos : pos+8]) diff --git a/state/domain_shared.go b/state/domain_shared.go index 39e37884a..ca57debb7 100644 --- a/state/domain_shared.go +++ b/state/domain_shared.go @@ -1,5 +1,24 @@ package state +import ( + "bytes" + "container/heap" + "context" + "encoding/binary" + "fmt" + "sync" + "sync/atomic" + "time" + "unsafe" + + "github.com/ledgerwatch/log/v3" + btree2 "github.com/tidwall/btree" + + "github.com/ledgerwatch/erigon-lib/commitment" + "github.com/ledgerwatch/erigon-lib/common" + "github.com/ledgerwatch/erigon-lib/kv" +) + // KvList sort.Interface to sort write list by keys type KvList struct { Keys []string @@ -23,3 +42,763 @@ func (l *KvList) Swap(i, j int) { l.Keys[i], l.Keys[j] = l.Keys[j], l.Keys[i] l.Vals[i], l.Vals[j] = l.Vals[j], l.Vals[i] } + +type SharedDomains struct { + aggCtx *AggregatorV3Context + roTx kv.Tx + + txNum atomic.Uint64 + blockNum atomic.Uint64 + estSize atomic.Uint64 + trace bool + muMaps sync.RWMutex + walLock sync.RWMutex + + account map[string][]byte + code map[string][]byte + storage *btree2.Map[string, []byte] + commitment map[string][]byte + Account *Domain + Storage *Domain + Code *Domain + Commitment *DomainCommitted + TracesTo *InvertedIndex + LogAddrs *InvertedIndex + LogTopics *InvertedIndex + TracesFrom *InvertedIndex +} + +func NewSharedDomains(a, c, s *Domain, comm *DomainCommitted) *SharedDomains { + sd := &SharedDomains{ + Account: a, + account: map[string][]byte{}, + Code: c, + code: map[string][]byte{}, + Storage: s, + storage: btree2.NewMap[string, []byte](128), + Commitment: comm, + commitment: map[string][]byte{}, + } + + sd.Commitment.ResetFns(sd.branchFn, sd.accountFn, sd.storageFn) + return sd +} + +func (sd *SharedDomains) SetInvertedIndices(tracesTo, tracesFrom, logAddrs, logTopics *InvertedIndex) { + sd.TracesTo = tracesTo + sd.TracesFrom = tracesFrom + sd.LogAddrs = logAddrs + sd.LogTopics = logTopics +} + +// aggregator context should call aggCtx.Unwind before this one. +func (sd *SharedDomains) Unwind(ctx context.Context, rwTx kv.RwTx, txUnwindTo uint64) error { + sd.ClearRam(true) + + bn, txn, err := sd.SeekCommitment(0, txUnwindTo) + fmt.Printf("Unwinded domains to block %d, txn %d wanted to %d\n", bn, txn, txUnwindTo) + return err +} + +func (sd *SharedDomains) SeekCommitment(fromTx, toTx uint64) (bn, txn uint64, err error) { + bn, txn, err = sd.Commitment.SeekCommitment(fromTx, toTx, sd.aggCtx.commitment) + if bn > 0 { + //we set bn+1 to correctly start from the next block + bn++ + } + sd.SetBlockNum(bn) + sd.SetTxNum(txn) + return +} + +func (sd *SharedDomains) ClearRam(resetCommitment bool) { + sd.muMaps.Lock() + defer sd.muMaps.Unlock() + log.Debug("ClearRam", "commitment", resetCommitment, "tx", sd.txNum.Load(), "block", sd.blockNum.Load()) + sd.account = map[string][]byte{} + sd.code = map[string][]byte{} + sd.commitment = map[string][]byte{} + if resetCommitment { + sd.Commitment.updates.List(true) + sd.Commitment.patriciaTrie.Reset() + } + + sd.storage = btree2.NewMap[string, []byte](128) + sd.estSize.Store(0) +} + +func (sd *SharedDomains) put(table kv.Domain, key, val []byte) { + sd.muMaps.Lock() + sd.puts(table, key, val) + sd.muMaps.Unlock() +} + +func (sd *SharedDomains) puts(table kv.Domain, key []byte, val []byte) { + keyS := string(key) + switch table { + case kv.AccountsDomain: + if old, ok := sd.account[keyS]; ok { + sd.estSize.Add(uint64(len(val) - len(old))) + } else { + sd.estSize.Add(uint64(len(key) + len(val))) + } + sd.account[keyS] = val + case kv.CodeDomain: + if old, ok := sd.code[keyS]; ok { + sd.estSize.Add(uint64(len(val) - len(old))) + } else { + sd.estSize.Add(uint64(len(key) + len(val))) + } + sd.code[keyS] = val + case kv.StorageDomain: + if old, ok := sd.storage.Set(keyS, val); ok { + sd.estSize.Add(uint64(len(val) - len(old))) + } else { + sd.estSize.Add(uint64(len(key) + len(val))) + } + case kv.CommitmentDomain: + if old, ok := sd.commitment[keyS]; ok { + sd.estSize.Add(uint64(len(val) - len(old))) + } else { + sd.estSize.Add(uint64(len(key) + len(val))) + } + sd.commitment[keyS] = val + default: + panic(fmt.Errorf("sharedDomains put to invalid table %s", table)) + } +} + +// Get returns cached value by key. Cache is invalidated when associated WAL is flushed +func (sd *SharedDomains) Get(table kv.Domain, key []byte) (v []byte, ok bool) { + sd.muMaps.RLock() + v, ok = sd.get(table, key) + sd.muMaps.RUnlock() + return v, ok +} + +func (sd *SharedDomains) get(table kv.Domain, key []byte) (v []byte, ok bool) { + keyS := *(*string)(unsafe.Pointer(&key)) + //keyS := string(key) + switch table { + case kv.AccountsDomain: + v, ok = sd.account[keyS] + case kv.CodeDomain: + v, ok = sd.code[keyS] + case kv.StorageDomain: + v, ok = sd.storage.Get(keyS) + case kv.CommitmentDomain: + v, ok = sd.commitment[keyS] + default: + panic(table) + } + return v, ok +} + +func (sd *SharedDomains) SizeEstimate() uint64 { + return sd.estSize.Load() * 2 // multiply 2 here, to cover data-structures overhead. more precise accounting - expensive. +} + +func (sd *SharedDomains) LatestCommitment(prefix []byte) ([]byte, error) { + v0, ok := sd.Get(kv.CommitmentDomain, prefix) + if ok { + return v0, nil + } + v, _, err := sd.aggCtx.GetLatest(kv.CommitmentDomain, prefix, nil, sd.roTx) + if err != nil { + return nil, fmt.Errorf("commitment prefix %x read error: %w", prefix, err) + } + return v, nil +} + +func (sd *SharedDomains) LatestCode(addr []byte) ([]byte, error) { + v0, ok := sd.Get(kv.CodeDomain, addr) + if ok { + return v0, nil + } + v, _, err := sd.aggCtx.GetLatest(kv.CodeDomain, addr, nil, sd.roTx) + if err != nil { + return nil, fmt.Errorf("code %x read error: %w", addr, err) + } + return v, nil +} + +func (sd *SharedDomains) LatestAccount(addr []byte) ([]byte, error) { + var v0, v []byte + var err error + var ok bool + + //defer func() { + // curious := "0da27ef618846cfa981516da2891fe0693a54f8418b85c91c384d2c0f4e14727" + // if bytes.Equal(hexutility.MustDecodeString(curious), addr) { + // fmt.Printf("found %s vDB/File %x vCache %x step %d\n", curious, v, v0, sd.txNum.Load()/sd.Account.aggregationStep) + // } + //}() + v0, ok = sd.Get(kv.AccountsDomain, addr) + if ok { + return v0, nil + } + v, _, err = sd.aggCtx.GetLatest(kv.AccountsDomain, addr, nil, sd.roTx) + if err != nil { + return nil, fmt.Errorf("account %x read error: %w", addr, err) + } + return v, nil +} + +const CodeSizeTableFake = "CodeSize" + +func (sd *SharedDomains) ReadsValid(readLists map[string]*KvList) bool { + sd.muMaps.RLock() + defer sd.muMaps.RUnlock() + + for table, list := range readLists { + switch table { + case string(kv.AccountsDomain): + m := sd.account + for i, key := range list.Keys { + if val, ok := m[key]; ok { + if !bytes.Equal(list.Vals[i], val) { + return false + } + } + } + case string(kv.CodeDomain): + m := sd.code + for i, key := range list.Keys { + if val, ok := m[key]; ok { + if !bytes.Equal(list.Vals[i], val) { + return false + } + } + } + case string(kv.StorageDomain): + m := sd.storage + for i, key := range list.Keys { + if val, ok := m.Get(key); ok { + if !bytes.Equal(list.Vals[i], val) { + return false + } + } + } + case CodeSizeTableFake: + m := sd.code + for i, key := range list.Keys { + if val, ok := m[key]; ok { + if binary.BigEndian.Uint64(list.Vals[i]) != uint64(len(val)) { + return false + } + } + } + default: + panic(table) + } + } + + return true +} + +func (sd *SharedDomains) LatestStorage(addrLoc []byte) ([]byte, error) { + //a := make([]byte, 0, len(addr)+len(loc)) + v0, ok := sd.Get(kv.StorageDomain, addrLoc) + if ok { + return v0, nil + } + v, _, err := sd.aggCtx.GetLatest(kv.StorageDomain, addrLoc, nil, sd.roTx) + if err != nil { + return nil, fmt.Errorf("storage %x read error: %w", addrLoc, err) + } + return v, nil +} + +func (sd *SharedDomains) branchFn(pref []byte) ([]byte, error) { + v, err := sd.LatestCommitment(pref) + if err != nil { + return nil, fmt.Errorf("branchFn failed: %w", err) + } + //fmt.Printf("branchFn[sd]: %x: %x\n", pref, v) + if len(v) == 0 { + return nil, nil + } + // skip touchmap + return v[2:], nil +} + +func (sd *SharedDomains) accountFn(plainKey []byte, cell *commitment.Cell) error { + encAccount, err := sd.LatestAccount(plainKey) + if err != nil { + return fmt.Errorf("accountFn failed: %w", err) + } + cell.Nonce = 0 + cell.Balance.Clear() + if len(encAccount) > 0 { + nonce, balance, chash := DecodeAccountBytes(encAccount) + cell.Nonce = nonce + cell.Balance.Set(balance) + if len(chash) > 0 { + copy(cell.CodeHash[:], chash) + } + //fmt.Printf("accountFn[sd]: %x: n=%d b=%d ch=%x\n", plainKey, nonce, balance, chash) + } + + code, err := sd.LatestCode(plainKey) + if err != nil { + return fmt.Errorf("accountFn[sd]: failed to read latest code: %w", err) + } + if len(code) > 0 { + //fmt.Printf("accountFn[sd]: code %x - %x\n", plainKey, code) + sd.Commitment.updates.keccak.Reset() + sd.Commitment.updates.keccak.Write(code) + sd.Commitment.updates.keccak.Read(cell.CodeHash[:]) + } else { + cell.CodeHash = commitment.EmptyCodeHashArray + } + cell.Delete = len(encAccount) == 0 && len(code) == 0 + return nil +} + +func (sd *SharedDomains) storageFn(plainKey []byte, cell *commitment.Cell) error { + // Look in the summary table first + //addr, loc := splitKey(plainKey) + enc, err := sd.LatestStorage(plainKey) + if err != nil { + return err + } + //fmt.Printf("storageFn[sd]: %x|%x - %x\n", addr, loc, enc) + cell.StorageLen = len(enc) + copy(cell.Storage[:], enc) + cell.Delete = cell.StorageLen == 0 + return nil +} + +func (sd *SharedDomains) UpdateAccountData(addr []byte, account, prevAccount []byte) error { + sd.Commitment.TouchPlainKey(addr, account, sd.Commitment.TouchAccount) + sd.put(kv.AccountsDomain, addr, account) + return sd.Account.PutWithPrev(addr, nil, account, prevAccount) +} + +func (sd *SharedDomains) UpdateAccountCode(addr, code []byte) error { + sd.Commitment.TouchPlainKey(addr, code, sd.Commitment.TouchCode) + prevCode, _ := sd.LatestCode(addr) + if bytes.Equal(prevCode, code) { + return nil + } + sd.put(kv.CodeDomain, addr, code) + if len(code) == 0 { + return sd.Code.DeleteWithPrev(addr, nil, prevCode) + } + return sd.Code.PutWithPrev(addr, nil, code, prevCode) +} + +func (sd *SharedDomains) UpdateCommitmentData(prefix []byte, data, prev []byte) error { + sd.put(kv.CommitmentDomain, prefix, data) + return sd.Commitment.PutWithPrev(prefix, nil, data, prev) +} + +func (sd *SharedDomains) DeleteAccount(addr, prev []byte) error { + sd.Commitment.TouchPlainKey(addr, nil, sd.Commitment.TouchAccount) + + sd.put(kv.AccountsDomain, addr, nil) + if err := sd.Account.DeleteWithPrev(addr, nil, prev); err != nil { + return err + } + + // commitment delete already has been applied via account + pc, err := sd.LatestCode(addr) + if err != nil { + return err + } + if len(pc) > 0 { + sd.Commitment.TouchPlainKey(addr, nil, sd.Commitment.TouchCode) + sd.put(kv.CodeDomain, addr, nil) + if err := sd.Code.DeleteWithPrev(addr, nil, pc); err != nil { + return err + } + } + + // bb, _ := hex.DecodeString("d96d1b15d6bec8e7d37038237b1e913ad99f7dee") + // if bytes.Equal(bb, addr) { + // fmt.Printf("delete account %x \n", addr) + // } + + type pair struct{ k, v []byte } + tombs := make([]pair, 0, 8) + err = sd.IterateStoragePrefix(sd.roTx, addr, func(k, v []byte) { + tombs = append(tombs, pair{k, v}) + }) + if err != nil { + return err + } + + for _, tomb := range tombs { + sd.put(kv.StorageDomain, tomb.k, nil) + sd.Commitment.TouchPlainKey(tomb.k, nil, sd.Commitment.TouchStorage) + err = sd.Storage.DeleteWithPrev(tomb.k, nil, tomb.v) + if err != nil { + return err + } + } + return nil +} + +func (sd *SharedDomains) WriteAccountStorage(addr, loc []byte, value, preVal []byte) error { + composite := addr + if loc != nil { // if caller passed already `composite` key, then just use it. otherwise join parts + composite = make([]byte, 0, len(addr)+len(loc)) + composite = append(append(composite, addr...), loc...) + } + sd.Commitment.TouchPlainKey(composite, value, sd.Commitment.TouchStorage) + sd.put(kv.StorageDomain, composite, value) + if len(value) == 0 { + return sd.Storage.DeleteWithPrev(composite, nil, preVal) + } + return sd.Storage.PutWithPrev(composite, nil, value, preVal) +} + +func (sd *SharedDomains) IndexAdd(table kv.InvertedIdx, key []byte) (err error) { + switch table { + case kv.LogAddrIdx, kv.TblLogAddressIdx: + err = sd.LogAddrs.Add(key) + case kv.LogTopicIdx, kv.TblLogTopicsIdx, kv.LogTopicIndex: + err = sd.LogTopics.Add(key) + case kv.TblTracesToIdx: + err = sd.TracesTo.Add(key) + case kv.TblTracesFromIdx: + err = sd.TracesFrom.Add(key) + default: + panic(fmt.Errorf("unknown shared index %s", table)) + } + return err +} + +func (sd *SharedDomains) SetContext(ctx *AggregatorV3Context) { + sd.aggCtx = ctx +} + +func (sd *SharedDomains) SetTx(tx kv.RwTx) { + sd.roTx = tx + sd.Commitment.SetTx(tx) + sd.Code.SetTx(tx) + sd.Account.SetTx(tx) + sd.Storage.SetTx(tx) + sd.TracesTo.SetTx(tx) + sd.TracesFrom.SetTx(tx) + sd.LogAddrs.SetTx(tx) + sd.LogTopics.SetTx(tx) +} + +// SetTxNum sets txNum for all domains as well as common txNum for all domains +// Requires for sd.rwTx because of commitment evaluation in shared domains if aggregationStep is reached +func (sd *SharedDomains) SetTxNum(txNum uint64) { + if txNum%sd.Account.aggregationStep == 0 { // + _, err := sd.Commit(true, sd.trace) + if err != nil { + panic(err) + } + } + + sd.txNum.Store(txNum) + sd.Account.SetTxNum(txNum) + sd.Code.SetTxNum(txNum) + sd.Storage.SetTxNum(txNum) + sd.Commitment.SetTxNum(txNum) + sd.TracesTo.SetTxNum(txNum) + sd.TracesFrom.SetTxNum(txNum) + sd.LogAddrs.SetTxNum(txNum) + sd.LogTopics.SetTxNum(txNum) +} + +func (sd *SharedDomains) TxNum() uint64 { + return sd.txNum.Load() +} + +func (sd *SharedDomains) SetBlockNum(blockNum uint64) { + sd.blockNum.Store(blockNum) +} + +func (sd *SharedDomains) Commit(saveStateAfter, trace bool) (rootHash []byte, err error) { + // if commitment mode is Disabled, there will be nothing to compute on. + mxCommitmentRunning.Inc() + defer mxCommitmentRunning.Dec() + + rootHash, branchNodeUpdates, err := sd.Commitment.ComputeCommitment(trace) + if err != nil { + return nil, err + } + + defer func(t time.Time) { mxCommitmentWriteTook.UpdateDuration(t) }(time.Now()) + + for pref, update := range branchNodeUpdates { + prefix := []byte(pref) + + stateValue, err := sd.LatestCommitment(prefix) + if err != nil { + return nil, err + } + stated := commitment.BranchData(stateValue) + merged, err := sd.Commitment.branchMerger.Merge(stated, update) + if err != nil { + return nil, err + } + if bytes.Equal(stated, merged) { + continue + } + if trace { + fmt.Printf("sd computeCommitment merge [%x] [%x]+[%x]=>[%x]\n", prefix, stated, update, merged) + } + + if err = sd.UpdateCommitmentData(prefix, merged, stated); err != nil { + return nil, err + } + mxCommitmentBranchUpdates.Inc() + } + + if saveStateAfter { + if err := sd.Commitment.storeCommitmentState(sd.blockNum.Load(), rootHash); err != nil { + return nil, err + } + } + return rootHash, nil +} + +// IterateStoragePrefix iterates over key-value pairs of the storage domain that start with given prefix +// Such iteration is not intended to be used in public API, therefore it uses read-write transaction +// inside the domain. Another version of this for public API use needs to be created, that uses +// roTx instead and supports ending the iterations before it reaches the end. +func (sd *SharedDomains) IterateStoragePrefix(roTx kv.Tx, prefix []byte, it func(k, v []byte)) error { + sc := sd.Storage.MakeContext() + defer sc.Close() + + // return sc.IteratePrefix(roTx, prefix, it) + sd.Storage.stats.FilesQueries.Add(1) + + var cp CursorHeap + cpPtr := &cp + heap.Init(cpPtr) + var k, v []byte + var err error + + iter := sd.storage.Iter() + if iter.Seek(string(prefix)) { + kx := iter.Key() + v = iter.Value() + k = []byte(kx) + + if len(kx) > 0 && bytes.HasPrefix(k, prefix) { + heap.Push(cpPtr, &CursorItem{t: RAM_CURSOR, key: common.Copy(k), val: common.Copy(v), iter: iter, endTxNum: sd.txNum.Load(), reverse: true}) + } + } + + keysCursor, err := roTx.CursorDupSort(sd.Storage.keysTable) + if err != nil { + return err + } + defer keysCursor.Close() + if k, v, err = keysCursor.Seek(prefix); err != nil { + return err + } + if k != nil && bytes.HasPrefix(k, prefix) { + keySuffix := make([]byte, len(k)+8) + copy(keySuffix, k) + copy(keySuffix[len(k):], v) + step := ^binary.BigEndian.Uint64(v) + txNum := step * sd.Storage.aggregationStep + if v, err = roTx.GetOne(sd.Storage.valsTable, keySuffix); err != nil { + return err + } + heap.Push(cpPtr, &CursorItem{t: DB_CURSOR, key: k, val: v, c: keysCursor, endTxNum: txNum, reverse: true}) + } + + sctx := sd.aggCtx.storage + for _, item := range sctx.files { + gg := NewArchiveGetter(item.src.decompressor.MakeGetter(), sd.Storage.compression) + cursor, err := item.src.bindex.Seek(gg, prefix) + if err != nil { + return err + } + if cursor == nil { + continue + } + cursor.getter = gg + + key := cursor.Key() + if key != nil && bytes.HasPrefix(key, prefix) { + val := cursor.Value() + heap.Push(cpPtr, &CursorItem{t: FILE_CURSOR, key: key, val: val, btCursor: cursor, endTxNum: item.endTxNum, reverse: true}) + } + } + + for cp.Len() > 0 { + lastKey := common.Copy(cp[0].key) + lastVal := common.Copy(cp[0].val) + // Advance all the items that have this key (including the top) + for cp.Len() > 0 && bytes.Equal(cp[0].key, lastKey) { + ci1 := heap.Pop(cpPtr).(*CursorItem) + switch ci1.t { + case RAM_CURSOR: + if ci1.iter.Next() { + k = []byte(ci1.iter.Key()) + if k != nil && bytes.HasPrefix(k, prefix) { + ci1.key = common.Copy(k) + ci1.val = common.Copy(ci1.iter.Value()) + heap.Push(cpPtr, ci1) + } + } + case FILE_CURSOR: + if UseBtree || UseBpsTree { + if ci1.btCursor.Next() { + ci1.key = ci1.btCursor.Key() + if ci1.key != nil && bytes.HasPrefix(ci1.key, prefix) { + ci1.val = ci1.btCursor.Value() + heap.Push(cpPtr, ci1) + } + } + } else { + ci1.dg.Reset(ci1.latestOffset) + if !ci1.dg.HasNext() { + break + } + key, _ := ci1.dg.Next(nil) + if key != nil && bytes.HasPrefix(key, prefix) { + ci1.key = key + ci1.val, ci1.latestOffset = ci1.dg.Next(nil) + heap.Push(cpPtr, ci1) + } + } + case DB_CURSOR: + k, v, err = ci1.c.NextNoDup() + if err != nil { + return err + } + + if k != nil && bytes.HasPrefix(k, prefix) { + ci1.key = common.Copy(k) + keySuffix := make([]byte, len(k)+8) + copy(keySuffix, k) + copy(keySuffix[len(k):], v) + if v, err = roTx.GetOne(sd.Storage.valsTable, keySuffix); err != nil { + return err + } + ci1.val = common.Copy(v) + heap.Push(cpPtr, ci1) + } + } + } + if len(lastVal) > 0 { + it(lastKey, lastVal) + } + } + return nil +} + +func (sd *SharedDomains) Close() { + //sd.FinishWrites() + sd.account = nil + sd.code = nil + sd.storage = nil + sd.commitment = nil +} + +// StartWrites - pattern: `defer domains.StartWrites().FinishWrites()` +func (sd *SharedDomains) StartWrites() *SharedDomains { + sd.walLock.Lock() + defer sd.walLock.Unlock() + + sd.Account.StartWrites() + sd.Storage.StartWrites() + sd.Code.StartWrites() + sd.Commitment.StartWrites() + sd.LogAddrs.StartWrites() + sd.LogTopics.StartWrites() + sd.TracesFrom.StartWrites() + sd.TracesTo.StartWrites() + + if sd.account == nil { + sd.account = map[string][]byte{} + } + if sd.commitment == nil { + sd.commitment = map[string][]byte{} + } + if sd.code == nil { + sd.code = map[string][]byte{} + } + if sd.storage == nil { + sd.storage = btree2.NewMap[string, []byte](128) + } + return sd +} + +func (sd *SharedDomains) StartUnbufferedWrites() *SharedDomains { + sd.walLock.Lock() + defer sd.walLock.Unlock() + + sd.Account.StartUnbufferedWrites() + sd.Storage.StartUnbufferedWrites() + sd.Code.StartUnbufferedWrites() + sd.Commitment.StartUnbufferedWrites() + sd.LogAddrs.StartUnbufferedWrites() + sd.LogTopics.StartUnbufferedWrites() + sd.TracesFrom.StartUnbufferedWrites() + sd.TracesTo.StartUnbufferedWrites() + + if sd.account == nil { + sd.account = map[string][]byte{} + } + if sd.commitment == nil { + sd.commitment = map[string][]byte{} + } + if sd.code == nil { + sd.code = map[string][]byte{} + } + if sd.storage == nil { + sd.storage = btree2.NewMap[string, []byte](128) + } + + return sd +} + +func (sd *SharedDomains) FinishWrites() { + sd.walLock.Lock() + defer sd.walLock.Unlock() + + sd.Account.FinishWrites() + sd.Storage.FinishWrites() + sd.Code.FinishWrites() + sd.Commitment.FinishWrites() + sd.LogAddrs.FinishWrites() + sd.LogTopics.FinishWrites() + sd.TracesFrom.FinishWrites() + sd.TracesTo.FinishWrites() +} + +func (sd *SharedDomains) BatchHistoryWriteStart() *SharedDomains { + sd.walLock.RLock() + return sd +} + +func (sd *SharedDomains) BatchHistoryWriteEnd() { + sd.walLock.RUnlock() +} + +func (sd *SharedDomains) rotate() []flusher { + sd.walLock.Lock() + defer sd.walLock.Unlock() + return []flusher{ + sd.Account.Rotate(), + sd.Storage.Rotate(), + sd.Code.Rotate(), + sd.Commitment.Domain.Rotate(), + sd.LogAddrs.Rotate(), + sd.LogTopics.Rotate(), + sd.TracesFrom.Rotate(), + sd.TracesTo.Rotate(), + } +} + +func (sd *SharedDomains) Flush(ctx context.Context, tx kv.RwTx) error { + flushers := sd.rotate() + for _, f := range flushers { + if err := f.Flush(ctx, tx); err != nil { + return err + } + } + return nil +} diff --git a/state/domain_shared_test.go b/state/domain_shared_test.go new file mode 100644 index 000000000..ea21cd9e1 --- /dev/null +++ b/state/domain_shared_test.go @@ -0,0 +1,98 @@ +package state + +import ( + "context" + "math/rand" + "testing" + + "github.com/holiman/uint256" + "github.com/stretchr/testify/require" + + "github.com/ledgerwatch/erigon-lib/common/length" +) + +func TestSharedDomain_Unwind(t *testing.T) { + stepSize := uint64(100) + db, agg := testDbAndAggregatorv3(t, stepSize) + + ctx := context.Background() + rwTx, err := db.BeginRw(ctx) + require.NoError(t, err) + defer rwTx.Rollback() + + agg.StartWrites() + defer agg.FinishWrites() + + ac := agg.MakeContext() + defer ac.Close() + d := agg.SharedDomains(ac) + defer agg.CloseSharedDomains() + d.SetTx(rwTx) + + maxTx := stepSize + hashes := make([][]byte, maxTx) + count := 10 + rnd := rand.New(rand.NewSource(0)) + ac.Close() + err = rwTx.Commit() + require.NoError(t, err) + +Loop: + rwTx, err = db.BeginRw(ctx) + require.NoError(t, err) + defer rwTx.Rollback() + + ac = agg.MakeContext() + defer ac.Close() + d = agg.SharedDomains(ac) + defer agg.CloseSharedDomains() + d.SetTx(rwTx) + + i := 0 + k0 := make([]byte, length.Addr) + commitStep := 3 + + for ; i < int(maxTx); i++ { + d.SetTxNum(uint64(i)) + for accs := 0; accs < 256; accs++ { + v := EncodeAccountBytes(uint64(i), uint256.NewInt(uint64(i*10e6)+uint64(accs*10e2)), nil, 0) + k0[0] = byte(accs) + pv, err := d.LatestAccount(k0) + require.NoError(t, err) + + err = d.UpdateAccountData(k0, v, pv) + require.NoError(t, err) + } + + if i%commitStep == 0 { + rh, err := d.Commit(true, false) + require.NoError(t, err) + if hashes[uint64(i)] != nil { + require.Equal(t, hashes[uint64(i)], rh) + } + require.NotNil(t, rh) + hashes[uint64(i)] = rh + } + } + + err = agg.Flush(ctx, rwTx) + require.NoError(t, err) + + unwindTo := uint64(commitStep * rnd.Intn(int(maxTx)/commitStep)) + + acu := agg.MakeContext() + err = acu.Unwind(ctx, unwindTo, rwTx) + require.NoError(t, err) + acu.Close() + + err = rwTx.Commit() + require.NoError(t, err) + if count > 0 { + count-- + } + if count == 0 { + return + } + + goto Loop +} diff --git a/state/domain_test.go b/state/domain_test.go index cf1ed599d..27cc4495d 100644 --- a/state/domain_test.go +++ b/state/domain_test.go @@ -17,36 +17,55 @@ package state import ( + "bytes" "context" "encoding/binary" + "encoding/hex" "fmt" "math" + "math/rand" "os" + "path/filepath" + "sort" "strings" "testing" "time" - "github.com/ledgerwatch/erigon-lib/common/background" + "github.com/holiman/uint256" "github.com/ledgerwatch/log/v3" "github.com/stretchr/testify/require" btree2 "github.com/tidwall/btree" + "github.com/ledgerwatch/erigon-lib/common" + "github.com/ledgerwatch/erigon-lib/common/background" + "github.com/ledgerwatch/erigon-lib/common/length" "github.com/ledgerwatch/erigon-lib/kv" "github.com/ledgerwatch/erigon-lib/kv/mdbx" - "github.com/ledgerwatch/erigon-lib/recsplit" ) -func testDbAndDomain(t *testing.T, logger log.Logger) (string, kv.RwDB, *Domain) { +func testDbAndDomain(t *testing.T, logger log.Logger) (kv.RwDB, *Domain) { + t.Helper() + return testDbAndDomainOfStep(t, 16, logger) +} +func testDbAndDomainOfStep(t *testing.T, aggStep uint64, logger log.Logger) (kv.RwDB, *Domain) { + t.Helper() + return testDbAndDomainOfStepValsDup(t, aggStep, logger, false) +} + +func testDbAndDomainOfStepValsDup(t *testing.T, aggStep uint64, logger log.Logger, dupSortVals bool) (kv.RwDB, *Domain) { t.Helper() - path := t.TempDir() + datadir := t.TempDir() + coldDir := filepath.Join(datadir, "snapshots", "history") + require.NoError(t, os.MkdirAll(filepath.Join(datadir, "snapshots", "warm"), 0740)) + require.NoError(t, os.MkdirAll(coldDir, 0740)) keysTable := "Keys" valsTable := "Vals" historyKeysTable := "HistoryKeys" historyValsTable := "HistoryVals" settingsTable := "Settings" indexTable := "Index" - db := mdbx.NewMDBX(logger).InMem(path).WithTableCfg(func(defaultBuckets kv.TableCfg) kv.TableCfg { - return kv.TableCfg{ + db := mdbx.NewMDBX(logger).InMem(datadir).WithTableCfg(func(defaultBuckets kv.TableCfg) kv.TableCfg { + tcfg := kv.TableCfg{ keysTable: kv.TableCfgItem{Flags: kv.DupSort}, valsTable: kv.TableCfgItem{}, historyKeysTable: kv.TableCfgItem{Flags: kv.DupSort}, @@ -54,24 +73,56 @@ func testDbAndDomain(t *testing.T, logger log.Logger) (string, kv.RwDB, *Domain) settingsTable: kv.TableCfgItem{}, indexTable: kv.TableCfgItem{Flags: kv.DupSort}, } + if dupSortVals { + tcfg[valsTable] = kv.TableCfgItem{Flags: kv.DupSort} + } + return tcfg }).MustOpen() t.Cleanup(db.Close) - d, err := NewDomain(path, path, 16, "base", keysTable, valsTable, historyKeysTable, historyValsTable, indexTable, true, false, logger) + salt := uint32(1) + cfg := domainCfg{ + domainLargeValues: AccDomainLargeValues, + hist: histCfg{ + iiCfg: iiCfg{salt: &salt, dir: coldDir, tmpdir: coldDir}, + withLocalityIndex: false, withExistenceIndex: true, compression: CompressNone, historyLargeValues: AccDomainLargeValues, + }} + d, err := NewDomain(cfg, aggStep, "base", keysTable, valsTable, historyKeysTable, historyValsTable, indexTable, logger) require.NoError(t, err) + d.DisableFsync() + d.compressWorkers = 1 t.Cleanup(d.Close) d.DisableFsync() - return path, db, d + return db, d } -// btree index should work correctly if K < m -func TestCollationBuild(t *testing.T) { +func TestDomain_CollationBuild(t *testing.T) { + // t.Run("compressDomainVals=false, domainLargeValues=false", func(t *testing.T) { + // testCollationBuild(t, false, false) + // }) + // t.Run("compressDomainVals=true, domainLargeValues=false", func(t *testing.T) { + // testCollationBuild(t, true, false) + // }) + t.Run("compressDomainVals=true, domainLargeValues=true", func(t *testing.T) { + testCollationBuild(t, true, true) + }) + t.Run("compressDomainVals=false, domainLargeValues=true", func(t *testing.T) { + testCollationBuild(t, false, true) + }) +} + +func testCollationBuild(t *testing.T, compressDomainVals, domainLargeValues bool) { + t.Helper() + logger := log.New() logEvery := time.NewTicker(30 * time.Second) defer logEvery.Stop() - _, db, d := testDbAndDomain(t, logger) + db, d := testDbAndDomainOfStepValsDup(t, 16, logger, !domainLargeValues) ctx := context.Background() defer d.Close() + d.domainLargeValues = domainLargeValues + d.compression = CompressKeys | CompressVals + tx, err := db.BeginRw(ctx) require.NoError(t, err) defer tx.Rollback() @@ -80,62 +131,143 @@ func TestCollationBuild(t *testing.T) { defer d.FinishWrites() d.SetTxNum(2) - err = d.Put([]byte("key1"), nil, []byte("value1.1")) + + var ( + k1 = []byte("key1") + k2 = []byte("key2") + v1 = []byte("value1.1") + v2 = []byte("value2.1") + p1, p2 []byte + ) + + err = d.PutWithPrev(k1, nil, v1, p1) require.NoError(t, err) d.SetTxNum(3) - err = d.Put([]byte("key2"), nil, []byte("value2.1")) + err = d.PutWithPrev(k2, nil, v2, p2) require.NoError(t, err) + p1, p2 = v1, v2 + v1, v2 = []byte("value1.2"), []byte("value2.2") //nolint + expectedStep1 := uint64(0) + d.SetTxNum(6) - err = d.Put([]byte("key1"), nil, []byte("value1.2")) + err = d.PutWithPrev(k1, nil, v1, p1) require.NoError(t, err) - err = d.Rotate().Flush(ctx, tx) + p1, v1 = v1, []byte("value1.3") + d.SetTxNum(d.aggregationStep + 2) + err = d.PutWithPrev(k1, nil, v1, p1) require.NoError(t, err) - c, err := d.collate(ctx, 0, 0, 7, tx, logEvery) + p1, v1 = v1, []byte("value1.4") + d.SetTxNum(d.aggregationStep + 3) + err = d.PutWithPrev(k1, nil, v1, p1) + require.NoError(t, err) + p1, v1 = v1, []byte("value1.5") + expectedStep2 := uint64(2) + d.SetTxNum(expectedStep2*d.aggregationStep + 2) + err = d.PutWithPrev(k1, nil, v1, p1) require.NoError(t, err) - require.True(t, strings.HasSuffix(c.valuesPath, "base.0-1.kv")) - require.Equal(t, 2, c.valuesCount) - require.True(t, strings.HasSuffix(c.historyPath, "base.0-1.v")) - require.Equal(t, 3, c.historyCount) - require.Equal(t, 2, len(c.indexBitmaps)) - require.Equal(t, []uint64{3}, c.indexBitmaps["key2"].ToArray()) - require.Equal(t, []uint64{2, 6}, c.indexBitmaps["key1"].ToArray()) - sf, err := d.buildFiles(ctx, 0, c, background.NewProgressSet()) + err = d.Rotate().Flush(ctx, tx) require.NoError(t, err) - defer sf.Close() - c.Close() + { + c, err := d.collate(ctx, 0, 0, 7, tx) - g := sf.valuesDecomp.MakeGetter() - g.Reset(0) - var words []string - for g.HasNext() { - w, _ := g.Next(nil) - words = append(words, string(w)) + require.NoError(t, err) + require.True(t, strings.HasSuffix(c.valuesPath, "base.0-1.kv")) + require.Equal(t, 2, c.valuesCount) + require.True(t, strings.HasSuffix(c.historyPath, "base.0-1.v")) + require.Equal(t, 3, c.historyCount) + require.Equal(t, 2, len(c.indexBitmaps)) + require.Equal(t, []uint64{3}, c.indexBitmaps["key2"].ToArray()) + require.Equal(t, []uint64{2, 6}, c.indexBitmaps["key1"].ToArray()) + + sf, err := d.buildFiles(ctx, 0, c, background.NewProgressSet()) + require.NoError(t, err) + c.Close() + + g := NewArchiveGetter(sf.valuesDecomp.MakeGetter(), d.compression) + g.Reset(0) + var words []string + for g.HasNext() { + w, _ := g.Next(nil) + words = append(words, string(w)) + } + switch domainLargeValues { + case true: + require.Equal(t, []string{"key1", "value1.2", "key2", "value2.1"}, words) + default: + is := make([]byte, 8) + binary.BigEndian.PutUint64(is, ^expectedStep1) + v1 := string(is) + "value1.2" + //binary.BigEndian.PutUint64(is, ^expectedStep2) + v2 := string(is) + "value2.1" + require.Equal(t, []string{"key1", v1, "key2", v2}, words) + } + // Check index + //require.Equal(t, 2, int(sf.valuesIdx.KeyCount())) + require.Equal(t, 2, int(sf.valuesBt.KeyCount())) + + //r := recsplit.NewIndexReader(sf.valuesIdx) + //defer r.Close() + //for i := 0; i < len(words); i += 2 { + // offset := r.Lookup([]byte(words[i])) + // g.Reset(offset) + // w, _ := g.Next(nil) + // require.Equal(t, words[i], string(w)) + // w, _ = g.Next(nil) + // require.Equal(t, words[i+1], string(w)) + //} + + for i := 0; i < len(words); i += 2 { + c, _ := sf.valuesBt.SeekDeprecated([]byte(words[i])) + require.Equal(t, words[i], string(c.Key())) + require.Equal(t, words[i+1], string(c.Value())) + } } - require.Equal(t, []string{"key1", "value1.2", "key2", "value2.1"}, words) - // Check index - require.Equal(t, 2, int(sf.valuesIdx.KeyCount())) + { + c, err := d.collate(ctx, 1, 1*d.aggregationStep, 2*d.aggregationStep, tx) + require.NoError(t, err) + sf, err := d.buildFiles(ctx, 1, c, background.NewProgressSet()) + require.NoError(t, err) + c.Close() + + g := sf.valuesDecomp.MakeGetter() + g.Reset(0) + var words []string + for g.HasNext() { + w, _ := g.Next(nil) + words = append(words, string(w)) + } + require.Equal(t, []string{"key1", "value1.4"}, words) + // Check index + require.Equal(t, 1, int(sf.valuesBt.KeyCount())) + for i := 0; i < len(words); i += 2 { + c, _ := sf.valuesBt.SeekDeprecated([]byte(words[i])) + require.Equal(t, words[i], string(c.Key())) + require.Equal(t, words[i+1], string(c.Value())) + } - r := recsplit.NewIndexReader(sf.valuesIdx) - defer r.Close() - for i := 0; i < len(words); i += 2 { - offset := r.Lookup([]byte(words[i])) - g.Reset(offset) - w, _ := g.Next(nil) - require.Equal(t, words[i], string(w)) - w, _ = g.Next(nil) - require.Equal(t, words[i+1], string(w)) + //require.Equal(t, 1, int(sf.valuesIdx.KeyCount())) + //r := recsplit.NewIndexReader(sf.valuesIdx) + //defer r.Close() + //for i := 0; i < len(words); i += 2 { + // offset := r.Lookup([]byte(words[i])) + // g.Reset(offset) + // w, _ := g.Next(nil) + // require.Equal(t, words[i], string(w)) + // w, _ = g.Next(nil) + // require.Equal(t, words[i+1], string(w)) + //} } } -func TestIterationBasic(t *testing.T) { +func TestDomain_IterationBasic(t *testing.T) { logger := log.New() - _, db, d := testDbAndDomain(t, logger) + db, d := testDbAndDomain(t, logger) ctx := context.Background() tx, err := db.BeginRw(ctx) require.NoError(t, err) @@ -160,23 +292,39 @@ func TestIterationBasic(t *testing.T) { err = d.Put([]byte("addr3"), []byte("loc2"), []byte("value1")) require.NoError(t, err) - var keys, vals []string dc := d.MakeContext() defer dc.Close() - err = dc.IteratePrefix([]byte("addr2"), func(k, v []byte) { - keys = append(keys, string(k)) - vals = append(vals, string(v)) - }) - require.NoError(t, err) - require.Equal(t, []string{"addr2loc1", "addr2loc2"}, keys) - require.Equal(t, []string{"value1", "value1"}, vals) + + { + var keys, vals []string + err = dc.IteratePrefix(tx, []byte("addr2"), func(k, v []byte) { + keys = append(keys, string(k)) + vals = append(vals, string(v)) + }) + require.NoError(t, err) + require.Equal(t, []string{"addr2loc1", "addr2loc2"}, keys) + require.Equal(t, []string{"value1", "value1"}, vals) + } + { + var keys, vals []string + iter2, err := dc.IteratePrefix2(tx, []byte("addr2"), []byte("addr3"), -1) + require.NoError(t, err) + for iter2.HasNext() { + k, v, err := iter2.Next() + require.NoError(t, err) + keys = append(keys, string(k)) + vals = append(vals, string(v)) + } + require.Equal(t, []string{"addr2loc1", "addr2loc2"}, keys) + require.Equal(t, []string{"value1", "value1"}, vals) + } } -func TestAfterPrune(t *testing.T) { +func TestDomain_AfterPrune(t *testing.T) { logger := log.New() logEvery := time.NewTicker(30 * time.Second) defer logEvery.Stop() - _, db, d := testDbAndDomain(t, logger) + db, d := testDbAndDomain(t, logger) ctx := context.Background() tx, err := db.BeginRw(ctx) @@ -186,30 +334,47 @@ func TestAfterPrune(t *testing.T) { d.StartWrites() defer d.FinishWrites() + var ( + k1 = []byte("key1") + k2 = []byte("key2") + p1 []byte + p2 []byte + + n1, n2 = []byte("value1.1"), []byte("value2.1") + ) + d.SetTxNum(2) - err = d.Put([]byte("key1"), nil, []byte("value1.1")) + err = d.PutWithPrev(k1, nil, n1, p1) require.NoError(t, err) d.SetTxNum(3) - err = d.Put([]byte("key2"), nil, []byte("value2.1")) + err = d.PutWithPrev(k2, nil, n2, p2) require.NoError(t, err) + p1, p2 = n1, n2 + n1, n2 = []byte("value1.2"), []byte("value2.2") + d.SetTxNum(6) - err = d.Put([]byte("key1"), nil, []byte("value1.2")) + err = d.PutWithPrev(k1, nil, n1, p1) require.NoError(t, err) + p1, n1 = n1, []byte("value1.3") + d.SetTxNum(17) - err = d.Put([]byte("key1"), nil, []byte("value1.3")) + err = d.PutWithPrev(k1, nil, n1, p1) require.NoError(t, err) + p1 = n1 + d.SetTxNum(18) - err = d.Put([]byte("key2"), nil, []byte("value2.2")) + err = d.PutWithPrev(k2, nil, n2, p2) require.NoError(t, err) + p2 = n2 err = d.Rotate().Flush(ctx, tx) require.NoError(t, err) - c, err := d.collate(ctx, 0, 0, 16, tx, logEvery) + c, err := d.collate(ctx, 0, 0, 16, tx) require.NoError(t, err) sf, err := d.buildFiles(ctx, 0, c, background.NewProgressSet()) @@ -219,40 +384,49 @@ func TestAfterPrune(t *testing.T) { var v []byte dc := d.MakeContext() defer dc.Close() - v, err = dc.Get([]byte("key1"), nil, tx) + v, found, err := dc.GetLatest(k1, nil, tx) + require.Truef(t, found, "key1 not found") require.NoError(t, err) - require.Equal(t, []byte("value1.3"), v) - v, err = dc.Get([]byte("key2"), nil, tx) + require.Equal(t, p1, v) + v, found, err = dc.GetLatest(k2, nil, tx) + require.Truef(t, found, "key2 not found") require.NoError(t, err) - require.Equal(t, []byte("value2.2"), v) + require.Equal(t, p2, v) - err = d.prune(ctx, 0, 0, 16, math.MaxUint64, logEvery) + err = dc.Prune(ctx, tx, 0, 0, 16, math.MaxUint64, logEvery) require.NoError(t, err) isEmpty, err := d.isEmpty(tx) require.NoError(t, err) require.False(t, isEmpty) - v, err = dc.Get([]byte("key1"), nil, tx) + v, found, err = dc.GetLatest(k1, nil, tx) require.NoError(t, err) - require.Equal(t, []byte("value1.3"), v) - v, err = dc.Get([]byte("key2"), nil, tx) + require.Truef(t, found, "key1 not found") + require.Equal(t, p1, v) + + v, found, err = dc.GetLatest(k2, nil, tx) require.NoError(t, err) - require.Equal(t, []byte("value2.2"), v) + require.Truef(t, found, "key2 not found") + require.Equal(t, p2, v) } -func filledDomain(t *testing.T, logger log.Logger) (string, kv.RwDB, *Domain, uint64) { +func filledDomain(t *testing.T, logger log.Logger) (kv.RwDB, *Domain, uint64) { t.Helper() - path, db, d := testDbAndDomain(t, logger) + require := require.New(t) + db, d := testDbAndDomain(t, logger) ctx := context.Background() tx, err := db.BeginRw(ctx) - require.NoError(t, err) + require.NoError(err) defer tx.Rollback() d.SetTx(tx) - d.StartWrites() + d.StartUnbufferedWrites() defer d.FinishWrites() txs := uint64(1000) + + dc := d.MakeContext() + defer dc.Close() // keys are encodings of numbers 1..31 // each key changes value on every txNum which is multiple of the key for txNum := uint64(1); txNum <= txs; txNum++ { @@ -264,56 +438,62 @@ func filledDomain(t *testing.T, logger log.Logger) (string, kv.RwDB, *Domain, ui var v [8]byte binary.BigEndian.PutUint64(k[:], keyNum) binary.BigEndian.PutUint64(v[:], valNum) - err = d.Put(k[:], nil, v[:]) - require.NoError(t, err) + prev, _, err := dc.GetLatest(k[:], nil, tx) + require.NoError(err) + err = d.PutWithPrev(k[:], nil, v[:], prev) + + require.NoError(err) } } if txNum%10 == 0 { err = d.Rotate().Flush(ctx, tx) - require.NoError(t, err) + require.NoError(err) } } err = d.Rotate().Flush(ctx, tx) - require.NoError(t, err) + require.NoError(err) err = tx.Commit() - require.NoError(t, err) - return path, db, d, txs + require.NoError(err) + return db, d, txs } func checkHistory(t *testing.T, db kv.RwDB, d *Domain, txs uint64) { t.Helper() + fmt.Printf("txs: %d\n", txs) + t.Helper() + require := require.New(t) ctx := context.Background() var err error + // Check the history - var roTx kv.Tx dc := d.MakeContext() defer dc.Close() + roTx, err := db.BeginRo(ctx) + require.NoError(err) + defer roTx.Rollback() + for txNum := uint64(0); txNum <= txs; txNum++ { - if txNum == 976 { - // Create roTx obnly for the last several txNum, because all history before that - // we should be able to read without any DB access - roTx, err = db.BeginRo(ctx) - require.NoError(t, err) - defer roTx.Rollback() - } for keyNum := uint64(1); keyNum <= uint64(31); keyNum++ { valNum := txNum / keyNum var k [8]byte var v [8]byte - label := fmt.Sprintf("txNum=%d, keyNum=%d", txNum, keyNum) binary.BigEndian.PutUint64(k[:], keyNum) binary.BigEndian.PutUint64(v[:], valNum) - val, err := dc.GetBeforeTxNum(k[:], txNum+1, roTx) - require.NoError(t, err, label) + + label := fmt.Sprintf("key %x txNum=%d, keyNum=%d", k, txNum, keyNum) + + val, err := dc.GetAsOf(k[:], txNum+1, roTx) + require.NoError(err, label) if txNum >= keyNum { - require.Equal(t, v[:], val, label) + require.Equal(v[:], val, label) } else { - require.Nil(t, val, label) + require.Nil(val, label) } if txNum == txs { - val, err := dc.Get(k[:], nil, roTx) - require.NoError(t, err) - require.EqualValues(t, v[:], val) + val, found, err := dc.GetLatest(k[:], nil, roTx) + require.True(found, label) + require.NoError(err) + require.EqualValues(v[:], val, label) } } } @@ -323,28 +503,13 @@ func TestHistory(t *testing.T) { logger := log.New() logEvery := time.NewTicker(30 * time.Second) defer logEvery.Stop() - _, db, d, txs := filledDomain(t, logger) - ctx := context.Background() - tx, err := db.BeginRw(ctx) - require.NoError(t, err) - d.SetTx(tx) - defer tx.Rollback() + db, d, txs := filledDomain(t, logger) + //ctx := context.Background() + //tx, err := db.BeginRw(ctx) + //require.NoError(t, err) + //defer tx.Rollback() - // Leave the last 2 aggregation steps un-collated - for step := uint64(0); step < txs/d.aggregationStep-1; step++ { - func() { - c, err := d.collate(ctx, step, step*d.aggregationStep, (step+1)*d.aggregationStep, tx, logEvery) - require.NoError(t, err) - sf, err := d.buildFiles(ctx, step, c, background.NewProgressSet()) - require.NoError(t, err) - d.integrateFiles(sf, step*d.aggregationStep, (step+1)*d.aggregationStep) - - err = d.prune(ctx, step, step*d.aggregationStep, (step+1)*d.aggregationStep, math.MaxUint64, logEvery) - require.NoError(t, err) - }() - } - err = tx.Commit() - require.NoError(t, err) + collateAndMerge(t, db, nil, d, txs) checkHistory(t, db, d, txs) } @@ -352,7 +517,7 @@ func TestIterationMultistep(t *testing.T) { logger := log.New() logEvery := time.NewTicker(30 * time.Second) defer logEvery.Stop() - _, db, d := testDbAndDomain(t, logger) + db, d := testDbAndDomain(t, logger) ctx := context.Background() tx, err := db.BeginRw(ctx) require.NoError(t, err) @@ -396,27 +561,45 @@ func TestIterationMultistep(t *testing.T) { for step := uint64(0); step <= 2; step++ { func() { - c, err := d.collate(ctx, step, step*d.aggregationStep, (step+1)*d.aggregationStep, tx, logEvery) + c, err := d.collate(ctx, step, step*d.aggregationStep, (step+1)*d.aggregationStep, tx) require.NoError(t, err) sf, err := d.buildFiles(ctx, step, c, background.NewProgressSet()) require.NoError(t, err) d.integrateFiles(sf, step*d.aggregationStep, (step+1)*d.aggregationStep) - err = d.prune(ctx, step, step*d.aggregationStep, (step+1)*d.aggregationStep, math.MaxUint64, logEvery) + + dc := d.MakeContext() + err = dc.Prune(ctx, tx, step, step*d.aggregationStep, (step+1)*d.aggregationStep, math.MaxUint64, logEvery) + dc.Close() require.NoError(t, err) }() } - var keys []string - var vals []string dc := d.MakeContext() defer dc.Close() - err = dc.IteratePrefix([]byte("addr2"), func(k, v []byte) { - keys = append(keys, string(k)) - vals = append(vals, string(v)) - }) - require.NoError(t, err) - require.Equal(t, []string{"addr2loc2", "addr2loc3", "addr2loc4"}, keys) - require.Equal(t, []string{"value1", "value1", "value1"}, vals) + + { + var keys, vals []string + err = dc.IteratePrefix(tx, []byte("addr2"), func(k, v []byte) { + keys = append(keys, string(k)) + vals = append(vals, string(v)) + }) + require.NoError(t, err) + require.Equal(t, []string{"addr2loc2", "addr2loc3", "addr2loc4"}, keys) + require.Equal(t, []string{"value1", "value1", "value1"}, vals) + } + { + var keys, vals []string + iter2, err := dc.IteratePrefix2(tx, []byte("addr2"), []byte("addr3"), -1) + require.NoError(t, err) + for iter2.HasNext() { + k, v, err := iter2.Next() + require.NoError(t, err) + keys = append(keys, string(k)) + vals = append(vals, string(v)) + } + require.Equal(t, []string{"addr2loc2", "addr2loc3", "addr2loc4"}, keys) + require.Equal(t, []string{"value1", "value1", "value1"}, vals) + } } func collateAndMerge(t *testing.T, db kv.RwDB, tx kv.RwTx, d *Domain, txs uint64) { @@ -428,36 +611,42 @@ func collateAndMerge(t *testing.T, db kv.RwDB, tx kv.RwTx, d *Domain, txs uint64 var err error useExternalTx := tx != nil if !useExternalTx { - tx, err = db.BeginRw(ctx) + tx, err = db.BeginRwNosync(ctx) require.NoError(t, err) defer tx.Rollback() } d.SetTx(tx) // Leave the last 2 aggregation steps un-collated for step := uint64(0); step < txs/d.aggregationStep-1; step++ { - c, err := d.collate(ctx, step, step*d.aggregationStep, (step+1)*d.aggregationStep, tx, logEvery) + c, err := d.collate(ctx, step, step*d.aggregationStep, (step+1)*d.aggregationStep, tx) require.NoError(t, err) sf, err := d.buildFiles(ctx, step, c, background.NewProgressSet()) require.NoError(t, err) d.integrateFiles(sf, step*d.aggregationStep, (step+1)*d.aggregationStep) - err = d.prune(ctx, step, step*d.aggregationStep, (step+1)*d.aggregationStep, math.MaxUint64, logEvery) + + dc := d.MakeContext() + err = dc.Prune(ctx, tx, step, step*d.aggregationStep, (step+1)*d.aggregationStep, math.MaxUint64, logEvery) + dc.Close() require.NoError(t, err) } var r DomainRanges maxEndTxNum := d.endTxNumMinimax() - maxSpan := d.aggregationStep * StepsInBiggestFile + maxSpan := d.aggregationStep * StepsInColdFile for { if stop := func() bool { dc := d.MakeContext() defer dc.Close() - r = d.findMergeRange(maxEndTxNum, maxSpan) + r = dc.findMergeRange(maxEndTxNum, maxSpan) if !r.any() { return true } valuesOuts, indexOuts, historyOuts, _ := dc.staticFilesInRange(r) valuesIn, indexIn, historyIn, err := d.mergeFiles(ctx, valuesOuts, indexOuts, historyOuts, r, 1, background.NewProgressSet()) require.NoError(t, err) + if valuesIn != nil && valuesIn.decompressor != nil { + fmt.Printf("merge: %s\n", valuesIn.decompressor.FileName()) + } d.integrateMergedFiles(valuesOuts, indexOuts, historyOuts, valuesIn, indexIn, historyIn) return false }(); stop { @@ -477,21 +666,27 @@ func collateAndMergeOnce(t *testing.T, d *Domain, step uint64) { ctx := context.Background() txFrom, txTo := (step)*d.aggregationStep, (step+1)*d.aggregationStep - c, err := d.collate(ctx, step, txFrom, txTo, d.tx, logEvery) + c, err := d.collate(ctx, step, txFrom, txTo, d.tx) require.NoError(t, err) sf, err := d.buildFiles(ctx, step, c, background.NewProgressSet()) require.NoError(t, err) d.integrateFiles(sf, txFrom, txTo) - err = d.prune(ctx, step, txFrom, txTo, math.MaxUint64, logEvery) + dc := d.MakeContext() + err = dc.Prune(ctx, d.tx, step, txFrom, txTo, math.MaxUint64, logEvery) + dc.Close() require.NoError(t, err) - var r DomainRanges maxEndTxNum := d.endTxNumMinimax() - maxSpan := d.aggregationStep * StepsInBiggestFile - for r = d.findMergeRange(maxEndTxNum, maxSpan); r.any(); r = d.findMergeRange(maxEndTxNum, maxSpan) { + maxSpan := d.aggregationStep * StepsInColdFile + for { dc := d.MakeContext() + r := dc.findMergeRange(maxEndTxNum, maxSpan) + if !r.any() { + dc.Close() + break + } valuesOuts, indexOuts, historyOuts, _ := dc.staticFilesInRange(r) valuesIn, indexIn, historyIn, err := d.mergeFiles(ctx, valuesOuts, indexOuts, historyOuts, r, 1, background.NewProgressSet()) require.NoError(t, err) @@ -503,21 +698,24 @@ func collateAndMergeOnce(t *testing.T, d *Domain, step uint64) { func TestDomain_MergeFiles(t *testing.T) { logger := log.New() - _, db, d, txs := filledDomain(t, logger) + db, d, txs := filledDomain(t, logger) + rwTx, err := db.BeginRw(context.Background()) + require.NoError(t, err) - collateAndMerge(t, db, nil, d, txs) + collateAndMerge(t, db, rwTx, d, txs) + err = rwTx.Commit() + require.NoError(t, err) checkHistory(t, db, d, txs) } func TestDomain_ScanFiles(t *testing.T) { logger := log.New() - path, db, d, txs := filledDomain(t, logger) - _ = path + db, d, txs := filledDomain(t, logger) collateAndMerge(t, db, nil, d, txs) // Recreate domain and re-scan the files txNum := d.txNum d.closeWhatNotInList([]string{}) - d.OpenFolder() + require.NoError(t, d.OpenFolder()) d.SetTxNum(txNum) // Check the history @@ -526,7 +724,7 @@ func TestDomain_ScanFiles(t *testing.T) { func TestDomain_Delete(t *testing.T) { logger := log.New() - _, db, d := testDbAndDomain(t, logger) + db, d := testDbAndDomain(t, logger) ctx, require := context.Background(), require.New(t) tx, err := db.BeginRw(ctx) require.NoError(err) @@ -553,7 +751,7 @@ func TestDomain_Delete(t *testing.T) { defer dc.Close() for txNum := uint64(0); txNum < 1000; txNum++ { label := fmt.Sprintf("txNum=%d", txNum) - //val, ok, err := dc.GetBeforeTxNum([]byte("key1"), txNum+1, tx) + //val, ok, err := dc.GetLatestBeforeTxNum([]byte("key1"), txNum+1, tx) //require.NoError(err) //require.True(ok) //if txNum%2 == 0 { @@ -562,7 +760,7 @@ func TestDomain_Delete(t *testing.T) { // require.Nil(val, label) //} //if txNum == 976 { - val, err := dc.GetBeforeTxNum([]byte("key2"), txNum+1, tx) + val, err := dc.GetAsOf([]byte("key2"), txNum+1, tx) require.NoError(err) //require.False(ok, label) require.Nil(val, label) @@ -570,9 +768,9 @@ func TestDomain_Delete(t *testing.T) { } } -func filledDomainFixedSize(t *testing.T, keysCount, txCount uint64, logger log.Logger) (string, kv.RwDB, *Domain, map[string][]bool) { +func filledDomainFixedSize(t *testing.T, keysCount, txCount, aggStep uint64, logger log.Logger) (kv.RwDB, *Domain, map[uint64][]bool) { t.Helper() - path, db, d := testDbAndDomain(t, logger) + db, d := testDbAndDomainOfStep(t, aggStep, logger) ctx := context.Background() tx, err := db.BeginRw(ctx) require.NoError(t, err) @@ -583,25 +781,47 @@ func filledDomainFixedSize(t *testing.T, keysCount, txCount uint64, logger log.L // keys are encodings of numbers 1..31 // each key changes value on every txNum which is multiple of the key - dat := make(map[string][]bool) // K:V is key -> list of bools. If list[i] == true, i'th txNum should persists + dat := make(map[uint64][]bool) // K:V is key -> list of bools. If list[i] == true, i'th txNum should persists + var k [8]byte + var v [8]byte + maxFrozenFiles := (txCount / d.aggregationStep) / StepsInColdFile + // key 0: only in frozen file 0 + // key 1: only in frozen file 1 and file 2 + // key 2: in frozen file 2 and in warm files + // other keys: only in warm files for txNum := uint64(1); txNum <= txCount; txNum++ { d.SetTxNum(txNum) - for keyNum := uint64(1); keyNum <= keysCount; keyNum++ { - if keyNum == txNum%d.aggregationStep { - continue + step := txNum / d.aggregationStep + frozenFileNum := step / 32 + for keyNum := uint64(0); keyNum < keysCount; keyNum++ { + if frozenFileNum < maxFrozenFiles { // frozen data + allowInsert := (keyNum == 0 && frozenFileNum == 0) || + (keyNum == 1 && (frozenFileNum == 1 || frozenFileNum == 2)) || + (keyNum == 2 && frozenFileNum == 2) + if !allowInsert { + continue + } + //fmt.Printf("put frozen: %d, step=%d, %d\n", keyNum, step, frozenFileNum) + } else { //warm data + if keyNum == 0 || keyNum == 1 { + continue + } + if keyNum == txNum%d.aggregationStep { + continue + } + //fmt.Printf("put: %d, step=%d\n", keyNum, step) } - var k [8]byte - var v [8]byte + binary.BigEndian.PutUint64(k[:], keyNum) binary.BigEndian.PutUint64(v[:], txNum) + //v[0] = 3 // value marker err = d.Put(k[:], nil, v[:]) require.NoError(t, err) - - if _, ok := dat[fmt.Sprintf("%d", keyNum)]; !ok { - dat[fmt.Sprintf("%d", keyNum)] = make([]bool, txCount+1) + if _, ok := dat[keyNum]; !ok { + dat[keyNum] = make([]bool, txCount+1) } - dat[fmt.Sprintf("%d", keyNum)][txNum] = true + dat[keyNum][txNum] = true } if txNum%d.aggregationStep == 0 { err = d.Rotate().Flush(ctx, tx) @@ -610,7 +830,7 @@ func filledDomainFixedSize(t *testing.T, keysCount, txCount uint64, logger log.L } err = tx.Commit() require.NoError(t, err) - return path, db, d, dat + return db, d, dat } // firstly we write all the data to domain @@ -620,9 +840,9 @@ func filledDomainFixedSize(t *testing.T, keysCount, txCount uint64, logger log.L func TestDomain_Prune_AfterAllWrites(t *testing.T) { logger := log.New() keyCount, txCount := uint64(4), uint64(64) - _, db, dom, data := filledDomainFixedSize(t, keyCount, txCount, logger) - + db, dom, data := filledDomainFixedSize(t, keyCount, txCount, 16, logger) collateAndMerge(t, db, nil, dom, txCount) + maxFrozenFiles := (txCount / dom.aggregationStep) / StepsInColdFile ctx := context.Background() roTx, err := db.BeginRo(ctx) @@ -632,18 +852,36 @@ func TestDomain_Prune_AfterAllWrites(t *testing.T) { // Check the history dc := dom.MakeContext() defer dc.Close() + var k, v [8]byte + for txNum := uint64(1); txNum <= txCount; txNum++ { - for keyNum := uint64(1); keyNum <= keyCount; keyNum++ { - var k [8]byte - var v [8]byte + for keyNum := uint64(0); keyNum < keyCount; keyNum++ { + step := txNum / dom.aggregationStep + frozenFileNum := step / 32 + if frozenFileNum < maxFrozenFiles { // frozen data + if keyNum != frozenFileNum { + continue + } + continue + //fmt.Printf("put frozen: %d, step=%d, %d\n", keyNum, step, frozenFileNum) + } else { //warm data + if keyNum == 0 || keyNum == 1 { + continue + } + if keyNum == txNum%dom.aggregationStep { + continue + } + //fmt.Printf("put: %d, step=%d\n", keyNum, step) + } + label := fmt.Sprintf("txNum=%d, keyNum=%d\n", txNum, keyNum) binary.BigEndian.PutUint64(k[:], keyNum) binary.BigEndian.PutUint64(v[:], txNum) - val, err := dc.GetBeforeTxNum(k[:], txNum+1, roTx) + val, err := dc.GetAsOf(k[:], txNum+1, roTx) // during generation such keys are skipped so value should be nil for this call require.NoError(t, err, label) - if !data[fmt.Sprintf("%d", keyNum)][txNum] { + if !data[keyNum][txNum] { if txNum > 1 { binary.BigEndian.PutUint64(v[:], txNum-1) } else { @@ -655,15 +893,14 @@ func TestDomain_Prune_AfterAllWrites(t *testing.T) { } } - var v [8]byte + //warm keys binary.BigEndian.PutUint64(v[:], txCount) - - for keyNum := uint64(1); keyNum <= keyCount; keyNum++ { - var k [8]byte - label := fmt.Sprintf("txNum=%d, keyNum=%d\n", txCount, keyNum) + for keyNum := uint64(2); keyNum < keyCount; keyNum++ { + label := fmt.Sprintf("txNum=%d, keyNum=%d\n", txCount-1, keyNum) binary.BigEndian.PutUint64(k[:], keyNum) - storedV, err := dc.Get(k[:], nil, roTx) + storedV, found, err := dc.GetLatest(k[:], nil, roTx) + require.Truef(t, found, label) require.NoError(t, err, label) require.EqualValues(t, v[:], storedV, label) } @@ -673,15 +910,14 @@ func TestDomain_PruneOnWrite(t *testing.T) { logger := log.New() keysCount, txCount := uint64(16), uint64(64) - path, db, d := testDbAndDomain(t, logger) + db, d := testDbAndDomain(t, logger) ctx := context.Background() - defer os.Remove(path) tx, err := db.BeginRw(ctx) require.NoError(t, err) defer tx.Rollback() d.SetTx(tx) - d.StartWrites() + d.StartUnbufferedWrites() defer d.FinishWrites() // keys are encodings of numbers 1..31 @@ -734,7 +970,7 @@ func TestDomain_PruneOnWrite(t *testing.T) { binary.BigEndian.PutUint64(k[:], keyNum) binary.BigEndian.PutUint64(v[:], valNum) - val, err := dc.GetBeforeTxNum(k[:], txNum+1, tx) + val, err := dc.GetAsOf(k[:], txNum+1, tx) require.NoError(t, err) if keyNum == txNum%d.aggregationStep { if txNum > 1 { @@ -759,17 +995,21 @@ func TestDomain_PruneOnWrite(t *testing.T) { label := fmt.Sprintf("txNum=%d, keyNum=%d\n", txCount, keyNum) binary.BigEndian.PutUint64(k[:], keyNum) - storedV, err := dc.Get(k[:], nil, tx) - require.NoError(t, err, label) + storedV, found, err := dc.GetLatest(k[:], nil, tx) + require.Truef(t, found, label) + require.NoErrorf(t, err, label) require.EqualValues(t, v[:], storedV, label) } + + from, to := d.stepsRangeInDB(tx) + require.Equal(t, 3, int(from)) + require.Equal(t, 4, int(to)) + } func TestScanStaticFilesD(t *testing.T) { - logger := log.New() - ii := &Domain{History: &History{InvertedIndex: &InvertedIndex{filenameBase: "test", aggregationStep: 1, logger: logger}, logger: logger}, - files: btree2.NewBTreeG[*filesItem](filesItemLess), - logger: logger, + ii := &Domain{History: &History{InvertedIndex: emptyTestInvertedIndex(1)}, + files: btree2.NewBTreeG[*filesItem](filesItemLess), } files := []string{ "test.0-1.kv", @@ -789,3 +1029,649 @@ func TestScanStaticFilesD(t *testing.T) { }) require.Equal(t, 6, len(found)) } + +func TestDomain_CollationBuildInMem(t *testing.T) { + logEvery := time.NewTicker(30 * time.Second) + defer logEvery.Stop() + db, d := testDbAndDomain(t, log.New()) + ctx := context.Background() + defer d.Close() + + tx, err := db.BeginRw(ctx) + require.NoError(t, err) + defer tx.Rollback() + d.SetTx(tx) + d.StartUnbufferedWrites() + defer d.FinishWrites() + + var preval1, preval2, preval3 []byte + maxTx := uint64(10000) + d.aggregationStep = maxTx + + dctx := d.MakeContext() + defer dctx.Close() + + l := []byte("asd9s9af0afa9sfh9afha") + + for i := 0; i < int(maxTx); i++ { + v1 := []byte(fmt.Sprintf("value1.%d", i)) + v2 := []byte(fmt.Sprintf("value2.%d", i)) + s := []byte(fmt.Sprintf("longstorage2.%d", i)) + + if i > 0 { + pv, _, err := dctx.GetLatest([]byte("key1"), nil, tx) + require.NoError(t, err) + require.Equal(t, pv, preval1) + + pv1, _, err := dctx.GetLatest([]byte("key2"), nil, tx) + require.NoError(t, err) + require.Equal(t, pv1, preval2) + + ps, _, err := dctx.GetLatest([]byte("key3"), l, tx) + require.NoError(t, err) + require.Equal(t, ps, preval3) + } + + d.SetTxNum(uint64(i)) + err = d.PutWithPrev([]byte("key1"), nil, v1, preval1) + require.NoError(t, err) + + err = d.PutWithPrev([]byte("key2"), nil, v2, preval2) + require.NoError(t, err) + + err = d.PutWithPrev([]byte("key3"), l, s, preval3) + require.NoError(t, err) + + preval1, preval2, preval3 = v1, v2, s + } + + err = d.Rotate().Flush(ctx, tx) + require.NoError(t, err) + + c, err := d.collate(ctx, 0, 0, maxTx, tx) + + require.NoError(t, err) + require.True(t, strings.HasSuffix(c.valuesPath, "base.0-1.kv")) + require.Equal(t, 3, c.valuesCount) + require.True(t, strings.HasSuffix(c.historyPath, "base.0-1.v")) + require.EqualValues(t, 3*maxTx, c.historyCount) + require.Equal(t, 3, len(c.indexBitmaps)) + require.Len(t, c.indexBitmaps["key2"].ToArray(), int(maxTx)) + require.Len(t, c.indexBitmaps["key1"].ToArray(), int(maxTx)) + require.Len(t, c.indexBitmaps["key3"+string(l)].ToArray(), int(maxTx)) + + sf, err := d.buildFiles(ctx, 0, c, background.NewProgressSet()) + require.NoError(t, err) + c.Close() + + g := sf.valuesDecomp.MakeGetter() + g.Reset(0) + var words []string + for g.HasNext() { + w, _ := g.Next(nil) + words = append(words, string(w)) + } + require.EqualValues(t, []string{"key1", string(preval1), "key2", string(preval2), "key3" + string(l), string(preval3)}, words) + // Check index + require.Equal(t, 3, int(sf.valuesBt.KeyCount())) + for i := 0; i < len(words); i += 2 { + c, _ := sf.valuesBt.SeekDeprecated([]byte(words[i])) + require.Equal(t, words[i], string(c.Key())) + require.Equal(t, words[i+1], string(c.Value())) + } + + //require.Equal(t, 3, int(sf.valuesIdx.KeyCount())) + // + //r := recsplit.NewIndexReader(sf.valuesIdx) + //defer r.Close() + //for i := 0; i < len(words); i += 2 { + // offset := r.Lookup([]byte(words[i])) + // g.Reset(offset) + // w, _ := g.Next(nil) + // require.Equal(t, words[i], string(w)) + // w, _ = g.Next(nil) + // require.Equal(t, words[i+1], string(w)) + //} +} + +func TestDomainContext_IteratePrefixAgain(t *testing.T) { + db, d := testDbAndDomain(t, log.New()) + defer db.Close() + defer d.Close() + + tx, err := db.BeginRw(context.Background()) + require.NoError(t, err) + defer tx.Rollback() + + d.SetTx(tx) + d.historyLargeValues = true + d.StartUnbufferedWrites() + defer d.FinishWrites() + + rnd := rand.New(rand.NewSource(time.Now().UnixNano())) + key := make([]byte, 20) + var loc []byte + value := make([]byte, 32) + first := []byte{0xab, 0xff} + other := []byte{0xcc, 0xfe} + copy(key[:], first) + + values := make(map[string][]byte) + for i := 0; i < 30; i++ { + rnd.Read(key[2:]) + if i == 15 { + copy(key[:2], other) + } + loc = make([]byte, 32) + rnd.Read(loc) + rnd.Read(value) + // if i%5 == 0 { + // d.SetTxNum(uint64(i)) + // } + + if i == 0 || i == 15 { + loc = nil + copy(key[2:], make([]byte, 18)) + } + + values[hex.EncodeToString(common.Append(key, loc))] = common.Copy(value) + err := d.PutWithPrev(key, loc, value, nil) + require.NoError(t, err) + } + + dctx := d.MakeContext() + defer dctx.Close() + + counter := 0 + err = dctx.IteratePrefix(tx, other, func(kx, vx []byte) { + if !bytes.HasPrefix(kx, other) { + return + } + fmt.Printf("%x \n", kx) + counter++ + v, ok := values[hex.EncodeToString(kx)] + require.True(t, ok) + require.Equal(t, v, vx) + }) + require.NoError(t, err) + err = dctx.IteratePrefix(tx, first, func(kx, vx []byte) { + if !bytes.HasPrefix(kx, first) { + return + } + fmt.Printf("%x \n", kx) + counter++ + v, ok := values[hex.EncodeToString(kx)] + require.True(t, ok) + require.Equal(t, v, vx) + }) + require.NoError(t, err) + require.EqualValues(t, len(values), counter) +} + +func TestDomainContext_IteratePrefix(t *testing.T) { + db, d := testDbAndDomain(t, log.New()) + defer db.Close() + defer d.Close() + + tx, err := db.BeginRw(context.Background()) + require.NoError(t, err) + defer tx.Rollback() + + d.SetTx(tx) + + d.historyLargeValues = true + d.StartUnbufferedWrites() + defer d.FinishWrites() + + rnd := rand.New(rand.NewSource(time.Now().UnixNano())) + key := make([]byte, 20) + value := make([]byte, 32) + copy(key[:], []byte{0xff, 0xff}) + + dctx := d.MakeContext() + defer dctx.Close() + + values := make(map[string][]byte) + for i := 0; i < 3000; i++ { + rnd.Read(key[2:]) + rnd.Read(value) + + values[hex.EncodeToString(key)] = common.Copy(value) + + err := d.PutWithPrev(key, nil, value, nil) + require.NoError(t, err) + } + + { + counter := 0 + err = dctx.IteratePrefix(tx, key[:2], func(kx, vx []byte) { + if !bytes.HasPrefix(kx, key[:2]) { + return + } + counter++ + v, ok := values[hex.EncodeToString(kx)] + require.True(t, ok) + require.Equal(t, v, vx) + }) + require.NoError(t, err) + require.EqualValues(t, len(values), counter) + } + { + counter := 0 + iter2, err := dctx.IteratePrefix2(tx, []byte("addr2"), []byte("addr3"), -1) + require.NoError(t, err) + for iter2.HasNext() { + kx, vx, err := iter2.Next() + require.NoError(t, err) + if !bytes.HasPrefix(kx, key[:2]) { + return + } + counter++ + v, ok := values[hex.EncodeToString(kx)] + require.True(t, ok) + require.Equal(t, v, vx) + } + } +} + +func TestDomainContext_getFromFiles(t *testing.T) { + db, d := testDbAndDomain(t, log.New()) + defer db.Close() + defer d.Close() + + tx, err := db.BeginRw(context.Background()) + require.NoError(t, err) + defer tx.Rollback() + + d.SetTx(tx) + d.StartUnbufferedWrites() + d.aggregationStep = 20 + + keys, vals := generateInputData(t, 8, 16, 100) + keys = keys[:20] + + var i int + values := make(map[string][][]byte) + + mc := d.MakeContext() + + for i = 0; i < len(vals); i++ { + d.SetTxNum(uint64(i)) + + for j := 0; j < len(keys); j++ { + buf := EncodeAccountBytes(uint64(i), uint256.NewInt(uint64(i*100_000)), nil, 0) + prev, _, err := mc.GetLatest(keys[j], nil, tx) + require.NoError(t, err) + + err = d.PutWithPrev(keys[j], nil, buf, prev) + require.NoError(t, err) + + if i > 0 && i+1%int(d.aggregationStep) == 0 { + values[hex.EncodeToString(keys[j])] = append(values[hex.EncodeToString(keys[j])], buf) + } + } + } + d.FinishWrites() + defer mc.Close() + + ctx := context.Background() + ps := background.NewProgressSet() + for step := uint64(0); step < uint64(len(vals))/d.aggregationStep; step++ { + dc := d.MakeContext() + + txFrom := step * d.aggregationStep + txTo := (step + 1) * d.aggregationStep + + fmt.Printf("Step %d [%d,%d)\n", step, txFrom, txTo) + + collation, err := d.collate(ctx, step, txFrom, txTo, d.tx) + require.NoError(t, err) + + sf, err := d.buildFiles(ctx, step, collation, ps) + require.NoError(t, err) + + d.integrateFiles(sf, txFrom, txTo) + collation.Close() + + logEvery := time.NewTicker(time.Second * 30) + + err = dc.Prune(ctx, tx, step, txFrom, txTo, math.MaxUint64, logEvery) + require.NoError(t, err) + + ranges := dc.findMergeRange(txFrom, txTo) + vl, il, hl, _ := dc.staticFilesInRange(ranges) + + dv, di, dh, err := d.mergeFiles(ctx, vl, il, hl, ranges, 1, ps) + require.NoError(t, err) + + d.integrateMergedFiles(vl, il, hl, dv, di, dh) + + logEvery.Stop() + + dc.Close() + } + + mc = d.MakeContext() + defer mc.Close() + + for key, bufs := range values { + var i int + + beforeTx := d.aggregationStep + for i = 0; i < len(bufs); i++ { + ks, _ := hex.DecodeString(key) + val, err := mc.GetAsOf(ks, beforeTx, tx) + require.NoError(t, err) + require.EqualValuesf(t, bufs[i], val, "key %s, tx %d", key, beforeTx) + beforeTx += d.aggregationStep + } + } +} + +func TestDomain_Unwind(t *testing.T) { + db, d := testDbAndDomain(t, log.New()) + ctx := context.Background() + defer d.Close() + + tx, err := db.BeginRw(ctx) + require.NoError(t, err) + defer tx.Rollback() + d.SetTx(tx) + d.StartWrites() + defer d.FinishWrites() + + var preval1, preval2 []byte + maxTx := uint64(16) + d.aggregationStep = maxTx + + dctx := d.MakeContext() + defer dctx.Close() + + for i := 0; i < int(maxTx); i++ { + v1 := []byte(fmt.Sprintf("value1.%d", i)) + v2 := []byte(fmt.Sprintf("value2.%d", i)) + + //if i > 0 { + // pv, _, err := dctx.GetLatest([]byte("key1"), nil, tx) + // require.NoError(t, err) + // require.Equal(t, pv, preval1) + // + // pv1, _, err := dctx.GetLatest([]byte("key2"), nil, tx) + // require.NoError(t, err) + // require.Equal(t, pv1, preval2) + // + // ps, _, err := dctx.GetLatest([]byte("key3"), l, tx) + // require.NoError(t, err) + // require.Equal(t, ps, preval3) + //} + // + d.SetTxNum(uint64(i)) + err = d.PutWithPrev([]byte("key1"), nil, v1, preval1) + require.NoError(t, err) + + err = d.PutWithPrev([]byte("key2"), nil, v2, preval2) + require.NoError(t, err) + + preval1, preval2 = v1, v2 + } + + err = d.Rotate().Flush(ctx, tx) + require.NoError(t, err) + + dc := d.MakeContext() + err = dc.Unwind(ctx, tx, 0, 5, maxTx, math.MaxUint64, nil) + require.NoError(t, err) + dc.Close() + + require.NoError(t, err) + d.MakeContext().IteratePrefix(tx, []byte("key1"), func(k, v []byte) { + fmt.Printf("%s: %s\n", k, v) + }) + return +} + +type upd struct { + txNum uint64 + value []byte +} + +func generateTestData(tb testing.TB, keySize1, keySize2, totalTx, keyTxsLimit, keyLimit uint64) map[string][]upd { + tb.Helper() + + data := make(map[string][]upd) + //seed := time.Now().Unix() + seed := 31 + defer tb.Logf("generated data with seed %d, keys %d", seed, keyLimit) + + r := rand.New(rand.NewSource(0)) + if keyLimit == 1 { + key1 := generateRandomKey(r, keySize1) + data[key1] = generateUpdates(r, totalTx, keyTxsLimit) + return data + } + + for i := uint64(0); i < keyLimit/2; i++ { + key1 := generateRandomKey(r, keySize1) + data[key1] = generateUpdates(r, totalTx, keyTxsLimit) + key2 := key1 + generateRandomKey(r, keySize2-keySize1) + data[key2] = generateUpdates(r, totalTx, keyTxsLimit) + } + + return data +} + +func generateRandomKey(r *rand.Rand, size uint64) string { + key := make([]byte, size) + r.Read(key) + return string(key) +} + +func generateUpdates(r *rand.Rand, totalTx, keyTxsLimit uint64) []upd { + updates := make([]upd, 0) + usedTxNums := make(map[uint64]bool) + + for i := uint64(0); i < keyTxsLimit; i++ { + txNum := generateRandomTxNum(r, totalTx, usedTxNums) + value := make([]byte, 10) + r.Read(value) + + updates = append(updates, upd{txNum: txNum, value: value}) + usedTxNums[txNum] = true + } + sort.Slice(updates, func(i, j int) bool { return updates[i].txNum < updates[j].txNum }) + + return updates +} + +func generateRandomTxNum(r *rand.Rand, maxTxNum uint64, usedTxNums map[uint64]bool) uint64 { + txNum := uint64(r.Intn(int(maxTxNum))) + for usedTxNums[txNum] { + txNum = uint64(r.Intn(int(maxTxNum))) + } + + return txNum +} + +func TestDomain_GetAfterAggregation(t *testing.T) { + db, d := testDbAndDomainOfStep(t, 25, log.New()) + defer db.Close() + defer d.Close() + + tx, err := db.BeginRw(context.Background()) + require.NoError(t, err) + defer tx.Rollback() + + d.historyLargeValues = false + d.History.compression = CompressKeys | CompressVals + d.domainLargeValues = true // false requires dupsort value table for domain + d.compression = CompressKeys | CompressVals + d.withLocalityIndex = true + + UseBpsTree = true + bufferedWrites := true + + d.SetTx(tx) + if bufferedWrites { + d.StartWrites() + } else { + d.StartUnbufferedWrites() + } + defer d.FinishWrites() + + keySize1 := uint64(length.Addr) + keySize2 := uint64(length.Addr + length.Hash) + totalTx := uint64(3000) + keyTxsLimit := uint64(50) + keyLimit := uint64(200) + + // put some kvs + data := generateTestData(t, keySize1, keySize2, totalTx, keyTxsLimit, keyLimit) + for key, updates := range data { + p := []byte{} + for i := 0; i < len(updates); i++ { + d.SetTxNum(updates[i].txNum) + d.PutWithPrev([]byte(key), nil, updates[i].value, p) + p = common.Copy(updates[i].value) + } + } + d.SetTxNum(totalTx) + + if bufferedWrites { + err = d.Rotate().Flush(context.Background(), tx) + require.NoError(t, err) + } + + // aggregate + collateAndMerge(t, db, tx, d, totalTx) + require.NoError(t, tx.Commit()) + + tx, err = db.BeginRw(context.Background()) + require.NoError(t, err) + defer tx.Rollback() + d.SetTx(tx) + + dc := d.MakeContext() + defer dc.Close() + + kc := 0 + for key, updates := range data { + kc++ + for i := 1; i < len(updates); i++ { + v, err := dc.GetAsOf([]byte(key), updates[i].txNum, tx) + require.NoError(t, err) + require.EqualValuesf(t, updates[i-1].value, v, "(%d/%d) key %x, tx %d", kc, len(data), []byte(key), updates[i-1].txNum) + } + if len(updates) == 0 { + continue + } + v, ok, err := dc.GetLatest([]byte(key), nil, tx) + require.NoError(t, err) + require.EqualValuesf(t, updates[len(updates)-1].value, v, "key %x latest", []byte(key)) + require.True(t, ok) + } +} + +func TestDomain_PruneAfterAggregation(t *testing.T) { + db, d := testDbAndDomainOfStep(t, 25, log.New()) + defer db.Close() + defer d.Close() + + tx, err := db.BeginRw(context.Background()) + require.NoError(t, err) + defer tx.Rollback() + + d.historyLargeValues = false + d.History.compression = CompressKeys | CompressVals + d.domainLargeValues = true // false requires dupsort value table for domain + d.compression = CompressKeys | CompressVals + d.withLocalityIndex = true + + UseBpsTree = true + bufferedWrites := true + + d.SetTx(tx) + if bufferedWrites { + d.StartWrites() + } else { + d.StartUnbufferedWrites() + } + defer d.FinishWrites() + + keySize1 := uint64(length.Addr) + keySize2 := uint64(length.Addr + length.Hash) + totalTx := uint64(5000) + keyTxsLimit := uint64(50) + keyLimit := uint64(200) + + // put some kvs + data := generateTestData(t, keySize1, keySize2, totalTx, keyTxsLimit, keyLimit) + for key, updates := range data { + p := []byte{} + for i := 0; i < len(updates); i++ { + d.SetTxNum(updates[i].txNum) + d.PutWithPrev([]byte(key), nil, updates[i].value, p) + p = common.Copy(updates[i].value) + } + } + d.SetTxNum(totalTx) + + if bufferedWrites { + err = d.Rotate().Flush(context.Background(), tx) + require.NoError(t, err) + } + + // aggregate + collateAndMerge(t, db, tx, d, totalTx) // expected to left 2 latest steps in db + + require.NoError(t, tx.Commit()) + + tx, err = db.BeginRw(context.Background()) + require.NoError(t, err) + defer tx.Rollback() + d.SetTx(tx) + + dc := d.MakeContext() + defer dc.Close() + + prefixes := 0 + err = dc.IteratePrefix(tx, nil, func(k, v []byte) { + upds, ok := data[string(k)] + require.True(t, ok) + prefixes++ + latest := upds[len(upds)-1] + if string(latest.value) != string(v) { + fmt.Printf("opanki %x\n", k) + for li := len(upds) - 1; li >= 0; li-- { + latest := upds[li] + if bytes.Equal(latest.value, v) { + t.Logf("returned value was set with nonce %d/%d (tx %d, step %d)", li+1, len(upds), latest.txNum, latest.txNum/d.aggregationStep) + } else { + continue + } + require.EqualValuesf(t, latest.value, v, "key %x txNum %d", k, latest.txNum) + break + } + } + + require.EqualValuesf(t, latest.value, v, "key %x txnum %d", k, latest.txNum) + }) + require.NoError(t, err) + require.EqualValues(t, len(data), prefixes, "seen less keys than expected") + + kc := 0 + for key, updates := range data { + kc++ + for i := 1; i < len(updates); i++ { + v, err := dc.GetAsOf([]byte(key), updates[i].txNum, tx) + require.NoError(t, err) + require.EqualValuesf(t, updates[i-1].value, v, "(%d/%d) key %x, tx %d", kc, len(data), []byte(key), updates[i-1].txNum) + } + if len(updates) == 0 { + continue + } + v, ok, err := dc.GetLatest([]byte(key), nil, tx) + require.NoError(t, err) + require.EqualValuesf(t, updates[len(updates)-1].value, v, "key %x latest", []byte(key)) + require.True(t, ok) + } +} diff --git a/state/gc_test.go b/state/gc_test.go index a159b766d..3b5cc3fe3 100644 --- a/state/gc_test.go +++ b/state/gc_test.go @@ -33,7 +33,12 @@ func TestGCReadAfterRemoveFile(t *testing.T) { // - open new view // - make sure there is no canDelete file hc := h.MakeContext() - _ = hc + if h.withLocalityIndex { + //require.Nil(hc.ic.coldLocality.file) // optimization: don't create LocalityIndex for 1 file + require.NotNil(hc.ic.coldLocality.file) + require.NotNil(hc.ic.warmLocality.file) + } + lastOnFs, _ := h.files.Max() require.False(lastOnFs.frozen) // prepared dataset must have some non-frozen files. or it's bad dataset. h.integrateMergedFiles(nil, []*filesItem{lastOnFs}, nil, nil) @@ -51,12 +56,16 @@ func TestGCReadAfterRemoveFile(t *testing.T) { } require.NotNil(lastOnFs.decompressor) - loc := hc.ic.loc // replace of locality index must not affect current HistoryContext, but expect to be closed after last reader - h.localityIndex.integrateFiles(LocalityIndexFiles{}, 0, 0) - require.NotNil(loc.file) + //replace of locality index must not affect current HistoryContext, but expect to be closed after last reader + if h.withLocalityIndex { + h.warmLocalityIdx.integrateFiles(&LocalityIndexFiles{}) + require.NotNil(h.warmLocalityIdx.file) + } hc.Close() require.Nil(lastOnFs.decompressor) - require.NotNil(loc.file) + if h.withLocalityIndex { + require.NotNil(h.warmLocalityIdx.file) + } nonDeletedOnFs, _ := h.files.Max() require.False(nonDeletedOnFs.frozen) @@ -88,11 +97,11 @@ func TestGCReadAfterRemoveFile(t *testing.T) { }) } t.Run("large_values", func(t *testing.T) { - _, db, h, txs := filledHistory(t, true, logger) + db, h, txs := filledHistory(t, true, logger) test(t, h, db, txs) }) t.Run("small_values", func(t *testing.T) { - _, db, h, txs := filledHistory(t, false, logger) + db, h, txs := filledHistory(t, false, logger) test(t, h, db, txs) }) } @@ -170,6 +179,6 @@ func TestDomainGCReadAfterRemoveFile(t *testing.T) { }) } logger := log.New() - _, db, d, txs := filledDomain(t, logger) + db, d, txs := filledDomain(t, logger) test(t, d, db, txs) } diff --git a/state/history.go b/state/history.go index 23fad0251..6dc7cbfbe 100644 --- a/state/history.go +++ b/state/history.go @@ -35,6 +35,8 @@ import ( "golang.org/x/exp/slices" "golang.org/x/sync/errgroup" + "github.com/ledgerwatch/erigon-lib/common/hexutility" + "github.com/ledgerwatch/erigon-lib/common" "github.com/ledgerwatch/erigon-lib/common/background" "github.com/ledgerwatch/erigon-lib/common/cmp" @@ -63,7 +65,7 @@ type History struct { historyValsTable string // key1+key2+txnNum -> oldValue , stores values BEFORE change compressWorkers int - compressVals bool + compression FileCompression integrityFileExtensions []string // not large: @@ -72,29 +74,33 @@ type History struct { // large: // keys: txNum -> key1+key2 // vals: key1+key2+txNum -> value (not DupSort) - largeValues bool // can't use DupSort optimization (aka. prefix-compression) if values size > 4kb + historyLargeValues bool // can't use DupSort optimization (aka. prefix-compression) if values size > 4kb garbageFiles []*filesItem // files that exist on disk, but ignored on opening folder - because they are garbage - wal *historyWAL - logger log.Logger + wal *historyWAL +} + +type histCfg struct { + iiCfg iiCfg + compression FileCompression + historyLargeValues bool + withLocalityIndex bool + withExistenceIndex bool // move to iiCfg } -func NewHistory(dir, tmpdir string, aggregationStep uint64, - filenameBase, indexKeysTable, indexTable, historyValsTable string, - compressVals bool, integrityFileExtensions []string, largeValues bool, logger log.Logger) (*History, error) { +func NewHistory(cfg histCfg, aggregationStep uint64, filenameBase, indexKeysTable, indexTable, historyValsTable string, integrityFileExtensions []string, logger log.Logger) (*History, error) { h := History{ files: btree2.NewBTreeGOptions[*filesItem](filesItemLess, btree2.Options{Degree: 128, NoLocks: false}), historyValsTable: historyValsTable, - compressVals: compressVals, + compression: cfg.compression, compressWorkers: 1, integrityFileExtensions: integrityFileExtensions, - largeValues: largeValues, - logger: logger, + historyLargeValues: cfg.historyLargeValues, } h.roFiles.Store(&[]ctxItem{}) var err error - h.InvertedIndex, err = NewInvertedIndex(dir, tmpdir, aggregationStep, filenameBase, indexKeysTable, indexTable, true, append(slices.Clone(h.integrityFileExtensions), "v"), logger) + h.InvertedIndex, err = NewInvertedIndex(cfg.iiCfg, aggregationStep, filenameBase, indexKeysTable, indexTable, cfg.withLocalityIndex, cfg.withExistenceIndex, append(slices.Clone(h.integrityFileExtensions), "v"), logger) if err != nil { return nil, fmt.Errorf("NewHistory: %s, %w", filenameBase, err) } @@ -106,11 +112,11 @@ func NewHistory(dir, tmpdir string, aggregationStep uint64, // It's ok if some files was open earlier. // If some file already open: noop. // If some file already open but not in provided list: close and remove from `files` field. -func (h *History) OpenList(fNames []string) error { - if err := h.InvertedIndex.OpenList(fNames); err != nil { +func (h *History) OpenList(coldNames, warmNames []string) error { + if err := h.InvertedIndex.OpenList(coldNames, warmNames); err != nil { return err } - return h.openList(fNames) + return h.openList(coldNames) } func (h *History) openList(fNames []string) error { @@ -123,11 +129,11 @@ func (h *History) openList(fNames []string) error { } func (h *History) OpenFolder() error { - files, err := h.fileNamesOnDisk() + coldNames, warmNames, err := h.fileNamesOnDisk() if err != nil { return err } - return h.OpenList(files) + return h.OpenList(coldNames, warmNames) } // scanStateFiles @@ -220,17 +226,17 @@ func (h *History) openFiles() error { return false } - if item.index != nil { - continue - } - idxPath := filepath.Join(h.dir, fmt.Sprintf("%s.%d-%d.vi", h.filenameBase, fromStep, toStep)) - if dir.FileExist(idxPath) { - if item.index, err = recsplit.OpenIndex(idxPath); err != nil { - h.logger.Debug(fmt.Errorf("Hisrory.openFiles: %w, %s", err, idxPath).Error()) - return false + if item.index == nil { + idxPath := filepath.Join(h.dir, fmt.Sprintf("%s.%d-%d.vi", h.filenameBase, fromStep, toStep)) + if dir.FileExist(idxPath) { + if item.index, err = recsplit.OpenIndex(idxPath); err != nil { + h.logger.Debug(fmt.Errorf("Hisrory.openFiles: %w, %s", err, idxPath).Error()) + return false + } + totalKeys += item.index.KeyCount() } - totalKeys += item.index.KeyCount() } + } return true }) @@ -278,17 +284,13 @@ func (h *History) Close() { h.reCalcRoFiles() } -func (h *History) Files() (res []string) { - h.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - if item.decompressor != nil { - res = append(res, item.decompressor.FileName()) - } +func (hc *HistoryContext) Files() (res []string) { + for _, item := range hc.files { + if item.src.decompressor != nil { + res = append(res, item.src.decompressor.FileName()) } - return true - }) - res = append(res, h.InvertedIndex.Files()...) - return res + } + return append(res, hc.ic.Files()...) } func (h *History) missedIdxFiles() (l []*filesItem) { @@ -304,12 +306,7 @@ func (h *History) missedIdxFiles() (l []*filesItem) { return l } -// BuildMissedIndices - produce .efi/.vi/.kvi from .ef/.v/.kv -func (hc *HistoryContext) BuildOptionalMissedIndices(ctx context.Context) (err error) { - return hc.h.localityIndex.BuildMissedIndices(ctx, hc.ic) -} - -func (h *History) buildVi(ctx context.Context, item *filesItem, p *background.Progress) (err error) { +func (h *History) buildVi(ctx context.Context, item *filesItem, ps *background.ProgressSet) (err error) { search := &filesItem{startTxNum: item.startTxNum, endTxNum: item.endTxNum} iiItem, ok := h.InvertedIndex.files.Get(search) if !ok { @@ -321,15 +318,7 @@ func (h *History) buildVi(ctx context.Context, item *filesItem, p *background.Pr idxPath := filepath.Join(h.dir, fName) //h.logger.Info("[snapshots] build idx", "file", fName) - - p.Name.Store(&fName) - p.Total.Store(uint64(iiItem.decompressor.Count()) * 2) - - count, err := iterateForVi(item, iiItem, p, h.compressVals, func(v []byte) error { return nil }) - if err != nil { - return err - } - return buildVi(ctx, item, iiItem, idxPath, h.tmpdir, count, p, h.compressVals, h.logger) + return buildVi(ctx, item, iiItem, idxPath, h.tmpdir, ps, h.InvertedIndex.compression, h.compression, h.salt, h.logger) } func (h *History) BuildMissedIndices(ctx context.Context, g *errgroup.Group, ps *background.ProgressSet) { @@ -338,81 +327,44 @@ func (h *History) BuildMissedIndices(ctx context.Context, g *errgroup.Group, ps for _, item := range missedFiles { item := item g.Go(func() error { - p := &background.Progress{} - ps.Add(p) - defer ps.Delete(p) - return h.buildVi(ctx, item, p) + return h.buildVi(ctx, item, ps) }) } } -func iterateForVi(historyItem, iiItem *filesItem, p *background.Progress, compressVals bool, f func(v []byte) error) (count int, err error) { - var cp CursorHeap - heap.Init(&cp) - g := iiItem.decompressor.MakeGetter() - g.Reset(0) - if g.HasNext() { - g2 := historyItem.decompressor.MakeGetter() - key, _ := g.NextUncompressed() - val, _ := g.NextUncompressed() - heap.Push(&cp, &CursorItem{ - t: FILE_CURSOR, - dg: g, - dg2: g2, - key: key, - val: val, - endTxNum: iiItem.endTxNum, - reverse: false, - }) - } +func buildVi(ctx context.Context, historyItem, iiItem *filesItem, historyIdxPath, tmpdir string, ps *background.ProgressSet, compressIindex, compressHist FileCompression, salt *uint32, logger log.Logger) error { + defer iiItem.decompressor.EnableReadAhead().DisableReadAhead() + defer historyItem.decompressor.EnableReadAhead().DisableReadAhead() - // In the loop below, the pair `keyBuf=>valBuf` is always 1 item behind `lastKey=>lastVal`. - // `lastKey` and `lastVal` are taken from the top of the multi-way merge (assisted by the CursorHeap cp), but not processed right away - // instead, the pair from the previous iteration is processed first - `keyBuf=>valBuf`. After that, `keyBuf` and `valBuf` are assigned - // to `lastKey` and `lastVal` correspondingly, and the next step of multi-way merge happens. Therefore, after the multi-way merge loop - // (when CursorHeap cp is empty), there is a need to process the last pair `keyBuf=>valBuf`, because it was one step behind - var valBuf []byte - for cp.Len() > 0 { - lastKey := common.Copy(cp[0].key) - // Advance all the items that have this key (including the top) - //var mergeOnce bool - for cp.Len() > 0 && bytes.Equal(cp[0].key, lastKey) { - ci1 := cp[0] - keysCount := eliasfano32.Count(ci1.val) - for i := uint64(0); i < keysCount; i++ { - if compressVals { - valBuf, _ = ci1.dg2.Next(valBuf[:0]) - } else { - valBuf, _ = ci1.dg2.NextUncompressed() - } - if err = f(valBuf); err != nil { - return count, err - } - } - count += int(keysCount) - if ci1.dg.HasNext() { - ci1.key, _ = ci1.dg.NextUncompressed() - ci1.val, _ = ci1.dg.NextUncompressed() - heap.Fix(&cp, 0) - } else { - heap.Remove(&cp, 0) - } + _, fName := filepath.Split(historyIdxPath) + p := ps.AddNew(fName, uint64(iiItem.decompressor.Count()*2)) + defer ps.Delete(p) - p.Processed.Add(1) + var count uint64 + g := NewArchiveGetter(iiItem.decompressor.MakeGetter(), compressIindex) + g.Reset(0) + for g.HasNext() { + select { + case <-ctx.Done(): + return ctx.Err() + default: } + + g.Skip() // key + valBuf, _ := g.Next(nil) + count += eliasfano32.Count(valBuf) + p.Processed.Add(1) } - return count, nil -} -func buildVi(ctx context.Context, historyItem, iiItem *filesItem, historyIdxPath, tmpdir string, count int, p *background.Progress, compressVals bool, logger log.Logger) error { rs, err := recsplit.NewRecSplit(recsplit.RecSplitArgs{ - KeyCount: count, + KeyCount: int(count), Enums: false, BucketSize: 2000, LeafSize: 8, TmpDir: tmpdir, IndexFile: historyIdxPath, EtlBufLimit: etl.BufferOptimalSize / 2, + Salt: salt, }, logger) if err != nil { return fmt.Errorf("create recsplit: %w", err) @@ -423,25 +375,15 @@ func buildVi(ctx context.Context, historyItem, iiItem *filesItem, historyIdxPath var txKey [8]byte var valOffset uint64 - defer iiItem.decompressor.EnableMadvNormal().DisableReadAhead() - defer historyItem.decompressor.EnableMadvNormal().DisableReadAhead() - - g := iiItem.decompressor.MakeGetter() - g2 := historyItem.decompressor.MakeGetter() + g2 := NewArchiveGetter(historyItem.decompressor.MakeGetter(), compressHist) var keyBuf, valBuf []byte for { g.Reset(0) g2.Reset(0) valOffset = 0 for g.HasNext() { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - - keyBuf, _ = g.NextUncompressed() - valBuf, _ = g.NextUncompressed() + keyBuf, _ = g.Next(nil) + valBuf, _ = g.Next(nil) ef, _ := eliasfano32.ReadEliasFano(valBuf) efIt := ef.Iterator() for efIt.HasNext() { @@ -451,14 +393,19 @@ func buildVi(ctx context.Context, historyItem, iiItem *filesItem, historyIdxPath if err = rs.AddKey(historyKey, valOffset); err != nil { return err } - if compressVals { - valOffset, _ = g2.Skip() - } else { - valOffset, _ = g2.SkipUncompressed() - } + //if compressHist { + valOffset, _ = g2.Skip() + //} else { + // valOffset, _ = g2.SkipUncompressed() + //} } p.Processed.Add(1) + select { + case <-ctx.Done(): + return ctx.Err() + default: + } } if err = rs.Build(ctx); err != nil { if rs.Collision() { @@ -500,22 +447,45 @@ func (h *History) FinishWrites() { } func (h *History) Rotate() historyFlusher { - w := h.wal - h.wal = h.newWriter(h.wal.tmpdir, h.wal.buffered, h.wal.discard) - return historyFlusher{h: w, i: h.InvertedIndex.Rotate()} + hf := historyFlusher{} + if h.InvertedIndex.wal != nil { + hf.i = h.InvertedIndex.Rotate() + } + + if h.wal != nil { + w := h.wal + if w.buffered { + if err := w.historyVals.Flush(); err != nil { + panic(err) + } + } + hf.h = w + h.wal = h.newWriter(h.wal.tmpdir, h.wal.buffered, h.wal.discard) + } + return hf } type historyFlusher struct { h *historyWAL i *invertedIndexWAL + d *domainWAL } func (f historyFlusher) Flush(ctx context.Context, tx kv.RwTx) error { - if err := f.i.Flush(ctx, tx); err != nil { - return err + if f.d != nil { + if err := f.d.flush(ctx, tx); err != nil { + return err + } } - if err := f.h.flush(ctx, tx); err != nil { - return err + if f.i != nil { + if err := f.i.Flush(ctx, tx); err != nil { + return err + } + } + if f.h != nil { + if err := f.h.flush(ctx, tx); err != nil { + return err + } } return nil } @@ -554,8 +524,8 @@ func (h *History) newWriter(tmpdir string, buffered, discard bool) *historyWAL { discard: discard, autoIncrementBuf: make([]byte, 8), - historyKey: make([]byte, 0, 128), - largeValues: h.largeValues, + historyKey: make([]byte, 128), + largeValues: h.historyLargeValues, } if buffered { w.historyVals = etl.NewCollector(h.historyValsTable, tmpdir, etl.NewSortableBuffer(WALCollectorRAM), h.logger) @@ -580,15 +550,17 @@ func (h *historyWAL) addPrevValue(key1, key2, original []byte) error { return nil } + //defer func() { + // fmt.Printf("addPrevValue: %x tx %x %x lv=%t buffered=%t\n", key1, h.h.InvertedIndex.txNumBytes, original, h.largeValues, h.buffered) + //}() + ii := h.h.InvertedIndex + if h.largeValues { lk := len(key1) + len(key2) + + h.historyKey = append(append(append(h.historyKey[:0], key1...), key2...), h.h.InvertedIndex.txNumBytes[:]...) historyKey := h.historyKey[:lk+8] - copy(historyKey, key1) - if len(key2) > 0 { - copy(historyKey[len(key1):], key2) - } - copy(historyKey[lk:], h.h.InvertedIndex.txNumBytes[:]) if !h.buffered { if err := h.h.tx.Put(h.h.historyValsTable, historyKey, original); err != nil { @@ -607,13 +579,14 @@ func (h *historyWAL) addPrevValue(key1, key2, original []byte) error { } return nil } + if len(original) > 2048 { + log.Error("History value is too large while largeValues=false", "h", h.h.historyValsTable, "histo", string(h.historyKey[:len(key1)+len(key2)]), "len", len(original), "max", len(h.historyKey)-8-len(key1)-len(key2)) + panic("History value is too large while largeValues=false") + } lk := len(key1) + len(key2) + h.historyKey = append(append(append(append(h.historyKey[:0], key1...), key2...), h.h.InvertedIndex.txNumBytes[:]...), original...) historyKey := h.historyKey[:lk+8+len(original)] - copy(historyKey, key1) - copy(historyKey[len(key1):], key2) - copy(historyKey[lk:], h.h.InvertedIndex.txNumBytes[:]) - copy(historyKey[lk+8:], original) historyKey1 := historyKey[:lk] historyVal := historyKey[lk:] invIdxVal := historyKey[:lk] @@ -637,7 +610,7 @@ func (h *historyWAL) addPrevValue(key1, key2, original []byte) error { } type HistoryCollation struct { - historyComp *compress.Compressor + historyComp ArchiveWriter indexBitmaps map[string]*roaring64.Bitmap historyPath string historyCount int @@ -653,7 +626,7 @@ func (c HistoryCollation) Close() { } func (h *History) collate(step, txFrom, txTo uint64, roTx kv.Tx) (HistoryCollation, error) { - var historyComp *compress.Compressor + var historyComp ArchiveWriter var err error closeComp := true defer func() { @@ -664,9 +637,12 @@ func (h *History) collate(step, txFrom, txTo uint64, roTx kv.Tx) (HistoryCollati } }() historyPath := filepath.Join(h.dir, fmt.Sprintf("%s.%d-%d.v", h.filenameBase, step, step+1)) - if historyComp, err = compress.NewCompressor(context.Background(), "collate history", historyPath, h.tmpdir, compress.MinPatternScore, h.compressWorkers, log.LvlTrace, h.logger); err != nil { + comp, err := compress.NewCompressor(context.Background(), "collate history", historyPath, h.tmpdir, compress.MinPatternScore, h.compressWorkers, log.LvlTrace, h.logger) + if err != nil { return HistoryCollation{}, fmt.Errorf("create %s history compressor: %w", h.filenameBase, err) } + historyComp = NewArchiveWriter(comp, h.compression) + keysCursor, err := roTx.CursorDupSort(h.indexKeysTable) if err != nil { return HistoryCollation{}, fmt.Errorf("create %s history cursor: %w", h.filenameBase, err) @@ -683,9 +659,11 @@ func (h *History) collate(step, txFrom, txTo uint64, roTx kv.Tx) (HistoryCollati } var bitmap *roaring64.Bitmap var ok bool - if bitmap, ok = indexBitmaps[string(v)]; !ok { + + ks := string(v) + if bitmap, ok = indexBitmaps[ks]; !ok { bitmap = bitmapdb.NewBitmap64() - indexBitmaps[string(v)] = bitmap + indexBitmaps[ks] = bitmap } bitmap.Add(txNum) } @@ -698,11 +676,10 @@ func (h *History) collate(step, txFrom, txTo uint64, roTx kv.Tx) (HistoryCollati } slices.Sort(keys) historyCount := 0 - keyBuf := make([]byte, 256) var c kv.Cursor var cd kv.CursorDupSort - if h.largeValues { + if h.historyLargeValues { c, err = roTx.Cursor(h.historyValsTable) if err != nil { return HistoryCollation{}, err @@ -715,37 +692,41 @@ func (h *History) collate(step, txFrom, txTo uint64, roTx kv.Tx) (HistoryCollati } defer cd.Close() } + + keyBuf := make([]byte, 0, 256) for _, key := range keys { bitmap := indexBitmaps[key] it := bitmap.Iterator() - copy(keyBuf, key) - keyBuf = keyBuf[:len(key)+8] + keyBuf = append(append(keyBuf[:0], []byte(key)...), make([]byte, 8)...) + lk := len([]byte(key)) + for it.HasNext() { txNum := it.Next() - binary.BigEndian.PutUint64(keyBuf[len(key):], txNum) + binary.BigEndian.PutUint64(keyBuf[lk:], txNum) //TODO: use cursor range - if h.largeValues { + if h.historyLargeValues { val, err := roTx.GetOne(h.historyValsTable, keyBuf) if err != nil { - return HistoryCollation{}, fmt.Errorf("get %s history val [%x]: %w", h.filenameBase, k, err) + return HistoryCollation{}, fmt.Errorf("getBeforeTxNum %s history val [%x]: %w", h.filenameBase, k, err) } if len(val) == 0 { val = nil } - if err = historyComp.AddUncompressedWord(val); err != nil { + if err = historyComp.AddWord(val); err != nil { return HistoryCollation{}, fmt.Errorf("add %s history val [%x]=>[%x]: %w", h.filenameBase, k, val, err) } } else { - val, err := cd.SeekBothRange(keyBuf[:len(key)], keyBuf[len(key):]) + val, err := cd.SeekBothRange(keyBuf[:lk], keyBuf[lk:]) if err != nil { return HistoryCollation{}, err } if val != nil && binary.BigEndian.Uint64(val) == txNum { + // fmt.Printf("HistCollate [%x]=>[%x]\n", []byte(key), val) val = val[8:] } else { val = nil } - if err = historyComp.AddUncompressedWord(val); err != nil { + if err = historyComp.AddWord(val); err != nil { return HistoryCollation{}, fmt.Errorf("add %s history val [%x]=>[%x]: %w", h.filenameBase, k, val, err) } } @@ -766,6 +747,10 @@ type HistoryFiles struct { historyIdx *recsplit.Index efHistoryDecomp *compress.Decompressor efHistoryIdx *recsplit.Index + efExistence *bloomFilter + + warmLocality *LocalityIndexFiles + coldLocality *LocalityIndexFiles } func (sf HistoryFiles) Close() { @@ -783,7 +768,7 @@ func (sf HistoryFiles) Close() { } } func (h *History) reCalcRoFiles() { - roFiles := ctxFiles(h.files) + roFiles := ctxFiles(h.files, true, false) h.roFiles.Store(&roFiles) } @@ -794,10 +779,13 @@ func (h *History) buildFiles(ctx context.Context, step uint64, collation History if h.noFsync { historyComp.DisableFsync() } - var historyDecomp, efHistoryDecomp *compress.Decompressor - var historyIdx, efHistoryIdx *recsplit.Index - var efHistoryComp *compress.Compressor - var rs *recsplit.RecSplit + var ( + historyDecomp, efHistoryDecomp *compress.Decompressor + historyIdx, efHistoryIdx *recsplit.Index + efExistence *bloomFilter + efHistoryComp *compress.Compressor + rs *recsplit.RecSplit + ) closeComp := true defer func() { if closeComp { @@ -895,20 +883,30 @@ func (h *History) buildFiles(ctx context.Context, step uint64, collation History if efHistoryDecomp, err = compress.NewDecompressor(efHistoryPath); err != nil { return HistoryFiles{}, fmt.Errorf("open %s ef history decompressor: %w", h.filenameBase, err) } - efHistoryIdxFileName := fmt.Sprintf("%s.%d-%d.efi", h.filenameBase, step, step+1) - efHistoryIdxPath := filepath.Join(h.dir, efHistoryIdxFileName) - p := ps.AddNew(efHistoryIdxFileName, uint64(len(keys)*2)) - defer ps.Delete(p) - if efHistoryIdx, err = buildIndexThenOpen(ctx, efHistoryDecomp, efHistoryIdxPath, h.tmpdir, len(keys), false /* values */, p, h.logger, h.noFsync); err != nil { - return HistoryFiles{}, fmt.Errorf("build %s ef history idx: %w", h.filenameBase, err) + { + efHistoryIdxFileName := fmt.Sprintf("%s.%d-%d.efi", h.filenameBase, step, step+1) + efHistoryIdxPath := filepath.Join(h.dir, efHistoryIdxFileName) + if efHistoryIdx, err = buildIndexThenOpen(ctx, efHistoryDecomp, h.compression, efHistoryIdxPath, h.tmpdir, false, h.salt, ps, h.logger, h.noFsync); err != nil { + return HistoryFiles{}, fmt.Errorf("build %s ef history idx: %w", h.filenameBase, err) + } + } + if h.InvertedIndex.withExistenceIndex { + existenceIdxFileName := fmt.Sprintf("%s.%d-%d.efei", h.filenameBase, step, step+1) + existenceIdxPath := filepath.Join(h.dir, existenceIdxFileName) + if efExistence, err = buildIndexFilterThenOpen(ctx, efHistoryDecomp, h.compression, existenceIdxPath, h.tmpdir, h.salt, ps, h.logger, h.noFsync); err != nil { + return HistoryFiles{}, fmt.Errorf("build %s ef history idx: %w", h.filenameBase, err) + } + } if rs, err = recsplit.NewRecSplit(recsplit.RecSplitArgs{ - KeyCount: collation.historyCount, - Enums: false, - BucketSize: 2000, - LeafSize: 8, - TmpDir: h.tmpdir, - IndexFile: historyIdxPath, + KeyCount: collation.historyCount, + Enums: false, + BucketSize: 2000, + LeafSize: 8, + TmpDir: h.tmpdir, + IndexFile: historyIdxPath, + EtlBufLimit: etl.BufferOptimalSize / 2, + Salt: h.salt, }, h.logger); err != nil { return HistoryFiles{}, fmt.Errorf("create recsplit: %w", err) } @@ -919,17 +917,18 @@ func (h *History) buildFiles(ctx context.Context, step uint64, collation History var historyKey []byte var txKey [8]byte var valOffset uint64 - g := historyDecomp.MakeGetter() + g := NewArchiveGetter(historyDecomp.MakeGetter(), h.compression) for { g.Reset(0) valOffset = 0 for _, key := range keys { bitmap := collation.indexBitmaps[key] it := bitmap.Iterator() + kb := []byte(key) for it.HasNext() { txNum := it.Next() binary.BigEndian.PutUint64(txKey[:], txNum) - historyKey = append(append(historyKey[:0], txKey[:]...), key...) + historyKey = append(append(historyKey[:0], txKey[:]...), kb...) if err = rs.AddKey(historyKey, valOffset); err != nil { return HistoryFiles{}, fmt.Errorf("add %s history idx [%x]: %w", h.filenameBase, historyKey, err) } @@ -949,6 +948,12 @@ func (h *History) buildFiles(ctx context.Context, step uint64, collation History } rs.Close() rs = nil + + warmLocality, err := h.buildWarmLocality(ctx, efHistoryDecomp, step, ps) + if err != nil { + return HistoryFiles{}, err + } + if historyIdx, err = recsplit.OpenIndex(historyIdxPath); err != nil { return HistoryFiles{}, fmt.Errorf("open idx: %w", err) } @@ -958,13 +963,18 @@ func (h *History) buildFiles(ctx context.Context, step uint64, collation History historyIdx: historyIdx, efHistoryDecomp: efHistoryDecomp, efHistoryIdx: efHistoryIdx, + efExistence: efExistence, + warmLocality: warmLocality, }, nil } func (h *History) integrateFiles(sf HistoryFiles, txNumFrom, txNumTo uint64) { h.InvertedIndex.integrateFiles(InvertedFiles{ - decomp: sf.efHistoryDecomp, - index: sf.efHistoryIdx, + decomp: sf.efHistoryDecomp, + index: sf.efHistoryIdx, + existence: sf.efExistence, + warmLocality: sf.warmLocality, + coldLocality: sf.coldLocality, }, txNumFrom, txNumTo) fi := newFilesItem(txNumFrom, txNumTo, h.aggregationStep) @@ -1001,9 +1011,9 @@ func (h *History) warmup(ctx context.Context, txFrom, limit uint64, tx kv.Tx) er txTo = txFrom + limit } keyBuf := make([]byte, 256) - for ; err == nil && k != nil; k, v, err = historyKeysCursor.Next() { + for ; k != nil; k, v, err = historyKeysCursor.Next() { if err != nil { - return err + return fmt.Errorf("iterate over %s history keys: %w", h.filenameBase, err) } txNum := binary.BigEndian.Uint64(k) if txNum >= txTo { @@ -1019,15 +1029,11 @@ func (h *History) warmup(ctx context.Context, txFrom, limit uint64, tx kv.Tx) er default: } } - if err != nil { - return fmt.Errorf("iterate over %s history keys: %w", h.filenameBase, err) - } - return nil } func (h *History) isEmpty(tx kv.Tx) (bool, error) { - if h.largeValues { + if h.historyLargeValues { k, err := kv.FirstKey(tx, h.historyValsTable) if err != nil { return false, err @@ -1049,72 +1055,116 @@ func (h *History) isEmpty(tx kv.Tx) (bool, error) { return k == nil && k2 == nil, nil } -func (h *History) prune(ctx context.Context, txFrom, txTo, limit uint64, logEvery *time.Ticker) error { - historyKeysCursorForDeletes, err := h.tx.RwCursorDupSort(h.indexKeysTable) - if err != nil { - return fmt.Errorf("create %s history cursor: %w", h.filenameBase, err) - } - defer historyKeysCursorForDeletes.Close() - historyKeysCursor, err := h.tx.RwCursorDupSort(h.indexKeysTable) - if err != nil { - return fmt.Errorf("create %s history cursor: %w", h.filenameBase, err) - } - defer historyKeysCursor.Close() - var txKey [8]byte - binary.BigEndian.PutUint64(txKey[:], txFrom) - var k, v []byte - var valsC kv.RwCursor - var valsCDup kv.RwCursorDupSort - if h.largeValues { - valsC, err = h.tx.RwCursor(h.historyValsTable) +type HistoryRecord struct { + TxNum uint64 + Value []byte +} + +// returns up to 2 records: one has txnum <= beforeTxNum, another has txnum > beforeTxNum, if any +func (h *History) unwindKey(key []byte, beforeTxNum uint64, tx kv.RwTx) ([]HistoryRecord, error) { + res := make([]HistoryRecord, 0, 2) + + if h.historyLargeValues { + c, err := tx.RwCursor(h.historyValsTable) if err != nil { - return err + return nil, err } - defer valsC.Close() - } else { - valsCDup, err = h.tx.RwCursorDupSort(h.historyValsTable) + defer c.Close() + + seek := make([]byte, len(key)+8) + copy(seek, key) + binary.BigEndian.PutUint64(seek[len(key):], beforeTxNum) + + kAndTxNum, val, err := c.Seek(seek) if err != nil { - return err - } - defer valsCDup.Close() - } - for k, v, err = historyKeysCursor.Seek(txKey[:]); err == nil && k != nil; k, v, err = historyKeysCursor.Next() { - txNum := binary.BigEndian.Uint64(k) - if txNum >= txTo { - break + return nil, err } - if limit == 0 { - return nil + if len(kAndTxNum) == 0 || !bytes.Equal(kAndTxNum[:len(kAndTxNum)-8], key) { + // need to go back to the previous key + kAndTxNum, val, err = c.Prev() + if err != nil { + return nil, err + } + if len(kAndTxNum) == 0 || !bytes.Equal(kAndTxNum[:len(kAndTxNum)-8], key) { + return nil, nil + } } - limit-- - if h.largeValues { - seek := append(common.Copy(v), k...) - if err := valsC.Delete(seek); err != nil { - return err - } - } else { - vv, err := valsCDup.SeekBothRange(v, k) + rec := HistoryRecord{binary.BigEndian.Uint64(kAndTxNum[len(kAndTxNum)-8:]), common.Copy(val)} + switch { + case rec.TxNum < beforeTxNum: + nk, nv, err := c.Next() if err != nil { - return err + return nil, err } - if binary.BigEndian.Uint64(vv) != txNum { - continue + + res = append(res, rec) + if nk != nil && bytes.Equal(nk[:len(nk)-8], key) { + res = append(res, HistoryRecord{binary.BigEndian.Uint64(nk[len(nk)-8:]), common.Copy(nv)}) } - if err = valsCDup.DeleteCurrent(); err != nil { - return err + case rec.TxNum >= beforeTxNum: + pk, pv, err := c.Prev() + if err != nil { + return nil, err } + + if pk != nil && bytes.Equal(pk[:len(pk)-8], key) { + res = append(res, HistoryRecord{binary.BigEndian.Uint64(pk[len(pk)-8:]), common.Copy(pv)}) + } + res = append(res, rec) } + return res, nil + } - // This DeleteCurrent needs to the last in the loop iteration, because it invalidates k and v - if _, _, err = historyKeysCursorForDeletes.SeekBothExact(k, v); err != nil { - return err + c, err := tx.RwCursorDupSort(h.historyValsTable) + if err != nil { + return nil, err + } + defer c.Close() + + var val []byte + var txNum uint64 + aux := hexutility.EncodeTs(beforeTxNum) + val, err = c.SeekBothRange(key, aux) + if err != nil { + return nil, err + } + if val == nil { + return nil, nil + } + txNum = binary.BigEndian.Uint64(val[:8]) + val = val[8:] + + switch { + case txNum <= beforeTxNum: + nk, nv, err := c.NextDup() + if err != nil { + return nil, err } - if err = historyKeysCursorForDeletes.DeleteCurrent(); err != nil { - return err + + res = append(res, HistoryRecord{beforeTxNum, val}) + if nk != nil { + res = append(res, HistoryRecord{binary.BigEndian.Uint64(nv[:8]), nv[8:]}) + if err := c.DeleteCurrent(); err != nil { + return nil, err + } + } + case txNum > beforeTxNum: + pk, pv, err := c.PrevDup() + if err != nil { + return nil, err } + + if pk != nil { + res = append(res, HistoryRecord{binary.BigEndian.Uint64(pv[:8]), pv[8:]}) + if err := c.DeleteCurrent(); err != nil { + return nil, err + } + // this case will be removed by pruning. Or need to implement cleaning through txTo + } + res = append(res, HistoryRecord{beforeTxNum, val}) } - return nil + return res, nil } type HistoryContext struct { @@ -1122,37 +1172,41 @@ type HistoryContext struct { ic *InvertedIndexContext files []ctxItem // have no garbage (canDelete=true, overlaps, etc...) - getters []*compress.Getter + getters []ArchiveGetter readers []*recsplit.IndexReader trace bool + + valsC kv.Cursor + valsCDup kv.CursorDupSort + + _bufTs []byte } func (h *History) MakeContext() *HistoryContext { + files := *h.roFiles.Load() + for i := 0; i < len(files); i++ { + if !files[i].src.frozen { + files[i].src.refcount.Add(1) + } + } - var hc = HistoryContext{ + return &HistoryContext{ h: h, ic: h.InvertedIndex.MakeContext(), - files: *h.roFiles.Load(), - + files: files, trace: false, } - for _, item := range hc.files { - if !item.src.frozen { - item.src.refcount.Add(1) - } - } - - return &hc } -func (hc *HistoryContext) statelessGetter(i int) *compress.Getter { +func (hc *HistoryContext) statelessGetter(i int) ArchiveGetter { if hc.getters == nil { - hc.getters = make([]*compress.Getter, len(hc.files)) + hc.getters = make([]ArchiveGetter, len(hc.files)) } r := hc.getters[i] if r == nil { - r = hc.files[i].src.decompressor.MakeGetter() + g := hc.files[i].src.decompressor.MakeGetter() + r = NewArchiveGetter(g, hc.h.compression) hc.getters[i] = r } return r @@ -1169,38 +1223,164 @@ func (hc *HistoryContext) statelessIdxReader(i int) *recsplit.IndexReader { return r } +func (hc *HistoryContext) Prune(ctx context.Context, rwTx kv.RwTx, txFrom, txTo, limit uint64, logEvery *time.Ticker) error { + defer func(t time.Time) { mxPruneTookHistory.UpdateDuration(t) }(time.Now()) + + historyKeysCursorForDeletes, err := rwTx.RwCursorDupSort(hc.h.indexKeysTable) + if err != nil { + return fmt.Errorf("create %s history cursor: %w", hc.h.filenameBase, err) + } + defer historyKeysCursorForDeletes.Close() + historyKeysCursor, err := rwTx.RwCursorDupSort(hc.h.indexKeysTable) + if err != nil { + return fmt.Errorf("create %s history cursor: %w", hc.h.filenameBase, err) + } + defer historyKeysCursor.Close() + + var ( + txKey [8]byte + k, v []byte + valsC kv.RwCursor + valsCDup kv.RwCursorDupSort + ) + + binary.BigEndian.PutUint64(txKey[:], txFrom) + if hc.h.historyLargeValues { + valsC, err = rwTx.RwCursor(hc.h.historyValsTable) + if err != nil { + return err + } + defer valsC.Close() + } else { + valsCDup, err = rwTx.RwCursorDupSort(hc.h.historyValsTable) + if err != nil { + return err + } + defer valsCDup.Close() + } + + seek := make([]byte, 0, 256) + var pruneSize uint64 + for k, v, err = historyKeysCursor.Seek(txKey[:]); err == nil && k != nil; k, v, err = historyKeysCursor.Next() { + txNum := binary.BigEndian.Uint64(k) + if txNum >= txTo { + break + } + if limit == 0 { + return nil + } + limit-- + + if hc.h.historyLargeValues { + seek = append(append(seek[:0], v...), k...) + if err := valsC.Delete(seek); err != nil { + return err + } + } else { + vv, err := valsCDup.SeekBothRange(v, k) + if err != nil { + return err + } + if binary.BigEndian.Uint64(vv) != txNum { + continue + } + if err = valsCDup.DeleteCurrent(); err != nil { + return err + } + } + // This DeleteCurrent needs to the last in the loop iteration, because it invalidates k and v + if _, _, err = historyKeysCursorForDeletes.SeekBothExact(k, v); err != nil { + return err + } + if err = historyKeysCursorForDeletes.DeleteCurrent(); err != nil { + return err + } + + pruneSize++ + mxPruneSizeHistory.Inc() + select { + case <-ctx.Done(): + return ctx.Err() + case <-logEvery.C: + hc.h.logger.Info("[snapshots] prune history", "name", hc.h.filenameBase, "from", txFrom, "to", txTo, + "pruned records", pruneSize) + //"steps", fmt.Sprintf("%.2f-%.2f", float64(txFrom)/float64(d.aggregationStep), float64(txTo)/float64(d.aggregationStep))) + default: + } + } + return nil +} + func (hc *HistoryContext) Close() { - hc.ic.Close() - for _, item := range hc.files { - if item.src.frozen { + if hc.files == nil { // invariant: it's safe to call Close multiple times + return + } + files := hc.files + hc.files = nil + for i := 0; i < len(files); i++ { + if files[i].src.frozen { continue } - refCnt := item.src.refcount.Add(-1) + refCnt := files[i].src.refcount.Add(-1) //if hc.h.filenameBase == "accounts" && item.src.canDelete.Load() { // log.Warn("[history] HistoryContext.Close: check file to remove", "refCnt", refCnt, "name", item.src.decompressor.FileName()) //} //GC: last reader responsible to remove useles files: close it and delete - if refCnt == 0 && item.src.canDelete.Load() { - item.src.closeFilesAndRemove() + if refCnt == 0 && files[i].src.canDelete.Load() { + files[i].src.closeFilesAndRemove() } } for _, r := range hc.readers { r.Close() } + hc.ic.Close() } -func (hc *HistoryContext) getFile(from, to uint64) (it ctxItem, ok bool) { - for _, item := range hc.files { - if item.startTxNum == from && item.endTxNum == to { - return item, true +func (hc *HistoryContext) getFileDeprecated(from, to uint64) (it ctxItem, ok bool) { + for i := 0; i < len(hc.files); i++ { + if hc.files[i].startTxNum == from && hc.files[i].endTxNum == to { + return hc.files[i], true + } + } + return it, false +} +func (hc *HistoryContext) getFile(txNum uint64) (it ctxItem, ok bool) { + for i := 0; i < len(hc.files); i++ { + if hc.files[i].startTxNum <= txNum && hc.files[i].endTxNum > txNum { + return hc.files[i], true } } return it, false } func (hc *HistoryContext) GetNoState(key []byte, txNum uint64) ([]byte, bool, error) { - exactStep1, exactStep2, lastIndexedTxNum, foundExactShard1, foundExactShard2 := hc.h.localityIndex.lookupIdxFiles(hc.ic.loc, key, txNum) + if !hc.h.withExistenceIndex { + return hc.getNoStateByLocalityIndex(key, txNum) + } + // Files list of II and History is different + // it means II can't return index of file, but can return TxNum which History will use to find own file + ok, histTxNum := hc.ic.Seek(key, txNum) + if !ok { + return nil, false, nil + } + historyItem, ok := hc.getFile(histTxNum) + if !ok { + return nil, false, fmt.Errorf("hist file not found: key=%x, %s.%d-%d", key, hc.h.filenameBase, histTxNum/hc.h.aggregationStep, histTxNum/hc.h.aggregationStep) + } + reader := hc.statelessIdxReader(historyItem.i) + if reader.Empty() { + return nil, false, nil + } + offset := reader.Lookup2(hc.encodeTs(histTxNum), key) + g := hc.statelessGetter(historyItem.i) + g.Reset(offset) + + v, _ := g.Next(nil) + return v, true, nil +} +func (hc *HistoryContext) getNoStateByLocalityIndex(key []byte, txNum uint64) ([]byte, bool, error) { + exactStep1, exactStep2, lastIndexedTxNum, foundExactShard1, foundExactShard2 := hc.ic.coldLocality.lookupIdxFiles(key, txNum) //fmt.Printf("GetNoState [%x] %d\n", key, txNum) var foundTxNum uint64 @@ -1213,9 +1393,10 @@ func (hc *HistoryContext) GetNoState(key []byte, txNum uint64) ([]byte, bool, er return true } offset := reader.Lookup(key) + g := hc.ic.statelessGetter(item.i) g.Reset(offset) - k, _ := g.NextUncompressed() + k, _ := g.Next(nil) if !bytes.Equal(k, key) { //if bytes.Equal(key, hex.MustDecodeString("009ba32869045058a3f05d6f3dd2abb967e338f6")) { @@ -1223,14 +1404,8 @@ func (hc *HistoryContext) GetNoState(key []byte, txNum uint64) ([]byte, bool, er //} return true } - eliasVal, _ := g.NextUncompressed() - ef, _ := eliasfano32.ReadEliasFano(eliasVal) - n, ok := ef.Search(txNum) - if hc.trace { - n2, _ := ef.Search(n + 1) - n3, _ := ef.Search(n - 1) - fmt.Printf("hist: files: %s %d<-%d->%d->%d, %x\n", hc.h.filenameBase, n3, txNum, n, n2, key) - } + eliasVal, _ := g.Next(nil) + n, ok := eliasfano32.Seek(eliasVal, txNum) if ok { foundTxNum = n foundEndTxNum = item.endTxNum @@ -1244,7 +1419,7 @@ func (hc *HistoryContext) GetNoState(key []byte, txNum uint64) ([]byte, bool, er // -- LocaliyIndex opimization -- // check up to 2 exact files if foundExactShard1 { - from, to := exactStep1*hc.h.aggregationStep, (exactStep1+StepsInBiggestFile)*hc.h.aggregationStep + from, to := exactStep1*hc.h.aggregationStep, (exactStep1+StepsInColdFile)*hc.h.aggregationStep item, ok := hc.ic.getFile(from, to) if ok { findInFile(item) @@ -1254,18 +1429,18 @@ func (hc *HistoryContext) GetNoState(key []byte, txNum uint64) ([]byte, bool, er // findInFile(item) // } //} - //exactShard1, ok := hc.invIndexFiles.Get(ctxItem{startTxNum: exactStep1 * hc.h.aggregationStep, endTxNum: (exactStep1 + StepsInBiggestFile) * hc.h.aggregationStep}) + //exactShard1, ok := hc.invIndexFiles.Get(ctxItem{startTxNum: exactStep1 * hc.h.aggregationStep, endTxNum: (exactStep1 + StepsInColdFile) * hc.h.aggregationStep}) //if ok { // findInFile(exactShard1) //} } if !found && foundExactShard2 { - from, to := exactStep2*hc.h.aggregationStep, (exactStep2+StepsInBiggestFile)*hc.h.aggregationStep + from, to := exactStep2*hc.h.aggregationStep, (exactStep2+StepsInColdFile)*hc.h.aggregationStep item, ok := hc.ic.getFile(from, to) if ok { findInFile(item) } - //exactShard2, ok := hc.invIndexFiles.Get(ctxItem{startTxNum: exactStep2 * hc.h.aggregationStep, endTxNum: (exactStep2 + StepsInBiggestFile) * hc.h.aggregationStep}) + //exactShard2, ok := hc.invIndexFiles.Get(ctxItem{startTxNum: exactStep2 * hc.h.aggregationStep, endTxNum: (exactStep2 + StepsInColdFile) * hc.h.aggregationStep}) //if ok { // findInFile(exactShard2) //} @@ -1288,7 +1463,7 @@ func (hc *HistoryContext) GetNoState(key []byte, txNum uint64) ([]byte, bool, er } if found { - historyItem, ok := hc.getFile(foundStartTxNum, foundEndTxNum) + historyItem, ok := hc.getFileDeprecated(foundStartTxNum, foundEndTxNum) if !ok { return nil, false, fmt.Errorf("hist file not found: key=%x, %s.%d-%d", key, hc.h.filenameBase, foundStartTxNum/hc.h.aggregationStep, foundEndTxNum/hc.h.aggregationStep) } @@ -1299,11 +1474,8 @@ func (hc *HistoryContext) GetNoState(key []byte, txNum uint64) ([]byte, bool, er //fmt.Printf("offset = %d, txKey=[%x], key=[%x]\n", offset, txKey[:], key) g := hc.statelessGetter(historyItem.i) g.Reset(offset) - if hc.h.compressVals { - v, _ := g.Next(nil) - return v, true, nil - } - v, _ := g.NextUncompressed() + + v, _ := g.Next(nil) return v, true, nil } return nil, false, nil @@ -1358,6 +1530,14 @@ func (hs *HistoryStep) MaxTxNum(key []byte) (bool, uint64) { return true, eliasfano32.Max(eliasVal) } +func (hc *HistoryContext) encodeTs(txNum uint64) []byte { + if hc._bufTs == nil { + hc._bufTs = make([]byte, 8) + } + binary.BigEndian.PutUint64(hc._bufTs, txNum) + return hc._bufTs +} + // GetNoStateWithRecent searches history for a value of specified key before txNum // second return value is true if the value is found in the history (even if it is nil) func (hc *HistoryContext) GetNoStateWithRecent(key []byte, txNum uint64, roTx kv.Tx) ([]byte, bool, error) { @@ -1369,23 +1549,40 @@ func (hc *HistoryContext) GetNoStateWithRecent(key []byte, txNum uint64, roTx kv return v, true, nil } - // Value not found in history files, look in the recent history - if roTx == nil { - return nil, false, fmt.Errorf("roTx is nil") - } return hc.getNoStateFromDB(key, txNum, roTx) } +func (hc *HistoryContext) valsCursor(tx kv.Tx) (c kv.Cursor, err error) { + if hc.valsC != nil { + return hc.valsC, nil + } + hc.valsC, err = tx.Cursor(hc.h.historyValsTable) + if err != nil { + return nil, err + } + return hc.valsC, nil +} +func (hc *HistoryContext) valsCursorDup(tx kv.Tx) (c kv.CursorDupSort, err error) { + if hc.valsCDup != nil { + return hc.valsCDup, nil + } + hc.valsCDup, err = tx.CursorDupSort(hc.h.historyValsTable) + if err != nil { + return nil, err + } + return hc.valsCDup, nil +} + func (hc *HistoryContext) getNoStateFromDB(key []byte, txNum uint64, tx kv.Tx) ([]byte, bool, error) { - if hc.h.largeValues { - c, err := tx.Cursor(hc.h.historyValsTable) + if hc.h.historyLargeValues { + c, err := hc.valsCursor(tx) if err != nil { return nil, false, err } - defer c.Close() seek := make([]byte, len(key)+8) copy(seek, key) binary.BigEndian.PutUint64(seek[len(key):], txNum) + kAndTxNum, val, err := c.Seek(seek) if err != nil { return nil, false, err @@ -1393,55 +1590,49 @@ func (hc *HistoryContext) getNoStateFromDB(key []byte, txNum uint64, tx kv.Tx) ( if kAndTxNum == nil || !bytes.Equal(kAndTxNum[:len(kAndTxNum)-8], key) { return nil, false, nil } - // val == []byte{},m eans key was created in this txNum and doesn't exists before. + // val == []byte{}, means key was created in this txNum and doesn't exist before. return val, true, nil } - c, err := tx.CursorDupSort(hc.h.historyValsTable) + c, err := hc.valsCursorDup(tx) if err != nil { return nil, false, err } - defer c.Close() - seek := make([]byte, len(key)+8) - copy(seek, key) - binary.BigEndian.PutUint64(seek[len(key):], txNum) - val, err := c.SeekBothRange(key, seek[len(key):]) + val, err := c.SeekBothRange(key, hc.encodeTs(txNum)) if err != nil { return nil, false, err } if val == nil { return nil, false, nil } - // `val == []byte{}` means key was created in this txNum and doesn't exists before. + // `val == []byte{}` means key was created in this txNum and doesn't exist before. return val[8:], true, nil } - -func (hc *HistoryContext) WalkAsOf(startTxNum uint64, from, to []byte, roTx kv.Tx, limit int) iter.KV { +func (hc *HistoryContext) WalkAsOf(startTxNum uint64, from, to []byte, roTx kv.Tx, limit int) (iter.KV, error) { hi := &StateAsOfIterF{ from: from, to: to, limit: limit, - hc: hc, - compressVals: hc.h.compressVals, - startTxNum: startTxNum, + hc: hc, + startTxNum: startTxNum, } for _, item := range hc.ic.files { if item.endTxNum <= startTxNum { continue } // TODO: seek(from) - g := item.src.decompressor.MakeGetter() + g := NewArchiveGetter(item.src.decompressor.MakeGetter(), hc.h.compression) g.Reset(0) if g.HasNext() { - key, offset := g.NextUncompressed() + key, offset := g.Next(nil) heap.Push(&hi.h, &ReconItem{g: g, key: key, startTxNum: item.startTxNum, endTxNum: item.endTxNum, txNum: item.endTxNum, startOffset: offset, lastOffset: offset}) } } binary.BigEndian.PutUint64(hi.startTxKey[:], startTxNum) if err := hi.advanceInFiles(); err != nil { - panic(err) + return nil, err } dbit := &StateAsOfIterDB{ - largeValues: hc.h.largeValues, + largeValues: hc.h.historyLargeValues, roTx: roTx, valsTable: hc.h.historyValsTable, from: from, to: to, limit: limit, @@ -1452,7 +1643,7 @@ func (hc *HistoryContext) WalkAsOf(startTxNum uint64, from, to []byte, roTx kv.T if err := dbit.advance(); err != nil { panic(err) } - return iter.UnionKV(hi, dbit, limit) + return iter.UnionKV(hi, dbit, limit), nil } // StateAsOfIter - returns state range at given time in history @@ -1464,11 +1655,10 @@ type StateAsOfIterF struct { nextVal []byte nextKey []byte - h ReconHeap - startTxNum uint64 - startTxKey [8]byte - txnKey [8]byte - compressVals bool + h ReconHeap + startTxNum uint64 + startTxKey [8]byte + txnKey [8]byte k, v, kBackup, vBackup []byte } @@ -1481,17 +1671,17 @@ func (hi *StateAsOfIterF) advanceInFiles() error { top := heap.Pop(&hi.h).(*ReconItem) key := top.key var idxVal []byte - if hi.compressVals { - idxVal, _ = top.g.Next(nil) - } else { - idxVal, _ = top.g.NextUncompressed() - } + //if hi.compressVals { + idxVal, _ = top.g.Next(nil) + //} else { + // idxVal, _ = top.g.NextUncompressed() + //} if top.g.HasNext() { - if hi.compressVals { - top.key, _ = top.g.Next(nil) - } else { - top.key, _ = top.g.NextUncompressed() - } + //if hi.compressVals { + top.key, _ = top.g.Next(nil) + //} else { + // top.key, _ = top.g.NextUncompressed() + //} if hi.to == nil || bytes.Compare(top.key, hi.to) < 0 { heap.Push(&hi.h, top) } @@ -1504,27 +1694,23 @@ func (hi *StateAsOfIterF) advanceInFiles() error { if bytes.Equal(key, hi.nextKey) { continue } - ef, _ := eliasfano32.ReadEliasFano(idxVal) - n, ok := ef.Search(hi.startTxNum) + n, ok := eliasfano32.Seek(idxVal, hi.startTxNum) if !ok { continue } hi.nextKey = key binary.BigEndian.PutUint64(hi.txnKey[:], n) - historyItem, ok := hi.hc.getFile(top.startTxNum, top.endTxNum) + historyItem, ok := hi.hc.getFileDeprecated(top.startTxNum, top.endTxNum) if !ok { return fmt.Errorf("no %s file found for [%x]", hi.hc.h.filenameBase, hi.nextKey) } reader := hi.hc.statelessIdxReader(historyItem.i) offset := reader.Lookup2(hi.txnKey[:], hi.nextKey) + g := hi.hc.statelessGetter(historyItem.i) g.Reset(offset) - if hi.compressVals { - hi.nextVal, _ = g.Next(nil) - } else { - hi.nextVal, _ = g.NextUncompressed() - } + hi.nextVal, _ = g.Next(nil) return nil } hi.nextKey = nil @@ -1701,11 +1887,10 @@ func (hc *HistoryContext) iterateChangedFrozen(fromTxNum, toTxNum int, asc order } hi := &HistoryChangesIterFiles{ - hc: hc, - compressVals: hc.h.compressVals, - startTxNum: cmp.Max(0, uint64(fromTxNum)), - endTxNum: toTxNum, - limit: limit, + hc: hc, + startTxNum: cmp.Max(0, uint64(fromTxNum)), + endTxNum: toTxNum, + limit: limit, } if fromTxNum >= 0 { binary.BigEndian.PutUint64(hi.startTxKey[:], uint64(fromTxNum)) @@ -1717,10 +1902,10 @@ func (hc *HistoryContext) iterateChangedFrozen(fromTxNum, toTxNum int, asc order if toTxNum >= 0 && item.startTxNum >= uint64(toTxNum) { break } - g := item.src.decompressor.MakeGetter() + g := NewArchiveGetter(item.src.decompressor.MakeGetter(), hc.h.compression) g.Reset(0) if g.HasNext() { - key, offset := g.NextUncompressed() + key, offset := g.Next(nil) heap.Push(&hi.h, &ReconItem{g: g, key: key, startTxNum: item.startTxNum, endTxNum: item.endTxNum, txNum: item.endTxNum, startOffset: offset, lastOffset: offset}) } } @@ -1741,7 +1926,7 @@ func (hc *HistoryContext) iterateChangedRecent(fromTxNum, toTxNum int, asc order dbi := &HistoryChangesIterDB{ endTxNum: toTxNum, roTx: roTx, - largeValues: hc.h.largeValues, + largeValues: hc.h.historyLargeValues, valsTable: hc.h.historyValsTable, limit: limit, } @@ -1771,15 +1956,14 @@ func (hc *HistoryContext) HistoryRange(fromTxNum, toTxNum int, asc order.By, lim } type HistoryChangesIterFiles struct { - hc *HistoryContext - nextVal []byte - nextKey []byte - h ReconHeap - startTxNum uint64 - endTxNum int - startTxKey [8]byte - txnKey [8]byte - compressVals bool + hc *HistoryContext + nextVal []byte + nextKey []byte + h ReconHeap + startTxNum uint64 + endTxNum int + startTxKey [8]byte + txnKey [8]byte k, v, kBackup, vBackup []byte err error @@ -1794,25 +1978,24 @@ func (hi *HistoryChangesIterFiles) advance() error { top := heap.Pop(&hi.h).(*ReconItem) key := top.key var idxVal []byte - if hi.compressVals { - idxVal, _ = top.g.Next(nil) - } else { - idxVal, _ = top.g.NextUncompressed() - } + //if hi.compressVals { + idxVal, _ = top.g.Next(nil) + //} else { + // idxVal, _ = top.g.NextUncompressed() + //} if top.g.HasNext() { - if hi.compressVals { - top.key, _ = top.g.Next(nil) - } else { - top.key, _ = top.g.NextUncompressed() - } + //if hi.compressVals { + top.key, _ = top.g.Next(nil) + //} else { + // top.key, _ = top.g.NextUncompressed() + //} heap.Push(&hi.h, top) } if bytes.Equal(key, hi.nextKey) { continue } - ef, _ := eliasfano32.ReadEliasFano(idxVal) - n, ok := ef.Search(hi.startTxNum) //TODO: if startTxNum==0, can do ef.Get(0) + n, ok := eliasfano32.Seek(idxVal, hi.startTxNum) if !ok { continue } @@ -1822,7 +2005,7 @@ func (hi *HistoryChangesIterFiles) advance() error { hi.nextKey = key binary.BigEndian.PutUint64(hi.txnKey[:], n) - historyItem, ok := hi.hc.getFile(top.startTxNum, top.endTxNum) + historyItem, ok := hi.hc.getFileDeprecated(top.startTxNum, top.endTxNum) if !ok { return fmt.Errorf("HistoryChangesIterFiles: no %s file found for [%x]", hi.hc.h.filenameBase, hi.nextKey) } @@ -1830,11 +2013,7 @@ func (hi *HistoryChangesIterFiles) advance() error { offset := reader.Lookup2(hi.txnKey[:], hi.nextKey) g := hi.hc.statelessGetter(historyItem.i) g.Reset(offset) - if hi.compressVals { - hi.nextVal, _ = g.Next(nil) - } else { - hi.nextVal, _ = g.NextUncompressed() - } + hi.nextVal, _ = g.Next(nil) return nil } hi.nextKey = nil @@ -2031,46 +2210,6 @@ func (h *History) DisableReadAhead() { }) } -func (h *History) EnableReadAhead() *History { - h.InvertedIndex.EnableReadAhead() - h.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - item.decompressor.EnableReadAhead() - if item.index != nil { - item.index.EnableReadAhead() - } - } - return true - }) - return h -} -func (h *History) EnableMadvWillNeed() *History { - h.InvertedIndex.EnableMadvWillNeed() - h.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - item.decompressor.EnableWillNeed() - if item.index != nil { - item.index.EnableWillNeed() - } - } - return true - }) - return h -} -func (h *History) EnableMadvNormalReadAhead() *History { - h.InvertedIndex.EnableMadvNormalReadAhead() - h.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - item.decompressor.EnableMadvNormal() - if item.index != nil { - item.index.EnableMadvNormal() - } - } - return true - }) - return h -} - // HistoryStep used for incremental state reconsitution, it isolates only one snapshot interval type HistoryStep struct { compressVals bool @@ -2090,7 +2229,7 @@ func (h *History) MakeSteps(toTxNum uint64) []*HistoryStep { } step := &HistoryStep{ - compressVals: h.compressVals, + compressVals: h.compression&CompressVals != 0, indexItem: item, indexFile: ctxItem{ startTxNum: item.startTxNum, @@ -2145,7 +2284,7 @@ func (hs *HistoryStep) Clone() *HistoryStep { func (hc *HistoryContext) idxRangeRecent(key []byte, startTxNum, endTxNum int, asc order.By, limit int, roTx kv.Tx) (iter.U64, error) { var dbIt iter.U64 - if hc.h.largeValues { + if hc.h.historyLargeValues { if asc { from := make([]byte, len(key)+8) copy(from, key) diff --git a/state/history_test.go b/state/history_test.go index 647e14a39..68b935029 100644 --- a/state/history_test.go +++ b/state/history_test.go @@ -21,10 +21,16 @@ import ( "encoding/binary" "fmt" "math" + "os" + "path/filepath" "strings" "testing" "time" + "github.com/ledgerwatch/log/v3" + "github.com/stretchr/testify/require" + btree2 "github.com/tidwall/btree" + "github.com/ledgerwatch/erigon-lib/common/background" "github.com/ledgerwatch/erigon-lib/common/hexutility" "github.com/ledgerwatch/erigon-lib/kv" @@ -33,14 +39,14 @@ import ( "github.com/ledgerwatch/erigon-lib/kv/order" "github.com/ledgerwatch/erigon-lib/recsplit" "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" - "github.com/ledgerwatch/log/v3" - "github.com/stretchr/testify/require" - btree2 "github.com/tidwall/btree" ) -func testDbAndHistory(tb testing.TB, largeValues bool, logger log.Logger) (string, kv.RwDB, *History) { +func testDbAndHistory(tb testing.TB, largeValues bool, logger log.Logger) (kv.RwDB, *History) { tb.Helper() path := tb.TempDir() + dir := filepath.Join(path, "snapshots", "history") + require.NoError(tb, os.MkdirAll(filepath.Join(path, "snapshots", "warm"), 0740)) + require.NoError(tb, os.MkdirAll(dir, 0740)) keysTable := "AccountKeys" indexTable := "AccountIndex" valsTable := "AccountVals" @@ -53,12 +59,18 @@ func testDbAndHistory(tb testing.TB, largeValues bool, logger log.Logger) (strin settingsTable: kv.TableCfgItem{}, } }).MustOpen() - h, err := NewHistory(path, path, 16, "hist", keysTable, indexTable, valsTable, false, nil, false, logger) + //TODO: tests will fail if set histCfg.compression = CompressKeys | CompressValues + salt := uint32(1) + cfg := histCfg{ + iiCfg: iiCfg{salt: &salt, dir: dir, tmpdir: dir}, + withLocalityIndex: false, withExistenceIndex: true, compression: CompressNone, historyLargeValues: largeValues, + } + h, err := NewHistory(cfg, 16, "hist", keysTable, indexTable, valsTable, nil, logger) require.NoError(tb, err) h.DisableFsync() tb.Cleanup(db.Close) tb.Cleanup(h.Close) - return path, db, h + return db, h } func TestHistoryCollationBuild(t *testing.T) { @@ -165,11 +177,11 @@ func TestHistoryCollationBuild(t *testing.T) { } } t.Run("large_values", func(t *testing.T) { - _, db, h := testDbAndHistory(t, true, logger) + db, h := testDbAndHistory(t, true, logger) test(t, h, db) }) t.Run("small_values", func(t *testing.T) { - _, db, h := testDbAndHistory(t, false, logger) + db, h := testDbAndHistory(t, false, logger) test(t, h, db) }) } @@ -220,7 +232,10 @@ func TestHistoryAfterPrune(t *testing.T) { h.integrateFiles(sf, 0, 16) - err = h.prune(ctx, 0, 16, math.MaxUint64, logEvery) + hc := h.MakeContext() + err = hc.Prune(ctx, tx, 0, 16, math.MaxUint64, logEvery) + hc.Close() + require.NoError(err) h.SetTx(tx) @@ -236,24 +251,24 @@ func TestHistoryAfterPrune(t *testing.T) { } } t.Run("large_values", func(t *testing.T) { - _, db, h := testDbAndHistory(t, true, logger) + db, h := testDbAndHistory(t, true, logger) test(t, h, db) }) t.Run("small_values", func(t *testing.T) { - _, db, h := testDbAndHistory(t, false, logger) + db, h := testDbAndHistory(t, false, logger) test(t, h, db) }) } -func filledHistory(tb testing.TB, largeValues bool, logger log.Logger) (string, kv.RwDB, *History, uint64) { +func filledHistory(tb testing.TB, largeValues bool, logger log.Logger) (kv.RwDB, *History, uint64) { tb.Helper() - path, db, h := testDbAndHistory(tb, largeValues, logger) + db, h := testDbAndHistory(tb, largeValues, logger) ctx := context.Background() tx, err := db.BeginRw(ctx) require.NoError(tb, err) defer tx.Rollback() h.SetTx(tx) - h.StartWrites() + h.StartUnbufferedWrites() defer h.FinishWrites() txs := uint64(1000) @@ -295,7 +310,7 @@ func filledHistory(tb testing.TB, largeValues bool, logger log.Logger) (string, err = tx.Commit() require.NoError(tb, err) - return path, db, h, txs + return db, h, txs } func checkHistoryHistory(t *testing.T, h *History, txs uint64) { @@ -349,18 +364,21 @@ func TestHistoryHistory(t *testing.T) { sf, err := h.buildFiles(ctx, step, c, background.NewProgressSet()) require.NoError(err) h.integrateFiles(sf, step*h.aggregationStep, (step+1)*h.aggregationStep) - err = h.prune(ctx, step*h.aggregationStep, (step+1)*h.aggregationStep, math.MaxUint64, logEvery) + + hc := h.MakeContext() + err = hc.Prune(ctx, tx, step*h.aggregationStep, (step+1)*h.aggregationStep, math.MaxUint64, logEvery) + hc.Close() require.NoError(err) }() } checkHistoryHistory(t, h, txs) } t.Run("large_values", func(t *testing.T) { - _, db, h, txs := filledHistory(t, true, logger) + db, h, txs := filledHistory(t, true, logger) test(t, h, db, txs) }) t.Run("small_values", func(t *testing.T) { - _, db, h, txs := filledHistory(t, false, logger) + db, h, txs := filledHistory(t, false, logger) test(t, h, db, txs) }) @@ -385,20 +403,23 @@ func collateAndMergeHistory(tb testing.TB, db kv.RwDB, h *History, txs uint64) { sf, err := h.buildFiles(ctx, step, c, background.NewProgressSet()) require.NoError(err) h.integrateFiles(sf, step*h.aggregationStep, (step+1)*h.aggregationStep) - err = h.prune(ctx, step*h.aggregationStep, (step+1)*h.aggregationStep, math.MaxUint64, logEvery) + + hc := h.MakeContext() + err = hc.Prune(ctx, tx, step*h.aggregationStep, (step+1)*h.aggregationStep, math.MaxUint64, logEvery) + hc.Close() require.NoError(err) } var r HistoryRanges maxEndTxNum := h.endTxNumMinimax() - maxSpan := h.aggregationStep * StepsInBiggestFile + maxSpan := h.aggregationStep * StepsInColdFile for { if stop := func() bool { hc := h.MakeContext() defer hc.Close() - r = h.findMergeRange(maxEndTxNum, maxSpan) + r = hc.findMergeRange(maxEndTxNum, maxSpan) if !r.any() { return true } @@ -415,7 +436,7 @@ func collateAndMergeHistory(tb testing.TB, db kv.RwDB, h *History, txs uint64) { hc := h.MakeContext() defer hc.Close() - err = hc.BuildOptionalMissedIndices(ctx) + err = hc.ic.BuildOptionalMissedIndices(ctx, background.NewProgressSet()) require.NoError(err) err = tx.Commit() @@ -431,11 +452,11 @@ func TestHistoryMergeFiles(t *testing.T) { } t.Run("large_values", func(t *testing.T) { - _, db, h, txs := filledHistory(t, true, logger) + db, h, txs := filledHistory(t, true, logger) test(t, h, db, txs) }) t.Run("small_values", func(t *testing.T) { - _, db, h, txs := filledHistory(t, false, logger) + db, h, txs := filledHistory(t, false, logger) test(t, h, db, txs) }) } @@ -458,11 +479,11 @@ func TestHistoryScanFiles(t *testing.T) { } t.Run("large_values", func(t *testing.T) { - _, db, h, txs := filledHistory(t, true, logger) + db, h, txs := filledHistory(t, true, logger) test(t, h, db, txs) }) t.Run("small_values", func(t *testing.T) { - _, db, h, txs := filledHistory(t, false, logger) + db, h, txs := filledHistory(t, false, logger) test(t, h, db, txs) }) } @@ -606,11 +627,11 @@ func TestIterateChanged(t *testing.T) { require.Equal([]string{"ff000000000003cf", "ff000000000001e7"}, vals) } t.Run("large_values", func(t *testing.T) { - _, db, h, txs := filledHistory(t, true, logger) + db, h, txs := filledHistory(t, true, logger) test(t, h, db, txs) }) t.Run("small_values", func(t *testing.T) { - _, db, h, txs := filledHistory(t, false, logger) + db, h, txs := filledHistory(t, false, logger) test(t, h, db, txs) }) } @@ -798,20 +819,18 @@ func TestIterateChanged2(t *testing.T) { }) } t.Run("large_values", func(t *testing.T) { - _, db, h, txs := filledHistory(t, true, logger) + db, h, txs := filledHistory(t, true, logger) test(t, h, db, txs) }) t.Run("small_values", func(t *testing.T) { - _, db, h, txs := filledHistory(t, false, logger) + db, h, txs := filledHistory(t, false, logger) test(t, h, db, txs) }) } func TestScanStaticFilesH(t *testing.T) { - logger := log.New() - h := &History{InvertedIndex: &InvertedIndex{filenameBase: "test", aggregationStep: 1, logger: logger}, - files: btree2.NewBTreeG[*filesItem](filesItemLess), - logger: logger, + h := &History{InvertedIndex: emptyTestInvertedIndex(1), + files: btree2.NewBTreeG[*filesItem](filesItemLess), } files := []string{ "test.0-1.v", diff --git a/state/inverted_index.go b/state/inverted_index.go index 5ef8e6ec2..ff8bfae39 100644 --- a/state/inverted_index.go +++ b/state/inverted_index.go @@ -31,9 +31,15 @@ import ( "time" "github.com/RoaringBitmap/roaring/roaring64" - "github.com/c2h5oh/datasize" + "github.com/ledgerwatch/log/v3" + "github.com/spaolacci/murmur3" + btree2 "github.com/tidwall/btree" + "golang.org/x/exp/slices" + "golang.org/x/sync/errgroup" + "github.com/ledgerwatch/erigon-lib/common/background" "github.com/ledgerwatch/erigon-lib/common/cmp" + "github.com/ledgerwatch/erigon-lib/common/dbg" "github.com/ledgerwatch/erigon-lib/common/dir" "github.com/ledgerwatch/erigon-lib/compress" "github.com/ledgerwatch/erigon-lib/etl" @@ -43,13 +49,10 @@ import ( "github.com/ledgerwatch/erigon-lib/kv/order" "github.com/ledgerwatch/erigon-lib/recsplit" "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" - "github.com/ledgerwatch/log/v3" - btree2 "github.com/tidwall/btree" - "golang.org/x/exp/slices" - "golang.org/x/sync/errgroup" ) type InvertedIndex struct { + iiCfg files *btree2.BTreeG[*filesItem] // thread-safe, but maybe need 1 RWLock for all trees in AggregatorV3 // roFiles derivative from field `file`, but without garbage (canDelete=true, overlaps, etc...) @@ -58,15 +61,20 @@ type InvertedIndex struct { indexKeysTable string // txnNum_u64 -> key (k+auto_increment) indexTable string // k -> txnNum_u64 , Needs to be table with DupSort - dir, tmpdir string // Directory where static files are created + warmDir string // Directory where static files are created filenameBase string aggregationStep uint64 - compressWorkers int integrityFileExtensions []string withLocalityIndex bool - localityIndex *LocalityIndex - tx kv.RwTx + withExistenceIndex bool + + // localityIdx of warm files - storing `steps` where `key` was updated + // - need re-calc when new file created + // - don't need re-calc after files merge - because merge doesn't change `steps` where `key` was updated + warmLocalityIdx *LocalityIndex + coldLocalityIdx *LocalityIndex + tx kv.RwTx garbageFiles []*filesItem // files that exist on disk, but ignored on opening folder - because they are garbage @@ -77,21 +85,30 @@ type InvertedIndex struct { logger log.Logger noFsync bool // fsync is enabled by default, but tests can manually disable + + compression FileCompression + compressWorkers int +} + +type iiCfg struct { + salt *uint32 + dir, tmpdir string } func NewInvertedIndex( - dir, tmpdir string, + cfg iiCfg, aggregationStep uint64, filenameBase string, indexKeysTable string, indexTable string, - withLocalityIndex bool, + withLocalityIndex, withExistenceIndex bool, integrityFileExtensions []string, logger log.Logger, ) (*InvertedIndex, error) { + baseDir := filepath.Dir(cfg.dir) ii := InvertedIndex{ - dir: dir, - tmpdir: tmpdir, + iiCfg: cfg, + warmDir: filepath.Join(baseDir, "warm"), files: btree2.NewBTreeGOptions[*filesItem](filesItemLess, btree2.Options{Degree: 128, NoLocks: false}), aggregationStep: aggregationStep, filenameBase: filenameBase, @@ -100,24 +117,36 @@ func NewInvertedIndex( compressWorkers: 1, integrityFileExtensions: integrityFileExtensions, withLocalityIndex: withLocalityIndex, + withExistenceIndex: withExistenceIndex, logger: logger, } ii.roFiles.Store(&[]ctxItem{}) if ii.withLocalityIndex { - var err error - ii.localityIndex, err = NewLocalityIndex(ii.dir, ii.tmpdir, ii.aggregationStep, ii.filenameBase, ii.logger) - if err != nil { - return nil, fmt.Errorf("NewHistory: %s, %w", ii.filenameBase, err) + if err := ii.enableLocalityIndex(); err != nil { + return nil, err } } return &ii, nil } -func (ii *InvertedIndex) fileNamesOnDisk() ([]string, error) { +func (ii *InvertedIndex) enableLocalityIndex() error { + var err error + ii.warmLocalityIdx = NewLocalityIndex(true, ii.warmDir, ii.filenameBase, ii.aggregationStep, ii.tmpdir, ii.salt, ii.logger) + if err != nil { + return fmt.Errorf("NewHistory: %s, %w", ii.filenameBase, err) + } + ii.coldLocalityIdx = NewLocalityIndex(false, ii.dir, ii.filenameBase, ii.aggregationStep, ii.tmpdir, ii.salt, ii.logger) + if err != nil { + return fmt.Errorf("NewHistory: %s, %w", ii.filenameBase, err) + } + return nil +} + +func (ii *InvertedIndex) fileNamesOnDisk() ([]string, []string, error) { files, err := os.ReadDir(ii.dir) if err != nil { - return nil, err + return nil, nil, fmt.Errorf("ReadDir: %w, %s", err, ii.dir) } filteredFiles := make([]string, 0, len(files)) for _, f := range files { @@ -126,27 +155,43 @@ func (ii *InvertedIndex) fileNamesOnDisk() ([]string, error) { } filteredFiles = append(filteredFiles, f.Name()) } - return filteredFiles, nil + + warmFiles := make([]string, 0, len(files)) + files, err = os.ReadDir(ii.warmDir) + if err != nil { + return nil, nil, fmt.Errorf("ReadDir: %w, %s", err, ii.dir) + } + for _, f := range files { + if !f.Type().IsRegular() { + continue + } + warmFiles = append(warmFiles, f.Name()) + } + + return filteredFiles, warmFiles, nil } -func (ii *InvertedIndex) OpenList(fNames []string) error { - if err := ii.localityIndex.OpenList(fNames); err != nil { +func (ii *InvertedIndex) OpenList(fNames, warmFNames []string) error { + if err := ii.warmLocalityIdx.OpenList(warmFNames); err != nil { + return err + } + if err := ii.coldLocalityIdx.OpenList(fNames); err != nil { return err } ii.closeWhatNotInList(fNames) ii.garbageFiles = ii.scanStateFiles(fNames) if err := ii.openFiles(); err != nil { - return fmt.Errorf("NewHistory.openFiles: %s, %w", ii.filenameBase, err) + return fmt.Errorf("InvertedIndex.openFiles: %s, %w", ii.filenameBase, err) } return nil } func (ii *InvertedIndex) OpenFolder() error { - files, err := ii.fileNamesOnDisk() + files, warm, err := ii.fileNamesOnDisk() if err != nil { return err } - return ii.OpenList(files) + return ii.OpenList(files, warm) } func (ii *InvertedIndex) scanStateFiles(fileNames []string) (garbageFiles []*filesItem) { @@ -192,24 +237,29 @@ Loop: } addNewFile := true - var subSets []*filesItem - ii.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - if item.isSubsetOf(newFile) { - subSets = append(subSets, item) - continue - } + /* + var subSets []*filesItem + ii.files.Walk(func(items []*filesItem) bool { + for _, item := range items { + if item.isSubsetOf(newFile) { + fmt.Printf("skip is subset %s.%d-%d.ef of %s.%d-%d.ef\n", ii.filenameBase, item.startTxNum/ii.aggregationStep, item.endTxNum/ii.aggregationStep, ii.filenameBase, newFile.startTxNum/ii.aggregationStep, newFile.endTxNum/ii.aggregationStep) + subSets = append(subSets, item) + continue + } - if newFile.isSubsetOf(item) { - if item.frozen { - addNewFile = false - garbageFiles = append(garbageFiles, newFile) + if newFile.isSubsetOf(item) { + //if item.frozen { + //fmt.Printf("skip2 is subperset %s.%d-%d.ef of %s.%d-%d.ef, %t, %t\n", ii.filenameBase, item.startTxNum/ii.aggregationStep, item.endTxNum/ii.aggregationStep, ii.filenameBase, newFile.startTxNum/ii.aggregationStep, newFile.endTxNum/ii.aggregationStep, item.frozen, newFile.frozen) + //addNewFile = false + //garbageFiles = append(garbageFiles, newFile) + //} + return false } - continue } - } - return true - }) + return true + }) + */ + //for _, subSet := range subSets { // ii.files.Delete(subSet) //} @@ -217,11 +267,10 @@ Loop: ii.files.Set(newFile) } } - return garbageFiles } -func ctxFiles(files *btree2.BTreeG[*filesItem]) (roItems []ctxItem) { +func ctxFiles(files *btree2.BTreeG[*filesItem], requireHashIndex, requireBTreeIndex bool) (roItems []ctxItem) { roFiles := make([]ctxItem, 0, files.Len()) files.Walk(func(items []*filesItem) bool { for _, item := range items { @@ -229,6 +278,14 @@ func ctxFiles(files *btree2.BTreeG[*filesItem]) (roItems []ctxItem) { continue } + // TODO: need somehow handle this case, but indices do not open in tests TestFindMergeRangeCornerCases + //if requireHashIndex && item.index == nil { + // continue + //} + //if requireBTreeIndex && item.bindex == nil { + // continue + //} + // `kill -9` may leave small garbage files, but if big one already exists we assume it's good(fsynced) and no reason to merge again // see super-set file, just drop sub-set files from list for len(roFiles) > 0 && roFiles[len(roFiles)-1].src.isSubsetOf(item) { @@ -251,7 +308,7 @@ func ctxFiles(files *btree2.BTreeG[*filesItem]) (roItems []ctxItem) { } func (ii *InvertedIndex) reCalcRoFiles() { - roFiles := ctxFiles(ii.files) + roFiles := ctxFiles(ii.files, true, false) ii.roFiles.Store(&roFiles) } @@ -267,27 +324,99 @@ func (ii *InvertedIndex) missedIdxFiles() (l []*filesItem) { }) return l } +func (ii *InvertedIndex) missedIdxFilterFiles() (l []*filesItem) { + ii.files.Walk(func(items []*filesItem) bool { + for _, item := range items { + fromStep, toStep := item.startTxNum/ii.aggregationStep, item.endTxNum/ii.aggregationStep + if !dir.FileExist(filepath.Join(ii.dir, fmt.Sprintf("%s.%d-%d.efei", ii.filenameBase, fromStep, toStep))) { + l = append(l, item) + } + } + return true + }) + return l +} -func (ii *InvertedIndex) buildEfi(ctx context.Context, item *filesItem, p *background.Progress) (err error) { +func (ii *InvertedIndex) buildEfi(ctx context.Context, item *filesItem, ps *background.ProgressSet) (err error) { fromStep, toStep := item.startTxNum/ii.aggregationStep, item.endTxNum/ii.aggregationStep fName := fmt.Sprintf("%s.%d-%d.efi", ii.filenameBase, fromStep, toStep) idxPath := filepath.Join(ii.dir, fName) - p.Name.Store(&fName) - p.Total.Store(uint64(item.decompressor.Count())) - //ii.logger.Info("[snapshots] build idx", "file", fName) - return buildIndex(ctx, item.decompressor, idxPath, ii.tmpdir, item.decompressor.Count()/2, false, p, ii.logger, ii.noFsync) + + return buildIndex(ctx, item.decompressor, CompressNone, idxPath, ii.tmpdir, false, ii.salt, ps, ii.logger, ii.noFsync) +} +func (ii *InvertedIndex) buildIdxFilter(ctx context.Context, item *filesItem, ps *background.ProgressSet) (err error) { + fromStep, toStep := item.startTxNum/ii.aggregationStep, item.endTxNum/ii.aggregationStep + fName := fmt.Sprintf("%s.%d-%d.efei", ii.filenameBase, fromStep, toStep) + idxPath := filepath.Join(ii.dir, fName) + return buildIdxFilter(ctx, item.decompressor, CompressNone, idxPath, ii.tmpdir, ii.salt, ps, ii.logger, ii.noFsync) +} +func buildIdxFilter(ctx context.Context, d *compress.Decompressor, compressed FileCompression, idxPath, tmpdir string, salt *uint32, ps *background.ProgressSet, logger log.Logger, noFsync bool) error { + g := NewArchiveGetter(d.MakeGetter(), compressed) + _, fileName := filepath.Split(idxPath) + count := d.Count() / 2 + if count < 2 { + return nil + } + + p := ps.AddNew(fileName, uint64(count)) + defer ps.Delete(p) + defer d.EnableReadAhead().DisableReadAhead() + + idxFilter, err := NewBloom(uint64(count), idxPath) + if err != nil { + return err + } + hasher := murmur3.New128WithSeed(*salt) + + key := make([]byte, 0, 256) + g.Reset(0) + for g.HasNext() { + key, _ = g.Next(key[:0]) + hasher.Reset() + hasher.Write(key) //nolint:errcheck + hi, _ := hasher.Sum128() + idxFilter.AddHash(hi) + + // Skip value + g.Skip() + + p.Processed.Add(1) + } + if err := idxFilter.Build(); err != nil { + return err + } + + return nil } // BuildMissedIndices - produce .efi/.vi/.kvi from .ef/.v/.kv func (ii *InvertedIndex) BuildMissedIndices(ctx context.Context, g *errgroup.Group, ps *background.ProgressSet) { - missedFiles := ii.missedIdxFiles() - for _, item := range missedFiles { + for _, item := range ii.missedIdxFiles() { item := item g.Go(func() error { - p := &background.Progress{} - ps.Add(p) - defer ps.Delete(p) - return ii.buildEfi(ctx, item, p) + return ii.buildEfi(ctx, item, ps) + }) + } + + for _, item := range ii.missedIdxFilterFiles() { + item := item + g.Go(func() error { + return ii.buildIdxFilter(ctx, item, ps) + }) + } + + if ii.withLocalityIndex && ii.warmLocalityIdx != nil { + g.Go(func() error { + ic := ii.MakeContext() + defer ic.Close() + from, to := ic.minWarmStep(), ic.maxWarmStep() + if from == to || ic.ii.warmLocalityIdx.exists(from, to) { + return nil + } + if err := ic.ii.warmLocalityIdx.BuildMissedIndices(ctx, from, to, false, ps, func() *LocalityIterator { return ic.iterateKeysLocality(ctx, from, to, nil) }); err != nil { + return err + } + return nil }) } } @@ -313,16 +442,25 @@ func (ii *InvertedIndex) openFiles() error { continue } - if item.index != nil { - continue + if item.index == nil { + idxPath := filepath.Join(ii.dir, fmt.Sprintf("%s.%d-%d.efi", ii.filenameBase, fromStep, toStep)) + if dir.FileExist(idxPath) { + if item.index, err = recsplit.OpenIndex(idxPath); err != nil { + ii.logger.Debug("InvertedIndex.openFiles: %w, %s", err, idxPath) + return false + } + totalKeys += item.index.KeyCount() + } } - idxPath := filepath.Join(ii.dir, fmt.Sprintf("%s.%d-%d.efi", ii.filenameBase, fromStep, toStep)) - if dir.FileExist(idxPath) { - if item.index, err = recsplit.OpenIndex(idxPath); err != nil { - ii.logger.Debug("InvertedIndex.openFiles: %w, %s", err, idxPath) - return false + if item.bloom == nil && ii.withExistenceIndex { + idxPath := filepath.Join(ii.dir, fmt.Sprintf("%s.%d-%d.efei", ii.filenameBase, fromStep, toStep)) + if dir.FileExist(idxPath) { + if item.bloom, err = OpenBloom(idxPath); err != nil { + ii.logger.Debug("InvertedIndex.openFiles: %w, %s", err, idxPath) + return false + } + totalKeys += item.index.KeyCount() } - totalKeys += item.index.KeyCount() } } return true @@ -366,7 +504,8 @@ func (ii *InvertedIndex) closeWhatNotInList(fNames []string) { } func (ii *InvertedIndex) Close() { - ii.localityIndex.Close() + ii.warmLocalityIdx.Close() + ii.coldLocalityIdx.Close() ii.closeWhatNotInList([]string{}) ii.reCalcRoFiles() } @@ -374,15 +513,12 @@ func (ii *InvertedIndex) Close() { // DisableFsync - just for tests func (ii *InvertedIndex) DisableFsync() { ii.noFsync = true } -func (ii *InvertedIndex) Files() (res []string) { - ii.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - if item.decompressor != nil { - res = append(res, item.decompressor.FileName()) - } +func (ic *InvertedIndexContext) Files() (res []string) { + for _, item := range ic.files { + if item.src.decompressor != nil { + res = append(res, item.src.decompressor.FileName()) } - return true - }) + } return res } @@ -420,6 +556,14 @@ func (ii *InvertedIndex) FinishWrites() { func (ii *InvertedIndex) Rotate() *invertedIndexWAL { wal := ii.wal if wal != nil { + if wal.buffered { + if err := wal.index.Flush(); err != nil { + panic(err) + } + if err := wal.indexKeys.Flush(); err != nil { + panic(err) + } + } ii.wal = ii.newWriter(ii.wal.tmpdir, ii.wal.buffered, ii.wal.discard) } return wal @@ -466,19 +610,9 @@ func (ii *invertedIndexWAL) close() { } } -// 3 history + 4 indices = 10 etl collectors, 10*256Mb/8 = 512mb - for all indices buffers -var WALCollectorRAM = 2 * (etl.BufferOptimalSize / 8) - -func init() { - v, _ := os.LookupEnv("ERIGON_WAL_COLLETOR_RAM") - if v != "" { - var err error - WALCollectorRAM, err = datasize.ParseString(v) - if err != nil { - panic(err) - } - } -} +// 3_domains * 2 + 3_history * 1 + 4_indices * 2 = 17 etl collectors, 17*(256Mb/8) = 512Mb - for all collectros +var WALCollectorRAM = dbg.EnvDataSize("AGG_WAL_RAM", etl.BufferOptimalSize/8) +var AggTraceFileLife = dbg.EnvString("AGG_TRACE_FILE_LIFE", "") func (ii *InvertedIndex) newWriter(tmpdir string, buffered, discard bool) *invertedIndexWAL { w := &invertedIndexWAL{ii: ii, @@ -521,27 +655,36 @@ func (ii *invertedIndexWAL) add(key, indexKey []byte) error { } func (ii *InvertedIndex) MakeContext() *InvertedIndexContext { - var ic = InvertedIndexContext{ - ii: ii, - files: *ii.roFiles.Load(), - loc: ii.localityIndex.MakeContext(), - } - for _, item := range ic.files { - if !item.src.frozen { - item.src.refcount.Add(1) + files := *ii.roFiles.Load() + for i := 0; i < len(files); i++ { + if !files[i].src.frozen { + files[i].src.refcount.Add(1) } } - return &ic + return &InvertedIndexContext{ + ii: ii, + files: files, + warmLocality: ii.warmLocalityIdx.MakeContext(), + coldLocality: ii.coldLocalityIdx.MakeContext(), + } } func (ic *InvertedIndexContext) Close() { - for _, item := range ic.files { - if item.src.frozen { + if ic.files == nil { // invariant: it's safe to call Close multiple times + return + } + files := ic.files + ic.files = nil + for i := 0; i < len(files); i++ { + if files[i].src.frozen { continue } - refCnt := item.src.refcount.Add(-1) + refCnt := files[i].src.refcount.Add(-1) //GC: last reader responsible to remove useles files: close it and delete - if refCnt == 0 && item.src.canDelete.Load() { - item.src.closeFilesAndRemove() + if refCnt == 0 && files[i].src.canDelete.Load() { + if ic.ii.filenameBase == AggTraceFileLife { + ic.ii.logger.Warn(fmt.Sprintf("[agg] real remove at ctx close: %s", files[i].src.decompressor.FileName())) + } + files[i].src.closeFilesAndRemove() } } @@ -549,24 +692,43 @@ func (ic *InvertedIndexContext) Close() { r.Close() } - ic.loc.Close(ic.ii.logger) + ic.warmLocality.Close() + ic.coldLocality.Close() } type InvertedIndexContext struct { ii *InvertedIndex files []ctxItem // have no garbage (overlaps, etc...) - getters []*compress.Getter + getters []ArchiveGetter readers []*recsplit.IndexReader - loc *ctxLocalityIdx + + warmLocality *ctxLocalityIdx + coldLocality *ctxLocalityIdx + + _hasher murmur3.Hash128 } -func (ic *InvertedIndexContext) statelessGetter(i int) *compress.Getter { +func (ic *InvertedIndexContext) statelessHasher() murmur3.Hash128 { + if ic._hasher == nil { + ic._hasher = murmur3.New128WithSeed(*ic.ii.salt) + } + return ic._hasher +} +func (ic *InvertedIndexContext) hashKey(k []byte) (hi, lo uint64) { + hasher := ic.statelessHasher() + ic._hasher.Reset() + _, _ = hasher.Write(k) //nolint:errcheck + return hasher.Sum128() +} + +func (ic *InvertedIndexContext) statelessGetter(i int) ArchiveGetter { if ic.getters == nil { - ic.getters = make([]*compress.Getter, len(ic.files)) + ic.getters = make([]ArchiveGetter, len(ic.files)) } r := ic.getters[i] if r == nil { - r = ic.files[i].src.decompressor.MakeGetter() + g := ic.files[i].src.decompressor.MakeGetter() + r = NewArchiveGetter(g, ic.ii.compression) ic.getters[i] = r } return r @@ -592,6 +754,40 @@ func (ic *InvertedIndexContext) getFile(from, to uint64) (it ctxItem, ok bool) { return it, false } +func (ic *InvertedIndexContext) Seek(key []byte, txNum uint64) (found bool, equalOrHigherTxNum uint64) { + hi, lo := ic.hashKey(key) + + for i := 0; i < len(ic.files); i++ { + if ic.files[i].endTxNum <= txNum { + continue + } + if ic.ii.withExistenceIndex && ic.files[i].src.bloom != nil { + if !ic.files[i].src.bloom.ContainsHash(hi) { + continue + } + } + reader := ic.statelessIdxReader(i) + if reader.Empty() { + continue + } + offset := reader.LookupHash(hi, lo) + + g := ic.statelessGetter(i) + g.Reset(offset) + k, _ := g.Next(nil) + if !bytes.Equal(k, key) { + continue + } + eliasVal, _ := g.Next(nil) + equalOrHigherTxNum, found = eliasfano32.Seek(eliasVal, txNum) + + if found { + return true, equalOrHigherTxNum + } + } + return false, 0 +} + // IdxRange - return range of txNums for given `key` // is to be used in public API, therefore it relies on read-only transaction // so that iteration can be done even when the inverted index is being updated. @@ -697,6 +893,118 @@ func (ic *InvertedIndexContext) iterateRangeFrozen(key []byte, startTxNum, endTx return it, nil } +// [txFrom; txTo) +func (ic *InvertedIndexContext) Prune(ctx context.Context, rwTx kv.RwTx, txFrom, txTo, limit uint64, logEvery *time.Ticker) error { + ii := ic.ii + defer func(t time.Time) { mxPruneTookIndex.UpdateDuration(t) }(time.Now()) + + keysCursor, err := rwTx.RwCursorDupSort(ii.indexKeysTable) + if err != nil { + return fmt.Errorf("create %s keys cursor: %w", ii.filenameBase, err) + } + defer keysCursor.Close() + var txKey [8]byte + binary.BigEndian.PutUint64(txKey[:], txFrom) + k, v, err := keysCursor.Seek(txKey[:]) + if err != nil { + return err + } + if k == nil { + return nil + } + txFrom = binary.BigEndian.Uint64(k) + if limit != math.MaxUint64 && limit != 0 { + txTo = cmp.Min(txTo, txFrom+limit) + } + if txFrom >= txTo { + return nil + } + + collector := etl.NewCollector("snapshots", ii.tmpdir, etl.NewOldestEntryBuffer(etl.BufferOptimalSize), ii.logger) + defer collector.Close() + collector.LogLvl(log.LvlDebug) + + idxCForDeletes, err := rwTx.RwCursorDupSort(ii.indexTable) + if err != nil { + return err + } + defer idxCForDeletes.Close() + idxC, err := rwTx.RwCursorDupSort(ii.indexTable) + if err != nil { + return err + } + defer idxC.Close() + + // Invariant: if some `txNum=N` pruned - it's pruned Fully + // Means: can use DeleteCurrentDuplicates all values of given `txNum` + for ; k != nil; k, v, err = keysCursor.NextNoDup() { + if err != nil { + return err + } + + txNum := binary.BigEndian.Uint64(k) + if txNum >= txTo { + break + } + for ; v != nil; _, v, err = keysCursor.NextDup() { + if err != nil { + return err + } + if err := collector.Collect(v, nil); err != nil { + return err + } + } + if ctx.Err() != nil { + return ctx.Err() + } + + // This DeleteCurrent needs to the last in the loop iteration, because it invalidates k and v + if err = rwTx.Delete(ii.indexKeysTable, k); err != nil { + return err + } + } + if err != nil { + return fmt.Errorf("iterate over %s keys: %w", ii.filenameBase, err) + } + + var pruneCount uint64 + if err := collector.Load(rwTx, "", func(key, _ []byte, table etl.CurrentTableReader, next etl.LoadNextFunc) error { + for v, err := idxC.SeekBothRange(key, txKey[:]); v != nil; _, v, err = idxC.NextDup() { + if err != nil { + return err + } + txNum := binary.BigEndian.Uint64(v) + if txNum >= txTo { + break + } + + if _, _, err = idxCForDeletes.SeekBothExact(key, v); err != nil { + return err + } + if err = idxCForDeletes.DeleteCurrent(); err != nil { + return err + } + pruneCount++ + mxPruneSizeIndex.Inc() + + select { + case <-logEvery.C: + ii.logger.Info("[snapshots] prune history", "name", ii.filenameBase, + "to_step", fmt.Sprintf("%.2f", float64(txTo)/float64(ii.aggregationStep)), "prefix", fmt.Sprintf("%x", key[:8]), + "pruned count", pruneCount) + case <-ctx.Done(): + return ctx.Err() + default: + } + } + return nil + }, etl.TransformArgs{}); err != nil { + return err + } + + return nil +} + // FrozenInvertedIdxIter allows iteration over range of tx numbers // Iteration is not implmented via callback function, because there is often // a requirement for interators to be composable (for example, to implement AND and OR for indices) @@ -995,9 +1303,9 @@ func (it *InvertedIterator1) advanceInFiles() { for it.h.Len() > 0 { top := heap.Pop(&it.h).(*ReconItem) key := top.key - val, _ := top.g.NextUncompressed() + val, _ := top.g.Next(nil) if top.g.HasNext() { - top.key, _ = top.g.NextUncompressed() + top.key, _ = top.g.Next(nil) heap.Push(&it.h, top) } if !bytes.Equal(key, it.key) { @@ -1103,9 +1411,9 @@ func (ic *InvertedIndexContext) IterateChangedKeys(startTxNum, endTxNum uint64, if item.endTxNum >= endTxNum { ii1.hasNextInDb = false } - g := item.src.decompressor.MakeGetter() + g := NewArchiveGetter(item.src.decompressor.MakeGetter(), ic.ii.compression) if g.HasNext() { - key, _ := g.NextUncompressed() + key, _ := g.Next(nil) heap.Push(&ii1.h, &ReconItem{startTxNum: item.startTxNum, endTxNum: item.endTxNum, g: g, txNum: ^item.endTxNum, key: key}) ii1.hasNextInFiles = true } @@ -1119,7 +1427,14 @@ func (ic *InvertedIndexContext) IterateChangedKeys(startTxNum, endTxNum uint64, return ii1 } -func (ii *InvertedIndex) collate(ctx context.Context, txFrom, txTo uint64, roTx kv.Tx) (map[string]*roaring64.Bitmap, error) { +// collate [stepFrom, stepTo) +func (ii *InvertedIndex) collate(ctx context.Context, stepFrom, stepTo uint64, roTx kv.Tx) (map[string]*roaring64.Bitmap, error) { + txFrom, txTo := stepFrom*ii.aggregationStep, stepTo*ii.aggregationStep + mxRunningCollations.Inc() + start := time.Now() + defer mxRunningCollations.Dec() + defer mxCollateTook.UpdateDuration(start) + keysCursor, err := roTx.CursorDupSort(ii.indexKeysTable) if err != nil { return nil, fmt.Errorf("create %s keys cursor: %w", ii.filenameBase, err) @@ -1129,7 +1444,10 @@ func (ii *InvertedIndex) collate(ctx context.Context, txFrom, txTo uint64, roTx var txKey [8]byte binary.BigEndian.PutUint64(txKey[:], txFrom) var k, v []byte - for k, v, err = keysCursor.Seek(txKey[:]); err == nil && k != nil; k, v, err = keysCursor.Next() { + for k, v, err = keysCursor.Seek(txKey[:]); k != nil; k, v, err = keysCursor.Next() { + if err != nil { + return nil, fmt.Errorf("iterate over %s keys cursor: %w", ii.filenameBase, err) + } txNum := binary.BigEndian.Uint64(k) if txNum >= txTo { break @@ -1148,18 +1466,18 @@ func (ii *InvertedIndex) collate(ctx context.Context, txFrom, txTo uint64, roTx default: } } - if err != nil { - return nil, fmt.Errorf("iterate over %s keys cursor: %w", ii.filenameBase, err) - } return indexBitmaps, nil } type InvertedFiles struct { - decomp *compress.Decompressor - index *recsplit.Index + decomp *compress.Decompressor + index *recsplit.Index + existence *bloomFilter + warmLocality *LocalityIndexFiles + coldLocality *LocalityIndexFiles } -func (sf InvertedFiles) Close() { +func (sf InvertedFiles) CleanupOnError() { if sf.decomp != nil { sf.decomp.Close() } @@ -1168,11 +1486,18 @@ func (sf InvertedFiles) Close() { } } +// buildFiles - `step=N` means build file `[N:N+1)` which is equal to [N:N+1) func (ii *InvertedIndex) buildFiles(ctx context.Context, step uint64, bitmaps map[string]*roaring64.Bitmap, ps *background.ProgressSet) (InvertedFiles, error) { - var decomp *compress.Decompressor - var index *recsplit.Index - var comp *compress.Compressor - var err error + start := time.Now() + defer mxBuildTook.UpdateDuration(start) + + var ( + decomp *compress.Decompressor + index *recsplit.Index + existence *bloomFilter + comp *compress.Compressor + err error + ) closeComp := true defer func() { if closeComp { @@ -1187,9 +1512,7 @@ func (ii *InvertedIndex) buildFiles(ctx context.Context, step uint64, bitmaps ma } } }() - txNumFrom := step * ii.aggregationStep - txNumTo := (step + 1) * ii.aggregationStep - datFileName := fmt.Sprintf("%s.%d-%d.ef", ii.filenameBase, txNumFrom/ii.aggregationStep, txNumTo/ii.aggregationStep) + datFileName := fmt.Sprintf("%s.%d-%d.ef", ii.filenameBase, step, step+1) datPath := filepath.Join(ii.dir, datFileName) keys := make([]string, 0, len(bitmaps)) for key := range bitmaps { @@ -1203,9 +1526,10 @@ func (ii *InvertedIndex) buildFiles(ctx context.Context, step uint64, bitmaps ma if err != nil { return InvertedFiles{}, fmt.Errorf("create %s compressor: %w", ii.filenameBase, err) } + writer := NewArchiveWriter(comp, ii.compression) var buf []byte for _, key := range keys { - if err = comp.AddUncompressedWord([]byte(key)); err != nil { + if err = writer.AddWord([]byte(key)); err != nil { return InvertedFiles{}, fmt.Errorf("add %s key [%x]: %w", ii.filenameBase, key, err) } bitmap := bitmaps[key] @@ -1216,7 +1540,7 @@ func (ii *InvertedIndex) buildFiles(ctx context.Context, step uint64, bitmaps ma } ef.Build() buf = ef.AppendBytes(buf[:0]) - if err = comp.AddUncompressedWord(buf); err != nil { + if err = writer.AddWord(buf); err != nil { return InvertedFiles{}, fmt.Errorf("add %s val: %w", ii.filenameBase, err) } } @@ -1231,21 +1555,56 @@ func (ii *InvertedIndex) buildFiles(ctx context.Context, step uint64, bitmaps ma return InvertedFiles{}, fmt.Errorf("open %s decompressor: %w", ii.filenameBase, err) } - idxFileName := fmt.Sprintf("%s.%d-%d.efi", ii.filenameBase, txNumFrom/ii.aggregationStep, txNumTo/ii.aggregationStep) + idxFileName := fmt.Sprintf("%s.%d-%d.efi", ii.filenameBase, step, step+1) idxPath := filepath.Join(ii.dir, idxFileName) - p := ps.AddNew(idxFileName, uint64(decomp.Count()*2)) - defer ps.Delete(p) - if index, err = buildIndexThenOpen(ctx, decomp, idxPath, ii.tmpdir, len(keys), false /* values */, p, ii.logger, ii.noFsync); err != nil { + if index, err = buildIndexThenOpen(ctx, decomp, ii.compression, idxPath, ii.tmpdir, false, ii.salt, ps, ii.logger, ii.noFsync); err != nil { return InvertedFiles{}, fmt.Errorf("build %s efi: %w", ii.filenameBase, err) } + + if ii.withExistenceIndex { + idxFileName2 := fmt.Sprintf("%s.%d-%d.efei", ii.filenameBase, step, step+1) + idxPath2 := filepath.Join(ii.dir, idxFileName2) + if existence, err = buildIndexFilterThenOpen(ctx, decomp, ii.compression, idxPath2, ii.tmpdir, ii.salt, ps, ii.logger, ii.noFsync); err != nil { + return InvertedFiles{}, fmt.Errorf("build %s efei: %w", ii.filenameBase, err) + } + } + + warmLocality, err := ii.buildWarmLocality(ctx, decomp, step+1, ps) + if err != nil { + return InvertedFiles{}, fmt.Errorf("buildWarmLocality: %w", err) + } + closeComp = false - return InvertedFiles{decomp: decomp, index: index}, nil + return InvertedFiles{decomp: decomp, index: index, existence: existence, warmLocality: warmLocality}, nil +} + +func (ii *InvertedIndex) buildWarmLocality(ctx context.Context, decomp *compress.Decompressor, step uint64, ps *background.ProgressSet) (*LocalityIndexFiles, error) { + if !ii.withLocalityIndex { + return nil, nil + } + + ic := ii.MakeContext() // TODO: use existing context + defer ic.Close() + // Here we can make a choise: to index "cold non-indexed file" by warm locality index, or not? + // Let's don't index. Because: speed of new files build is very important - to speed-up pruning + fromStep, toStep := ic.minWarmStep(), step+1 + defer func() { + if ic.ii.filenameBase == AggTraceFileLife { + ii.logger.Warn(fmt.Sprintf("[agg] BuildWarmLocality done: %s.%d-%d", ii.filenameBase, fromStep, toStep)) + } + }() + return ii.warmLocalityIdx.buildFiles(ctx, fromStep, toStep, false, ps, func() *LocalityIterator { + return ic.iterateKeysLocality(ctx, fromStep, toStep, decomp) + }) } func (ii *InvertedIndex) integrateFiles(sf InvertedFiles, txNumFrom, txNumTo uint64) { + ii.warmLocalityIdx.integrateFiles(sf.warmLocality) + fi := newFilesItem(txNumFrom, txNumTo, ii.aggregationStep) fi.decompressor = sf.decomp fi.index = sf.index + fi.bloom = sf.existence ii.files.Set(fi) ii.reCalcRoFiles() @@ -1322,6 +1681,7 @@ func (ii *InvertedIndex) prune(ctx context.Context, txFrom, txTo, limit uint64, collector := etl.NewCollector("snapshots", ii.tmpdir, etl.NewOldestEntryBuffer(etl.BufferOptimalSize), ii.logger) defer collector.Close() + collector.LogLvl(log.LvlDebug) idxCForDeletes, err := ii.tx.RwCursorDupSort(ii.indexTable) if err != nil { @@ -1357,11 +1717,6 @@ func (ii *InvertedIndex) prune(ctx context.Context, txFrom, txTo, limit uint64, if err = ii.tx.Delete(ii.indexKeysTable, k); err != nil { return err } - select { - case <-ctx.Done(): - return ctx.Err() - default: - } } if err != nil { return fmt.Errorf("iterate over %s keys: %w", ii.filenameBase, err) @@ -1387,6 +1742,8 @@ func (ii *InvertedIndex) prune(ctx context.Context, txFrom, txTo, limit uint64, select { case <-logEvery.C: ii.logger.Info("[snapshots] prune history", "name", ii.filenameBase, "to_step", fmt.Sprintf("%.2f", float64(txTo)/float64(ii.aggregationStep)), "prefix", fmt.Sprintf("%x", key[:8])) + case <-ctx.Done(): + return ctx.Err() default: } } @@ -1410,43 +1767,6 @@ func (ii *InvertedIndex) DisableReadAhead() { }) } -func (ii *InvertedIndex) EnableReadAhead() *InvertedIndex { - ii.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - item.decompressor.EnableReadAhead() - if item.index != nil { - item.index.EnableReadAhead() - } - } - return true - }) - return ii -} -func (ii *InvertedIndex) EnableMadvWillNeed() *InvertedIndex { - ii.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - item.decompressor.EnableWillNeed() - if item.index != nil { - item.index.EnableWillNeed() - } - } - return true - }) - return ii -} -func (ii *InvertedIndex) EnableMadvNormalReadAhead() *InvertedIndex { - ii.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - item.decompressor.EnableMadvNormal() - if item.index != nil { - item.index.EnableMadvNormal() - } - } - return true - }) - return ii -} - func (ii *InvertedIndex) collectFilesStat() (filesCount, filesSize, idxSize uint64) { if ii.files == nil { return 0, 0, 0 @@ -1458,7 +1778,8 @@ func (ii *InvertedIndex) collectFilesStat() (filesCount, filesSize, idxSize uint } filesSize += uint64(item.decompressor.Size()) idxSize += uint64(item.index.Size()) - filesCount += 2 + idxSize += uint64(item.bindex.Size()) + filesCount += 3 } return true }) @@ -1467,7 +1788,7 @@ func (ii *InvertedIndex) collectFilesStat() (filesCount, filesSize, idxSize uint func (ii *InvertedIndex) stepsRangeInDBAsStr(tx kv.Tx) string { a1, a2 := ii.stepsRangeInDB(tx) - return fmt.Sprintf("%s: %.1f-%.1f", ii.filenameBase, a1, a2) + return fmt.Sprintf("%s: %.1f", ii.filenameBase, a2-a1) } func (ii *InvertedIndex) stepsRangeInDB(tx kv.Tx) (from, to float64) { fst, _ := kv.FirstKey(tx, ii.indexKeysTable) @@ -1478,5 +1799,8 @@ func (ii *InvertedIndex) stepsRangeInDB(tx kv.Tx) (from, to float64) { if len(lst) > 0 { to = float64(binary.BigEndian.Uint64(lst)) / float64(ii.aggregationStep) } + if to == 0 { + to = from + } return from, to } diff --git a/state/inverted_index_test.go b/state/inverted_index_test.go index c23dcb5d0..219fc1e83 100644 --- a/state/inverted_index_test.go +++ b/state/inverted_index_test.go @@ -22,26 +22,27 @@ import ( "fmt" "math" "os" + "path/filepath" "testing" "time" "github.com/ledgerwatch/erigon-lib/common/background" - "github.com/ledgerwatch/erigon-lib/kv/iter" - "github.com/ledgerwatch/erigon-lib/kv/order" - "github.com/ledgerwatch/log/v3" - "github.com/stretchr/testify/require" - btree2 "github.com/tidwall/btree" - "github.com/ledgerwatch/erigon-lib/kv" + "github.com/ledgerwatch/erigon-lib/kv/iter" "github.com/ledgerwatch/erigon-lib/kv/mdbx" + "github.com/ledgerwatch/erigon-lib/kv/order" "github.com/ledgerwatch/erigon-lib/recsplit" "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" + "github.com/ledgerwatch/log/v3" + "github.com/stretchr/testify/require" ) -func testDbAndInvertedIndex(tb testing.TB, aggStep uint64, logger log.Logger) (string, kv.RwDB, *InvertedIndex) { +func testDbAndInvertedIndex(tb testing.TB, aggStep uint64, logger log.Logger) (kv.RwDB, *InvertedIndex) { tb.Helper() path := tb.TempDir() - tb.Cleanup(func() { os.RemoveAll(path) }) + dir := filepath.Join(path, "snapshots", "history") + require.NoError(tb, os.MkdirAll(filepath.Join(path, "snapshots", "warm"), 0740)) + require.NoError(tb, os.MkdirAll(dir, 0740)) keysTable := "Keys" indexTable := "Index" db := mdbx.NewMDBX(logger).InMem(path).WithTableCfg(func(defaultBuckets kv.TableCfg) kv.TableCfg { @@ -51,18 +52,20 @@ func testDbAndInvertedIndex(tb testing.TB, aggStep uint64, logger log.Logger) (s } }).MustOpen() tb.Cleanup(db.Close) - ii, err := NewInvertedIndex(path, path, aggStep, "inv" /* filenameBase */, keysTable, indexTable, false, nil, logger) + salt := uint32(1) + cfg := iiCfg{salt: &salt, dir: dir, tmpdir: dir} + ii, err := NewInvertedIndex(cfg, aggStep, "inv" /* filenameBase */, keysTable, indexTable, false, true, nil, logger) require.NoError(tb, err) ii.DisableFsync() tb.Cleanup(ii.Close) - return path, db, ii + return db, ii } func TestInvIndexCollationBuild(t *testing.T) { logger := log.New() logEvery := time.NewTicker(30 * time.Second) defer logEvery.Stop() - _, db, ii := testDbAndInvertedIndex(t, 16, logger) + db, ii := testDbAndInvertedIndex(t, 16, logger) ctx := context.Background() tx, err := db.BeginRw(ctx) require.NoError(t, err) @@ -85,6 +88,10 @@ func TestInvIndexCollationBuild(t *testing.T) { err = ii.Add([]byte("key3")) require.NoError(t, err) + ii.SetTxNum(17) + err = ii.Add([]byte("key10")) + require.NoError(t, err) + err = ii.Rotate().Flush(ctx, tx) require.NoError(t, err) err = tx.Commit() @@ -94,7 +101,7 @@ func TestInvIndexCollationBuild(t *testing.T) { require.NoError(t, err) defer roTx.Rollback() - bs, err := ii.collate(ctx, 0, 7, roTx) + bs, err := ii.collate(ctx, 0, 1, roTx) require.NoError(t, err) require.Equal(t, 3, len(bs)) require.Equal(t, []uint64{3}, bs["key2"].ToArray()) @@ -103,7 +110,6 @@ func TestInvIndexCollationBuild(t *testing.T) { sf, err := ii.buildFiles(ctx, 0, bs, background.NewProgressSet()) require.NoError(t, err) - defer sf.Close() g := sf.decomp.MakeGetter() g.Reset(0) @@ -137,7 +143,7 @@ func TestInvIndexAfterPrune(t *testing.T) { logger := log.New() logEvery := time.NewTicker(30 * time.Second) defer logEvery.Stop() - _, db, ii := testDbAndInvertedIndex(t, 16, logger) + db, ii := testDbAndInvertedIndex(t, 16, logger) ctx := context.Background() tx, err := db.BeginRw(ctx) require.NoError(t, err) @@ -173,7 +179,7 @@ func TestInvIndexAfterPrune(t *testing.T) { require.NoError(t, err) defer roTx.Rollback() - bs, err := ii.collate(ctx, 0, 16, roTx) + bs, err := ii.collate(ctx, 0, 1, roTx) require.NoError(t, err) sf, err := ii.buildFiles(ctx, 0, bs, background.NewProgressSet()) @@ -185,6 +191,10 @@ func TestInvIndexAfterPrune(t *testing.T) { ii.integrateFiles(sf, 0, 16) + from, to := ii.stepsRangeInDB(tx) + require.Equal(t, "0.1", fmt.Sprintf("%.1f", from)) + require.Equal(t, "0.4", fmt.Sprintf("%.1f", to)) + err = ii.prune(ctx, 0, 16, math.MaxUint64, logEvery) require.NoError(t, err) err = tx.Commit() @@ -203,16 +213,20 @@ func TestInvIndexAfterPrune(t *testing.T) { require.NoError(t, err) require.Nil(t, k, table) } + + from, to = ii.stepsRangeInDB(tx) + require.Equal(t, float64(0), from) + require.Equal(t, float64(0), to) } -func filledInvIndex(tb testing.TB, logger log.Logger) (string, kv.RwDB, *InvertedIndex, uint64) { +func filledInvIndex(tb testing.TB, logger log.Logger) (kv.RwDB, *InvertedIndex, uint64) { tb.Helper() return filledInvIndexOfSize(tb, uint64(1000), 16, 31, logger) } -func filledInvIndexOfSize(tb testing.TB, txs, aggStep, module uint64, logger log.Logger) (string, kv.RwDB, *InvertedIndex, uint64) { +func filledInvIndexOfSize(tb testing.TB, txs, aggStep, module uint64, logger log.Logger) (kv.RwDB, *InvertedIndex, uint64) { tb.Helper() - path, db, ii := testDbAndInvertedIndex(tb, aggStep, logger) + db, ii := testDbAndInvertedIndex(tb, aggStep, logger) ctx, require := context.Background(), require.New(tb) tx, err := db.BeginRw(ctx) require.NoError(err) @@ -249,7 +263,7 @@ func filledInvIndexOfSize(tb testing.TB, txs, aggStep, module uint64, logger log require.NoError(err) err = tx.Commit() require.NoError(err) - return path, db, ii, txs + return db, ii, txs } func checkRanges(t *testing.T, db kv.RwDB, ii *InvertedIndex, txs uint64) { @@ -347,7 +361,7 @@ func mergeInverted(tb testing.TB, db kv.RwDB, ii *InvertedIndex, txs uint64) { // Leave the last 2 aggregation steps un-collated for step := uint64(0); step < txs/ii.aggregationStep-1; step++ { func() { - bs, err := ii.collate(ctx, step*ii.aggregationStep, (step+1)*ii.aggregationStep, tx) + bs, err := ii.collate(ctx, step, step+1, tx) require.NoError(tb, err) sf, err := ii.buildFiles(ctx, step, bs, background.NewProgressSet()) require.NoError(tb, err) @@ -357,13 +371,13 @@ func mergeInverted(tb testing.TB, db kv.RwDB, ii *InvertedIndex, txs uint64) { var found bool var startTxNum, endTxNum uint64 maxEndTxNum := ii.endTxNumMinimax() - maxSpan := ii.aggregationStep * StepsInBiggestFile + maxSpan := ii.aggregationStep * StepsInColdFile for { if stop := func() bool { ic := ii.MakeContext() defer ic.Close() - found, startTxNum, endTxNum = ii.findMergeRange(maxEndTxNum, maxSpan) + found, startTxNum, endTxNum = ic.findMergeRange(maxEndTxNum, maxSpan) if !found { return true } @@ -387,7 +401,7 @@ func TestInvIndexRanges(t *testing.T) { logger := log.New() logEvery := time.NewTicker(30 * time.Second) defer logEvery.Stop() - _, db, ii, txs := filledInvIndex(t, logger) + db, ii, txs := filledInvIndex(t, logger) ctx := context.Background() tx, err := db.BeginRw(ctx) require.NoError(t, err) @@ -397,7 +411,7 @@ func TestInvIndexRanges(t *testing.T) { // Leave the last 2 aggregation steps un-collated for step := uint64(0); step < txs/ii.aggregationStep-1; step++ { func() { - bs, err := ii.collate(ctx, step*ii.aggregationStep, (step+1)*ii.aggregationStep, tx) + bs, err := ii.collate(ctx, step, step+1, tx) require.NoError(t, err) sf, err := ii.buildFiles(ctx, step, bs, background.NewProgressSet()) require.NoError(t, err) @@ -414,7 +428,7 @@ func TestInvIndexRanges(t *testing.T) { func TestInvIndexMerge(t *testing.T) { logger := log.New() - _, db, ii, txs := filledInvIndex(t, logger) + db, ii, txs := filledInvIndex(t, logger) mergeInverted(t, db, ii, txs) checkRanges(t, db, ii, txs) @@ -422,11 +436,14 @@ func TestInvIndexMerge(t *testing.T) { func TestInvIndexScanFiles(t *testing.T) { logger := log.New() - path, db, ii, txs := filledInvIndex(t, logger) + db, ii, txs := filledInvIndex(t, logger) + path := ii.dir // Recreate InvertedIndex to scan the files var err error - ii, err = NewInvertedIndex(path, path, ii.aggregationStep, ii.filenameBase, ii.indexKeysTable, ii.indexTable, false, nil, logger) + salt := uint32(1) + cfg := iiCfg{salt: &salt, dir: path, tmpdir: path} + ii, err = NewInvertedIndex(cfg, ii.aggregationStep, ii.filenameBase, ii.indexKeysTable, ii.indexTable, false, true, nil, logger) require.NoError(t, err) defer ii.Close() @@ -436,7 +453,7 @@ func TestInvIndexScanFiles(t *testing.T) { func TestChangedKeysIterator(t *testing.T) { logger := log.New() - _, db, ii, txs := filledInvIndex(t, logger) + db, ii, txs := filledInvIndex(t, logger) ctx := context.Background() mergeInverted(t, db, ii, txs) roTx, err := db.BeginRo(ctx) @@ -497,11 +514,7 @@ func TestChangedKeysIterator(t *testing.T) { } func TestScanStaticFiles(t *testing.T) { - logger := log.New() - ii := &InvertedIndex{filenameBase: "test", aggregationStep: 1, - files: btree2.NewBTreeG[*filesItem](filesItemLess), - logger: logger, - } + ii := emptyTestInvertedIndex(1) files := []string{ "test.0-1.ef", "test.1-2.ef", @@ -521,11 +534,7 @@ func TestScanStaticFiles(t *testing.T) { } func TestCtxFiles(t *testing.T) { - logger := log.New() - ii := &InvertedIndex{filenameBase: "test", aggregationStep: 1, - files: btree2.NewBTreeG[*filesItem](filesItemLess), - logger: logger, - } + ii := emptyTestInvertedIndex(1) files := []string{ "test.0-1.ef", // overlap with same `endTxNum=4` "test.1-2.ef", @@ -541,16 +550,16 @@ func TestCtxFiles(t *testing.T) { ii.scanStateFiles(files) require.Equal(t, 10, ii.files.Len()) - roFiles := ctxFiles(ii.files) + roFiles := ctxFiles(ii.files, true, false) for i, item := range roFiles { if item.src.canDelete.Load() { - require.Failf(t, "deleted file", "%d-%d", item.src.startTxNum, item.src.endTxNum) + require.Failf(t, "deleted file", "%d-%d", item.startTxNum, item.endTxNum) } if i == 0 { continue } if item.src.isSubsetOf(roFiles[i-1].src) || roFiles[i-1].src.isSubsetOf(item.src) { - require.Failf(t, "overlaping files", "%d-%d, %d-%d", item.src.startTxNum, item.src.endTxNum, roFiles[i-1].src.startTxNum, roFiles[i-1].src.endTxNum) + require.Failf(t, "overlaping files", "%d-%d, %d-%d", item.startTxNum, item.endTxNum, roFiles[i-1].startTxNum, roFiles[i-1].endTxNum) } } require.Equal(t, 3, len(roFiles)) diff --git a/state/locality_index.go b/state/locality_index.go index 8d126a087..fb2dedfe8 100644 --- a/state/locality_index.go +++ b/state/locality_index.go @@ -21,15 +21,17 @@ import ( "container/heap" "context" "fmt" - "os" "path/filepath" "regexp" "strconv" "sync/atomic" - "time" + _ "github.com/FastFilter/xorfilter" "github.com/ledgerwatch/erigon-lib/common/assert" + "github.com/ledgerwatch/erigon-lib/common/background" "github.com/ledgerwatch/erigon-lib/common/dir" + "github.com/ledgerwatch/erigon-lib/compress" + "github.com/ledgerwatch/erigon-lib/etl" "github.com/ledgerwatch/erigon-lib/kv/bitmapdb" "github.com/ledgerwatch/erigon-lib/recsplit" "github.com/ledgerwatch/log/v3" @@ -37,7 +39,7 @@ import ( const LocalityIndexUint64Limit = 64 //bitmap spend 1 bit per file, stored as uint64 -// LocalityIndex - has info in which .ef files exists given key +// LocalityIndex - has info in which .ef or .kv files exists given key // Format: key -> bitmap(step_number_list) // step_number_list is list of .ef files where exists given key type LocalityIndex struct { @@ -45,36 +47,38 @@ type LocalityIndex struct { dir, tmpdir string // Directory where static files are created aggregationStep uint64 // immutable + salt *uint32 + // preferSmallerFiles forcing files like `32-40.l` have higher priority than `0-40.l`. + // It's used by "warm data indexing": new small "warm index" created after old data + // merged and indexed by "cold index" + preferSmallerFiles bool + file *filesItem - bm *bitmapdb.FixedSizeBitmaps - roFiles atomic.Pointer[ctxItem] - roBmFile atomic.Pointer[bitmapdb.FixedSizeBitmaps] - logger log.Logger + roFiles atomic.Pointer[ctxItem] + logger log.Logger + + noFsync bool // fsync is enabled by default, but tests can manually disable } -func NewLocalityIndex( - dir, tmpdir string, - aggregationStep uint64, - filenameBase string, - logger log.Logger, -) (*LocalityIndex, error) { - li := &LocalityIndex{ - dir: dir, - tmpdir: tmpdir, - aggregationStep: aggregationStep, - filenameBase: filenameBase, - logger: logger, - } - return li, nil +func NewLocalityIndex(preferSmallerFiles bool, dir, filenameBase string, aggregationStep uint64, tmpdir string, salt *uint32, logger log.Logger) *LocalityIndex { + return &LocalityIndex{ + preferSmallerFiles: preferSmallerFiles, + dir: dir, + salt: salt, + tmpdir: tmpdir, + aggregationStep: aggregationStep, + filenameBase: filenameBase, + logger: logger, + } } func (li *LocalityIndex) closeWhatNotInList(fNames []string) { - if li == nil || li.bm == nil { + if li == nil || li.file == nil { return } for _, protectName := range fNames { - if li.bm.FileName() == protectName { + if li.file.bm.FileName() == protectName { return } } @@ -88,7 +92,7 @@ func (li *LocalityIndex) OpenList(fNames []string) error { li.closeWhatNotInList(fNames) _ = li.scanStateFiles(fNames) if err := li.openFiles(); err != nil { - return fmt.Errorf("NewHistory.openFiles: %s, %w", li.filenameBase, err) + return fmt.Errorf("LocalityIndex.openFiles: %s, %w", li.filenameBase, err) } return nil } @@ -98,7 +102,7 @@ func (li *LocalityIndex) scanStateFiles(fNames []string) (uselessFiles []*filesI return nil } - re := regexp.MustCompile("^" + li.filenameBase + ".([0-9]+)-([0-9]+).li$") + re := regexp.MustCompile("^" + li.filenameBase + ".([0-9]+)-([0-9]+).l$") var err error for _, name := range fNames { subs := re.FindStringSubmatch(name) @@ -122,21 +126,17 @@ func (li *LocalityIndex) scanStateFiles(fNames []string) (uselessFiles []*filesI continue } - if startStep != 0 { - li.logger.Warn("LocalityIndex must always starts from step 0") - continue - } - if endStep > StepsInBiggestFile*LocalityIndexUint64Limit { + if endStep-startStep > StepsInColdFile*LocalityIndexUint64Limit { li.logger.Warn("LocalityIndex does store bitmaps as uint64, means it can't handle > 2048 steps. But it's possible to implement") continue } startTxNum, endTxNum := startStep*li.aggregationStep, endStep*li.aggregationStep - if li.file == nil { - li.file = newFilesItem(startTxNum, endTxNum, li.aggregationStep) - li.file.frozen = false // LocalityIndex files are never frozen - } else if li.file.endTxNum < endTxNum { - uselessFiles = append(uselessFiles, li.file) + useThisFile := li.file == nil || + (li.file.endTxNum < endTxNum) || // newer + (li.preferSmallerFiles && li.file.endTxNum == endTxNum && li.file.startTxNum < startTxNum) || + (!li.preferSmallerFiles && li.file.startTxNum == startTxNum && li.file.endTxNum < endTxNum) + if useThisFile { li.file = newFilesItem(startTxNum, endTxNum, li.aggregationStep) li.file.frozen = false // LocalityIndex files are never frozen } @@ -150,10 +150,10 @@ func (li *LocalityIndex) openFiles() (err error) { } fromStep, toStep := li.file.startTxNum/li.aggregationStep, li.file.endTxNum/li.aggregationStep - if li.bm == nil { + if li.file.bm == nil { dataPath := filepath.Join(li.dir, fmt.Sprintf("%s.%d-%d.l", li.filenameBase, fromStep, toStep)) if dir.FileExist(dataPath) { - li.bm, err = bitmapdb.OpenFixedSizeBitmaps(dataPath, int((toStep-fromStep)/StepsInBiggestFile)) + li.file.bm, err = bitmapdb.OpenFixedSizeBitmaps(dataPath) if err != nil { return err } @@ -168,25 +168,42 @@ func (li *LocalityIndex) openFiles() (err error) { } } } + if li.file.bloom == nil { + idxPath := filepath.Join(li.dir, fmt.Sprintf("%s.%d-%d.li.lb", li.filenameBase, fromStep, toStep)) + if dir.FileExist(idxPath) { + li.file.bloom, err = OpenBloom(idxPath) + if err != nil { + return err + } + } + } li.reCalcRoFiles() return nil } func (li *LocalityIndex) closeFiles() { - if li == nil { + if li == nil || li.file == nil { return } - if li.file != nil && li.file.index != nil { + if li.file.index != nil { li.file.index.Close() - li.file = nil + li.file.index = nil + } + if li.file.bm != nil { + li.file.bm.Close() + li.file.bm = nil } - if li.bm != nil { - li.bm.Close() - li.bm = nil + if li.file.bloom != nil { + li.file.bloom = nil } } func (li *LocalityIndex) reCalcRoFiles() { - if li == nil || li.file == nil { + if li == nil { + return + } + + if li.file == nil { + li.roFiles.Store(nil) return } li.roFiles.Store(&ctxItem{ @@ -195,45 +212,39 @@ func (li *LocalityIndex) reCalcRoFiles() { i: 0, src: li.file, }) - li.roBmFile.Store(li.bm) } func (li *LocalityIndex) MakeContext() *ctxLocalityIdx { if li == nil { return nil } - x := &ctxLocalityIdx{ - file: li.roFiles.Load(), - bm: li.roBmFile.Load(), + file := li.roFiles.Load() + if file != nil && file.src != nil { + file.src.refcount.Add(1) } - if x.file != nil && x.file.src != nil { - x.file.src.refcount.Add(1) + return &ctxLocalityIdx{ + file: file, + aggregationStep: li.aggregationStep, } - return x } -func (out *ctxLocalityIdx) Close(logger log.Logger) { - if out == nil || out.file == nil || out.file.src == nil { +func (lc *ctxLocalityIdx) Close() { + if lc == nil || lc.file == nil || lc.file.src == nil { // invariant: it's safe to call Close multiple times return } - refCnt := out.file.src.refcount.Add(-1) - if refCnt == 0 && out.file.src.canDelete.Load() { - closeLocalityIndexFilesAndRemove(out, logger) + refCnt := lc.file.src.refcount.Add(-1) + if refCnt == 0 && lc.file.src.canDelete.Load() { + closeLocalityIndexFilesAndRemove(lc) } + lc.file = nil } -func closeLocalityIndexFilesAndRemove(i *ctxLocalityIdx, logger log.Logger) { - if i.file.src != nil { - i.file.src.closeFilesAndRemove() - i.file.src = nil - } - if i.bm != nil { - i.bm.Close() - if err := os.Remove(i.bm.FilePath()); err != nil { - logger.Trace("os.Remove", "err", err, "file", i.bm.FileName()) - } - i.bm = nil +func closeLocalityIndexFilesAndRemove(i *ctxLocalityIdx) { + if i.file == nil || i.file.src == nil { + return } + i.file.src.closeFilesAndRemove() + i.file.src = nil } func (li *LocalityIndex) Close() { @@ -250,102 +261,180 @@ func (li *LocalityIndex) NewIdxReader() *recsplit.IndexReader { // LocalityIndex return exactly 2 file (step) // prevents searching key in many files -func (li *LocalityIndex) lookupIdxFiles(loc *ctxLocalityIdx, key []byte, fromTxNum uint64) (exactShard1, exactShard2 uint64, lastIndexedTxNum uint64, ok1, ok2 bool) { - if li == nil || loc == nil || loc.bm == nil { +func (lc *ctxLocalityIdx) lookupIdxFiles(key []byte, fromTxNum uint64) (exactShard1, exactShard2 uint64, lastIndexedTxNum uint64, ok1, ok2 bool) { + if lc == nil || lc.file == nil { return 0, 0, 0, false, false } - if loc.reader == nil { - loc.reader = recsplit.NewIndexReader(loc.file.src.index) + if lc.reader == nil { + lc.reader = recsplit.NewIndexReader(lc.file.src.index) } - if fromTxNum >= loc.file.endTxNum { + if fromTxNum >= lc.file.endTxNum { return 0, 0, fromTxNum, false, false } - fromFileNum := fromTxNum / li.aggregationStep / StepsInBiggestFile - fn1, fn2, ok1, ok2, err := loc.bm.First2At(loc.reader.Lookup(key), fromFileNum) + fromFileNum := fromTxNum / lc.aggregationStep / StepsInColdFile + fn1, fn2, ok1, ok2, err := lc.file.src.bm.First2At(lc.reader.Lookup(key), fromFileNum) if err != nil { panic(err) } - return fn1 * StepsInBiggestFile, fn2 * StepsInBiggestFile, loc.file.endTxNum, ok1, ok2 + return fn1 * StepsInColdFile, fn2 * StepsInColdFile, lc.file.endTxNum, ok1, ok2 } -func (li *LocalityIndex) missedIdxFiles(ii *InvertedIndexContext) (toStep uint64, idxExists bool) { - if len(ii.files) == 0 { - return 0, true +// indexedTo - [from, to) +func (lc *ctxLocalityIdx) indexedTo() uint64 { + if lc == nil || lc.file == nil { + return 0 } - var item *ctxItem - for i := len(ii.files) - 1; i >= 0; i-- { - if ii.files[i].src.frozen { - item = &ii.files[i] - break - } + return lc.file.endTxNum +} +func (lc *ctxLocalityIdx) indexedFrom() (uint64, bool) { + if lc == nil || lc.file == nil { + return 0, false + } + return lc.file.startTxNum, true +} + +// lookupLatest return latest file (step) +// prevents searching key in many files +func (lc *ctxLocalityIdx) lookupLatest(key []byte) (latestShard uint64, ok bool, err error) { + if lc == nil || lc.file == nil || lc.file.src.index == nil { + return 0, false, nil + } + if lc.reader == nil { + lc.reader = recsplit.NewIndexReader(lc.file.src.index) } - if item != nil { - toStep = item.endTxNum / li.aggregationStep + if lc.reader.Empty() { + return 0, false, nil } - fName := fmt.Sprintf("%s.%d-%d.li", li.filenameBase, 0, toStep) - return toStep, dir.FileExist(filepath.Join(li.dir, fName)) + + hi, lo := lc.reader.Sum(key) + if lc.file.src.bloom != nil && !lc.file.src.bloom.ContainsHash(hi) { + return 0, false, nil + } + + //if bytes.HasPrefix(key, common.FromHex("f29a")) { + // res, _ := lc.file.src.bm.At(lc.reader.Lookup(key)) + // l, _, _ := lc.file.src.bm.LastAt(lc.reader.Lookup(key)) + // fmt.Printf("idx: %x, %d, last: %d\n", key, res, l) + //} + return lc.file.src.bm.LastAt(lc.reader.LookupHash(hi, lo)) } -func (li *LocalityIndex) buildFiles(ctx context.Context, ic *InvertedIndexContext, toStep uint64) (files *LocalityIndexFiles, err error) { - defer ic.ii.EnableMadvNormalReadAhead().DisableReadAhead() - logEvery := time.NewTicker(30 * time.Second) - defer logEvery.Stop() +func (li *LocalityIndex) exists(fromStep, toStep uint64) bool { + return dir.FileExist(filepath.Join(li.dir, fmt.Sprintf("%s.%d-%d.li", li.filenameBase, fromStep, toStep))) && + dir.FileExist(filepath.Join(li.dir, fmt.Sprintf("%s.%d-%d.li.lb", li.filenameBase, fromStep, toStep))) +} - fromStep := uint64(0) - count := 0 - it := ic.iterateKeysLocality(toStep * li.aggregationStep) - for it.HasNext() { - _, _ = it.Next() - count++ +func (li *LocalityIndex) buildFiles(ctx context.Context, fromStep, toStep uint64, convertStepsToFileNums bool, ps *background.ProgressSet, makeIter func() *LocalityIterator) (files *LocalityIndexFiles, err error) { + if li == nil { + return nil, nil + } + if toStep < fromStep { + return nil, fmt.Errorf("LocalityIndex.buildFiles: fromStep(%d) < toStep(%d)", fromStep, toStep) } fName := fmt.Sprintf("%s.%d-%d.li", li.filenameBase, fromStep, toStep) idxPath := filepath.Join(li.dir, fName) filePath := filepath.Join(li.dir, fmt.Sprintf("%s.%d-%d.l", li.filenameBase, fromStep, toStep)) + p := ps.AddNew(fName, uint64(1)) + defer ps.Delete(p) + + count := 0 + it := makeIter() + defer it.Close() + //if it.FilesAmount() == 1 { // optimization: no reason to create LocalityIndex for 1 file + // return nil, nil + //} + + for it.HasNext() { + _, _, _ = it.Next() + count++ + } + it.Close() + + p.Total.Store(uint64(count)) + rs, err := recsplit.NewRecSplit(recsplit.RecSplitArgs{ - KeyCount: count, - Enums: false, - BucketSize: 2000, - LeafSize: 8, - TmpDir: li.tmpdir, - IndexFile: idxPath, + KeyCount: count, + Enums: false, + BucketSize: 2000, + LeafSize: 8, + TmpDir: li.tmpdir, + IndexFile: idxPath, + EtlBufLimit: etl.BufferOptimalSize / 2, + Salt: li.salt, }, li.logger) if err != nil { return nil, fmt.Errorf("create recsplit: %w", err) } defer rs.Close() rs.LogLvl(log.LvlTrace) + if li.noFsync { + rs.DisableFsync() + } - i := uint64(0) + //statelessHasher := murmur3.New128WithSeed(rs.Salt()) + var bloom *bloomFilter for { - dense, err := bitmapdb.NewFixedSizeBitmapsWriter(filePath, int(it.FilesAmount()), uint64(count), li.logger) + p.Processed.Store(0) + i := uint64(0) + maxPossibleValue := int(toStep - fromStep) + baseDataID := fromStep + if convertStepsToFileNums { + maxPossibleValue = int(it.FilesAmount()) + baseDataID = uint64(0) + } + dense, err := bitmapdb.NewFixedSizeBitmapsWriter(filePath, maxPossibleValue, baseDataID, uint64(count), li.logger) if err != nil { return nil, err } defer dense.Close() + if li.noFsync { + dense.DisableFsync() + } + + //if count > 0 { + // bloom, err = NewBloom(uint64(count), idxPath+".lb") + // if err != nil { + // return nil, err + // } + //} - it = ic.iterateKeysLocality(toStep * li.aggregationStep) + it = makeIter() + defer it.Close() for it.HasNext() { - k, inFiles := it.Next() - if err := dense.AddArray(i, inFiles); err != nil { + k, inSteps, err := it.Next() + if err != nil { return nil, err } - if err = rs.AddKey(k, 0); err != nil { - return nil, err + //if bytes.HasPrefix(k, common.FromHex("5e7d")) { + // fmt.Printf("build: %x, %d\n", k, inSteps) + //} + + if convertStepsToFileNums { + for j := range inSteps { + inSteps[j] = inSteps[j] / StepsInColdFile + } } - i++ - select { - case <-ctx.Done(): - return nil, ctx.Err() - case <-logEvery.C: - li.logger.Info("[LocalityIndex] build", "name", li.filenameBase, "progress", fmt.Sprintf("%.2f%%", 50+it.Progress()/2)) - default: + //statelessHasher.Reset() + //statelessHasher.Write(k) //nolint:errcheck + //hi, _ := statelessHasher.Sum128() + //bloom.AddHash(hi) + + //wrintf("buld: %x, %d, %d\n", k, i, inFiles) + if err := dense.AddArray(i, inSteps); err != nil { + return nil, err + } + if err = rs.AddKey(k, i); err != nil { + return nil, err } + i++ + p.Processed.Add(1) } + it.Close() if err := dense.Build(); err != nil { return nil, err @@ -353,7 +442,7 @@ func (li *LocalityIndex) buildFiles(ctx context.Context, ic *InvertedIndexContex if err = rs.Build(ctx); err != nil { if rs.Collision() { - li.logger.Debug("Building recsplit. Collision happened. It's ok. Restarting...") + li.logger.Warn("Building recsplit. Collision happened. It's ok. Restarting...") rs.ResetNextSalt() } else { return nil, fmt.Errorf("build idx: %w", err) @@ -363,51 +452,69 @@ func (li *LocalityIndex) buildFiles(ctx context.Context, ic *InvertedIndexContex } } + //if bloom != nil { + // if err := bloom.Build(); err != nil { + // return nil, err + // } + // bloom.Close() //TODO: move to defer, and move building and opennig to different funcs + //} + idx, err := recsplit.OpenIndex(idxPath) if err != nil { return nil, err } - bm, err := bitmapdb.OpenFixedSizeBitmaps(filePath, int(it.FilesAmount())) + bm, err := bitmapdb.OpenFixedSizeBitmaps(filePath) if err != nil { return nil, err } - return &LocalityIndexFiles{index: idx, bm: bm}, nil + //if dir.FileExist(idxPath + ".lb") { + // bloom, err = OpenBloom(idxPath + ".lb") + // if err != nil { + // return nil, err + // } + //} + return &LocalityIndexFiles{index: idx, bm: bm, bloom: bloom, fromStep: fromStep, toStep: toStep}, nil } -func (li *LocalityIndex) integrateFiles(sf LocalityIndexFiles, txNumFrom, txNumTo uint64) { +func (li *LocalityIndex) integrateFiles(sf *LocalityIndexFiles) { + if li == nil { + return + } if li.file != nil { li.file.canDelete.Store(true) } - li.file = &filesItem{ - startTxNum: txNumFrom, - endTxNum: txNumTo, - index: sf.index, - frozen: false, + if sf == nil { + return //TODO: support non-indexing of single file + //li.file = nil + //li.bm = nil + } else { + li.file = &filesItem{ + startTxNum: sf.fromStep * li.aggregationStep, + endTxNum: sf.toStep * li.aggregationStep, + index: sf.index, + bm: sf.bm, + bloom: sf.bloom, + frozen: false, + } } - li.bm = sf.bm li.reCalcRoFiles() } -func (li *LocalityIndex) BuildMissedIndices(ctx context.Context, ii *InvertedIndexContext) error { - if li == nil { - return nil - } - toStep, idxExists := li.missedIdxFiles(ii) - if idxExists || toStep == 0 { - return nil - } - fromStep := uint64(0) - f, err := li.buildFiles(ctx, ii, toStep) +func (li *LocalityIndex) BuildMissedIndices(ctx context.Context, fromStep, toStep uint64, convertStepsToFileNums bool, ps *background.ProgressSet, makeIter func() *LocalityIterator) error { + f, err := li.buildFiles(ctx, fromStep, toStep, convertStepsToFileNums, ps, makeIter) if err != nil { return err } - li.integrateFiles(*f, fromStep*li.aggregationStep, toStep*li.aggregationStep) + li.integrateFiles(f) return nil } type LocalityIndexFiles struct { index *recsplit.Index bm *bitmapdb.FixedSizeBitmaps + bloom *bloomFilter + + fromStep, toStep uint64 } func (sf LocalityIndexFiles) Close() { @@ -417,81 +524,115 @@ func (sf LocalityIndexFiles) Close() { if sf.bm != nil { sf.bm.Close() } + if sf.bloom != nil { + sf.bloom.Close() + } } type LocalityIterator struct { - hc *InvertedIndexContext - h ReconHeapOlderFirst - files, nextFiles []uint64 - key, nextKey []byte - progress uint64 - hasNext bool + aggStep uint64 + compressVals bool + h ReconHeapOlderFirst + v, nextV, vBackup []uint64 + k, nextK, kBackup []byte + progress uint64 totalOffsets, filesAmount uint64 + involvedFiles []*compress.Decompressor //used in destructor to disable read-ahead + ctx context.Context } func (si *LocalityIterator) advance() { for si.h.Len() > 0 { top := heap.Pop(&si.h).(*ReconItem) key := top.key - _, offset := top.g.NextUncompressed() + var offset uint64 + //if si.compressVals { + offset, _ = top.g.Skip() + //} else { + // offset, _ = top.g.SkipUncompressed() + //} si.progress += offset - top.lastOffset top.lastOffset = offset - inStep := uint32(top.startTxNum / si.hc.ii.aggregationStep) + inStep := top.startTxNum / si.aggStep if top.g.HasNext() { - top.key, _ = top.g.NextUncompressed() + top.key, _ = top.g.Next(nil) heap.Push(&si.h, top) } - inFile := inStep / StepsInBiggestFile - - if !bytes.Equal(key, si.key) { - if si.key == nil { - si.key = key - si.files = append(si.files, uint64(inFile)) - continue - } + if si.k == nil { + si.k = key + si.v = append(si.v, inStep) + continue + } - si.nextFiles, si.files = si.files, si.nextFiles[:0] - si.nextKey = si.key + if !bytes.Equal(key, si.k) { + si.nextV, si.v = si.v, si.nextV[:0] + si.nextK = si.k - si.files = append(si.files, uint64(inFile)) - si.key = key - si.hasNext = true + si.v = append(si.v, inStep) + si.k = key return } - si.files = append(si.files, uint64(inFile)) + si.v = append(si.v, inStep) } - si.nextFiles, si.files = si.files, si.nextFiles[:0] - si.nextKey = si.key - si.hasNext = false + si.nextV, si.v = si.v, si.nextV[:0] + si.nextK = si.k + si.k = nil } -func (si *LocalityIterator) HasNext() bool { return si.hasNext } +func (si *LocalityIterator) HasNext() bool { return si.nextK != nil } func (si *LocalityIterator) Progress() float64 { return (float64(si.progress) / float64(si.totalOffsets)) * 100 } func (si *LocalityIterator) FilesAmount() uint64 { return si.filesAmount } -func (si *LocalityIterator) Next() ([]byte, []uint64) { +func (si *LocalityIterator) Next() ([]byte, []uint64, error) { + select { + case <-si.ctx.Done(): + return nil, nil, si.ctx.Err() + default: + } + + //if hi.err != nil { + // return nil, nil, hi.err + //} + //hi.limit-- + + // Satisfy iter.Dual Invariant 2 + si.nextK, si.kBackup, si.nextV, si.vBackup = si.kBackup, si.nextK, si.vBackup, si.nextV si.advance() - return si.nextKey, si.nextFiles + return si.kBackup, si.vBackup, nil +} + +// Close - safe to call multiple times +func (si *LocalityIterator) Close() { + for _, f := range si.involvedFiles { + f.DisableReadAhead() + } + si.involvedFiles = nil } -func (ic *InvertedIndexContext) iterateKeysLocality(uptoTxNum uint64) *LocalityIterator { - si := &LocalityIterator{hc: ic} +// iterateKeysLocality [from, to) +func (ic *InvertedIndexContext) iterateKeysLocality(ctx context.Context, fromStep, toStep uint64, last *compress.Decompressor) *LocalityIterator { + fromTxNum, toTxNum := fromStep*ic.ii.aggregationStep, toStep*ic.ii.aggregationStep + si := &LocalityIterator{ctx: ctx, aggStep: ic.ii.aggregationStep, compressVals: false} + for _, item := range ic.files { - if !item.src.frozen || item.startTxNum > uptoTxNum { + if item.endTxNum <= fromTxNum || item.startTxNum >= toTxNum { continue } if assert.Enable { - if (item.endTxNum-item.startTxNum)/ic.ii.aggregationStep != StepsInBiggestFile { + if (item.endTxNum-item.startTxNum)/si.aggStep != StepsInColdFile { panic(fmt.Errorf("frozen file of small size: %s", item.src.decompressor.FileName())) } } - g := item.src.decompressor.MakeGetter() + item.src.decompressor.EnableReadAhead() // disable in destructor of iterator + si.involvedFiles = append(si.involvedFiles, item.src.decompressor) + + g := NewArchiveGetter(item.src.decompressor.MakeGetter(), ic.ii.compression) if g.HasNext() { - key, offset := g.NextUncompressed() + key, offset := g.Next(nil) heapItem := &ReconItem{startTxNum: item.startTxNum, endTxNum: item.endTxNum, g: g, txNum: ^item.endTxNum, key: key, startOffset: offset, lastOffset: offset} heap.Push(&si.h, heapItem) @@ -499,6 +640,23 @@ func (ic *InvertedIndexContext) iterateKeysLocality(uptoTxNum uint64) *LocalityI si.totalOffsets += uint64(g.Size()) si.filesAmount++ } + + if last != nil { + //add last one + last.EnableReadAhead() // disable in destructor of iterator + si.involvedFiles = append(si.involvedFiles, last) + g := NewArchiveGetter(last.MakeGetter(), ic.ii.compression) + if g.HasNext() { + key, offset := g.Next(nil) + + startTxNum, endTxNum := (toStep-1)*ic.ii.aggregationStep, toStep*ic.ii.aggregationStep + heapItem := &ReconItem{startTxNum: startTxNum, endTxNum: endTxNum, g: g, txNum: ^endTxNum, key: key, startOffset: offset, lastOffset: offset} + heap.Push(&si.h, heapItem) + } + si.totalOffsets += uint64(g.Size()) + si.filesAmount++ + } + si.advance() return si } diff --git a/state/locality_index_test.go b/state/locality_index_test.go index ef7049697..9a6c07ee5 100644 --- a/state/locality_index_test.go +++ b/state/locality_index_test.go @@ -3,26 +3,53 @@ package state import ( "context" "encoding/binary" - "math" - "sync/atomic" + "fmt" "testing" "github.com/ledgerwatch/log/v3" "github.com/stretchr/testify/require" + "golang.org/x/sync/errgroup" + + "github.com/ledgerwatch/erigon-lib/common/background" + "github.com/ledgerwatch/erigon-lib/common/hexutility" ) -func BenchmarkName2(b *testing.B) { - b.Run("1", func(b *testing.B) { - j := atomic.Int32{} - for i := 0; i < b.N; i++ { - j.Add(1) +func TestScanStaticFilesLocality(t *testing.T) { + + t.Run("new", func(t *testing.T) { + ii := emptyTestInvertedIndex(1) + ii.enableLocalityIndex() + files := []string{ + "test.0-1.l", + "test.1-2.l", + "test.0-4.l", + "test.2-3.l", + "test.3-4.l", + "test.4-5.l", } + ii.warmLocalityIdx.scanStateFiles(files) + require.Equal(t, 4, int(ii.warmLocalityIdx.file.startTxNum)) + require.Equal(t, 5, int(ii.warmLocalityIdx.file.endTxNum)) + ii.coldLocalityIdx.scanStateFiles(files) + require.Equal(t, 4, int(ii.coldLocalityIdx.file.startTxNum)) + require.Equal(t, 5, int(ii.coldLocalityIdx.file.endTxNum)) }) - b.Run("2", func(b *testing.B) { - j := &atomic.Int32{} - for i := 0; i < b.N; i++ { - j.Add(1) - } + t.Run("overlap", func(t *testing.T) { + ii := emptyTestInvertedIndex(1) + ii.enableLocalityIndex() + ii.warmLocalityIdx.scanStateFiles([]string{ + "test.0-50.l", + "test.0-70.l", + "test.64-70.l", + }) + require.Equal(t, 64, int(ii.warmLocalityIdx.file.startTxNum)) + require.Equal(t, 70, int(ii.warmLocalityIdx.file.endTxNum)) + ii.coldLocalityIdx.scanStateFiles([]string{ + "test.0-32.l", + "test.0-64.l", + }) + require.Equal(t, 0, int(ii.coldLocalityIdx.file.startTxNum)) + require.Equal(t, 64, int(ii.coldLocalityIdx.file.endTxNum)) }) } @@ -30,52 +57,63 @@ func TestLocality(t *testing.T) { logger := log.New() ctx, require := context.Background(), require.New(t) const Module uint64 = 31 - path, db, ii, txs := filledInvIndexOfSize(t, 300, 4, Module, logger) + aggStep := uint64(4) + coldFiles := uint64(2) + db, ii, txs := filledInvIndexOfSize(t, 300, aggStep, Module, logger) mergeInverted(t, db, ii, txs) - ic := ii.MakeContext() - defer ic.Close() - li, _ := NewLocalityIndex(path, path, 4, "inv", logger) - defer li.Close() - err := li.BuildMissedIndices(ctx, ic) - require.NoError(err) + + { //prepare + ii.withLocalityIndex = true + require.NoError(ii.enableLocalityIndex()) + + ic := ii.MakeContext() + g := &errgroup.Group{} + ii.BuildMissedIndices(ctx, g, background.NewProgressSet()) + require.NoError(g.Wait()) + require.NoError(ic.BuildOptionalMissedIndices(ctx, background.NewProgressSet())) + ic.Close() + } + t.Run("locality iterator", func(t *testing.T) { ic := ii.MakeContext() defer ic.Close() - it := ic.iterateKeysLocality(math.MaxUint64) + it := ic.iterateKeysLocality(ctx, 0, coldFiles*StepsInColdFile, nil) require.True(it.HasNext()) - key, bitmap := it.Next() - require.Equal(uint64(2), binary.BigEndian.Uint64(key)) - require.Equal([]uint64{0, 1}, bitmap) + key, bitmap, _ := it.Next() + require.Equal(uint64(1), binary.BigEndian.Uint64(key)) + require.Equal([]uint64{0 * StepsInColdFile, 1 * StepsInColdFile}, bitmap) require.True(it.HasNext()) - key, bitmap = it.Next() - require.Equal(uint64(3), binary.BigEndian.Uint64(key)) - require.Equal([]uint64{0, 1}, bitmap) + key, bitmap, _ = it.Next() + require.Equal(uint64(2), binary.BigEndian.Uint64(key)) + require.Equal([]uint64{0 * StepsInColdFile, 1 * StepsInColdFile}, bitmap) var last []byte for it.HasNext() { - key, _ = it.Next() + key, _, _ = it.Next() last = key } require.Equal(Module, binary.BigEndian.Uint64(last)) }) - files, err := li.buildFiles(ctx, ic, ii.endTxNumMinimax()/ii.aggregationStep) - require.NoError(err) - defer files.Close() - t.Run("locality index: get full bitamp", func(t *testing.T) { - res, err := files.bm.At(0) + t.Run("locality index: getBeforeTxNum full bitamp", func(t *testing.T) { + ic := ii.MakeContext() + defer ic.Close() + + res, err := ic.coldLocality.file.src.bm.At(0) require.NoError(err) require.Equal([]uint64{0, 1}, res) - res, err = files.bm.At(1) + res, err = ic.coldLocality.file.src.bm.At(1) require.NoError(err) require.Equal([]uint64{0, 1}, res) - res, err = files.bm.At(32) //too big, must error + res, err = ic.coldLocality.file.src.bm.At(32) //too big, must error require.Error(err) require.Empty(res) }) t.Run("locality index: search from given position", func(t *testing.T) { - fst, snd, ok1, ok2, err := files.bm.First2At(0, 1) + ic := ii.MakeContext() + defer ic.Close() + fst, snd, ok1, ok2, err := ic.coldLocality.file.src.bm.First2At(0, 1) require.NoError(err) require.True(ok1) require.False(ok2) @@ -83,7 +121,9 @@ func TestLocality(t *testing.T) { require.Zero(snd) }) t.Run("locality index: search from given position in future", func(t *testing.T) { - fst, snd, ok1, ok2, err := files.bm.First2At(0, 2) + ic := ii.MakeContext() + defer ic.Close() + fst, snd, ok1, ok2, err := ic.coldLocality.file.src.bm.First2At(0, 2) require.NoError(err) require.False(ok1) require.False(ok2) @@ -91,15 +131,233 @@ func TestLocality(t *testing.T) { require.Zero(snd) }) t.Run("locality index: lookup", func(t *testing.T) { - liCtx := li.MakeContext() - defer liCtx.Close(logger) - var k [8]byte - binary.BigEndian.PutUint64(k[:], 1) - v1, v2, from, ok1, ok2 := li.lookupIdxFiles(liCtx, k[:], 1*li.aggregationStep*StepsInBiggestFile) + ic := ii.MakeContext() + defer ic.Close() + k := hexutility.EncodeTs(1) + v1, v2, from, ok1, ok2 := ic.coldLocality.lookupIdxFiles(k, 1*ic.ii.aggregationStep*StepsInColdFile) require.True(ok1) require.False(ok2) - require.Equal(uint64(1*StepsInBiggestFile), v1) - require.Equal(uint64(0*StepsInBiggestFile), v2) - require.Equal(2*li.aggregationStep*StepsInBiggestFile, from) + require.Equal(uint64(1*StepsInColdFile), v1) + require.Equal(uint64(0*StepsInColdFile), v2) + require.Equal(2*ic.ii.aggregationStep*StepsInColdFile, from) + }) +} + +func TestLocalityDomain(t *testing.T) { + UseBpsTree = true + logger := log.New() + ctx, require := context.Background(), require.New(t) + aggStep := 2 + coldFiles := 3 + coldSteps := coldFiles * StepsInColdFile + txsInColdFile := aggStep * StepsInColdFile + keyCount, txCount := uint64(6), coldFiles*txsInColdFile+aggStep*16 + db, dom, data := filledDomainFixedSize(t, keyCount, uint64(txCount), uint64(aggStep), logger) + collateAndMerge(t, db, nil, dom, uint64(txCount)) + + { //prepare + dom.withLocalityIndex = true + require.NoError(dom.enableLocalityIndex()) + + dc := dom.MakeContext() + g := &errgroup.Group{} + dom.BuildMissedIndices(ctx, g, background.NewProgressSet()) + require.NoError(g.Wait()) + err := dc.BuildOptionalMissedIndices(ctx, background.NewProgressSet()) + require.NoError(err) + dc.Close() + } + + _, _ = ctx, data + t.Run("locality iterator", func(t *testing.T) { + dc := dom.MakeContext() + defer dc.Close() + require.Equal(0, int(dc.maxColdStep())) // domains have no cold files + var last []byte + + it := dc.hc.ic.iterateKeysLocality(ctx, 0, uint64(coldSteps), nil) + require.True(it.HasNext()) + key, bitmap, _ := it.Next() + require.Equal(uint64(0), binary.BigEndian.Uint64(key)) + require.Equal([]uint64{0 * StepsInColdFile}, bitmap) + require.True(it.HasNext()) + key, bitmap, _ = it.Next() + require.Equal(uint64(1), binary.BigEndian.Uint64(key)) + require.Equal([]uint64{1 * StepsInColdFile, 2 * StepsInColdFile}, bitmap) + + for it.HasNext() { + last, _, _ = it.Next() + } + require.Equal(coldFiles-1, int(binary.BigEndian.Uint64(last))) + + it = dc.hc.ic.iterateKeysLocality(ctx, dc.hc.ic.maxColdStep(), dc.hc.ic.maxWarmStep()+1, nil) + require.True(it.HasNext()) + key, bitmap, _ = it.Next() + require.Equal(2, int(binary.BigEndian.Uint64(key))) + require.Equal([]uint64{uint64(coldSteps), uint64(coldSteps + 8), uint64(coldSteps + 8 + 4), uint64(coldSteps + 8 + 4 + 2)}, bitmap) + require.True(it.HasNext()) + key, bitmap, _ = it.Next() + require.Equal(3, int(binary.BigEndian.Uint64(key))) + require.Equal([]uint64{uint64(coldSteps), uint64(coldSteps + 8), uint64(coldSteps + 8 + 4), uint64(coldSteps + 8 + 4 + 2)}, bitmap) + + last = nil + for it.HasNext() { + last, _, _ = it.Next() + } + require.Equal(int(keyCount-1), int(binary.BigEndian.Uint64(last))) + + }) + + t.Run("locality index: bitmap all data check", func(t *testing.T) { + dc := dom.MakeContext() + defer dc.Close() + res, err := dc.hc.ic.coldLocality.file.src.bm.At(0) + require.NoError(err) + require.Equal([]uint64{0}, res) + res, err = dc.hc.ic.coldLocality.file.src.bm.At(1) + require.NoError(err) + require.Equal([]uint64{1, 2}, res) + res, err = dc.hc.ic.coldLocality.file.src.bm.At(keyCount) //too big, must error + require.Error(err) + require.Empty(res) + }) + + t.Run("locality index: search from given position", func(t *testing.T) { + dc := dom.MakeContext() + defer dc.Close() + fst, snd, ok1, ok2, err := dc.hc.ic.coldLocality.file.src.bm.First2At(1, 1) + require.NoError(err) + require.True(ok1) + require.True(ok2) + require.Equal(1, int(fst)) + require.Equal(2, int(snd)) + + fst, snd, ok1, ok2, err = dc.hc.ic.coldLocality.file.src.bm.First2At(1, 2) + require.NoError(err) + require.True(ok1) + require.False(ok2) + require.Equal(2, int(fst)) + require.Equal(0, int(snd)) + + fst, snd, ok1, ok2, err = dc.hc.ic.coldLocality.file.src.bm.First2At(2, 1) + require.NoError(err) + require.True(ok1) + require.False(ok2) + require.Equal(uint64(2), fst) + require.Zero(snd) + + _, _, ok1, ok2, err = dc.hc.ic.coldLocality.file.src.bm.First2At(0, 1) + require.NoError(err) + require.False(ok1) + require.False(ok2) + }) + t.Run("locality index: bitmap operations", func(t *testing.T) { + dc := dom.MakeContext() + defer dc.Close() + _, _, ok1, ok2, err := dc.hc.ic.coldLocality.file.src.bm.First2At(0, 2) + require.NoError(err) + require.False(ok1) + require.False(ok2) + + _, _, ok1, ok2, err = dc.hc.ic.coldLocality.file.src.bm.First2At(2, 3) + require.NoError(err) + require.False(ok1) + require.False(ok2) + + v1, ok1, err := dc.hc.ic.coldLocality.file.src.bm.LastAt(0) + require.NoError(err) + require.True(ok1) + require.Equal(0, int(v1)) + + v1, ok1, err = dc.hc.ic.coldLocality.file.src.bm.LastAt(1) + require.NoError(err) + require.True(ok1) + require.Equal(2, int(v1)) + + _, ok1, err = dc.hc.ic.coldLocality.file.src.bm.LastAt(3) + require.NoError(err) + require.False(ok1) + }) + t.Run("locality index: lookup", func(t *testing.T) { + dc := dom.MakeContext() + defer dc.Close() + to := dc.hc.ic.coldLocality.indexedTo() + require.Equal(coldFiles*txsInColdFile, int(to)) + + v1, _, from, ok1, ok2 := dc.hc.ic.coldLocality.lookupIdxFiles(hexutility.EncodeTs(0), 0) + require.True(ok1) + require.False(ok2) + require.Equal(uint64(0*StepsInColdFile), v1) + require.Equal(txsInColdFile*coldFiles, int(from)) + + v1, v2, from, ok1, ok2 := dc.hc.ic.coldLocality.lookupIdxFiles(hexutility.EncodeTs(1), 0) + require.True(ok1) + require.True(ok2) + require.Equal(uint64(1*StepsInColdFile), v1) + require.Equal(uint64(2*StepsInColdFile), v2) + require.Equal(txsInColdFile*coldFiles, int(from)) + }) + t.Run("locality index to kv file", func(t *testing.T) { + dc := dom.MakeContext() + defer dc.Close() + + for _, f := range dc.files { + g := NewArchiveGetter(f.src.decompressor.MakeGetter(), dc.d.compression) + + for g.HasNext() { + k, _ := g.Next(nil) + g.Skip() // v + + coveredByWarmIdx := f.isSubsetOf(dc.hc.ic.warmLocality.file) + if coveredByWarmIdx { + exactStep, ok, err := dc.hc.ic.warmLocality.lookupLatest(k) + require.NoError(err) + require.True(ok) + comment := fmt.Sprintf("files: %s, %s", f.src.decompressor.FileName(), dc.hc.ic.warmLocality.file.src.bm.FileName()) + exactTxNum := exactStep * dc.d.aggregationStep + require.LessOrEqual(f.startTxNum, exactTxNum, comment) + } + + coveredByColdIdx := f.isSubsetOf(dc.hc.ic.coldLocality.file) + if coveredByColdIdx { + exactSuperStep, ok, err := dc.hc.ic.coldLocality.lookupLatest(k) + require.NoError(err) + require.True(ok) + exactTxNum := exactSuperStep * StepsInColdFile * dc.d.aggregationStep + comment := fmt.Sprintf("files: %s, %s", f.src.decompressor.FileName(), dc.hc.ic.coldLocality.file.src.bm.FileName()) + require.GreaterOrEqual(dc.hc.ic.coldLocality.file.endTxNum, exactTxNum, comment) + require.LessOrEqual(f.startTxNum, exactTxNum, comment) + } + } + } + }) + + t.Run("domain.getLatestFromFiles", func(t *testing.T) { + dc := dom.MakeContext() + defer dc.Close() + fmt.Printf("--case0\n") + v, ok, err := dc.getLatestFromFiles(hexutility.EncodeTs(0)) + require.NoError(err) + require.True(ok) + require.Equal(1*txsInColdFile-1, int(binary.BigEndian.Uint64(v))) + + fmt.Printf("--case1\n") + v, ok, err = dc.getLatestFromFiles(hexutility.EncodeTs(1)) + require.NoError(err) + require.NotNil(v) + require.True(ok) + require.Equal(3*txsInColdFile-1, int(binary.BigEndian.Uint64(v))) + + fmt.Printf("--case2\n") + v, ok, err = dc.getLatestFromFiles(hexutility.EncodeTs(2)) + require.NoError(err) + require.True(ok) + require.Equal(221, int(binary.BigEndian.Uint64(v))) + + fmt.Printf("--case5\n") + v, ok, err = dc.getLatestFromFiles(hexutility.EncodeTs(5)) + require.NoError(err) + require.True(ok) + require.Equal(221, int(binary.BigEndian.Uint64(v))) }) } diff --git a/state/merge.go b/state/merge.go index 1a45bb550..41254e5fa 100644 --- a/state/merge.go +++ b/state/merge.go @@ -22,16 +22,17 @@ import ( "context" "encoding/binary" "fmt" - "os" "path/filepath" "strings" - "github.com/ledgerwatch/erigon-lib/common/background" "github.com/ledgerwatch/log/v3" "github.com/ledgerwatch/erigon-lib/common" + "github.com/ledgerwatch/erigon-lib/common/background" "github.com/ledgerwatch/erigon-lib/common/cmp" + "github.com/ledgerwatch/erigon-lib/common/dir" "github.com/ledgerwatch/erigon-lib/compress" + "github.com/ledgerwatch/erigon-lib/etl" "github.com/ledgerwatch/erigon-lib/recsplit" "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" ) @@ -105,24 +106,26 @@ type DomainRanges struct { values bool history bool index bool + + aggStep uint64 } func (r DomainRanges) String() string { var b strings.Builder if r.values { - b.WriteString(fmt.Sprintf("Values: [%d, %d)", r.valuesStartTxNum, r.valuesEndTxNum)) + b.WriteString(fmt.Sprintf("val:%d-%d", r.valuesStartTxNum/r.aggStep, r.valuesEndTxNum/r.aggStep)) } if r.history { if b.Len() > 0 { b.WriteString(", ") } - b.WriteString(fmt.Sprintf("History: [%d, %d)", r.historyStartTxNum, r.historyEndTxNum)) + b.WriteString(fmt.Sprintf("hist:%d-%d", r.historyStartTxNum/r.aggStep, r.historyEndTxNum/r.aggStep)) } if r.index { if b.Len() > 0 { b.WriteString(", ") } - b.WriteString(fmt.Sprintf("Index: [%d, %d)", r.indexStartTxNum, r.indexEndTxNum)) + b.WriteString(fmt.Sprintf("idx:%d-%d", r.indexStartTxNum/r.aggStep, r.indexEndTxNum/r.aggStep)) } return b.String() } @@ -131,10 +134,13 @@ func (r DomainRanges) any() bool { return r.values || r.history || r.index } -// findMergeRange assumes that all fTypes in d.files have items at least as far as maxEndTxNum +// findMergeRange +// assumes that all fTypes in d.files have items at least as far as maxEndTxNum // That is why only Values type is inspected -func (d *Domain) findMergeRange(maxEndTxNum, maxSpan uint64) DomainRanges { - hr := d.History.findMergeRange(maxEndTxNum, maxSpan) +// +// As any other methods of DomainContext - it can't see any files overlaps or garbage +func (dc *DomainContext) findMergeRange(maxEndTxNum, maxSpan uint64) DomainRanges { + hr := dc.hc.findMergeRange(maxEndTxNum, maxSpan) r := DomainRanges{ historyStartTxNum: hr.historyStartTxNum, historyEndTxNum: hr.historyEndTxNum, @@ -142,26 +148,66 @@ func (d *Domain) findMergeRange(maxEndTxNum, maxSpan uint64) DomainRanges { indexStartTxNum: hr.indexStartTxNum, indexEndTxNum: hr.indexEndTxNum, index: hr.index, + aggStep: dc.d.aggregationStep, } - d.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - if item.endTxNum > maxEndTxNum { - return false + for _, item := range dc.files { + if item.endTxNum > maxEndTxNum { + break + } + endStep := item.endTxNum / dc.d.aggregationStep + spanStep := endStep & -endStep // Extract rightmost bit in the binary representation of endStep, this corresponds to size of maximally possible merge ending at endStep + span := spanStep * dc.d.aggregationStep + start := item.endTxNum - span + if start < item.startTxNum { + if !r.values || start < r.valuesStartTxNum { + r.values = true + r.valuesStartTxNum = start + r.valuesEndTxNum = item.endTxNum } - endStep := item.endTxNum / d.aggregationStep - spanStep := endStep & -endStep // Extract rightmost bit in the binary representation of endStep, this corresponds to size of maximally possible merge ending at endStep - span := cmp.Min(spanStep*d.aggregationStep, maxSpan) - start := item.endTxNum - span - if start < item.startTxNum { - if !r.values || start < r.valuesStartTxNum { - r.values = true - r.valuesStartTxNum = start - r.valuesEndTxNum = item.endTxNum - } + } + } + return r +} + +func (hc *HistoryContext) findMergeRange(maxEndTxNum, maxSpan uint64) HistoryRanges { + var r HistoryRanges + r.index, r.indexStartTxNum, r.indexEndTxNum = hc.ic.findMergeRange(maxEndTxNum, maxSpan) + for _, item := range hc.files { + if item.endTxNum > maxEndTxNum { + continue + } + endStep := item.endTxNum / hc.h.aggregationStep + spanStep := endStep & -endStep // Extract rightmost bit in the binary representation of endStep, this corresponds to size of maximally possible merge ending at endStep + span := cmp.Min(spanStep*hc.h.aggregationStep, maxSpan) + start := item.endTxNum - span + foundSuperSet := r.indexStartTxNum == item.startTxNum && item.endTxNum >= r.historyEndTxNum + if foundSuperSet { + r.history = false + r.historyStartTxNum = start + r.historyEndTxNum = item.endTxNum + } else if start < item.startTxNum { + if !r.history || start < r.historyStartTxNum { + r.history = true + r.historyStartTxNum = start + r.historyEndTxNum = item.endTxNum } } - return true - }) + } + + if r.history && r.index { + // history is behind idx: then merge only history + historyIsAgead := r.historyEndTxNum > r.indexEndTxNum + if historyIsAgead { + r.history, r.historyStartTxNum, r.historyEndTxNum = false, 0, 0 + return r + } + + historyIsBehind := r.historyEndTxNum < r.indexEndTxNum + if historyIsBehind { + r.index, r.indexStartTxNum, r.indexEndTxNum = false, 0, 0 + return r + } + } return r } @@ -172,67 +218,31 @@ func (d *Domain) findMergeRange(maxEndTxNum, maxSpan uint64) DomainRanges { // 0-1,1-2,2-3: allow merge 0-2 // // 0-2,2-3: nothing to merge -func (ii *InvertedIndex) findMergeRange(maxEndTxNum, maxSpan uint64) (bool, uint64, uint64) { +func (ic *InvertedIndexContext) findMergeRange(maxEndTxNum, maxSpan uint64) (bool, uint64, uint64) { var minFound bool var startTxNum, endTxNum uint64 - ii.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - if item.endTxNum > maxEndTxNum { - continue - } - endStep := item.endTxNum / ii.aggregationStep - spanStep := endStep & -endStep // Extract rightmost bit in the binary representation of endStep, this corresponds to size of maximally possible merge ending at endStep - span := cmp.Min(spanStep*ii.aggregationStep, maxSpan) - start := item.endTxNum - span - foundSuperSet := startTxNum == item.startTxNum && item.endTxNum >= endTxNum - if foundSuperSet { - minFound = false + for _, item := range ic.files { + if item.endTxNum > maxEndTxNum { + continue + } + endStep := item.endTxNum / ic.ii.aggregationStep + spanStep := endStep & -endStep // Extract rightmost bit in the binary representation of endStep, this corresponds to size of maximally possible merge ending at endStep + span := cmp.Min(spanStep*ic.ii.aggregationStep, maxSpan) + start := item.endTxNum - span + foundSuperSet := startTxNum == item.startTxNum && item.endTxNum >= endTxNum + if foundSuperSet { + minFound = false + startTxNum = start + endTxNum = item.endTxNum + } else if start < item.startTxNum { + if !minFound || start < startTxNum { + minFound = true startTxNum = start endTxNum = item.endTxNum - } else if start < item.startTxNum { - if !minFound || start < startTxNum { - minFound = true - startTxNum = start - endTxNum = item.endTxNum - } } } - return true - }) - return minFound, startTxNum, endTxNum -} - -func (ii *InvertedIndex) mergeRangesUpTo(ctx context.Context, maxTxNum, maxSpan uint64, workers int, ictx *InvertedIndexContext, ps *background.ProgressSet) (err error) { - closeAll := true - for updated, startTx, endTx := ii.findMergeRange(maxSpan, maxTxNum); updated; updated, startTx, endTx = ii.findMergeRange(maxTxNum, maxSpan) { - staticFiles, _ := ictx.staticFilesInRange(startTx, endTx) - defer func() { - if closeAll { - for _, i := range staticFiles { - i.decompressor.Close() - i.index.Close() - } - } - }() - - mergedIndex, err := ii.mergeFiles(ctx, staticFiles, startTx, endTx, workers, ps) - if err != nil { - return err - } - defer func() { - if closeAll { - mergedIndex.decompressor.Close() - mergedIndex.index.Close() - } - }() - - ii.integrateMergedFiles(staticFiles, mergedIndex) - if mergedIndex.frozen { - ii.cleanAfterFreeze(mergedIndex.endTxNum) - } } - closeAll = false - return nil + return minFound, startTxNum, endTxNum } type HistoryRanges struct { @@ -258,49 +268,97 @@ func (r HistoryRanges) any() bool { return r.history || r.index } -func (h *History) findMergeRange(maxEndTxNum, maxSpan uint64) HistoryRanges { - var r HistoryRanges - r.index, r.indexStartTxNum, r.indexEndTxNum = h.InvertedIndex.findMergeRange(maxEndTxNum, maxSpan) - h.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - if item.endTxNum > maxEndTxNum { - continue - } - endStep := item.endTxNum / h.aggregationStep - spanStep := endStep & -endStep // Extract rightmost bit in the binary representation of endStep, this corresponds to size of maximally possible merge ending at endStep - span := cmp.Min(spanStep*h.aggregationStep, maxSpan) - start := item.endTxNum - span - foundSuperSet := r.indexStartTxNum == item.startTxNum && item.endTxNum >= r.historyEndTxNum - if foundSuperSet { - r.history = false - r.historyStartTxNum = start - r.historyEndTxNum = item.endTxNum - } else if start < item.startTxNum { - if !r.history || start < r.historyStartTxNum { - r.history = true - r.historyStartTxNum = start - r.historyEndTxNum = item.endTxNum - } +func (dc *DomainContext) BuildOptionalMissedIndices(ctx context.Context, ps *background.ProgressSet) (err error) { + if err := dc.hc.ic.BuildOptionalMissedIndices(ctx, ps); err != nil { + return err + } + return nil +} + +func (ic *InvertedIndexContext) BuildOptionalMissedIndices(ctx context.Context, ps *background.ProgressSet) (err error) { + if ic.ii.withLocalityIndex && ic.ii.coldLocalityIdx != nil { + from, to := uint64(0), ic.maxColdStep() + if to == 0 || ic.ii.coldLocalityIdx.exists(from, to) { + return nil + } + defer func() { + if ic.ii.filenameBase == AggTraceFileLife { + ic.ii.logger.Warn(fmt.Sprintf("[agg] BuildColdLocality done: %s.%d-%d", ic.ii.filenameBase, from, to)) } + }() + if err = ic.ii.coldLocalityIdx.BuildMissedIndices(ctx, from, to, true, ps, + func() *LocalityIterator { return ic.iterateKeysLocality(ctx, from, to, nil) }, + ); err != nil { + return err } - return true - }) + } + return nil +} - if r.history && r.index { - // history is behind idx: then merge only history - historyIsAgead := r.historyEndTxNum > r.indexEndTxNum - if historyIsAgead { - r.history, r.historyStartTxNum, r.historyEndTxNum = false, 0, 0 - return r +func (dc *DomainContext) maxColdStep() uint64 { + return dc.maxTxNumInFiles(true) / dc.d.aggregationStep +} +func (ic *InvertedIndexContext) maxColdStep() uint64 { + return ic.maxTxNumInFiles(true) / ic.ii.aggregationStep +} +func (ic *InvertedIndexContext) minWarmStep() uint64 { + return ic.maxTxNumInFiles(true) / ic.ii.aggregationStep +} +func (ic *InvertedIndexContext) maxWarmStep() uint64 { + return ic.maxTxNumInFiles(false) / ic.ii.aggregationStep +} + +func (dc *DomainContext) maxTxNumInFiles(cold bool) uint64 { + if len(dc.files) == 0 { + return 0 + } + var max uint64 + if cold { + for i := len(dc.files) - 1; i >= 0; i-- { + if !dc.files[i].src.frozen { + continue + } + max = dc.files[i].endTxNum + break } + } else { + max = dc.files[len(dc.files)-1].endTxNum + } + return cmp.Min(max, dc.hc.maxTxNumInFiles(cold)) +} - historyIsBehind := r.historyEndTxNum < r.indexEndTxNum - if historyIsBehind { - r.index, r.indexStartTxNum, r.indexEndTxNum = false, 0, 0 - return r +func (hc *HistoryContext) maxTxNumInFiles(cold bool) uint64 { + if len(hc.files) == 0 { + return 0 + } + var max uint64 + if cold { + for i := len(hc.files) - 1; i >= 0; i-- { + if !hc.files[i].src.frozen { + continue + } + max = hc.files[i].endTxNum + break } + } else { + max = hc.files[len(hc.files)-1].endTxNum } - return r + return cmp.Min(max, hc.ic.maxTxNumInFiles(cold)) +} +func (ic *InvertedIndexContext) maxTxNumInFiles(cold bool) uint64 { + if len(ic.files) == 0 { + return 0 + } + if !cold { + return ic.files[len(ic.files)-1].endTxNum + } + for i := len(ic.files) - 1; i >= 0; i-- { + if !ic.files[i].src.frozen { + continue + } + return ic.files[i].endTxNum + } + return 0 } // staticFilesInRange returns list of static files with txNum in specified range [startTxNum; endTxNum) @@ -340,10 +398,6 @@ func (dc *DomainContext) staticFilesInRange(r DomainRanges) (valuesFiles, indexF return } -// nolint -func (d *Domain) staticFilesInRange(r DomainRanges, dc *DomainContext) (valuesFiles, indexFiles, historyFiles []*filesItem, startJ int) { - panic("deprecated: use DomainContext.staticFilesInRange") -} func (ic *InvertedIndexContext) staticFilesInRange(startTxNum, endTxNum uint64) ([]*filesItem, int) { files := make([]*filesItem, 0, len(ic.files)) var startJ int @@ -459,46 +513,22 @@ func (d *Domain) mergeFiles(ctx context.Context, valuesFiles, indexFiles, histor if !r.any() { return } - var comp *compress.Compressor - closeItem := true + closeItem := true + var comp ArchiveWriter defer func() { if closeItem { if comp != nil { comp.Close() } if indexIn != nil { - if indexIn.decompressor != nil { - indexIn.decompressor.Close() - } - if indexIn.index != nil { - indexIn.index.Close() - } - if indexIn.bindex != nil { - indexIn.bindex.Close() - } + indexIn.closeFilesAndRemove() } if historyIn != nil { - if historyIn.decompressor != nil { - historyIn.decompressor.Close() - } - if historyIn.index != nil { - historyIn.index.Close() - } - if historyIn.bindex != nil { - historyIn.bindex.Close() - } + historyIn.closeFilesAndRemove() } if valuesIn != nil { - if valuesIn.decompressor != nil { - valuesIn.decompressor.Close() - } - if valuesIn.index != nil { - valuesIn.index.Close() - } - if valuesIn.bindex != nil { - valuesIn.bindex.Close() - } + valuesIn.closeFilesAndRemove() } } }() @@ -512,153 +542,290 @@ func (d *Domain) mergeFiles(ctx context.Context, valuesFiles, indexFiles, histor index: r.index}, workers, ps); err != nil { return nil, nil, nil, err } - if r.values { - for _, f := range valuesFiles { - defer f.decompressor.EnableMadvNormal().DisableReadAhead() - } - datFileName := fmt.Sprintf("%s.%d-%d.kv", d.filenameBase, r.valuesStartTxNum/d.aggregationStep, r.valuesEndTxNum/d.aggregationStep) - datPath := filepath.Join(d.dir, datFileName) - if comp, err = compress.NewCompressor(ctx, "merge", datPath, d.tmpdir, compress.MinPatternScore, workers, log.LvlTrace, d.logger); err != nil { - return nil, nil, nil, fmt.Errorf("merge %s history compressor: %w", d.filenameBase, err) + + if !r.values { + closeItem = false + return + } + + for _, f := range valuesFiles { + defer f.decompressor.EnableReadAhead().DisableReadAhead() + } + + datFileName := fmt.Sprintf("%s.%d-%d.kv", d.filenameBase, r.valuesStartTxNum/d.aggregationStep, r.valuesEndTxNum/d.aggregationStep) + datPath := filepath.Join(d.dir, datFileName) + compr, err := compress.NewCompressor(ctx, "merge", datPath, d.tmpdir, compress.MinPatternScore, workers, log.LvlTrace, d.logger) + if err != nil { + return nil, nil, nil, fmt.Errorf("merge %s domain compressor: %w", d.filenameBase, err) + } + + comp = NewArchiveWriter(compr, d.compression) + if d.noFsync { + comp.DisableFsync() + } + p := ps.AddNew("merge "+datFileName, 1) + defer ps.Delete(p) + + var cp CursorHeap + heap.Init(&cp) + for _, item := range valuesFiles { + g := NewArchiveGetter(item.decompressor.MakeGetter(), d.compression) + g.Reset(0) + if g.HasNext() { + key, _ := g.Next(nil) + val, _ := g.Next(nil) + heap.Push(&cp, &CursorItem{ + t: FILE_CURSOR, + dg: g, + key: key, + val: val, + endTxNum: item.endTxNum, + reverse: true, + }) } - if d.noFsync { - comp.DisableFsync() + } + // In the loop below, the pair `keyBuf=>valBuf` is always 1 item behind `lastKey=>lastVal`. + // `lastKey` and `lastVal` are taken from the top of the multi-way merge (assisted by the CursorHeap cp), but not processed right away + // instead, the pair from the previous iteration is processed first - `keyBuf=>valBuf`. After that, `keyBuf` and `valBuf` are assigned + // to `lastKey` and `lastVal` correspondingly, and the next step of multi-way merge happens. Therefore, after the multi-way merge loop + // (when CursorHeap cp is empty), there is a need to process the last pair `keyBuf=>valBuf`, because it was one step behind + var keyBuf, valBuf []byte + for cp.Len() > 0 { + lastKey := common.Copy(cp[0].key) + lastVal := common.Copy(cp[0].val) + // Advance all the items that have this key (including the top) + for cp.Len() > 0 && bytes.Equal(cp[0].key, lastKey) { + ci1 := heap.Pop(&cp).(*CursorItem) + if ci1.dg.HasNext() { + ci1.key, _ = ci1.dg.Next(nil) + ci1.val, _ = ci1.dg.Next(nil) + heap.Push(&cp, ci1) + } } - p := ps.AddNew("merege "+datFileName, 1) - defer ps.Delete(p) - var cp CursorHeap - heap.Init(&cp) - for _, item := range valuesFiles { - g := item.decompressor.MakeGetter() - g.Reset(0) - if g.HasNext() { - key, _ := g.NextUncompressed() - var val []byte - if d.compressVals { - val, _ = g.Next(nil) - } else { - val, _ = g.NextUncompressed() + // empty value means deletion + deleted := r.valuesStartTxNum == 0 && len(lastVal) == 0 + if !deleted { + if keyBuf != nil { + if err = comp.AddWord(keyBuf); err != nil { + return nil, nil, nil, err + } + if err = comp.AddWord(valBuf); err != nil { + return nil, nil, nil, err } - heap.Push(&cp, &CursorItem{ - t: FILE_CURSOR, - dg: g, - key: key, - val: val, - endTxNum: item.endTxNum, - reverse: true, - }) } + keyBuf = append(keyBuf[:0], lastKey...) + valBuf = append(valBuf[:0], lastVal...) } - keyCount := 0 - // In the loop below, the pair `keyBuf=>valBuf` is always 1 item behind `lastKey=>lastVal`. - // `lastKey` and `lastVal` are taken from the top of the multi-way merge (assisted by the CursorHeap cp), but not processed right away - // instead, the pair from the previous iteration is processed first - `keyBuf=>valBuf`. After that, `keyBuf` and `valBuf` are assigned - // to `lastKey` and `lastVal` correspondingly, and the next step of multi-way merge happens. Therefore, after the multi-way merge loop - // (when CursorHeap cp is empty), there is a need to process the last pair `keyBuf=>valBuf`, because it was one step behind - var keyBuf, valBuf []byte - for cp.Len() > 0 { - lastKey := common.Copy(cp[0].key) - lastVal := common.Copy(cp[0].val) - // Advance all the items that have this key (including the top) - for cp.Len() > 0 && bytes.Equal(cp[0].key, lastKey) { - ci1 := cp[0] - if ci1.dg.HasNext() { - ci1.key, _ = ci1.dg.NextUncompressed() - if d.compressVals { - ci1.val, _ = ci1.dg.Next(ci1.val[:0]) - } else { - ci1.val, _ = ci1.dg.NextUncompressed() - } - heap.Fix(&cp, 0) - } else { - heap.Pop(&cp) - } + } + if keyBuf != nil { + if err = comp.AddWord(keyBuf); err != nil { + return nil, nil, nil, err + } + if err = comp.AddWord(valBuf); err != nil { + return nil, nil, nil, err + } + } + if err = comp.Compress(); err != nil { + return nil, nil, nil, err + } + comp.Close() + comp = nil + ps.Delete(p) + + valuesIn = newFilesItem(r.valuesStartTxNum, r.valuesEndTxNum, d.aggregationStep) + valuesIn.frozen = false + if valuesIn.decompressor, err = compress.NewDecompressor(datPath); err != nil { + return nil, nil, nil, fmt.Errorf("merge %s decompressor [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) + } + + idxFileName := fmt.Sprintf("%s.%d-%d.kvi", d.filenameBase, r.valuesStartTxNum/d.aggregationStep, r.valuesEndTxNum/d.aggregationStep) + idxPath := filepath.Join(d.dir, idxFileName) + // if valuesIn.index, err = buildIndex(valuesIn.decompressor, idxPath, d.dir, false /* values */); err != nil { + if !UseBpsTree { + if valuesIn.index, err = buildIndexThenOpen(ctx, valuesIn.decompressor, d.compression, idxPath, d.tmpdir, false, d.salt, ps, d.logger, d.noFsync); err != nil { + return nil, nil, nil, fmt.Errorf("merge %s buildIndex [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) + } + } + + btFileName := fmt.Sprintf("%s.%d-%d.bt", d.filenameBase, r.valuesStartTxNum/d.aggregationStep, r.valuesEndTxNum/d.aggregationStep) + btPath := filepath.Join(d.dir, btFileName) + valuesIn.bindex, err = CreateBtreeIndexWithDecompressor(btPath, DefaultBtreeM, valuesIn.decompressor, d.compression, *d.salt, ps, d.tmpdir, d.logger) + if err != nil { + return nil, nil, nil, fmt.Errorf("merge %s btindex [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) + } + + { + fileName := fmt.Sprintf("%s.%d-%d.kvei", d.filenameBase, r.valuesStartTxNum/d.aggregationStep, r.valuesEndTxNum/d.aggregationStep) + if dir.FileExist(filepath.Join(d.dir, fileName)) { + valuesIn.bloom, err = OpenBloom(filepath.Join(d.dir, fileName)) + if err != nil { + return nil, nil, nil, fmt.Errorf("merge %s bloom [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) } + } + } - // empty value means deletion - deleted := r.valuesStartTxNum == 0 && len(lastVal) == 0 - if !deleted { - if keyBuf != nil { - if err = comp.AddUncompressedWord(keyBuf); err != nil { - return nil, nil, nil, err - } - keyCount++ // Only counting keys, not values - switch d.compressVals { - case true: - if err = comp.AddWord(valBuf); err != nil { - return nil, nil, nil, err - } - default: - if err = comp.AddUncompressedWord(valBuf); err != nil { - return nil, nil, nil, err - } - } - } - keyBuf = append(keyBuf[:0], lastKey...) - valBuf = append(valBuf[:0], lastVal...) + closeItem = false + d.stats.MergesCount++ + return +} + +func (d *DomainCommitted) mergeFiles(ctx context.Context, oldFiles SelectedStaticFiles, mergedFiles MergedFiles, r DomainRanges, workers int, ps *background.ProgressSet) (valuesIn, indexIn, historyIn *filesItem, err error) { + if !r.any() { + return + } + + domainFiles := oldFiles.commitment + indexFiles := oldFiles.commitmentIdx + historyFiles := oldFiles.commitmentHist + + var comp ArchiveWriter + var closeItem = true + defer func() { + if closeItem { + if comp != nil { + comp.Close() + } + if indexIn != nil { + indexIn.closeFilesAndRemove() + } + if historyIn != nil { + historyIn.closeFilesAndRemove() + } + if valuesIn != nil { + valuesIn.closeFilesAndRemove() } } - if keyBuf != nil { - if err = comp.AddUncompressedWord(keyBuf); err != nil { - return nil, nil, nil, err + }() + if indexIn, historyIn, err = d.History.mergeFiles(ctx, indexFiles, historyFiles, + HistoryRanges{ + historyStartTxNum: r.historyStartTxNum, + historyEndTxNum: r.historyEndTxNum, + history: r.history, + indexStartTxNum: r.indexStartTxNum, + indexEndTxNum: r.indexEndTxNum, + index: r.index}, workers, ps); err != nil { + return nil, nil, nil, err + } + + if !r.values { + closeItem = false + return + } + + datFileName := fmt.Sprintf("%s.%d-%d.kv", d.filenameBase, r.valuesStartTxNum/d.aggregationStep, r.valuesEndTxNum/d.aggregationStep) + datPath := filepath.Join(d.dir, datFileName) + p := ps.AddNew(datFileName, 1) + defer ps.Delete(p) + + cmp, err := compress.NewCompressor(ctx, "merge", datPath, d.dir, compress.MinPatternScore, workers, log.LvlTrace, d.logger) + if err != nil { + return nil, nil, nil, fmt.Errorf("merge %s compressor: %w", d.filenameBase, err) + } + comp = NewArchiveWriter(cmp, d.compression) + + for _, f := range domainFiles { + defer f.decompressor.EnableReadAhead().DisableReadAhead() + } + + var cp CursorHeap + heap.Init(&cp) + for _, item := range domainFiles { + g := NewArchiveGetter(item.decompressor.MakeGetter(), d.compression) + g.Reset(0) + if g.HasNext() { + key, _ := g.Next(nil) + val, _ := g.Next(nil) + heap.Push(&cp, &CursorItem{ + t: FILE_CURSOR, + dg: g, + key: key, + val: val, + endTxNum: item.endTxNum, + reverse: true, + }) + } + } + // In the loop below, the pair `keyBuf=>valBuf` is always 1 item behind `lastKey=>lastVal`. + // `lastKey` and `lastVal` are taken from the top of the multi-way merge (assisted by the CursorHeap cp), but not processed right away + // instead, the pair from the previous iteration is processed first - `keyBuf=>valBuf`. After that, `keyBuf` and `valBuf` are assigned + // to `lastKey` and `lastVal` correspondingly, and the next step of multi-way merge happens. Therefore, after the multi-way merge loop + // (when CursorHeap cp is empty), there is a need to process the last pair `keyBuf=>valBuf`, because it was one step behind + var keyBuf, valBuf []byte + for cp.Len() > 0 { + lastKey := common.Copy(cp[0].key) + lastVal := common.Copy(cp[0].val) + // Advance all the items that have this key (including the top) + for cp.Len() > 0 && bytes.Equal(cp[0].key, lastKey) { + ci1 := heap.Pop(&cp).(*CursorItem) + if ci1.dg.HasNext() { + ci1.key, _ = ci1.dg.Next(nil) + ci1.val, _ = ci1.dg.Next(nil) + heap.Push(&cp, ci1) } - keyCount++ // Only counting keys, not values - if d.compressVals { - if err = comp.AddWord(valBuf); err != nil { + } + // For the rest of types, empty value means deletion + skip := r.valuesStartTxNum == 0 && len(lastVal) == 0 + if !skip { + if keyBuf != nil { + if err = comp.AddWord(keyBuf); err != nil { return nil, nil, nil, err } - } else { - if err = comp.AddUncompressedWord(valBuf); err != nil { + if err = comp.AddWord(valBuf); err != nil { return nil, nil, nil, err } } + keyBuf = append(keyBuf[:0], lastKey...) + valBuf = append(valBuf[:0], lastVal...) } - if err = comp.Compress(); err != nil { + } + if keyBuf != nil { + if err = comp.AddWord(keyBuf); err != nil { return nil, nil, nil, err } - comp.Close() - comp = nil - ps.Delete(p) - valuesIn = newFilesItem(r.valuesStartTxNum, r.valuesEndTxNum, d.aggregationStep) - if valuesIn.decompressor, err = compress.NewDecompressor(datPath); err != nil { - return nil, nil, nil, fmt.Errorf("merge %s decompressor [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) + //fmt.Printf("last heap key %x\n", keyBuf) + valBuf, err = d.commitmentValTransform(&oldFiles, &mergedFiles, valBuf) + if err != nil { + return nil, nil, nil, fmt.Errorf("merge: 2valTransform [%x] %w", valBuf, err) + } + if err = comp.AddWord(valBuf); err != nil { + return nil, nil, nil, err } + } + if err = comp.Compress(); err != nil { + return nil, nil, nil, err + } + comp.Close() + comp = nil - idxFileName := fmt.Sprintf("%s.%d-%d.kvi", d.filenameBase, r.valuesStartTxNum/d.aggregationStep, r.valuesEndTxNum/d.aggregationStep) - idxPath := filepath.Join(d.dir, idxFileName) - p = ps.AddNew("merge "+idxFileName, uint64(keyCount*2)) - defer ps.Delete(p) - ps.Delete(p) + valuesIn = newFilesItem(r.valuesStartTxNum, r.valuesEndTxNum, d.aggregationStep) + valuesIn.frozen = false + if valuesIn.decompressor, err = compress.NewDecompressor(datPath); err != nil { + return nil, nil, nil, fmt.Errorf("merge %s decompressor [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) + } + ps.Delete(p) - // if valuesIn.index, err = buildIndex(valuesIn.decompressor, idxPath, d.dir, keyCount, false /* values */); err != nil { - if valuesIn.index, err = buildIndexThenOpen(ctx, valuesIn.decompressor, idxPath, d.tmpdir, keyCount, false /* values */, p, d.logger, d.noFsync); err != nil { + idxFileName := fmt.Sprintf("%s.%d-%d.kvi", d.filenameBase, r.valuesStartTxNum/d.aggregationStep, r.valuesEndTxNum/d.aggregationStep) + idxPath := filepath.Join(d.dir, idxFileName) + if !UseBpsTree { + if valuesIn.index, err = buildIndexThenOpen(ctx, valuesIn.decompressor, d.compression, idxPath, d.dir, false, d.salt, ps, d.logger, d.noFsync); err != nil { return nil, nil, nil, fmt.Errorf("merge %s buildIndex [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) } + } - btFileName := strings.TrimSuffix(idxFileName, "kvi") + "bt" - p = ps.AddNew(btFileName, uint64(keyCount*2)) - defer ps.Delete(p) - btPath := filepath.Join(d.dir, btFileName) - err = BuildBtreeIndexWithDecompressor(btPath, valuesIn.decompressor, p, d.tmpdir, d.logger) - if err != nil { - return nil, nil, nil, fmt.Errorf("merge %s btindex [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) - } - - bt, err := OpenBtreeIndexWithDecompressor(btPath, DefaultBtreeM, valuesIn.decompressor) - if err != nil { - return nil, nil, nil, fmt.Errorf("merge %s btindex2 [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) - } - valuesIn.bindex = bt + btPath := strings.TrimSuffix(idxPath, "kvi") + "bt" + valuesIn.bindex, err = CreateBtreeIndexWithDecompressor(btPath, DefaultBtreeM, valuesIn.decompressor, d.compression, *d.salt, ps, d.tmpdir, d.logger) + if err != nil { + return nil, nil, nil, fmt.Errorf("create btindex %s [%d-%d]: %w", d.filenameBase, r.valuesStartTxNum, r.valuesEndTxNum, err) } + closeItem = false - d.stats.MergesCount++ return } func (ii *InvertedIndex) mergeFiles(ctx context.Context, files []*filesItem, startTxNum, endTxNum uint64, workers int, ps *background.ProgressSet) (*filesItem, error) { for _, h := range files { - defer h.decompressor.EnableMadvNormal().DisableReadAhead() + defer h.decompressor.EnableReadAhead().DisableReadAhead() } var outItem *filesItem @@ -675,13 +842,7 @@ func (ii *InvertedIndex) mergeFiles(ctx context.Context, files []*filesItem, sta decomp.Close() } if outItem != nil { - if outItem.decompressor != nil { - outItem.decompressor.Close() - } - if outItem.index != nil { - outItem.index.Close() - } - outItem = nil + outItem.closeFilesAndRemove() } } }() @@ -697,14 +858,15 @@ func (ii *InvertedIndex) mergeFiles(ctx context.Context, files []*filesItem, sta if ii.noFsync { comp.DisableFsync() } - p := ps.AddNew("merge "+datFileName, 1) + write := NewArchiveWriter(comp, ii.compression) + p := ps.AddNew(datFileName, 1) defer ps.Delete(p) var cp CursorHeap heap.Init(&cp) for _, item := range files { - g := item.decompressor.MakeGetter() + g := NewArchiveGetter(item.decompressor.MakeGetter(), ii.compression) g.Reset(0) if g.HasNext() { key, _ := g.Next(nil) @@ -720,7 +882,6 @@ func (ii *InvertedIndex) mergeFiles(ctx context.Context, files []*filesItem, sta }) } } - keyCount := 0 // In the loop below, the pair `keyBuf=>valBuf` is always 1 item behind `lastKey=>lastVal`. // `lastKey` and `lastVal` are taken from the top of the multi-way merge (assisted by the CursorHeap cp), but not processed right away @@ -735,7 +896,7 @@ func (ii *InvertedIndex) mergeFiles(ctx context.Context, files []*filesItem, sta // Advance all the items that have this key (including the top) for cp.Len() > 0 && bytes.Equal(cp[0].key, lastKey) { - ci1 := cp[0] + ci1 := heap.Pop(&cp).(*CursorItem) if mergedOnce { if lastVal, err = mergeEfs(ci1.val, lastVal, nil); err != nil { return nil, fmt.Errorf("merge %s inverted index: %w", ii.filenameBase, err) @@ -745,20 +906,17 @@ func (ii *InvertedIndex) mergeFiles(ctx context.Context, files []*filesItem, sta } //fmt.Printf("multi-way %s [%d] %x\n", ii.indexKeysTable, ci1.endTxNum, ci1.key) if ci1.dg.HasNext() { - ci1.key, _ = ci1.dg.NextUncompressed() - ci1.val, _ = ci1.dg.NextUncompressed() + ci1.key, _ = ci1.dg.Next(nil) + ci1.val, _ = ci1.dg.Next(nil) //fmt.Printf("heap next push %s [%d] %x\n", ii.indexKeysTable, ci1.endTxNum, ci1.key) - heap.Fix(&cp, 0) - } else { - heap.Pop(&cp) + heap.Push(&cp, ci1) } } if keyBuf != nil { - if err = comp.AddUncompressedWord(keyBuf); err != nil { + if err = write.AddWord(keyBuf); err != nil { return nil, err } - keyCount++ // Only counting keys, not values - if err = comp.AddUncompressedWord(valBuf); err != nil { + if err = write.AddWord(valBuf); err != nil { return nil, err } } @@ -766,32 +924,40 @@ func (ii *InvertedIndex) mergeFiles(ctx context.Context, files []*filesItem, sta valBuf = append(valBuf[:0], lastVal...) } if keyBuf != nil { - if err = comp.AddUncompressedWord(keyBuf); err != nil { + if err = write.AddWord(keyBuf); err != nil { return nil, err } - keyCount++ // Only counting keys, not values - if err = comp.AddUncompressedWord(valBuf); err != nil { + if err = write.AddWord(valBuf); err != nil { return nil, err } } - if err = comp.Compress(); err != nil { + if err = write.Compress(); err != nil { return nil, err } comp.Close() comp = nil + outItem = newFilesItem(startTxNum, endTxNum, ii.aggregationStep) if outItem.decompressor, err = compress.NewDecompressor(datPath); err != nil { return nil, fmt.Errorf("merge %s decompressor [%d-%d]: %w", ii.filenameBase, startTxNum, endTxNum, err) } ps.Delete(p) - idxFileName := fmt.Sprintf("%s.%d-%d.efi", ii.filenameBase, startTxNum/ii.aggregationStep, endTxNum/ii.aggregationStep) - idxPath := filepath.Join(ii.dir, idxFileName) - p = ps.AddNew("merge "+idxFileName, uint64(outItem.decompressor.Count()*2)) - defer ps.Delete(p) - if outItem.index, err = buildIndexThenOpen(ctx, outItem.decompressor, idxPath, ii.tmpdir, keyCount, false /* values */, p, ii.logger, ii.noFsync); err != nil { - return nil, fmt.Errorf("merge %s buildIndex [%d-%d]: %w", ii.filenameBase, startTxNum, endTxNum, err) + { + idxFileName := fmt.Sprintf("%s.%d-%d.efi", ii.filenameBase, startTxNum/ii.aggregationStep, endTxNum/ii.aggregationStep) + idxPath := filepath.Join(ii.dir, idxFileName) + if outItem.index, err = buildIndexThenOpen(ctx, outItem.decompressor, ii.compression, idxPath, ii.tmpdir, false, ii.salt, ps, ii.logger, ii.noFsync); err != nil { + return nil, fmt.Errorf("merge %s buildIndex [%d-%d]: %w", ii.filenameBase, startTxNum, endTxNum, err) + } + } + if ii.withExistenceIndex { + idxFileName := fmt.Sprintf("%s.%d-%d.efei", ii.filenameBase, startTxNum/ii.aggregationStep, endTxNum/ii.aggregationStep) + idxPath := filepath.Join(ii.dir, idxFileName) + if outItem.bloom, err = buildIndexFilterThenOpen(ctx, outItem.decompressor, ii.compression, idxPath, ii.tmpdir, ii.salt, ps, ii.logger, ii.noFsync); err != nil { + return nil, err + } } + closeItem = false return outItem, nil } @@ -804,8 +970,7 @@ func (h *History) mergeFiles(ctx context.Context, indexFiles, historyFiles []*fi defer func() { if closeIndex { if indexIn != nil { - indexIn.decompressor.Close() - indexIn.index.Close() + indexIn.closeFilesAndRemove() } } }() @@ -814,10 +979,10 @@ func (h *History) mergeFiles(ctx context.Context, indexFiles, historyFiles []*fi } if r.history { for _, f := range indexFiles { - defer f.decompressor.EnableMadvNormal().DisableReadAhead() + defer f.decompressor.EnableReadAhead().DisableReadAhead() } for _, f := range historyFiles { - defer f.decompressor.EnableMadvNormal().DisableReadAhead() + defer f.decompressor.EnableReadAhead().DisableReadAhead() } var comp *compress.Compressor @@ -840,12 +1005,7 @@ func (h *History) mergeFiles(ctx context.Context, indexFiles, historyFiles []*fi index.Close() } if historyIn != nil { - if historyIn.decompressor != nil { - historyIn.decompressor.Close() - } - if historyIn.index != nil { - historyIn.index.Close() - } + historyIn.closeFilesAndRemove() } } }() @@ -856,29 +1016,31 @@ func (h *History) mergeFiles(ctx context.Context, indexFiles, historyFiles []*fi if comp, err = compress.NewCompressor(ctx, "merge", datPath, h.tmpdir, compress.MinPatternScore, workers, log.LvlTrace, h.logger); err != nil { return nil, nil, fmt.Errorf("merge %s history compressor: %w", h.filenameBase, err) } + compr := NewArchiveWriter(comp, h.compression) if h.noFsync { - comp.DisableFsync() + compr.DisableFsync() } - p := ps.AddNew("merge "+datFileName, 1) + p := ps.AddNew(datFileName, 1) defer ps.Delete(p) + var cp CursorHeap heap.Init(&cp) for _, item := range indexFiles { - g := item.decompressor.MakeGetter() + g := NewArchiveGetter(item.decompressor.MakeGetter(), h.compression) g.Reset(0) if g.HasNext() { - var g2 *compress.Getter + var g2 ArchiveGetter for _, hi := range historyFiles { // full-scan, because it's ok to have different amount files. by unclean-shutdown. if hi.startTxNum == item.startTxNum && hi.endTxNum == item.endTxNum { - g2 = hi.decompressor.MakeGetter() + g2 = NewArchiveGetter(hi.decompressor.MakeGetter(), h.compression) break } } if g2 == nil { panic(fmt.Sprintf("for file: %s, not found corresponding file to merge", g.FileName())) } - key, _ := g.NextUncompressed() - val, _ := g.NextUncompressed() + key, _ := g.Next(nil) + val, _ := g.Next(nil) heap.Push(&cp, &CursorItem{ t: FILE_CURSOR, dg: g, @@ -901,74 +1063,73 @@ func (h *History) mergeFiles(ctx context.Context, indexFiles, historyFiles []*fi lastKey := common.Copy(cp[0].key) // Advance all the items that have this key (including the top) for cp.Len() > 0 && bytes.Equal(cp[0].key, lastKey) { - ci1 := cp[0] + ci1 := heap.Pop(&cp).(*CursorItem) count := eliasfano32.Count(ci1.val) for i := uint64(0); i < count; i++ { if !ci1.dg2.HasNext() { panic(fmt.Errorf("assert: no value??? %s, i=%d, count=%d, lastKey=%x, ci1.key=%x", ci1.dg2.FileName(), i, count, lastKey, ci1.key)) } - if h.compressVals { - valBuf, _ = ci1.dg2.Next(valBuf[:0]) - if err = comp.AddWord(valBuf); err != nil { - return nil, nil, err - } - } else { - valBuf, _ = ci1.dg2.NextUncompressed() - if err = comp.AddUncompressedWord(valBuf); err != nil { - return nil, nil, err - } + valBuf, _ = ci1.dg2.Next(valBuf[:0]) + if err = compr.AddWord(valBuf); err != nil { + return nil, nil, err } } keyCount += int(count) if ci1.dg.HasNext() { - ci1.key, _ = ci1.dg.NextUncompressed() - ci1.val, _ = ci1.dg.NextUncompressed() - heap.Fix(&cp, 0) - } else { - heap.Remove(&cp, 0) + ci1.key, _ = ci1.dg.Next(nil) + ci1.val, _ = ci1.dg.Next(nil) + heap.Push(&cp, ci1) } } } - if err = comp.Compress(); err != nil { + if err = compr.Compress(); err != nil { return nil, nil, err } - comp.Close() + compr.Close() comp = nil if decomp, err = compress.NewDecompressor(datPath); err != nil { return nil, nil, err } ps.Delete(p) - p = ps.AddNew("merge "+idxFileName, uint64(2*keyCount)) + p = ps.AddNew(idxFileName, uint64(decomp.Count()/2)) defer ps.Delete(p) if rs, err = recsplit.NewRecSplit(recsplit.RecSplitArgs{ - KeyCount: keyCount, - Enums: false, - BucketSize: 2000, - LeafSize: 8, - TmpDir: h.tmpdir, - IndexFile: idxPath, + KeyCount: keyCount, + Enums: false, + BucketSize: 2000, + LeafSize: 8, + TmpDir: h.tmpdir, + IndexFile: idxPath, + EtlBufLimit: etl.BufferOptimalSize / 2, + Salt: h.salt, }, h.logger); err != nil { return nil, nil, fmt.Errorf("create recsplit: %w", err) } rs.LogLvl(log.LvlTrace) + if h.noFsync { rs.DisableFsync() } - var historyKey []byte - var txKey [8]byte - var valOffset uint64 - g := indexIn.decompressor.MakeGetter() - g2 := decomp.MakeGetter() - var keyBuf []byte + + var ( + txKey [8]byte + historyKey []byte + keyBuf []byte + valOffset uint64 + ) + + g := NewArchiveGetter(indexIn.decompressor.MakeGetter(), h.InvertedIndex.compression) + g2 := NewArchiveGetter(decomp.MakeGetter(), h.compression) + for { g.Reset(0) g2.Reset(0) valOffset = 0 for g.HasNext() { - keyBuf, _ = g.NextUncompressed() - valBuf, _ = g.NextUncompressed() + keyBuf, _ = g.Next(nil) + valBuf, _ = g.Next(nil) ef, _ := eliasfano32.ReadEliasFano(valBuf) efIt := ef.Iterator() for efIt.HasNext() { @@ -978,11 +1139,7 @@ func (h *History) mergeFiles(ctx context.Context, indexFiles, historyFiles []*fi if err = rs.AddKey(historyKey, valOffset); err != nil { return nil, nil, err } - if h.compressVals { - valOffset, _ = g2.Skip() - } else { - valOffset, _ = g2.SkipUncompressed() - } + valOffset, _ = g2.Skip() } p.Processed.Add(1) } @@ -1020,17 +1177,24 @@ func (d *Domain) integrateMergedFiles(valuesOuts, indexOuts, historyOuts []*file // `kill -9` may leave some garbage // but it still may be useful for merges, until we finish merge frozen file - if historyIn != nil && historyIn.frozen { - d.files.Walk(func(items []*filesItem) bool { - for _, item := range items { - if item.frozen || item.endTxNum > valuesIn.endTxNum { - continue - } - valuesOuts = append(valuesOuts, item) + d.files.Walk(func(items []*filesItem) bool { + for _, item := range items { + if item.frozen { + continue } - return true - }) - } + if item.startTxNum < valuesIn.startTxNum { + continue + } + if item.endTxNum > valuesIn.endTxNum { + continue + } + if item.startTxNum == valuesIn.startTxNum && item.endTxNum == valuesIn.endTxNum { + continue + } + valuesOuts = append(valuesOuts, item) + } + return true + }) } for _, out := range valuesOuts { if out == nil { @@ -1065,6 +1229,10 @@ func (ii *InvertedIndex) integrateMergedFiles(outs []*filesItem, in *filesItem) panic("must not happen: " + ii.filenameBase) } ii.files.Delete(out) + + if ii.filenameBase == AggTraceFileLife { + ii.logger.Warn(fmt.Sprintf("[agg] mark can delete: %s, triggered by merge of: %s", out.decompressor.FileName(), in.decompressor.FileName())) + } out.canDelete.Store(true) } ii.reCalcRoFiles() @@ -1113,6 +1281,7 @@ func (dc *DomainContext) frozenTo() uint64 { return 0 } +// nolint func (hc *HistoryContext) frozenTo() uint64 { if len(hc.files) == 0 { return 0 @@ -1124,6 +1293,8 @@ func (hc *HistoryContext) frozenTo() uint64 { } return 0 } + +// nolint func (ic *InvertedIndexContext) frozenTo() uint64 { if len(ic.files) == 0 { return 0 @@ -1136,20 +1307,23 @@ func (ic *InvertedIndexContext) frozenTo() uint64 { return 0 } -func (d *Domain) cleanAfterFreeze(frozenTo uint64) { - if frozenTo == 0 { +func (d *Domain) cleanAfterFreeze(mergedDomain, mergedHist, mergedIdx *filesItem) { + if mergedHist != nil && mergedHist.frozen { + d.History.cleanAfterFreeze(mergedHist.endTxNum) + } + if mergedDomain == nil { return } - var outs []*filesItem + mergedFrom, mergedTo := mergedDomain.startTxNum, mergedDomain.endTxNum // `kill -9` may leave some garbage // but it may be useful for merges, until merge `frozen` file d.files.Walk(func(items []*filesItem) bool { for _, item := range items { - if item.frozen || item.endTxNum > frozenTo { - continue + if item.startTxNum > mergedFrom && item.endTxNum < mergedTo { + outs = append(outs, item) } - outs = append(outs, item) + //TODO: domain doesn't have .frozen flag. Somehow need delete all earlier sub-sets, but keep largest one. } return true }) @@ -1159,16 +1333,24 @@ func (d *Domain) cleanAfterFreeze(frozenTo uint64) { panic("must not happen: " + d.filenameBase) } d.files.Delete(out) + out.canDelete.Store(true) if out.refcount.Load() == 0 { + if d.filenameBase == AggTraceFileLife && out.decompressor != nil { + d.logger.Info(fmt.Sprintf("[agg] cleanAfterFreeze remove: %s\n", out.decompressor.FileName())) + } // if it has no readers (invisible even for us) - it's safe to remove file right here out.closeFilesAndRemove() + } else { + if d.filenameBase == AggTraceFileLife && out.decompressor != nil { + d.logger.Warn(fmt.Sprintf("[agg] cleanAfterFreeze mark as delete: %s, refcnt=%d", out.decompressor.FileName(), out.refcount.Load())) + } } - out.canDelete.Store(true) } - d.History.cleanAfterFreeze(frozenTo) } -// cleanAfterFreeze - mark all small files before `f` as `canDelete=true` +// cleanAfterFreeze - sometime inverted_index may be already merged, but history not yet. and power-off happening. +// in this case we need keep small files, but when history already merged to `frozen` state - then we can cleanup +// all earlier small files, by mark tem as `canDelete=true` func (h *History) cleanAfterFreeze(frozenTo uint64) { if frozenTo == 0 { return @@ -1244,53 +1426,3 @@ func (ii *InvertedIndex) cleanAfterFreeze(frozenTo uint64) { ii.files.Delete(out) } } - -// nolint -func (d *Domain) deleteGarbageFiles() { - for _, item := range d.garbageFiles { - // paranoic-mode: don't delete frozen files - steps := item.endTxNum/d.aggregationStep - item.startTxNum/d.aggregationStep - if steps%StepsInBiggestFile == 0 { - continue - } - f1 := fmt.Sprintf("%s.%d-%d.kv", d.filenameBase, item.startTxNum/d.aggregationStep, item.endTxNum/d.aggregationStep) - os.Remove(filepath.Join(d.dir, f1)) - log.Debug("[snapshots] delete garbage", f1) - f2 := fmt.Sprintf("%s.%d-%d.kvi", d.filenameBase, item.startTxNum/d.aggregationStep, item.endTxNum/d.aggregationStep) - os.Remove(filepath.Join(d.dir, f2)) - log.Debug("[snapshots] delete garbage", f2) - } - d.garbageFiles = nil - d.History.deleteGarbageFiles() -} -func (h *History) deleteGarbageFiles() { - for _, item := range h.garbageFiles { - // paranoic-mode: don't delete frozen files - if item.endTxNum/h.aggregationStep-item.startTxNum/h.aggregationStep == StepsInBiggestFile { - continue - } - f1 := fmt.Sprintf("%s.%d-%d.v", h.filenameBase, item.startTxNum/h.aggregationStep, item.endTxNum/h.aggregationStep) - os.Remove(filepath.Join(h.dir, f1)) - log.Debug("[snapshots] delete garbage", f1) - f2 := fmt.Sprintf("%s.%d-%d.vi", h.filenameBase, item.startTxNum/h.aggregationStep, item.endTxNum/h.aggregationStep) - os.Remove(filepath.Join(h.dir, f2)) - log.Debug("[snapshots] delete garbage", f2) - } - h.garbageFiles = nil - h.InvertedIndex.deleteGarbageFiles() -} -func (ii *InvertedIndex) deleteGarbageFiles() { - for _, item := range ii.garbageFiles { - // paranoic-mode: don't delete frozen files - if item.endTxNum/ii.aggregationStep-item.startTxNum/ii.aggregationStep == StepsInBiggestFile { - continue - } - f1 := fmt.Sprintf("%s.%d-%d.ef", ii.filenameBase, item.startTxNum/ii.aggregationStep, item.endTxNum/ii.aggregationStep) - os.Remove(filepath.Join(ii.dir, f1)) - log.Debug("[snapshots] delete garbage", f1) - f2 := fmt.Sprintf("%s.%d-%d.efi", ii.filenameBase, item.startTxNum/ii.aggregationStep, item.endTxNum/ii.aggregationStep) - os.Remove(filepath.Join(ii.dir, f2)) - log.Debug("[snapshots] delete garbage", f2) - } - ii.garbageFiles = nil -} diff --git a/state/merge_test.go b/state/merge_test.go index 24b63de76..d2da51351 100644 --- a/state/merge_test.go +++ b/state/merge_test.go @@ -4,6 +4,7 @@ import ( "sort" "testing" + "github.com/ledgerwatch/log/v3" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" btree2 "github.com/tidwall/btree" @@ -11,9 +12,16 @@ import ( "github.com/ledgerwatch/erigon-lib/recsplit/eliasfano32" ) +func emptyTestInvertedIndex(aggStep uint64) *InvertedIndex { + salt := uint32(1) + logger := log.New() + return &InvertedIndex{iiCfg: iiCfg{salt: &salt, dir: "", tmpdir: ""}, + logger: logger, + filenameBase: "test", aggregationStep: aggStep, files: btree2.NewBTreeG[*filesItem](filesItemLess)} +} func TestFindMergeRangeCornerCases(t *testing.T) { t.Run("> 2 unmerged files", func(t *testing.T) { - ii := &InvertedIndex{filenameBase: "test", aggregationStep: 1, files: btree2.NewBTreeG[*filesItem](filesItemLess)} + ii := emptyTestInvertedIndex(1) ii.scanStateFiles([]string{ "test.0-2.ef", "test.2-3.ef", @@ -24,7 +32,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { ic := ii.MakeContext() defer ic.Close() - needMerge, from, to := ii.findMergeRange(4, 32) + needMerge, from, to := ic.findMergeRange(4, 32) assert.True(t, needMerge) assert.Equal(t, 0, int(from)) assert.Equal(t, 4, int(to)) @@ -32,7 +40,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { idxF, _ := ic.staticFilesInRange(from, to) assert.Equal(t, 3, len(idxF)) - ii = &InvertedIndex{filenameBase: "test", aggregationStep: 1, files: btree2.NewBTreeG[*filesItem](filesItemLess)} + ii = emptyTestInvertedIndex(1) ii.scanStateFiles([]string{ "test.0-1.ef", "test.1-2.ef", @@ -43,7 +51,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { ic = ii.MakeContext() defer ic.Close() - needMerge, from, to = ii.findMergeRange(4, 32) + needMerge, from, to = ic.findMergeRange(4, 32) assert.True(t, needMerge) assert.Equal(t, 0, int(from)) assert.Equal(t, 2, int(to)) @@ -56,16 +64,17 @@ func TestFindMergeRangeCornerCases(t *testing.T) { "test.3-4.v", }) h.reCalcRoFiles() - ic = ii.MakeContext() - defer ic.Close() + ic.Close() - r := h.findMergeRange(4, 32) + hc := h.MakeContext() + defer hc.Close() + r := hc.findMergeRange(4, 32) assert.True(t, r.history) assert.Equal(t, 2, int(r.historyEndTxNum)) assert.Equal(t, 2, int(r.indexEndTxNum)) }) t.Run("not equal amount of files", func(t *testing.T) { - ii := &InvertedIndex{filenameBase: "test", aggregationStep: 1, files: btree2.NewBTreeG[*filesItem](filesItemLess)} + ii := emptyTestInvertedIndex(1) ii.scanStateFiles([]string{ "test.0-1.ef", "test.1-2.ef", @@ -84,7 +93,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { hc := h.MakeContext() defer hc.Close() - r := h.findMergeRange(4, 32) + r := hc.findMergeRange(4, 32) assert.True(t, r.index) assert.True(t, r.history) assert.Equal(t, 0, int(r.historyStartTxNum)) @@ -92,7 +101,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { assert.Equal(t, 2, int(r.indexEndTxNum)) }) t.Run("idx merged, history not yet", func(t *testing.T) { - ii := &InvertedIndex{filenameBase: "test", aggregationStep: 1, files: btree2.NewBTreeG[*filesItem](filesItemLess)} + ii := emptyTestInvertedIndex(1) ii.scanStateFiles([]string{ "test.0-2.ef", "test.2-3.ef", @@ -110,14 +119,14 @@ func TestFindMergeRangeCornerCases(t *testing.T) { hc := h.MakeContext() defer hc.Close() - r := h.findMergeRange(4, 32) + r := hc.findMergeRange(4, 32) assert.True(t, r.history) assert.False(t, r.index) assert.Equal(t, 0, int(r.historyStartTxNum)) assert.Equal(t, 2, int(r.historyEndTxNum)) }) t.Run("idx merged, history not yet, 2", func(t *testing.T) { - ii := &InvertedIndex{filenameBase: "test", aggregationStep: 1, files: btree2.NewBTreeG[*filesItem](filesItemLess)} + ii := emptyTestInvertedIndex(1) ii.scanStateFiles([]string{ "test.0-1.ef", "test.1-2.ef", @@ -139,7 +148,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { hc := h.MakeContext() defer hc.Close() - r := h.findMergeRange(4, 32) + r := hc.findMergeRange(4, 32) assert.False(t, r.index) assert.True(t, r.history) assert.Equal(t, 2, int(r.historyEndTxNum)) @@ -149,7 +158,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { require.Equal(t, 2, len(histFiles)) }) t.Run("idx merged and small files lost", func(t *testing.T) { - ii := &InvertedIndex{filenameBase: "test", aggregationStep: 1, files: btree2.NewBTreeG[*filesItem](filesItemLess)} + ii := emptyTestInvertedIndex(1) ii.scanStateFiles([]string{ "test.0-4.ef", }) @@ -167,7 +176,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { hc := h.MakeContext() defer hc.Close() - r := h.findMergeRange(4, 32) + r := hc.findMergeRange(4, 32) assert.False(t, r.index) assert.True(t, r.history) assert.Equal(t, 2, int(r.historyEndTxNum)) @@ -176,7 +185,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { }) t.Run("history merged, but index not and history garbage left", func(t *testing.T) { - ii := &InvertedIndex{filenameBase: "test", aggregationStep: 1, files: btree2.NewBTreeG[*filesItem](filesItemLess)} + ii := emptyTestInvertedIndex(1) ii.scanStateFiles([]string{ "test.0-1.ef", "test.1-2.ef", @@ -195,7 +204,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { hc := h.MakeContext() defer hc.Close() - r := h.findMergeRange(4, 32) + r := hc.findMergeRange(4, 32) assert.True(t, r.index) assert.False(t, r.history) assert.Equal(t, uint64(2), r.indexEndTxNum) @@ -205,7 +214,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { require.Equal(t, 0, len(histFiles)) }) t.Run("history merge progress ahead of idx", func(t *testing.T) { - ii := &InvertedIndex{filenameBase: "test", aggregationStep: 1, files: btree2.NewBTreeG[*filesItem](filesItemLess)} + ii := emptyTestInvertedIndex(1) ii.scanStateFiles([]string{ "test.0-1.ef", "test.1-2.ef", @@ -228,7 +237,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { hc := h.MakeContext() defer hc.Close() - r := h.findMergeRange(4, 32) + r := hc.findMergeRange(4, 32) assert.True(t, r.index) assert.True(t, r.history) assert.Equal(t, 4, int(r.indexEndTxNum)) @@ -238,7 +247,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { require.Equal(t, 3, len(histFiles)) }) t.Run("idx merge progress ahead of history", func(t *testing.T) { - ii := &InvertedIndex{filenameBase: "test", aggregationStep: 1, files: btree2.NewBTreeG[*filesItem](filesItemLess)} + ii := emptyTestInvertedIndex(1) ii.scanStateFiles([]string{ "test.0-1.ef", "test.1-2.ef", @@ -258,7 +267,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { hc := h.MakeContext() defer hc.Close() - r := h.findMergeRange(4, 32) + r := hc.findMergeRange(4, 32) assert.False(t, r.index) assert.True(t, r.history) assert.Equal(t, 2, int(r.historyEndTxNum)) @@ -268,7 +277,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { require.Equal(t, 2, len(histFiles)) }) t.Run("idx merged, but garbage left", func(t *testing.T) { - ii := &InvertedIndex{filenameBase: "test", aggregationStep: 1, files: btree2.NewBTreeG[*filesItem](filesItemLess)} + ii := emptyTestInvertedIndex(1) ii.scanStateFiles([]string{ "test.0-1.ef", "test.1-2.ef", @@ -287,12 +296,12 @@ func TestFindMergeRangeCornerCases(t *testing.T) { hc := h.MakeContext() defer hc.Close() - r := h.findMergeRange(4, 32) + r := hc.findMergeRange(4, 32) assert.False(t, r.index) assert.False(t, r.history) }) t.Run("idx merged, but garbage left2", func(t *testing.T) { - ii := &InvertedIndex{filenameBase: "test", aggregationStep: 1, files: btree2.NewBTreeG[*filesItem](filesItemLess)} + ii := emptyTestInvertedIndex(1) ii.scanStateFiles([]string{ "test.0-1.ef", "test.1-2.ef", @@ -303,7 +312,7 @@ func TestFindMergeRangeCornerCases(t *testing.T) { ii.reCalcRoFiles() ic := ii.MakeContext() defer ic.Close() - needMerge, from, to := ii.findMergeRange(4, 32) + needMerge, from, to := ic.findMergeRange(4, 32) assert.True(t, needMerge) require.Equal(t, 0, int(from)) require.Equal(t, 4, int(to)) diff --git a/state/state_recon.go b/state/state_recon.go index 31babca99..bbc8effc5 100644 --- a/state/state_recon.go +++ b/state/state_recon.go @@ -28,7 +28,7 @@ import ( // Algorithms for reconstituting the state from state history type ReconItem struct { - g *compress.Getter + g ArchiveGetter key []byte txNum uint64 startTxNum uint64 @@ -43,8 +43,8 @@ func (rh ReconHeap) Len() int { return len(rh) } -// Less (part of heap.Interface) compares two links. For persisted links, those with the lower block heights get evicted first. This means that more recently persisted links are preferred. -// For non-persisted links, those with the highest block heights get evicted first. This is to prevent "holes" in the block heights that may cause inability to +// Less (part of heap.Interface) compares two links. For persisted links, those with the lower block heights getBeforeTxNum evicted first. This means that more recently persisted links are preferred. +// For non-persisted links, those with the highest block heights getBeforeTxNum evicted first. This is to prevent "holes" in the block heights that may cause inability to // insert headers in the ascending order of their block heights. func (rh ReconHeap) Less(i, j int) bool { c := bytes.Compare(rh[i].key, rh[j].key) @@ -181,8 +181,8 @@ func (hii *HistoryIteratorInc) advance() { hii.nextKey = nil for hii.nextKey == nil && hii.key != nil { val, _ := hii.indexG.NextUncompressed() - ef, _ := eliasfano32.ReadEliasFano(val) - if n, ok := ef.Search(hii.uptoTxNum); ok { + n, ok := eliasfano32.Seek(val, hii.uptoTxNum) + if ok { var txKey [8]byte binary.BigEndian.PutUint64(txKey[:], n) offset := hii.r.Lookup2(txKey[:], hii.key) diff --git a/txpool/pool.go b/txpool/pool.go index 1c39b28bb..d99ba9114 100644 --- a/txpool/pool.go +++ b/txpool/pool.go @@ -40,8 +40,6 @@ import ( "github.com/google/btree" "github.com/hashicorp/golang-lru/v2/simplelru" "github.com/holiman/uint256" - "github.com/ledgerwatch/log/v3" - "github.com/ledgerwatch/erigon-lib/chain" "github.com/ledgerwatch/erigon-lib/common" "github.com/ledgerwatch/erigon-lib/common/assert" @@ -59,6 +57,7 @@ import ( "github.com/ledgerwatch/erigon-lib/kv/mdbx" "github.com/ledgerwatch/erigon-lib/txpool/txpoolcfg" "github.com/ledgerwatch/erigon-lib/types" + "github.com/ledgerwatch/log/v3" ) var (