This repository has been archived by the owner on Apr 7, 2019. It is now read-only.
/
table.go
356 lines (303 loc) · 8.85 KB
/
table.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
/*
* Copyright 2017 Dgraph Labs, Inc. and Contributors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package table
import (
"bytes"
"crypto/sha256"
"encoding/binary"
"fmt"
"io"
"os"
"path"
"path/filepath"
"strconv"
"strings"
"sync"
"sync/atomic"
"github.com/ipsn/go-ipfs/gxlibs/github.com/dgraph-io/badger/options"
"github.com/ipsn/go-ipfs/gxlibs/github.com/dgraph-io/badger/y"
"github.com/ipsn/go-ipfs/gxlibs/github.com/ipfs/bbloom"
"github.com/pkg/errors"
)
const fileSuffix = ".sst"
type keyOffset struct {
key []byte
offset int
len int
}
// TableInterface is useful for testing.
type TableInterface interface {
Smallest() []byte
Biggest() []byte
DoesNotHave(key []byte) bool
}
// Table represents a loaded table file with the info we have about it
type Table struct {
sync.Mutex
fd *os.File // Own fd.
tableSize int // Initialized in OpenTable, using fd.Stat().
blockIndex []keyOffset
ref int32 // For file garbage collection. Atomic.
loadingMode options.FileLoadingMode
mmap []byte // Memory mapped.
// The following are initialized once and const.
smallest, biggest []byte // Smallest and largest keys.
id uint64 // file id, part of filename
bf bbloom.Bloom
Checksum []byte
}
// IncrRef increments the refcount (having to do with whether the file should be deleted)
func (t *Table) IncrRef() {
atomic.AddInt32(&t.ref, 1)
}
// DecrRef decrements the refcount and possibly deletes the table
func (t *Table) DecrRef() error {
newRef := atomic.AddInt32(&t.ref, -1)
if newRef == 0 {
// We can safely delete this file, because for all the current files, we always have
// at least one reference pointing to them.
// It's necessary to delete windows files
if t.loadingMode == options.MemoryMap {
y.Munmap(t.mmap)
}
if err := t.fd.Truncate(0); err != nil {
// This is very important to let the FS know that the file is deleted.
return err
}
filename := t.fd.Name()
if err := t.fd.Close(); err != nil {
return err
}
if err := os.Remove(filename); err != nil {
return err
}
}
return nil
}
type block struct {
offset int
data []byte
}
func (b block) NewIterator() *blockIterator {
return &blockIterator{data: b.data}
}
// OpenTable assumes file has only one table and opens it. Takes ownership of fd upon function
// entry. Returns a table with one reference count on it (decrementing which may delete the file!
// -- consider t.Close() instead). The fd has to writeable because we call Truncate on it before
// deleting.
func OpenTable(fd *os.File, mode options.FileLoadingMode, cksum []byte) (*Table, error) {
fileInfo, err := fd.Stat()
if err != nil {
// It's OK to ignore fd.Close() errs in this function because we have only read
// from the file.
_ = fd.Close()
return nil, y.Wrap(err)
}
filename := fileInfo.Name()
id, ok := ParseFileID(filename)
if !ok {
_ = fd.Close()
return nil, errors.Errorf("Invalid filename: %s", filename)
}
t := &Table{
fd: fd,
ref: 1, // Caller is given one reference.
id: id,
loadingMode: mode,
}
t.tableSize = int(fileInfo.Size())
// We first load to RAM, so we can read the index and do checksum.
if err := t.loadToRAM(); err != nil {
return nil, err
}
// Enforce checksum before we read index. Otherwise, if the file was
// truncated, we'd end up with panics in readIndex.
if len(cksum) > 0 && !bytes.Equal(t.Checksum, cksum) {
return nil, fmt.Errorf(
"CHECKSUM_MISMATCH: Table checksum does not match checksum in MANIFEST."+
" NOT including table %s. This would lead to missing data."+
"\n sha256 %x Expected\n sha256 %x Found\n", filename, cksum, t.Checksum)
}
if err := t.readIndex(); err != nil {
return nil, y.Wrap(err)
}
it := t.NewIterator(false)
defer it.Close()
it.Rewind()
if it.Valid() {
t.smallest = it.Key()
}
it2 := t.NewIterator(true)
defer it2.Close()
it2.Rewind()
if it2.Valid() {
t.biggest = it2.Key()
}
switch mode {
case options.LoadToRAM:
// No need to do anything. t.mmap is already filled.
case options.MemoryMap:
t.mmap, err = y.Mmap(fd, false, fileInfo.Size())
if err != nil {
_ = fd.Close()
return nil, y.Wrapf(err, "Unable to map file")
}
case options.FileIO:
t.mmap = nil
default:
panic(fmt.Sprintf("Invalid loading mode: %v", mode))
}
return t, nil
}
// Close closes the open table. (Releases resources back to the OS.)
func (t *Table) Close() error {
if t.loadingMode == options.MemoryMap {
y.Munmap(t.mmap)
}
return t.fd.Close()
}
func (t *Table) read(off int, sz int) ([]byte, error) {
if len(t.mmap) > 0 {
if len(t.mmap[off:]) < sz {
return nil, y.ErrEOF
}
return t.mmap[off : off+sz], nil
}
res := make([]byte, sz)
nbr, err := t.fd.ReadAt(res, int64(off))
y.NumReads.Add(1)
y.NumBytesRead.Add(int64(nbr))
return res, err
}
func (t *Table) readNoFail(off int, sz int) []byte {
res, err := t.read(off, sz)
y.Check(err)
return res
}
func (t *Table) readIndex() error {
if len(t.mmap) != t.tableSize {
panic("Table size does not match the read bytes")
}
readPos := t.tableSize
// Read bloom filter.
readPos -= 4
buf := t.readNoFail(readPos, 4)
bloomLen := int(binary.BigEndian.Uint32(buf))
readPos -= bloomLen
data := t.readNoFail(readPos, bloomLen)
t.bf = *bbloom.JSONUnmarshal(data)
readPos -= 4
buf = t.readNoFail(readPos, 4)
restartsLen := int(binary.BigEndian.Uint32(buf))
readPos -= 4 * restartsLen
buf = t.readNoFail(readPos, 4*restartsLen)
offsets := make([]int, restartsLen)
for i := 0; i < restartsLen; i++ {
offsets[i] = int(binary.BigEndian.Uint32(buf[:4]))
buf = buf[4:]
}
// The last offset stores the end of the last block.
for i := 0; i < len(offsets); i++ {
var o int
if i == 0 {
o = 0
} else {
o = offsets[i-1]
}
ko := keyOffset{
offset: o,
len: offsets[i] - o,
}
t.blockIndex = append(t.blockIndex, ko)
}
// Execute this index read serially, because we already have table data in memory.
var h header
for idx := range t.blockIndex {
ko := &t.blockIndex[idx]
hbuf := t.readNoFail(ko.offset, h.Size())
h.Decode(hbuf)
y.AssertTrue(h.plen == 0)
key := t.readNoFail(ko.offset+len(hbuf), int(h.klen))
ko.key = append([]byte{}, key...)
}
return nil
}
func (t *Table) block(idx int) (block, error) {
y.AssertTruef(idx >= 0, "idx=%d", idx)
if idx >= len(t.blockIndex) {
return block{}, errors.New("block out of index")
}
ko := t.blockIndex[idx]
blk := block{
offset: ko.offset,
}
var err error
blk.data, err = t.read(blk.offset, ko.len)
return blk, err
}
// Size is its file size in bytes
func (t *Table) Size() int64 { return int64(t.tableSize) }
// Smallest is its smallest key, or nil if there are none
func (t *Table) Smallest() []byte { return t.smallest }
// Biggest is its biggest key, or nil if there are none
func (t *Table) Biggest() []byte { return t.biggest }
// Filename is NOT the file name. Just kidding, it is.
func (t *Table) Filename() string { return t.fd.Name() }
// ID is the table's ID number (used to make the file name).
func (t *Table) ID() uint64 { return t.id }
// DoesNotHave returns true if (but not "only if") the table does not have the key. It does a
// bloom filter lookup.
func (t *Table) DoesNotHave(key []byte) bool { return !t.bf.Has(key) }
// ParseFileID reads the file id out of a filename.
func ParseFileID(name string) (uint64, bool) {
name = path.Base(name)
if !strings.HasSuffix(name, fileSuffix) {
return 0, false
}
// suffix := name[len(fileSuffix):]
name = strings.TrimSuffix(name, fileSuffix)
id, err := strconv.Atoi(name)
if err != nil {
return 0, false
}
y.AssertTrue(id >= 0)
return uint64(id), true
}
// IDToFilename does the inverse of ParseFileID
func IDToFilename(id uint64) string {
return fmt.Sprintf("%06d", id) + fileSuffix
}
// NewFilename should be named TableFilepath -- it combines the dir with the ID to make a table
// filepath.
func NewFilename(id uint64, dir string) string {
return filepath.Join(dir, IDToFilename(id))
}
func (t *Table) loadToRAM() error {
if _, err := t.fd.Seek(0, io.SeekStart); err != nil {
return err
}
t.mmap = make([]byte, t.tableSize)
sum := sha256.New()
tee := io.TeeReader(t.fd, sum)
read, err := tee.Read(t.mmap)
if err != nil || read != t.tableSize {
return y.Wrapf(err, "Unable to load file in memory. Table file: %s", t.Filename())
}
t.Checksum = sum.Sum(nil)
y.NumReads.Add(1)
y.NumBytesRead.Add(int64(read))
return nil
}