Permalink
Browse files

EXPERIMENTAL: midx3 format with separate prefix/postfix searching.

This new midx format doesn't take any extra space, but it reduces the
number of page faults we'll take in the common case (common, at
least, when doing a fresh new backup) because we can pack more sha1
prefixes into each page, and we can stop searching if a particular
sha1 prefix isn't actually found in the list.  That means we won't
have to ever read from disk the remaining sha1 bytes for final
comparison.

On the other hand, if a sha1 *does* exist, we now have to fault in
three pages (fanout + prefix + postfix) instead of two (fanout +
sha1), which *may* cause more churn in low memory situations.  On the
other hand, in bup, churning through large numbers of
already-existing sha1s should be a much less common case (since 'bup
save' short circuits at the top level of any directory tree that's
unchanged).  Moreover, 'bup server' should benefit almost always, since
clients will almost never send objects it already has.

That means, on average, this patch should save us a whole bunch of RAM.

'bup memtest' results show that memory usage decreases to about 1/3 of
the old results, which is pretty good.  Of course, 'bup memtest
--existing' results are essentially unchanged, since it still has to
swap in the whole midx file.
  • Loading branch information...
apenwarr committed Aug 26, 2010
1 parent 92944c8 commit 77e06c9ba9236e3b67b8c9756ec4d473ec51ca70
Showing with 106 additions and 35 deletions.
  1. +4 −3 cmd/margin-cmd.py
  2. +30 −6 cmd/midx-cmd.py
  3. +12 −4 lib/bup/_helpers.c
  4. +52 −16 lib/bup/git.py
  5. +8 −6 lib/bup/helpers.py
View
@@ -7,8 +7,8 @@
optspec = """
bup margin
--
-predict Guess object offsets and report the maximum deviation
-ignore-midx Don't use midx files; use only plain pack idx files.
+predict guess object offsets and report the maximum deviation
+ignore-midx with --predict, do each idx file separately
"""
o = options.Options('bup margin', optspec)
(opt, flags, extra) = o.parse(sys.argv[1:])
@@ -31,7 +31,8 @@ def do_predict(ix):
maxdiff = max(maxdiff, abs(diff))
print '%d of %d (%.3f%%) ' % (maxdiff, len(ix), maxdiff*100.0/len(ix))
sys.stdout.flush()
- assert(count+1 == len(ix))
+ if count+1 != len(ix):
+ raise Exception('wrong object count: %d != %d' % (count+1, len(ix)))
if opt.predict:
if opt.ignore_midx:
View
@@ -3,8 +3,10 @@
from bup import options, git
from bup.helpers import *
+PRE_BYTES = git.PRE_BYTES
+POST_BYTES = git.POST_BYTES
PAGE_SIZE=4096
-SHA_PER_PAGE=PAGE_SIZE/20.
+SHA_PER_PAGE=PAGE_SIZE*1.0/PRE_BYTES
def merge(idxlist, bits, table):
@@ -38,7 +40,7 @@ def do_midx(outdir, outfilename, infilenames):
pages = int(total/SHA_PER_PAGE) or 1
bits = int(math.ceil(math.log(pages, 2)))
entries = 2**bits
- log('Table size: %d (%d bits)\n' % (entries*4, bits))
+ log('Table size: %d entries (%d bits)\n' % (entries*4, bits))
table = [0]*entries
@@ -47,14 +49,36 @@ def do_midx(outdir, outfilename, infilenames):
except OSError:
pass
f = open(outfilename + '.tmp', 'w+')
- f.write('MIDX\0\0\0\2')
+ f.write('MIDX\0\0\0\3')
f.write(struct.pack('!I', bits))
assert(f.tell() == 12)
f.write('\0'*4*entries)
+
+ # format:
+ # header (12 bytes)
+ # lookup table
+ # prefixes
+ # postfixes
+ # file indexes
+ ofs_pre = f.tell()
+ ofs_post = ofs_pre + PRE_BYTES*total
+ ofs_which = ofs_post + POST_BYTES*total
+ ofs_end = ofs_which + 4*total
+
+ f.truncate(ofs_end)
+
+ map = mmap_readwrite(f, ofs_end, close=False)
- for e in merge(inp, bits, table):
- f.write(e)
-
+ for i,e in enumerate(merge(inp, bits, table)):
+ map[ofs_pre + i*PRE_BYTES : ofs_pre + (i+1)*PRE_BYTES] \
+ = e[:PRE_BYTES]
+ map[ofs_post + i*POST_BYTES : ofs_post + (i+1)*POST_BYTES] \
+ = e[PRE_BYTES:]
+ # FIXME: set the file index block
+ map.flush()
+ map.close()
+
+ f.seek(ofs_end)
f.write('\0'.join(os.path.basename(p) for p in infilenames))
f.seek(12)
View
@@ -62,15 +62,19 @@ static PyObject *bitmatch(PyObject *self, PyObject *args)
static PyObject *firstword(PyObject *self, PyObject *args)
{
- unsigned char *buf = NULL;
+ unsigned char *buf = NULL, tbuf[4];
int len = 0;
uint32_t v;
if (!PyArg_ParseTuple(args, "t#", &buf, &len))
return NULL;
if (len < 4)
- return NULL;
+ {
+ memset(tbuf, 0, sizeof(tbuf));
+ memcpy(tbuf, buf, len);
+ buf = tbuf;
+ }
v = ntohl(*(uint32_t *)buf);
return Py_BuildValue("I", v);
@@ -79,15 +83,19 @@ static PyObject *firstword(PyObject *self, PyObject *args)
static PyObject *extract_bits(PyObject *self, PyObject *args)
{
- unsigned char *buf = NULL;
+ unsigned char *buf = NULL, tbuf[4];
int len = 0, nbits = 0;
uint32_t v, mask;
if (!PyArg_ParseTuple(args, "t#i", &buf, &len, &nbits))
return NULL;
if (len < 4)
- return NULL;
+ {
+ memset(tbuf, 0, sizeof(tbuf));
+ memcpy(tbuf, buf, len);
+ buf = tbuf;
+ }
mask = (1<<nbits) - 1;
v = ntohl(*(uint32_t *)buf);
View
@@ -7,6 +7,9 @@
from bup.helpers import *
from bup import _helpers
+PRE_BYTES = 6
+POST_BYTES = 20 - PRE_BYTES
+
verbose = 0
ignore_midx = 0
home_repodir = os.path.expanduser('~/.bup')
@@ -17,6 +20,7 @@
_total_searches = 0
_total_steps = 0
+_rare_warned = 0
class GitError(Exception):
@@ -205,36 +209,70 @@ def __init__(self, filename):
self.name = filename
assert(filename.endswith('.midx'))
self.map = mmap_read(open(filename))
- if str(self.map[0:8]) == 'MIDX\0\0\0\1':
+ if str(self.map[0:8]) in ('MIDX\0\0\0\1', 'MIDX\0\0\0\2'):
log('Warning: ignoring old-style midx %r\n' % filename)
self.bits = 0
self.entries = 1
self.fanout = buffer('\0\0\0\0')
- self.shalist = buffer('\0'*20)
+ self.prelist = buffer('\0'*PRE_BYTES)
+ self.postlist = buffer('\0'*POST_BYTES)
self.idxnames = []
else:
- assert(str(self.map[0:8]) == 'MIDX\0\0\0\2')
+ assert(str(self.map[0:8]) == 'MIDX\0\0\0\3')
self.bits = _helpers.firstword(self.map[8:12])
self.entries = 2**self.bits
self.fanout = buffer(self.map, 12, self.entries*4)
shaofs = 12 + self.entries*4
- nsha = self._fanget(self.entries-1)
- self.shalist = buffer(self.map, shaofs, nsha*20)
- self.idxnames = str(self.map[shaofs + 20*nsha:]).split('\0')
+ self.nsha = nsha = self._fanget(self.entries-1)
+ self.prelist = buffer(self.map, shaofs, nsha*PRE_BYTES)
+ self.postlist = buffer(self.map, shaofs + nsha*PRE_BYTES,
+ nsha*POST_BYTES)
+ self.idxnames = str(self.map[shaofs + 24*nsha:]).split('\0')
def _fanget(self, i):
start = i*4
s = self.fanout[start:start+4]
return _helpers.firstword(s)
def _get(self, i):
- return str(self.shalist[i*20:(i+1)*20])
+ return str(self.prelist[i*PRE_BYTES:(i+1)*PRE_BYTES])
+
+ def _get_post(self, i):
+ return str(self.postlist[i*POST_BYTES:(i+1)*POST_BYTES])
+
+ def _exists_verify(self, hash, i):
+ global _total_steps, _rare_warned
+ pre = str(hash[:PRE_BYTES])
+ post = str(hash[PRE_BYTES:])
+ #print 'starting: %d (want %s)' % (i, hash.encode('hex'))
+ while i > 0:
+ _total_steps += 1
+ if self._get(i-1) == pre:
+ i -= 1
+ else:
+ break
+ #print ' now %d' % i
+ for i in xrange(i, self.nsha):
+ _total_steps += 1
+ #print ' testing %d (%s-%s)' % (i, self._get(i).encode('hex'),
+ # self._get_post(i).encode('hex'))
+ if self._get(i) != pre:
+ break
+ if self._get_post(i) == post:
+ return True
+ _rare_warned += 1
+ if _rare_warned <= 5:
+ log('Warning: rare: sha1 prefix %r collision. Please report.\n'
+ % pre.encode('hex'))
+ if _rare_warned == 5:
+ log('Warning: not printing any more collision warnings.\n')
+ return False
def exists(self, hash):
"""Return nonempty if the object exists in the index files."""
global _total_searches, _total_steps
_total_searches += 1
- want = str(hash)
+ want = str(hash)[:PRE_BYTES]
el = extract_bits(want, self.bits)
if el:
start = self._fanget(el-1)
@@ -253,20 +291,21 @@ def exists(self, hash):
mid = start + (hashv-startv)*(end-start-1)/(endv-startv)
#print ' %08x %08x %08x %d %d %d' % (startv, hashv, endv, start, mid, end)
v = self._get(mid)
- #print ' %08x' % self._num(v)
+ #print ' %08x' % _helpers.firstword(v)
if v < want:
start = mid+1
startv = _helpers.firstword(v)
elif v > want:
end = mid
endv = _helpers.firstword(v)
else: # got it!
- return True
+ return self._exists_verify(hash, mid)
return None
def __iter__(self):
for i in xrange(self._fanget(self.entries-1)):
- yield buffer(self.shalist, i*20, 20)
+ yield (str(self.prelist[i*PRE_BYTES:(i+1)*PRE_BYTES]) +
+ str(self.postlist[i*POST_BYTES:(i+1)*POST_BYTES]))
def __len__(self):
return int(self._fanget(self.entries-1))
@@ -401,15 +440,12 @@ def idxmerge(idxlist):
heap = [(next(it), it) for it in iters]
heapq.heapify(heap)
count = 0
- last = None
while heap:
if (count % 10024) == 0:
progress('Reading indexes: %.2f%% (%d/%d)\r'
% (count*100.0/total, count, total))
(e, it) = heap[0]
- if e != last:
- yield e
- last = e
+ yield e
count += 1
e = next(it)
if e:
@@ -468,7 +504,7 @@ def _write(self, bin, type, content):
return bin
def breakpoint(self):
- """Clear byte and object counts and return the last processed id."""
+ """End the current pack and start a new one."""
id = self._end()
self.outbytes = self.count = 0
return id
View
@@ -246,29 +246,31 @@ def slashappend(s):
return s
-def _mmap_do(f, sz, flags, prot):
+def _mmap_do(f, sz, flags, prot, close):
if not sz:
st = os.fstat(f.fileno())
sz = st.st_size
map = mmap.mmap(f.fileno(), sz, flags, prot)
- f.close() # map will persist beyond file close
+ if close:
+ f.close() # map will persist beyond file close
return map
-def mmap_read(f, sz = 0):
+def mmap_read(f, sz=0, close=True):
"""Create a read-only memory mapped region on file 'f'.
If sz is 0, the region will cover the entire file.
"""
- return _mmap_do(f, sz, mmap.MAP_PRIVATE, mmap.PROT_READ)
+ return _mmap_do(f, sz, mmap.MAP_PRIVATE, mmap.PROT_READ, close=close)
-def mmap_readwrite(f, sz = 0):
+def mmap_readwrite(f, sz=0, close=True):
"""Create a read-write memory mapped region on file 'f'.
If sz is 0, the region will cover the entire file.
"""
- return _mmap_do(f, sz, mmap.MAP_SHARED, mmap.PROT_READ|mmap.PROT_WRITE)
+ return _mmap_do(f, sz, mmap.MAP_SHARED, mmap.PROT_READ|mmap.PROT_WRITE,
+ close=close)
def parse_num(s):

0 comments on commit 77e06c9

Please sign in to comment.