Skip to content
Browse files

Merge branch 'experimental'

Conflicts:
	src/bdelta.cpp
	src/bdelta_python.cpp
	src/libbdelta.cpp
  • Loading branch information...
2 parents 0293d8a + 4e65a82 commit fb0a916b878e096b877c38bcedb30afa5eb65bb2 @jjwhitney committed Feb 16, 2012
Showing with 311 additions and 276 deletions.
  1. +18 −8 setup.py
  2. +32 −8 src/bdelta.cpp
  3. +20 −6 src/bdelta.h
  4. +56 −0 src/bdelta.pyx
  5. +0 −89 src/bdelta_python.cpp
  6. +185 −165 src/libbdelta.cpp
View
26 setup.py
@@ -1,10 +1,20 @@
from distutils.core import setup, Extension
+from Cython.Distutils import build_ext
+
+ext_modules = [Extension(
+ "bdelta",
+ ["src/bdelta.pyx", "src/libbdelta.cpp"],
+ define_macros=[('TOKEN_SIZE', '2')],
+)]
+
+setup(
+ name = 'BDelta',
+ version='0.3.0',
+ description='Python Bindings for BDelta',
+ author='John Whitney',
+ author_email='jjw@deltup.org',
+ url='http://deltup.org',
+ cmdclass = {'build_ext': build_ext},
+ ext_modules = ext_modules
+)
-setup(name='bdelta_python',
- version='0.2.3',
- description='Python Bindings for BDelta',
- author='John Whitney',
- author_email='jjw@deltup.org',
- url='http://deltup.org',
- ext_modules=[Extension('bdelta_python', ['src/bdelta_python.cpp'])],
- )
View
40 src/bdelta.cpp
@@ -25,6 +25,11 @@ void *f_read(void *f, void *buf, unsigned place, unsigned num) {
return buf;
}
+void my_pass(BDelta_Instance *b, unsigned blocksize, unsigned minMatchSize, unsigned flags) {
+ bdelta_pass(b, blocksize, minMatchSize, 0, flags);
+ bdelta_clean_matches(b, BDELTA_REMOVE_OVERLAP);
+}
+
int main(int argc, char **argv) {
try {
if (argc != 4) {
@@ -41,15 +46,34 @@ int main(int argc, char **argv) {
FILE *f1 = fopen(argv[1], "rb"),
*f2 = fopen(argv[2], "rb");
- void *b = bdelta_init_alg(size, size2, f_read, f1, f2, 1);
+ BDelta_Instance *b = bdelta_init_alg(size, size2, f_read, f1, f2, 1);
int nummatches;
-#ifdef CARELESSMATCH
- const int MINSIZE = 16;
-#else
- const int MINSIZE = 8;
-#endif
- for (int i = 512; i >= MINSIZE; i /= 2)
- nummatches = bdelta_pass(b, i);
+
+ // List of primes for reference. Taken from Wikipedia.
+ // 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
+ // 1-20 2 3 5 7 11 13 17 19 23 29 31 37 41 43 47 53 59 61 67 71
+ // 21-40 73 79 83 89 97 101 103 107 109 113 127 131 137 139 149 151 157 163 167 173
+ // 41-60 179 181 191 193 197 199 211 223 227 229 233 239 241 251 257 263 269 271 277 281
+ // 61-80 283 293 307 311 313 317 331 337 347 349 353 359 367 373 379 383 389 397 401 409
+ // 81-100 419 421 431 433 439 443 449 457 461 463 467 479 487 491 499 503 509 521 523 541
+ // 101-120 547 557 563 569 571 577 587 593 599 601 607 613 617 619 631 641 643 647 653 659
+ // 121-140 661 673 677 683 691 701 709 719 727 733 739 743 751 757 761 769 773 787 797 809
+ // 141-160 811 821 823 827 829 839 853 857 859 863 877 881 883 887 907 911 919 929 937 941
+ // 161-180 947 953 967 971 977 983 991 997
+
+ int seq[] = {503, 127, 31, 7, 5, 3, -31, 31, 7, 5, 3, -7, 2};
+ my_pass(b, 997, 1994, 0);
+ my_pass(b, 503, 1006, 0);
+ my_pass(b, 127, 254, 0);
+ my_pass(b, 31, 62, 0);
+ my_pass(b, 7, 14, 0);
+ my_pass(b, 5, 10, 0);
+ my_pass(b, 3, 6, 0);
+ my_pass(b, 13, 26, BDELTA_GLOBAL);
+ my_pass(b, 7, 14, 0);
+ my_pass(b, 5, 10, 0);
+
+ nummatches = bdelta_numMatches(b);
unsigned * copyloc1 = new unsigned[nummatches + 1];
unsigned * copyloc2 = new unsigned[nummatches + 1];
View
26 src/bdelta.h
@@ -17,23 +17,37 @@
extern "C" {
#endif // __cplusplus
+typedef struct _BDelta_Instance BDelta_Instance;
+
// Callback function must return a pointer to the data requested.
// A "fill and forget" buffer is provided, but can be ignored, so
// long as the data persists throughout the life of bdelta_pass().
typedef void *(*bdelta_readCallback)(void *handle, void *buf, unsigned place, unsigned num);
-void *bdelta_init_alg(unsigned data1_size, unsigned data2_size,
+BDelta_Instance *bdelta_init_alg(unsigned data1_size, unsigned data2_size,
bdelta_readCallback cb, void *handle1, void *handle2,
unsigned tokenSize);
-void bdelta_done_alg(void *instance);
+void bdelta_done_alg(BDelta_Instance *b);
+
+void bdelta_pass(BDelta_Instance *b, unsigned blockSize, unsigned minMatchSize, unsigned maxHoleSize, unsigned flags);
+
+void bdelta_swap_inputs(BDelta_Instance *b);
+void bdelta_clean_matches(BDelta_Instance *b, unsigned flags);
-//returns the total number of matches found
-unsigned bdelta_pass(void *instance, unsigned blocksize);
+unsigned bdelta_numMatches(BDelta_Instance *b);
-void bdelta_getMatch(void *instance, unsigned matchNum,
+void bdelta_getMatch(BDelta_Instance *b, unsigned matchNum,
unsigned *p1, unsigned *p2, unsigned *num);
-int bdelta_getError(void *instance);
+int bdelta_getError(BDelta_Instance *b);
+void bdelta_showMatches(BDelta_Instance *b);
+
+// Flags for bdelta_pass()
+#define BDELTA_GLOBAL 1
+#define BDELTA_SIDES_ORDERED 2
+
+// Flags for bdelta_clean_matches()
+#define BDELTA_REMOVE_OVERLAP 1
enum BDELTA_RESULT {
BDELTA_OK = 0,
View
56 src/bdelta.pyx
@@ -0,0 +1,56 @@
+cdef extern from "bdelta.h":
+ ctypedef struct BDelta_Instance:
+ pass
+
+ ctypedef void *(*bdelta_readCallback)(void *handle, void *buf, unsigned place, unsigned num)
+ BDelta_Instance *bdelta_init_alg(unsigned data1_size, unsigned data2_size,
+ bdelta_readCallback cb, void *handle1, void *handle2,
+ unsigned tokenSize)
+ void bdelta_done_alg(BDelta_Instance *b)
+
+ void bdelta_pass(BDelta_Instance *b, unsigned blockSize, unsigned minMatchSize, unsigned maxHoleSize, unsigned flags)
+
+ void bdelta_swap_inputs(BDelta_Instance *b)
+ void bdelta_clean_matches(BDelta_Instance *b, unsigned flags)
+
+ unsigned bdelta_numMatches(BDelta_Instance *b)
+
+ void bdelta_getMatch(BDelta_Instance *b, unsigned matchNum,
+ unsigned *p1, unsigned *p2, unsigned *num)
+
+ int bdelta_getError(BDelta_Instance *b)
+ void bdelta_showMatches(BDelta_Instance *b)
+
+ cdef enum PassFlags:
+ BDELTA_GLOBAL,
+ BDELTA_SIDES_ORDERED
+ cdef enum CleanFlags:
+ BDELTA_REMOVE_OVERLAP
+
+cdef void *readCallback(void *handle, void *buf, unsigned place, unsigned num):
+ cdef char *str = <bytes>handle
+ return str + ((place + 1) * 2);
+
+cdef class BDelta:
+ cdef BDelta_Instance *_b
+ cdef bytes str1, str2
+
+ def __cinit__(self, str1, str2):
+ self.str1 = str1.encode('UTF-16')
+ self.str2 = str2.encode('UTF-16')
+ self._b = bdelta_init_alg(len(str1), len(str2), readCallback, <void*>self.str1, <void*>self.str2, 2)
+
+ def __dealloc__(self):
+ self.str1 = None
+ self.str2 = None
+ bdelta_done_alg(self._b)
+
+ def b_pass(self, blockSize, minMatchSize, maxHoleSize, globalScope = False, sidesOrdered = False):
+ bdelta_pass(self._b, blockSize, minMatchSize, maxHoleSize,
+ (BDELTA_GLOBAL if globalScope else 0) | (BDELTA_SIDES_ORDERED if sidesOrdered else 0))
+
+ def matches(self):
+ cdef unsigned p1, p2, num
+ for i in xrange(bdelta_numMatches(self._b)):
+ bdelta_getMatch(self._b, i, &p1, &p2, &num)
+ yield (int(p1), int(p2), int(num))
View
89 src/bdelta_python.cpp
@@ -1,89 +0,0 @@
-/* Copyright (C) 2010 John Whitney
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * Author: John Whitney <jjw@linuxmail.org>
- */
-
-#include "Python.h"
-#define TOKEN_SIZE 2
-#define ALLOW_OVERLAP
-#include "libbdelta.cpp"
-#include "string.h"
-void *mem_read(void *data, void *buf, unsigned place, unsigned num) {
- return ((Token*)data) + place;
-}
-
-void dumpContent(const char *name, char *s, size_t len) {
- printf("%s: ", name);
- for (int i = 0; i < len; ++i)
- if (isprint(s[i]))
- printf("%c", s[i]);
- else
- printf("%i", s[i]);
- printf("\n\n");
-}
-
-PyObject* bdelta_SimpleString(PyObject* self, PyObject* args) {
- Py_UNICODE *a, *b;
- int len_a, len_b;
- int smallestMatch;
-
- if (!PyArg_ParseTuple(args, "u#u#i", &a, &len_a, &b, &len_b, &smallestMatch))
- return NULL;
-
- // Find all matches bigger than "smallestMatch" parameter.
- // We achieve this by using a blocksize of "size / 2" (and leaving CARELESSMATCH undefined).
- smallestMatch /= 2;
-
- PyObject *a16 = PyUnicode_EncodeUTF16(a, len_a, NULL, -1);
- PyObject *b16 = PyUnicode_EncodeUTF16(b, len_b, NULL, -1);
-
-#ifndef NDEBUG
- dumpContent("String 1", PyString_AsString(a16), len_a * 2);
- dumpContent("String 2", PyString_AsString(b16), len_b * 2);
-#endif
- void *string_a = PyString_AsString(a16);
- void *string_b = PyString_AsString(b16);
- void *bi = bdelta_init_alg(len_a, len_b, mem_read, string_a, string_b, 2);
- int nummatches;
- for (int i = 64; i >= smallestMatch; i /= 2)
- nummatches = bdelta_pass(bi, i);
-
- PyObject *ret = PyTuple_New(nummatches);
- for (int i = 0; i < nummatches; ++i) {
- unsigned p1, p2, num;
- bdelta_getMatch(bi, i, &p1, &p2, &num);
-
- PyObject *m = PyTuple_New(3);
- PyTuple_SetItem(m, 0, PyInt_FromLong(p1));
- PyTuple_SetItem(m, 1, PyInt_FromLong(p2));
- PyTuple_SetItem(m, 2, PyInt_FromLong(num));
- PyTuple_SetItem(ret, i, m);
- }
- Py_DECREF(a16);
- Py_DECREF(b16);
- return ret;
-}
-
-static PyMethodDef BDeltaMethods[] =
-{
- {"bdelta_SimpleString", bdelta_SimpleString, METH_VARARGS, "Get the delta of two strings (returns a list of matches)."},
- {NULL, NULL, 0, NULL}
-};
-
-PyMODINIT_FUNC
-
-initbdelta_python(void)
-{
- (void) Py_InitModule("bdelta_python", BDeltaMethods);
-}
-
View
350 src/libbdelta.cpp
@@ -22,11 +22,7 @@ typedef uint16_t Token;
typedef uint32_t Token;
#endif
-// Enables delta chunk statistics
-// #define DO_STATS_DEBUG
-
#include <stdio.h>
-#include <stdlib.h>
#include "bdelta.h"
#include "checksum.h"
#include <list>
@@ -53,7 +49,7 @@ struct Match {
{this->p1 = p1; this->p2 = p2; this->num = num;}
};
-struct BDelta_Instance {
+struct _BDelta_Instance {
bdelta_readCallback cb;
void *handle1, *handle2;
unsigned data1_size, data2_size;
@@ -130,126 +126,85 @@ unsigned match_backward(BDelta_Instance *b, unsigned p1, unsigned p2, unsigned b
// Iterator helper function
template <class T>
inline T prior(T i) {return --i;}
+template <class T>
+inline T next(T i) {return ++i;}
void addMatch(BDelta_Instance *b, unsigned p1, unsigned p2, unsigned num, std::list<Match>::iterator place) {
- while (place != b->matches.begin() && prior(place)->p2 >= p2)
- b->matches.erase(prior(place));
-#ifndef ALLOW_OVERLAP
- if (place != b->matches.begin() && prior(place)->p2 + prior(place)->num > p2)
- prior(place)->num = p2 - prior(place)->p2;
- if (place != b->matches.end() && p2 + num > place->p2)
- num = place->p2 - p2;
-#endif
- // printf("%i, %i, %i, %x, %x\n", p1, p2, num, place, next);
+ while (place != b->matches.begin() && prior(place)->p2 > p2)
+ --place;
+ while (place != b->matches.end() && place->p2 < p2)
+ ++place;
b->matches.insert(place, Match(p1, p2, num));
}
-struct PotentialMatch {
- unsigned p1, p2;
- Hash::Value cksum;
- PotentialMatch() {}
- PotentialMatch(unsigned p1, unsigned p2, Hash::Value cksum)
- {this->p1 = p1; this->p2 = p2; this->cksum = cksum;}
-};
-
template<class T>
T absoluteDifference(T a, T b) {
return std::max(a, b) - std::min(a, b);
}
-struct DistanceFromP1 {
- unsigned place;
- DistanceFromP1(unsigned place) {this->place = place;}
- bool operator() (PotentialMatch m1, PotentialMatch m2) {
- return absoluteDifference(place, m1.p1) < absoluteDifference(place, m2.p1);
- }
-};
-
-void sortTMatches(BDelta_Instance *b, std::list<Match>::iterator place, std::list<PotentialMatch> &matches) {
- unsigned lastf1Place = place != b->matches.begin() ? prior(place)->p1 + prior(place)->num : 0;
- matches.sort(DistanceFromP1(lastf1Place));
-}
-
-#ifdef DO_STATS_DEBUG
-long long stata = 0, statb = 0;
-#endif
-void findMatches(BDelta_Instance *b, Checksums_Instance *h, unsigned start, unsigned end,
- std::list<Match>::iterator place) {
+void findMatches(BDelta_Instance *b, Checksums_Instance *h, unsigned minMatchSize, unsigned start, unsigned end, unsigned place, std::list<Match>::iterator iterPlace) {
const unsigned blocksize = h->blocksize;
STACK_ALLOC(buf1, Token, blocksize);
STACK_ALLOC(buf2, Token, blocksize);
- const unsigned maxPMatch = 256;
- std::list<PotentialMatch> pMatch;
- unsigned processMatchesPos = end;
+ unsigned best1, best2, bestnum = 0;
+ unsigned processMatchesPos;
Token *inbuf = b->read2(buf1, start, blocksize),
*outbuf;
Hash hash = Hash(inbuf, blocksize);
unsigned buf_loc = blocksize;
- Hash::Value lastChecksum = ~hash.getValue();
for (unsigned j = start + blocksize; j <= end; ++j) {
unsigned thisTableIndex = h->tableIndex(hash.getValue());
checksum_entry *c = h->htable[thisTableIndex];
- if (c && hash.getValue() != lastChecksum) {
+ if (c) {
do {
if (c->cksum == hash.getValue()) {
- if (pMatch.size() >= maxPMatch) {
- // Keep the best 16
- sortTMatches(b, place, pMatch);
- pMatch.resize(16);
-#ifdef DO_STATS_DEBUG
- ++statb;
-#endif
+ unsigned p1 = c->loc, p2 = j - blocksize;
+ unsigned fnum = match_forward(b, p1, p2);
+ if (fnum >= blocksize) {
+ unsigned bnum = match_backward(b, p1, p2, blocksize);
+ unsigned num = fnum + bnum;
+ if (num >= minMatchSize) {
+ p1 -= bnum; p2 -= bnum;
+ bool foundBetter;
+ if (bestnum) {
+ double oldValue = double(bestnum) / (absoluteDifference(place, best1) + blocksize * 2),
+ newValue = double(num) / (absoluteDifference(place, p1) + blocksize * 2);
+ foundBetter = newValue > oldValue;
+ } else {
+ foundBetter = true;
+ processMatchesPos = std::min(j + blocksize - 1, end);
+ }
+ if (foundBetter) {
+ best1 = p1;
+ best2 = p2;
+ bestnum = num;
+ }
+
+ }
}
- pMatch.push_back(PotentialMatch(c->loc, j - blocksize, c->cksum));
- processMatchesPos = std::min(j + blocksize / 2, processMatchesPos);
}
++c;
} while (h->tableIndex(c->cksum) == thisTableIndex);
}
- lastChecksum = hash.getValue();
-
- if (j >= processMatchesPos) {
- processMatchesPos = end;
- sortTMatches(b, place, pMatch);
- for (std::list<PotentialMatch>::iterator i = pMatch.begin(); i != pMatch.end(); ++i) {
- unsigned p1 = i->p1, p2 = i->p2;
- unsigned fnum = match_forward(b, p1, p2);
- if (fnum >= blocksize) {
- #ifdef THOROUGH
- for (unsigned betterP1 = p1 - (p1 ? 1 : 0); betterP1; --betterP1) {
- unsigned nfnum = match_forward(b, betterP1, p2);
- if (nfnum > fnum) {
- fnum = nfnum;
- p1 = betterP1;
- } else
- break;
- }
- #endif
- unsigned bnum = match_backward(b, p1, p2, blocksize);
- unsigned num = fnum + bnum;
-#ifndef CARELESSMATCH
- if (num < blocksize * 2)
- break; // I'd like to continue here, but first need to reduce pMatchCount.
-#endif
- p1 -= bnum; p2 -= bnum;
- addMatch(b, p1, p2, num, place);
- if (p2 + num > j) {
- // Fast foward over matched area.
- j = p2 + num - blocksize;
- inbuf = b->read2(buf1, j, blocksize);
- hash = Hash(inbuf, blocksize);
- buf_loc = blocksize;
- j += blocksize;
- }
- #ifdef DO_STATS_DEBUG
- ++stata;
- #endif
- break;
+ if (bestnum && j >= processMatchesPos) {
+ addMatch(b, best1, best2, bestnum, iterPlace);
+ place = best1 + bestnum;
+ unsigned matchEnd = best2 + bestnum;
+ if (matchEnd > j) {
+ if (matchEnd >= end)
+ j = end;
+ else {
+ // Fast forward over matched area.
+ j = matchEnd - blocksize;
+ inbuf = b->read2(buf1, j, blocksize);
+ hash = Hash(inbuf, blocksize);
+ buf_loc = blocksize;
+ j += blocksize;
}
}
- pMatch.clear();
+ bestnum = 0;
}
if (buf_loc == blocksize) {
@@ -263,10 +218,6 @@ void findMatches(BDelta_Instance *b, Checksums_Instance *h, unsigned start, unsi
}
}
-bool comparep1(Range r1, Range r2) {
- return r1.p < r2.p;
-}
-
struct Checksums_Compare {
Checksums_Instance &ci;
Checksums_Compare(Checksums_Instance &h) : ci(h) {}
@@ -282,7 +233,7 @@ struct Checksums_Compare {
}
};
-void *bdelta_init_alg(unsigned data1_size, unsigned data2_size,
+BDelta_Instance *bdelta_init_alg(unsigned data1_size, unsigned data2_size,
bdelta_readCallback cb, void *handle1, void *handle2,
unsigned tokenSize) {
if (tokenSize != sizeof(Token)) {
@@ -300,64 +251,55 @@ void *bdelta_init_alg(unsigned data1_size, unsigned data2_size,
return b;
}
-void bdelta_done_alg(void *instance) {
- BDelta_Instance *b = (BDelta_Instance*)instance;
+void bdelta_done_alg(BDelta_Instance *b) {
b->matches.clear();
delete b;
}
-unsigned bdelta_pass(void *instance, unsigned blocksize) {
- if (verbose) printf("Organizing leftover blocks\n");
-
- Checksums_Instance h(blocksize);
- BDelta_Instance *b = (BDelta_Instance*)instance;
- b->access_int = -1;
-
- Range *unused = new Range[b->matches.size() + 1];
- if (!unused) {b->errorcode = BDELTA_MEM_ERROR; return 0;}
- unsigned numunused = 0;
- for (std::list<Match>::iterator l = b->matches.begin(); l != b->matches.end(); ++l)
- unused[numunused++] = Range(l->p1, l->num);
+struct UnusedRange {
+ unsigned p, num;
+ std::list<Match>::iterator ml, mr;
+ UnusedRange() {}
+ UnusedRange(unsigned p, unsigned num, std::list<Match>::iterator ml, std::list<Match>::iterator mr) {
+ this->p = p; this->num = num; this->ml = ml; this->mr = mr;
+ }
+};
- std::sort(unused, unused + numunused, comparep1);
- // Trick loop below into including the free range at the end.
- unused[numunused++] = Range(b->data1_size, b->data1_size);
+bool comparep(UnusedRange r1, UnusedRange r2) {
+ return r1.p < r2.p;
+}
+bool comparemrp2(UnusedRange r1, UnusedRange r2) {
+ return r1.mr->p2 < r2.mr->p2;
+}
- unsigned last = 0;
- unsigned missing = 0;
- for (unsigned i = 0; i < numunused; ++i) {
- unsigned nextstart = unused[i].p + unused[i].num;
- if (unused[i].p <= last)
- ++missing;
- else
- unused[i - missing] = Range(last, unused[i].p - last);
- last = std::max(last, nextstart);
- }
- numunused -= missing;
+bool compareMatchP2(Match r1, Match r2) {
+ return r1.p2 < r2.p2;
+}
+// Adapted from http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2
+unsigned roundUpPowerOf2(unsigned v) {
+ --v;
+ for (int i = 1; i <= 16; i *= 2)
+ v |= v >> i;
+ return v + 1;
+}
+void bdelta_pass_2(BDelta_Instance *b, unsigned blocksize, unsigned minMatchSize, UnusedRange *unused, unsigned numunused, UnusedRange *unused2, unsigned numunused2) {
+ Checksums_Instance h(blocksize);
+ b->access_int = -1;
unsigned numblocks = 0;
for (unsigned i = 0; i < numunused; ++i) {
numblocks += unused[i].num / blocksize;
}
- if (verbose) printf("Starting search for matching blocks of size %i\n", blocksize);
// numblocks = size / blocksize;
- if (verbose) printf("found %i blocks\n", numblocks);
- h.htablesize = 1 << 16;
- while (h.htablesize < numblocks) h.htablesize <<= 1;
- // h.htablesize <<= 2;
- // htablesize >>= 0;
- if (verbose) printf("creating hash table of size %i\n", h.htablesize);
- // h.htablesize = 65536;
+ h.htablesize = std::max((unsigned)2, roundUpPowerOf2(numblocks));
h.htable = new checksum_entry*[h.htablesize];
- if (!h.htable) {b->errorcode = BDELTA_MEM_ERROR; return 0;}
+ if (!h.htable) {b->errorcode = BDELTA_MEM_ERROR; return;}
h.checksums = new checksum_entry[numblocks + 2];
- if (!h.checksums) {b->errorcode = BDELTA_MEM_ERROR; return 0;}
-
- if (verbose) printf("find checksums\n");
+ if (!h.checksums) {b->errorcode = BDELTA_MEM_ERROR; return;}
h.numchecksums = 0;
// unsigned numchecksums = 0;
@@ -366,16 +308,16 @@ unsigned bdelta_pass(void *instance, unsigned blocksize) {
unsigned first = unused[i].p, last = unused[i].p + unused[i].num;
for (unsigned loc = first; loc + blocksize <= last; loc += blocksize) {
Token *read = b->read1(buf, loc, blocksize);
- Hash::Value blocksum = Hash(read, h.blocksize).getValue();
+ Hash::Value blocksum = Hash(read, blocksize).getValue();
// Adjacent checksums are never repeated.
- if (! h.numchecksums || blocksum != h.checksums[h.numchecksums - 1].cksum)
+ //if (! h.numchecksums || blocksum != h.checksums[h.numchecksums - 1].cksum)
h.add(checksum_entry(blocksum, loc));
}
}
+
if (h.numchecksums) {
std::sort(h.checksums, h.checksums + h.numchecksums, Checksums_Compare(h));
-#ifndef THOROUGH
- const unsigned maxIdenticalChecksums = 256;
+ const unsigned maxIdenticalChecksums = 2;
unsigned writeLoc = 0, readLoc, testAhead;
for (readLoc = 0; readLoc < h.numchecksums; readLoc = testAhead) {
for (testAhead = readLoc; testAhead < h.numchecksums && h.checksums[readLoc].cksum == h.checksums[testAhead].cksum; ++testAhead)
@@ -385,9 +327,7 @@ unsigned bdelta_pass(void *instance, unsigned blocksize) {
h.checksums[writeLoc++] = h.checksums[i];
}
h.numchecksums = writeLoc;
-#endif
}
-
h.checksums[h.numchecksums].cksum = std::numeric_limits<Hash::Value>::max(); // If there's only one checksum, we might hit this and not know it,
h.checksums[h.numchecksums].loc = 0; // So we'll just read from the beginning of the file to prevent crashes.
h.checksums[h.numchecksums + 1].cksum = 0;
@@ -396,32 +336,112 @@ unsigned bdelta_pass(void *instance, unsigned blocksize) {
for (int i = h.numchecksums - 1; i >= 0; --i)
h.htable[h.tableIndex(h.checksums[i].cksum)] = &h.checksums[i];
-// if (verbose) printf("%i checksums\n", h.numchecksums);
- if (verbose) printf("compare files\n");
+ for (unsigned i = 0; i < numunused2; ++i)
+ if (unused2[i].num >= blocksize)
+ findMatches(b, &h, minMatchSize, unused2[i].p, unused2[i].p + unused2[i].num, unused[i].p, unused2[i].mr);
+
+ delete [] h.htable;
+ delete [] h.checksums;
+}
+
+void bdelta_swap_inputs(BDelta_Instance *b) {
+ for (std::list<Match>::iterator l = b->matches.begin(); l != b->matches.end(); ++l)
+ std::swap(l->p1, l->p2);
+ std::swap(b->data1_size, b->data2_size);
+ std::swap(b->handle1, b->handle2);
+ b->matches.sort(compareMatchP2);
+}
+
+void bdelta_clean_matches(BDelta_Instance *b, unsigned flags) {
+ // TODO: delete worse match when there's a conflict.
+ std::list<Match>::iterator place = b->matches.begin();
+ while (true) {
+ while (place != b->matches.begin() && place != b->matches.end() && prior(place)->p2 + prior(place)->num >= place->p2 + place->num)
+ place = b->matches.erase(place);
+
+ if (place == b->matches.end())
+ break;
+
+ if (flags & BDELTA_REMOVE_OVERLAP)
+ if (place != b->matches.begin() && prior(place)->p2 + prior(place)->num > place->p2) {
+ prior(place)->num = place->p2 - prior(place)->p2;
+ if (! prior(place)->num)
+ b->matches.erase(prior(place));
+ }
+ ++place;
+ }
+}
+
+void bdelta_showMatches(BDelta_Instance *b) {
+ for (std::list<Match>::iterator l = b->matches.begin(); l != b->matches.end(); ++l)
+ printf("(%d, %d, %d), ", l->p1, l->p2, l->num);
+ printf ("\n\n");
+}
+
+void get_unused_blocks(UnusedRange *unused, unsigned *numunusedptr) {
+ unsigned &numunused = *numunusedptr;
+
+ unsigned last = 0;
+ std::list<Match>::iterator lastnext = unused[0].ml;
+ for (unsigned i = 0; i < numunused; ++i) {
+ unsigned nextstart = unused[i].p + unused[i].num;
+
+ std::list<Match>::iterator mr = unused[i].ml;
+ unused[i] = UnusedRange(last, unused[i].p < last ? 0 : unused[i].p - last, lastnext, mr);
+ lastnext = next(mr);
- last = 0;
+ last = std::max(last, nextstart);
+ }
+}
+
+void bdelta_pass(BDelta_Instance *b, unsigned blocksize, unsigned minMatchSize, unsigned maxHoleSize, unsigned flags) {
+ // Trick for including the free range at the end.
+ b->matches.push_back(Match(b->data1_size, b->data2_size, 0));
+
+ UnusedRange *unused = new UnusedRange[b->matches.size() + 1],
+ *unused2 = new UnusedRange[b->matches.size() + 1];
+ unsigned numunused = 0, numunused2 = 0;
for (std::list<Match>::iterator l = b->matches.begin(); l != b->matches.end(); ++l) {
- if (l->p2 - last >= blocksize)
- findMatches(b, &h, last, l->p2, l);
- last = l->p2 + l->num;
+ unused[numunused++] = UnusedRange(l->p1, l->num, l, l);
+ unused2[numunused2++] = UnusedRange(l->p2, l->num, l, l);
+ }
+
+ std::sort(unused, unused + numunused, comparep);
+ //std::sort(unused2, unused2 + numunused2, comparep);
+
+ get_unused_blocks(unused, &numunused);
+
+ get_unused_blocks(unused2, &numunused2);
+ //std::sort(unused2, unused2 + numunused2, comparemrp2);
+
+ if (flags & BDELTA_GLOBAL)
+ bdelta_pass_2(b, blocksize, minMatchSize, unused, numunused, unused2, numunused2);
+ else {
+ std::sort(unused, unused + numunused, comparemrp2);
+ for (unsigned i = 0; i < numunused; ++i) {
+ UnusedRange u1 = unused[i], u2 = unused2[i];
+ if (u1.num >= blocksize && u2.num >= blocksize)
+ if (! maxHoleSize || (u1.num <= maxHoleSize && u2.num <= maxHoleSize))
+ if (! (flags & BDELTA_SIDES_ORDERED) || (next(u1.ml) == u1.mr))
+ bdelta_pass_2(b, blocksize, minMatchSize, &u1, 1, &u2, 1);
+ }
}
- if (b->data2_size - last >= blocksize)
- findMatches(b, &h, last, b->data2_size, b->matches.end());
- // printf("afterwards: %i, %i, %i\n", b->matches.first->next->obj->p1, b->matches.first->next->obj->p2, b->matches.first->next->obj->num);
+
+ if (verbose) printf("pass (blocksize: %d, matches: %zu)\n", blocksize, b->matches.size());
+
+ // Get rid of the dummy value we placed at the end.
+ b->matches.pop_back();
+
delete [] unused;
- delete [] h.htable;
- delete [] h.checksums;
-#ifdef DO_STATS_DEBUG
- printf("a = %.lli; b = %.lli\n", stata, statb);
-#endif
- // printf("Found %i matches\n", b->matches.size());
- return b->matches.size();
+ delete [] unused2;
}
+unsigned bdelta_numMatches(BDelta_Instance *b) {
+ return b->matches.size();
+}
-void bdelta_getMatch(void *instance, unsigned matchNum,
+void bdelta_getMatch(BDelta_Instance *b, unsigned matchNum,
unsigned *p1, unsigned *p2, unsigned *num) {
- BDelta_Instance *b = (BDelta_Instance*)instance;
int &access_int = b->access_int;
std::list<Match>::iterator &accessplace = b->accessplace;
if (access_int == -1) {access_int = 0; accessplace = b->matches.begin();}
@@ -438,6 +458,6 @@ void bdelta_getMatch(void *instance, unsigned matchNum,
*num = accessplace->num;
}
-int bdelta_getError(void *instance) {
- return ((BDelta_Instance*)instance)->errorcode;
+int bdelta_getError(BDelta_Instance *instance) {
+ return instance->errorcode;
}

0 comments on commit fb0a916

Please sign in to comment.
Something went wrong with that request. Please try again.