Permalink
Fetching contributors…
Cannot retrieve contributors at this time
839 lines (741 sloc) 20.3 KB
/*
* Copyright (C) 2011 Joseph Adams <joeyadams3.14159@gmail.com>
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*/
#include "bdelta.h"
#include <assert.h>
#include <limits.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct
{
unsigned char *cur; /* End of string; insertion point for new bytes */
unsigned char *end; /* End of buffer */
unsigned char *start; /* Beginning of string */
} SB;
/* sb is evaluated multiple times in these macros. */
#define sb_size(sb) ((size_t)((sb)->cur - (sb)->start))
#define sb_avail(sb) ((size_t)((sb)->end - (sb)->cur))
/* sb and need may be evaluated multiple times. */
#define sb_need(sb, need) do { \
if (sb_avail(sb) < (need)) \
if (sb_grow(sb, need) != 0) \
goto out_of_memory; \
} while (0)
static int sb_init(SB *sb)
{
sb->start = malloc(17);
if (sb->start == NULL)
return -1;
sb->cur = sb->start;
sb->end = sb->start + 16;
return 0;
}
static int sb_grow(SB *sb, size_t need)
{
size_t length = sb->cur - sb->start;
size_t alloc = sb->end - sb->start;
unsigned char *tmp;
do {
alloc *= 2;
} while (alloc < length + need);
tmp = realloc(sb->start, alloc + 1);
if (tmp == NULL)
return -1;
sb->start = tmp;
sb->cur = tmp + length;
sb->end = tmp + alloc;
return 0;
}
static int sb_putc(SB *sb, unsigned char c)
{
sb_need(sb, 1);
*sb->cur++ = c;
return 0;
out_of_memory:
return -1;
}
static int sb_write(SB *sb, const void *data, size_t size)
{
sb_need(sb, size);
memcpy(sb->cur, data, size);
sb->cur += size;
return 0;
out_of_memory:
return -1;
}
static void sb_return(SB *sb, void **data_out, size_t *length_out)
{
*sb->cur = 0;
if (data_out)
*data_out = sb->start;
else
free(sb->start);
if (length_out)
*length_out = sb->cur - sb->start;
}
static void sb_discard(SB *sb, void **data_out, size_t *length_out)
{
free(sb->start);
if (data_out)
*data_out = NULL;
if (length_out)
*length_out = 0;
}
/*
* The first byte in a patch is the "patch type", which indicates how the
* patch is formatted. This keeps the patch format flexible while retaining
* backward compatibility. Patches produced with an older version of
* the library can be applied with a newer version.
*
* PT_LITERAL
* Contains nothing more than the content of the new text.
*
* PT_CSI32
* A string of copy, skip, and insert instructions for generating the new
* string from the old.
*
* copy(size): Copy @size bytes from old to new.
* skip(size): Skip @size bytes of old.
* insert(text): Insert @text into new.
*
* The syntax is as follows:
*
* copy: instruction_byte(1) size
* skip: instruction_byte(2) size
* insert: instruction_byte(3) size $size*OCTET
*
* 0 <= size_param_length <= 4
* instruction_byte(op) = op | size_param_length << 2
* size: $size_param_length*OCTET
* -- size is an unsigned integer encoded in big endian.
* -- However, if size_param_length is 0, the operation size is 1.
*
* Simply put, an instruction starts with an opcode and operation size.
* An insert instruction is followed by the bytes to be inserted.
*/
#define PT_LITERAL 10
#define PT_CSI32 11
#define OP_COPY 1
#define OP_SKIP 2
#define OP_INSERT 3
static unsigned int bytes_needed_for_size(uint32_t size)
{
if (size == 1)
return 0;
else if (size <= 0xFF)
return 1;
else if (size <= 0xFFFF)
return 2;
else if (size <= 0xFFFFFF)
return 3;
else
return 4;
}
/*
* Return values:
*
* BDELTA_OK: Success
* BDELTA_MEMORY: Memory allocation failed
*/
static BDELTAcode csi32_emit_op(SB *patch_out, int op, uint32_t size, const char **new_)
{
unsigned int i;
unsigned int size_param_length;
size_t need;
uint32_t tmp;
assert(op >= 1 && op <= 3);
if (size == 0)
return BDELTA_OK;
size_param_length = bytes_needed_for_size(size);
need = 1 + size_param_length;
if (op == OP_INSERT)
need += size;
sb_need(patch_out, need);
*patch_out->cur++ = (unsigned int)op | size_param_length << 2;
for (i = size_param_length, tmp = size; i-- > 0; tmp >>= 8)
patch_out->cur[i] = tmp & 0xFF;
patch_out->cur += size_param_length;
switch (op) {
case OP_COPY:
*new_ += size;
break;
case OP_SKIP:
break;
case OP_INSERT:
memcpy(patch_out->cur, *new_, size);
patch_out->cur += size;
*new_ += size;
break;
default:
assert(0);
}
return BDELTA_OK;
out_of_memory:
return BDELTA_MEMORY;
}
/*
* On success, returns 1, advances *sp past the parsed text, and sets *op_out and *size_out.
* On error or EOF, returns 0.
*/
static int csi32_parse_op(
const unsigned char **sp, const unsigned char *e,
int *op_out, uint32_t *size_out)
{
const unsigned char *s = *sp;
int op;
unsigned int i;
unsigned int size_param_length;
uint32_t size;
if (s >= e)
return 0;
op = *s & 3;
size_param_length = *s >> 2;
s++;
if (op == 0 || size_param_length > 4)
return 0;
if (size_param_length == 0) {
size = 1;
} else {
if ((size_t)(e - s) < size_param_length)
return 0;
size = 0;
for (i = 0; i < size_param_length; i++) {
size <<= 8;
size |= *s++ & 0xFF;
}
}
/* Make sure insert data fits in the patch, but don't consume it. */
if (op == OP_INSERT && (size_t)(e - s) < size)
return 0;
*op_out = op;
*size_out = size;
*sp = s;
return 1;
}
/*
* bdelta uses the algorithm described in:
*
* Myers, E. (1986). An O(ND) Difference Algorithm and Its Variations.
* Retrieved from http://www.xmailserver.org/diff2.pdf
*
* The pseudocode in Myers' paper (Figure 2) uses an array called V,
* where (V[k], V[k] - k) is the endpoint of the furthest-reaching
* D-path ending on diagonal k.
*
* The structure below holds the V array for every iteration of the outer loop.
* Because each iteration produces D+1 values, a triangle is formed:
*
* k
* -5 -4 -3 -2 -1 0 1 2 3 4 5
* ----------------------------------
* 0 | 0 (copy 0)
* | \ skip 1
* 1 | 0 1
* | \ skip 1, then copy 1
* 2 | 2 2 3
* D | / insert 1, then copy 2
* 3 | 3 4 5 5
* | \ skip 1, then copy 1
* 4 | 3 4 5 7 7
* | / insert 1
* 5 | 3 4 5 7 - -
*
* @data will literally contain: 0 0 1 2 2 3 3 4 5 5 3 4 5 7 7 3 4 5 7
*
* To convert this to an edit script, we first climb back to the top,
* using the same procedure as was used when the triangle was generated:
*
* If k = -D, climb right (the only way we can go).
* If k = +D, climb left (the only way we can go).
* Otherwise, favor the greater number.
* If the numbers are the same, climb left.
*
* Finally, we convert the descent to the solution to a patch script:
*
* The top number n corresponds to:
* copy n
*
* A descent left from a to b corresponds to:
* insert 1
* copy b-a
*
* A descent right from a to b corresponds to:
* skip 1
* copy b-a-1
*/
typedef struct
{
uint32_t *data;
int solution_d;
int solution_k;
uint32_t *solution_ptr;
} Triangle;
/*
* Return values:
*
* BDELTA_OK: Success
* BDELTA_MEMORY: Memory allocation failed
* BDELTA_INTERNAL_DMAX_EXCEEDED: d_max exceeded (strings are too different)
*/
static BDELTAcode build_triangle(
const char *old, uint32_t old_size,
const char *new_, uint32_t new_size,
int d_max,
Triangle *triangle_out)
{
int d, k;
uint32_t x, y;
uint32_t *data;
uint32_t *vprev; /* position within previous row */
uint32_t *v; /* position within current row */
uint32_t *vcur; /* beginning of current row */
size_t data_alloc = 16;
memset(triangle_out, 0, sizeof(*triangle_out));
data = malloc(data_alloc * sizeof(*data));
if (data == NULL)
return BDELTA_MEMORY;
/* Allow dmax < 0 to mean "no limit". */
if (d_max < 0)
d_max = old_size + new_size;
/*
* Compute the farthest-reaching 0-path so the loop after this
* will have a "previous" row to start with.
*/
for (x = 0; x < old_size && x < new_size && old[x] == new_[x]; )
x++;
*data = x;
if (x >= old_size && x >= new_size) {
/* Strings are equal, so return a triangle with one row (a dot). */
assert(x == old_size && x == new_size);
triangle_out->data = data;
triangle_out->solution_d = 0;
triangle_out->solution_k = 0;
triangle_out->solution_ptr = data;
return BDELTA_OK;
}
vprev = data;
vcur = v = data + 1;
/*
* Here is the core of the Myers diff algorithm.
*
* This is a direct translation of the pseudocode in Myers' paper,
* with implementation-specific adaptations:
*
* * Every V array is preserved per iteration of the outer loop.
* This is necessary so we can determine the actual patch, not just
* the length of the shortest edit string. See the coment above
* the definition of Triangle for an in-depth explanation.
*
* * Array items are stored consecutively so as to not waste space.
*
* * The buffer holding the V arrays is expanded dynamically.
*/
for (d = 1; d <= d_max; d++, vprev = vcur, vcur = v) {
/* Ensure that the buffer has enough room for this row. */
if ((size_t)(v - data + d + 1) > data_alloc) {
size_t vprev_idx = vprev - data;
size_t v_idx = v - data;
size_t vcur_idx = vcur - data;
uint32_t *tmp;
do {
data_alloc *= 2;
} while ((size_t)(v - data + d + 1) > data_alloc);
tmp = realloc(data, data_alloc * sizeof(*data));
if (tmp == NULL) {
free(data);
return BDELTA_MEMORY;
}
data = tmp;
/* Relocate pointers to the buffer we just expanded. */
vprev = data + vprev_idx;
v = data + v_idx;
vcur = data + vcur_idx;
}
for (k = -d; k <= d; k += 2, vprev++) {
if (k == -d || (k != d && vprev[-1] < vprev[0]))
x = vprev[0];
else
x = vprev[-1] + 1;
y = x - k;
while (x < old_size && y < new_size && old[x] == new_[y])
x++, y++;
*v++ = x;
if (x >= old_size && y >= new_size) {
/* Shortest edit string found. */
assert(x == old_size && y == new_size);
triangle_out->data = data;
triangle_out->solution_d = d;
triangle_out->solution_k = k;
triangle_out->solution_ptr = v - 1;
return BDELTA_OK;
}
}
}
free(data);
return BDELTA_INTERNAL_DMAX_EXCEEDED;
}
/*
* Trace a solution back to the top, returning a string of instructions
* for descending from the top to the solution.
*
* An instruction is one of the following:
*
* -1: Descend left.
* +1: Descend right.
* 0: Finished. You should be at the solution now.
*
* If memory allocation fails, this function will return NULL.
*/
static signed char *climb_triangle(const Triangle *triangle)
{
signed char *descent;
int d, k;
uint32_t *p;
assert(triangle->solution_d >= 0);
descent = malloc(triangle->solution_d + 1);
if (descent == NULL)
return NULL;
d = triangle->solution_d;
k = triangle->solution_k;
p = triangle->solution_ptr;
descent[d] = 0;
while (d > 0) {
if (k == -d || (k != d && *(p-d-1) < *(p-d))) {
/* Climb right */
k++;
p = p - d;
descent[--d] = -1;
} else {
/* Climb left */
k--;
p = p - d - 1;
descent[--d] = 1;
}
}
return descent;
}
/*
* Generate the actual patch, given data produced by build_triangle and
* climb_triangle. new_ is needed for the content of the insertions.
*
* See the comment above the definition of Triangle. It concisely documents
* how a descent down the triangle corresponds to a patch script.
*
* The resulting patch, including the patch type byte, is appended to patch_out.
*
* Return values:
*
* BDELTA_OK: Success
* BDELTA_MEMORY: Memory allocation failed
*/
static BDELTAcode descent_to_patch(
const signed char *descent,
const Triangle *triangle,
const char *new_, uint32_t new_size,
SB *patch_out)
{
const char *new_end = new_ + new_size;
uint32_t *p = triangle->data;
uint32_t *p2;
int d = 0;
int k = 0;
int pending_op = 0;
int current_op;
uint32_t pending_length = 0;
uint32_t copy_length;
if (sb_putc(patch_out, PT_CSI32) != 0)
return BDELTA_MEMORY;
if (*p > 0) {
if (csi32_emit_op(patch_out, OP_COPY, *p, &new_) != BDELTA_OK)
return BDELTA_MEMORY;
}
for (; *descent != 0; descent++, p = p2) {
if (*descent < 0) {
/* Descend left. */
d++;
k--;
p2 = p + d;
current_op = OP_INSERT;
assert(*p2 >= *p);
copy_length = *p2 - *p;
} else {
/* Descend right. */
d++;
k++;
p2 = p + d + 1;
current_op = OP_SKIP;
assert(*p2 > *p);
copy_length = *p2 - *p - 1;
}
if (pending_op == current_op) {
pending_length++;
} else {
if (pending_op != 0) {
if (csi32_emit_op(patch_out, pending_op, pending_length, &new_) != BDELTA_OK)
return BDELTA_MEMORY;
}
pending_op = current_op;
pending_length = 1;
}
if (copy_length > 0) {
if (csi32_emit_op(patch_out, pending_op, pending_length, &new_) != BDELTA_OK)
return BDELTA_MEMORY;
pending_op = 0;
if (csi32_emit_op(patch_out, OP_COPY, copy_length, &new_) != BDELTA_OK)
return BDELTA_MEMORY;
}
}
assert(d == triangle->solution_d);
assert(k == triangle->solution_k);
assert(p == triangle->solution_ptr);
/* Emit the last pending op, unless it's a skip. */
if (pending_op != 0 && pending_op != OP_SKIP) {
if (csi32_emit_op(patch_out, pending_op, pending_length, &new_) != BDELTA_OK)
return BDELTA_MEMORY;
}
assert(new_ == new_end);
return BDELTA_OK;
}
/*
* Generate a patch using Myers' O(ND) algorithm.
*
* The patch is appended to @patch_out, which must be initialized before calling.
*
* Return values:
*
* BDELTA_OK: Success
* BDELTA_MEMORY: Memory allocation failed
* BDELTA_INTERNAL_INPUTS_TOO_LARGE: Input sizes are too large
* BDELTA_INTERNAL_DMAX_EXCEEDED: d_max exceeded (strings are too different)
*/
static BDELTAcode diff_myers(
const char *old, size_t old_size,
const char *new_, size_t new_size,
SB *patch_out)
{
Triangle triangle;
signed char *descent;
BDELTAcode rc;
/* Make sure old_size + new_size does not overflow int or uint32_t. */
if (old_size >= UINT32_MAX ||
new_size >= UINT32_MAX - old_size ||
old_size >= (unsigned int)INT_MAX ||
new_size >= (unsigned int)INT_MAX - old_size)
return BDELTA_INTERNAL_INPUTS_TOO_LARGE;
rc = build_triangle(old, old_size, new_, new_size, 1000, &triangle);
if (rc != BDELTA_OK)
return rc;
descent = climb_triangle(&triangle);
if (descent == NULL)
goto oom1;
if (descent_to_patch(descent, &triangle, new_, new_size, patch_out) != BDELTA_OK)
goto oom2;
free(descent);
free(triangle.data);
return BDELTA_OK;
oom2:
free(descent);
oom1:
free(triangle.data);
return BDELTA_MEMORY;
}
BDELTAcode bdelta_diff(
const void *old, size_t old_size,
const void *new_, size_t new_size,
void **patch_out, size_t *patch_size_out)
{
SB patch;
if (sb_init(&patch) != 0)
goto out_of_memory;
if (new_size == 0)
goto emit_new_literally;
if (diff_myers(old, old_size, new_, new_size, &patch) != BDELTA_OK)
goto emit_new_literally;
if (sb_size(&patch) > new_size) {
/*
* A literal copy of new is no longer than this patch.
* All that for nothing.
*/
goto emit_new_literally;
}
/*
* Verify that patch, when applied to old, produces the correct text.
* If it doesn't, it's a bug, but fall back to a simple emit
* to avert data corruption.
*/
{
void *result;
size_t result_size;
BDELTAcode rc;
int correct;
rc = bdelta_patch(
old, old_size,
patch.start, patch.cur - patch.start,
&result, &result_size
);
switch (rc) {
case BDELTA_OK:
correct = (result_size == new_size &&
memcmp(result, new_, new_size) == 0);
free(result);
break;
case BDELTA_MEMORY:
goto out_of_memory;
default:
correct = 0;
break;
}
if (!correct) {
assert(0);
goto emit_new_literally;
}
}
sb_return(&patch, patch_out, patch_size_out);
return BDELTA_OK;
emit_new_literally:
if (patch.cur != patch.start) {
free(patch.start);
if (sb_init(&patch) != 0)
goto out_of_memory;
}
if (sb_putc(&patch, PT_LITERAL) != 0 || sb_write(&patch, new_, new_size) != 0)
goto out_of_memory;
sb_return(&patch, patch_out, patch_size_out);
return BDELTA_OK;
out_of_memory:
sb_discard(&patch, patch_out, patch_size_out);
return BDELTA_MEMORY;
}
/*
* Return values:
*
* BDELTA_OK: Success
* BDELTA_PATCH_INVALID: Patch is malformed
* BDELTA_PATCH_MISMATCH: Old string is too small
* BDELTA_MEMORY: Memory allocation failed
*/
static BDELTAcode patch_csi32(
const unsigned char *o, const unsigned char *oe,
const unsigned char *p, const unsigned char *pe,
SB *new_out)
{
int op;
uint32_t size;
while (csi32_parse_op(&p, pe, &op, &size)) {
if ((op == OP_COPY || op == OP_SKIP) && (size_t)(oe - o) < size) {
/* Copy or skip instruction exceeds length of old string. */
return BDELTA_PATCH_MISMATCH;
}
if (op == OP_COPY || op == OP_INSERT)
sb_need(new_out, size);
switch (op) {
case OP_COPY: /* Copy @size bytes from old string. */
memcpy(new_out->cur, o, size);
new_out->cur += size;
o += size;
break;
case OP_SKIP: /* Skip @size bytes of old string. */
o += size;
break;
case OP_INSERT: /* Insert @size new bytes (from the patch script). */
memcpy(new_out->cur, p, size);
new_out->cur += size;
p += size;
break;
default:
assert(0);
}
}
if (p != pe)
return BDELTA_PATCH_INVALID;
return BDELTA_OK;
out_of_memory:
return BDELTA_MEMORY;
}
BDELTAcode bdelta_patch(
const void *old, size_t old_size,
const void *patch, size_t patch_size,
void **new_out, size_t *new_size_out)
{
const unsigned char *o = old;
const unsigned char *oe = o + old_size;
const unsigned char *p = patch;
const unsigned char *pe = p + patch_size;
SB result;
BDELTAcode rc;
if (sb_init(&result) != 0) {
rc = BDELTA_MEMORY;
goto discard;
}
if (p >= pe) {
rc = BDELTA_PATCH_INVALID;
goto discard;
}
switch (*p++) {
case PT_LITERAL:
if (sb_write(&result, p, pe - p) != 0) {
rc = BDELTA_MEMORY;
goto discard;
}
break;
case PT_CSI32:
rc = patch_csi32(o, oe, p, pe, &result);
if (rc != BDELTA_OK)
goto discard;
break;
default:
rc = BDELTA_PATCH_INVALID;
goto discard;
}
sb_return(&result, new_out, new_size_out);
return BDELTA_OK;
discard:
sb_discard(&result, new_out, new_size_out);
return rc;
}
const char *bdelta_strerror(BDELTAcode code)
{
switch (code) {
case BDELTA_OK:
return "Success";
case BDELTA_MEMORY:
return "Could not allocate memory";
case BDELTA_PATCH_INVALID:
return "Patch is invalid";
case BDELTA_PATCH_MISMATCH:
return "Patch applied to wrong data";
case BDELTA_INTERNAL_DMAX_EXCEEDED:
return "Difference threshold exceeded (internal error)";
case BDELTA_INTERNAL_INPUTS_TOO_LARGE:
return "Inputs are too large (internal error)";
default:
return "Invalid error code";
}
}
void bdelta_perror(const char *s, BDELTAcode code)
{
if (s != NULL && *s != '\0')
fprintf(stderr, "%s: %s\n", s, bdelta_strerror(code));
else
fprintf(stderr, "%s\n", bdelta_strerror(code));
}