Skip to content

Commit

Permalink
Merge pull request #24 from martinal/martinal/topic-refactor-diff-alg…
Browse files Browse the repository at this point in the history
…orithms

Refactor diff algorithms
  • Loading branch information
Martin Sandve Alnæs committed Mar 8, 2016
2 parents d20e604 + 7164a48 commit c261edd
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 80 deletions.
39 changes: 23 additions & 16 deletions nbdime/diffing/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,18 @@ def is_atomic(x):
return not isinstance(x, (string_types, list, dict))


def diff(a, b, compare=operator.__eq__):
def diff(a, b, path="", compare=operator.__eq__, predicates={}, differs={}):
"Compute the diff of two json-like objects, list or dict or string."
# TODO: Providing separate comparison predicate for
# different dict paths will allow more customization

if isinstance(a, list) and isinstance(b, list):
d = diff_lists(a, b, compare=compare)
d = diff_lists(a, b, path=path, compare=compare, predicates=predicates, differs=differs)
elif isinstance(a, dict) and isinstance(b, dict):
d = diff_dicts(a, b, compare=compare)
d = diff_dicts(a, b, path=path, compare=compare, predicates=predicates, differs=differs)
elif isinstance(a, string_types) and isinstance(b, string_types):
d = diff_strings(a, b)
# FIXME: Do we need this string case, and if so do we need to pass on these additional arguments?
d = diff_strings(a, b) #, path=path, predicates=predicates, differs=differs)
else:
raise RuntimeError("Can currently only diff list, dict, or str objects.")

Expand All @@ -42,18 +43,26 @@ def diff(a, b, compare=operator.__eq__):
return d


def diff_lists(a, b, compare=operator.__eq__, shallow_diff=None):
# First make the one-level list diff with custom compare,
def diff_lists(a, b, path="", compare=operator.__eq__, predicates={}, differs={}, shallow_diff=None):

# Keeping compare a valid kwargs for simplicity and to avoid rewriting tests right now
if path in predicates and compare is not operator.__eq__:
raise RuntimeError("Please don't pass compare and predicates at the same time.")

# First make a shallow sequence diff with custom compare,
# unless it's provided for us
if shallow_diff is None:
shallow_diff = diff_sequence(a, b, compare)
shallow_diff = diff_sequence(a, b, predicates.get(path, compare))

# Count consumed items from a, "take" in patch_list
acons = 0
bcons = 0

di = SequenceDiff()

subpath = "/".join((path, "*"))
diffit = differs.get(subpath, diff)

M = len(shallow_diff)
for ie in range(M+1):
if ie < M:
Expand All @@ -77,7 +86,7 @@ def diff_lists(a, b, compare=operator.__eq__, shallow_diff=None):
aval = a[acons+i]
bval = b[bcons+i]
if not is_atomic(aval):
dd = diff(aval, bval, compare=compare)
dd = diffit(aval, bval, path=subpath, compare=compare, predicates=predicates, differs=differs)
if dd:
di.patch(acons+i, dd) # FIXME: Not covered in tests, create test situation

Expand All @@ -96,7 +105,7 @@ def diff_lists(a, b, compare=operator.__eq__, shallow_diff=None):
return di.diff # XXX


def diff_dicts(a, b, compare=operator.__eq__, subdiffs=None):
def diff_dicts(a, b, path="", compare=operator.__eq__, predicates={}, differs={}):
"""Compute diff of two dicts with configurable behaviour.
Keys in both a and b will be handled based on
Expand All @@ -107,9 +116,6 @@ def diff_dicts(a, b, compare=operator.__eq__, subdiffs=None):
Items not mentioned in diff are items where compare(x, y) return True.
For other items the diff will contain delete, insert, or replace entries.
"""
if subdiffs is None:
subdiffs = {}

assert isinstance(a, dict) and isinstance(b, dict)
akeys = set(a.keys())
bkeys = set(b.keys())
Expand All @@ -126,13 +132,14 @@ def diff_dicts(a, b, compare=operator.__eq__, subdiffs=None):
bvalue = b[key]
# If types are the same and nonatomic, recurse
if type(avalue) == type(bvalue) and not is_atomic(avalue):
diffit = subdiffs.get(key, diff)
dd = diffit(avalue, bvalue, compare=compare)
subpath = "/".join((path, key))
diffit = differs.get(subpath, diff)
dd = diffit(avalue, bvalue, path=subpath, compare=compare, predicates=predicates, differs=differs)
if dd:
di.patch(key, dd)
else:
#compareit = compare.get(key, operator.__eq__) # TODO: Do like this?
if not compare(avalue, bvalue): # TODO: Use != or not compare() here?
compareit = predicates.get(path, compare)
if not compareit(avalue, bvalue): # TODO: Use != or not compare() here?
di.replace(key, bvalue)

for key in sorted(bkeys - akeys):
Expand Down
91 changes: 37 additions & 54 deletions nbdime/diffing/notebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,34 +25,6 @@
__all__ = ["diff_notebooks"]


def __unused__notebook_diff_data():
# TODO: It might be possible to encode the below
# functions more compactly with collections
# of predicate functions and generalize.
# Basically, sequence diffs should be applied with multilevel
# algorithm for paths with more than one predicate,
# and using operator.__eq__ if no match in there.
predicates = {
"/cells": [
compare_cell_source_approximate,
compare_cell_source_exact,
compare_cell_source_and_outputs,
],
"/cells/#/outputs": [
compare_output_data_keys,
compare_output_data,
]
}
diff_algorithms = {
"/": diff_notebooks,
"/cells": diff_cells,
"/cells/#": diff_single_cells,
"/cells/#/source": diff_source,
"/cells/#/outputs": diff_outputs,
"/cells/#/outputs/#": diff_single_outputs,
}


def compare_cell_source_approximate(x, y):
"Compare source of cells x,y with approximate heuristics."
# Cell types must match
Expand Down Expand Up @@ -161,8 +133,9 @@ def compare_output_data(x, y):
return True


def diff_single_outputs(a, b, compare="ignored"):
def __unused_diff_single_outputs(a, b, compare="ignored", path="/cells/*/output/*"):
"Diff a pair of output cells."
assert path == "/cells/*/outputs/*"
# TODO: Handle output diffing with plugins? I.e. image diff, svg diff, json diff, etc.
# FIXME: Use linebased diff of some types of outputs:
# if a.output_type in ("execute_result", "display_data"):
Expand All @@ -171,38 +144,48 @@ def diff_single_outputs(a, b, compare="ignored"):
# a.text
return diff(a, b)


def diff_outputs(a, b, compare="ignored"):
"Diff a pair of lists of outputs from within a single cell."
predicates = [compare_output_data_keys,
compare_output_data]
return diff_sequence_multilevel(a, b, predicates, diff_single_outputs)


def diff_source(a, b, compare="ignored"):
def __unused_diff_source(a, b, path, compare, predicates, differs):
"Diff a pair of sources."
assert path == "/cells/*/source"
# FIXME: Make sure we use linebased diff of sources
# TODO: Use google-diff-patch-match library to diff the sources?
return diff(a, b)


def diff_single_cells(a, b):
return diff_dicts(a, b, subdiffs={"source": diff_source, "outputs": diff_outputs})


def diff_cells(a, b, compare="ignored"):
"Diff cell lists a and b. Argument compare is ignored."
# Old alternative implementation:
# shallow_diff = diff_sequence(a, b, compare_cell_source_and_outputs)
# return diff_lists(a, b, compare=operator.__eq__, shallow_diff=shallow_diff)

# Sequence diffs should be applied with multilevel
# algorithm for paths with more than one predicate,
# and using operator.__eq__ if no match in there.
notebook_predicates = {
# Predicates to compare cells in order of low-to-high precedence
predicates = [compare_cell_source_approximate,
compare_cell_source_exact,
compare_cell_source_and_outputs]
return diff_sequence_multilevel(a, b, predicates, diff_single_cells)
"/cells": [
compare_cell_source_approximate,
compare_cell_source_exact,
compare_cell_source_and_outputs,
],
# Predicates to compare output cells (within one cell) in order of low-to-high precedence
"/cells/*/outputs": [
compare_output_data_keys,
compare_output_data,
]
}


# Recursive diffing of substructures should pick a rule from here, with diff as fallback
notebook_differs = {
"/cells": diff_sequence_multilevel,
#"/cells/*": diff,
#"/cells/*/source": diff,
"/cells/*/outputs": diff_sequence_multilevel,
#"/cells/*/outputs/*": diff_single_outputs,
}


def diff_cells(a, b):
"This is currently just used by some tests."
path = "/cells"
return notebook_differs[path](a, b, path=path, predicates=notebook_predicates, differs=notebook_differs)


def diff_notebooks(a, b):
"""Compute the diff of two notebooks."""
return diff_dicts(a, b, subdiffs={"cells": diff_cells})
"""Compute the diff of two notebooks using customized heuristics and diff rules."""
return diff(a, b, path="", predicates=notebook_predicates, differs=notebook_differs)
26 changes: 16 additions & 10 deletions nbdime/diffing/snakes.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
Up- and down-conversion is handled by nbformat.
"""

import operator
from ..diff_format import SequenceDiff
from .seq_bruteforce import bruteforce_compute_snakes
from .generic import diff
Expand All @@ -33,12 +34,12 @@ def compute_snakes(A, B, rect, compare):
return snakes


def compute_snakes_multilevel(A, B, rect, predicates, level):
def compute_snakes_multilevel(A, B, rect, compares, level):
"""Compute snakes using a multilevel multi-predicate algorithm.
TODO: Document this algorithm.
"""
compare = predicates[level]
compare = compares[level]
snakes = compute_snakes(A, B, rect, compare)
if level == 0:
return snakes
Expand All @@ -47,7 +48,7 @@ def compute_snakes_multilevel(A, B, rect, predicates, level):
for snake in snakes + [(i1, j1, 0)]:
i, j, n = snake
if i > i0 and j > j0:
newsnakes += compute_snakes_multilevel(A, B, (i0, j0, i, j), predicates, level-1)
newsnakes += compute_snakes_multilevel(A, B, (i0, j0, i, j), compares, level-1)
if n > 0:
if newsnakes[-1][0] == i and newsnakes[-1][1] == j:
snake = newsnakes[-1]
Expand All @@ -61,8 +62,12 @@ def compute_snakes_multilevel(A, B, rect, predicates, level):
return newsnakes


def compute_diff_from_snakes(a, b, snakes, diff_single_item=diff):
# Compute diff from snakes
def compute_diff_from_snakes(a, b, snakes, path="", differs={}):
"Compute diff from snakes."

subpath = path + "/*"
diffit = differs.get(subpath, diff)

di = SequenceDiff()
i0, j0, i1, j1 = 0, 0, len(a), len(b)
for i, j, n in snakes + [(i1, j1, 0)]:
Expand All @@ -71,17 +76,18 @@ def compute_diff_from_snakes(a, b, snakes, diff_single_item=diff):
if j > j0:
di.add(i0, b[j0:j])
for k in range(n):
cd = diff_single_item(a[i + k], b[j + k])
cd = diffit(a[i + k], b[j + k], path=subpath)
if cd:
di.patch(i+k, cd)
# Update corner offsets for next rectangle
i0, j0 = i+n, j+n
return di.diff # XXX


def diff_sequence_multilevel(a, b, predicates, subdiff=diff):
def diff_sequence_multilevel(a, b, path="", compare=operator.__eq__, predicates={}, differs={}):
# Invoke multilevel snake computation algorithm
level = len(predicates) - 1
compares = predicates.get(path, [compare])
level = len(compares) - 1
rect = (0, 0, len(a), len(b))
snakes = compute_snakes_multilevel(a, b, rect, predicates, level)
return compute_diff_from_snakes(a, b, snakes, subdiff)
snakes = compute_snakes_multilevel(a, b, rect, compares, level)
return compute_diff_from_snakes(a, b, snakes, path=path, differs=differs)

0 comments on commit c261edd

Please sign in to comment.