Skip to content

Commit

Permalink
Merge pull request #27 from martinal/martinal/topic-refactor-diffing
Browse files Browse the repository at this point in the history
Generalizing and streamlining the previously special cased diff algorithms for notebook parts.
  • Loading branch information
Martin Sandve Alnæs committed Mar 8, 2016
2 parents 5af7f14 + 0a8e244 commit fbcb81f
Show file tree
Hide file tree
Showing 5 changed files with 145 additions and 81 deletions.
116 changes: 80 additions & 36 deletions nbdime/diffing/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,42 @@
from six import string_types
from six.moves import xrange as range
import operator
from collections import defaultdict

from ..diff_format import validate_diff, count_consumed_symbols
from ..diff_format import SequenceDiff, MappingDiff

from .sequences import diff_strings, diff_sequence
from .snakes import compute_snakes_multilevel, compute_diff_from_snakes

__all__ = ["diff"]


def is_atomic(x):
"Return True for values that diff should treat as a single atomic value."
return not isinstance(x, (string_types, list, dict))
return not isinstance(x, (string_types) + (list, dict))


def diff(a, b, path="", compare=operator.__eq__, predicates={}, differs={}):
def default_predicates():
return defaultdict(lambda: (operator.__eq__,))


def default_differs():
return defaultdict(lambda: diff)


def diff(a, b, path="", predicates=None, differs=None):
"Compute the diff of two json-like objects, list or dict or string."
# TODO: Providing separate comparison predicate for
# different dict paths will allow more customization

if predicates is None:
predicates = default_predicates()
if differs is None:
differs = default_differs()

if isinstance(a, list) and isinstance(b, list):
d = diff_lists(a, b, path=path, compare=compare, predicates=predicates, differs=differs)
d = diff_lists(a, b, path=path, predicates=predicates, differs=differs)
elif isinstance(a, dict) and isinstance(b, dict):
d = diff_dicts(a, b, path=path, compare=compare, predicates=predicates, differs=differs)
d = diff_dicts(a, b, path=path, predicates=predicates, differs=differs)
elif isinstance(a, string_types) and isinstance(b, string_types):
# FIXME: Do we need this string case, and if so do we need to pass on these additional arguments?
d = diff_strings(a, b) #, path=path, predicates=predicates, differs=differs)
Expand All @@ -43,69 +56,93 @@ def diff(a, b, path="", compare=operator.__eq__, predicates={}, differs={}):
return d


def diff_lists(a, b, path="", compare=operator.__eq__, predicates={}, differs={}, shallow_diff=None):
def diff_sequence_multilevel(a, b, path="", predicates=None, differs=None):
"""Compute diff of two lists with configurable behaviour."""

if predicates is None:
predicates = default_predicates()
if differs is None:
differs = default_differs()

# Invoke multilevel snake computation algorithm
compares = predicates[path]
snakes = compute_snakes_multilevel(a, b, compares)

# Convert snakes to diff
return compute_diff_from_snakes(a, b, snakes, path=path, predicates=predicates, differs=differs)

# Keeping compare a valid kwargs for simplicity and to avoid rewriting tests right now
if path in predicates and compare is not operator.__eq__:
raise RuntimeError("Please don't pass compare and predicates at the same time.")

def diff_lists(a, b, path="", predicates=None, differs=None, shallow_diff=None):
"""Compute diff of two lists with configurable behaviour."""

if predicates is None:
predicates = default_predicates()
if differs is None:
differs = default_differs()

# If multiple compares are provided to this path, delegate to multilevel algorithm
compares = predicates[path]
if len(compares) > 1:
assert shallow_diff is None
return diff_sequence_multilevel(a, b, path=path, predicates=predicates, differs=differs)

# First make a shallow sequence diff with custom compare,
# unless it's provided for us
if shallow_diff is None:
shallow_diff = diff_sequence(a, b, predicates.get(path, compare))

# Count consumed items from a, "take" in patch_list
acons = 0
bcons = 0

di = SequenceDiff()
shallow_diff = diff_sequence(a, b, compares[0])

# Next we recurse to diff items in sequence that are considered
# similar by compares[0] in the loop below
subpath = "/".join((path, "*"))
diffit = differs.get(subpath, diff)

# Count consumed items i,j from a,b, (i="take" in patch_list)
i, j = 0, 0
di = SequenceDiff()
M = len(shallow_diff)
for ie in range(M+1):
if ie < M:
# Consume n more unmentioned items before this diff entry
# Note that index can be larger than acons in the case where items
# Note that index can be larger than i in the case where items
# have been deleted from a and then insertions from b occur.
e = shallow_diff[ie]
index = e.key
n = max(0, index - acons)
n = max(0, index - i)
askip, bskip = count_consumed_symbols(e)
else:
# Consume final items after the last diff entry
e = None
n = len(a) - acons
n = len(a) - i
askip, bskip = 0, 0
assert n >= 0
assert len(b) - bcons == n
assert len(b) - j == n

# Recursively diff the n items that have been deemed similar
for i in range(n):
aval = a[acons+i]
bval = b[bcons+i]
for k in range(n):
aval = a[i + k]
bval = b[j + k]
if not is_atomic(aval):
dd = diffit(aval, bval, path=subpath, compare=compare, predicates=predicates, differs=differs)
if dd:
di.patch(acons+i, dd) # FIXME: Not covered in tests, create test situation
cd = diffit(aval, bval, path=subpath, predicates=predicates, differs=differs)
if cd:
di.patch(i + k, cd) # FIXME: Not covered in tests, create test situation

# Keep count of consumed items
acons += n + askip
bcons += n + bskip
i += n + askip
j += n + bskip

# Insert the diff entry unless past the end
# Insert the diff entry from shallow diff unless past the end
# (this either adds or removes items)
if ie < M:
di.append(e)

# Sanity check
assert acons == len(a)
assert bcons == len(b)
assert i == len(a)
assert j == len(b)

return di.diff # XXX


def diff_dicts(a, b, path="", compare=operator.__eq__, predicates={}, differs={}):
def diff_dicts(a, b, path="", predicates=None, differs=None):
"""Compute diff of two dicts with configurable behaviour.
Keys in both a and b will be handled based on
Expand All @@ -116,6 +153,11 @@ def diff_dicts(a, b, path="", compare=operator.__eq__, predicates={}, differs={}
Items not mentioned in diff are items where compare(x, y) return True.
For other items the diff will contain delete, insert, or replace entries.
"""
if predicates is None:
predicates = default_predicates()
if differs is None:
differs = default_differs()

assert isinstance(a, dict) and isinstance(b, dict)
akeys = set(a.keys())
bkeys = set(b.keys())
Expand All @@ -134,12 +176,14 @@ def diff_dicts(a, b, path="", compare=operator.__eq__, predicates={}, differs={}
if type(avalue) == type(bvalue) and not is_atomic(avalue):
subpath = "/".join((path, key))
diffit = differs.get(subpath, diff)
dd = diffit(avalue, bvalue, path=subpath, compare=compare, predicates=predicates, differs=differs)
dd = diffit(avalue, bvalue, path=subpath, predicates=predicates, differs=differs)
if dd:
di.patch(key, dd)
else:
compareit = predicates.get(path, compare)
if not compareit(avalue, bvalue): # TODO: Use != or not compare() here?
if path in predicates:
# Could also this a warning, but I think it shouldn't be done
raise RuntimeError("Found predicate(s) for path {} pointing to dict entry.".format(path))
if avalue != bvalue:
di.replace(key, bvalue)

for key in sorted(bkeys - akeys):
Expand Down
22 changes: 11 additions & 11 deletions nbdime/diffing/notebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,12 @@
import difflib
import operator
from six import string_types
from collections import defaultdict

from ..diff_format import source_as_string

from .sequences import diff_sequence
from .generic import diff, diff_lists, diff_dicts
from .snakes import diff_sequence_multilevel
from .generic import diff, diff_lists, diff_dicts, diff_sequence_multilevel

__all__ = ["diff_notebooks"]

Expand Down Expand Up @@ -133,7 +133,8 @@ def compare_output_data(x, y):
return True


def __unused_diff_single_outputs(a, b, compare="ignored", path="/cells/*/output/*"):
# Keeping these here for the comments and as as a reminder for possible future extension points:
def __unused_diff_single_outputs(a, b, path="/cells/*/output/*"):
"Diff a pair of output cells."
assert path == "/cells/*/outputs/*"
# TODO: Handle output diffing with plugins? I.e. image diff, svg diff, json diff, etc.
Expand All @@ -143,8 +144,7 @@ def __unused_diff_single_outputs(a, b, compare="ignored", path="/cells/*/output/
# 'application/javascript','image/svg+xml'}
# a.text
return diff(a, b)

def __unused_diff_source(a, b, path, compare, predicates, differs):
def __unused_diff_source(a, b, path, predicates, differs):
"Diff a pair of sources."
assert path == "/cells/*/source"
# FIXME: Make sure we use linebased diff of sources
Expand All @@ -155,7 +155,7 @@ def __unused_diff_source(a, b, path, compare, predicates, differs):
# Sequence diffs should be applied with multilevel
# algorithm for paths with more than one predicate,
# and using operator.__eq__ if no match in there.
notebook_predicates = {
notebook_predicates = defaultdict(lambda: [operator.__eq__], {
# Predicates to compare cells in order of low-to-high precedence
"/cells": [
compare_cell_source_approximate,
Expand All @@ -167,17 +167,17 @@ def __unused_diff_source(a, b, path, compare, predicates, differs):
compare_output_data_keys,
compare_output_data,
]
}
})


# Recursive diffing of substructures should pick a rule from here, with diff as fallback
notebook_differs = {
"/cells": diff_sequence_multilevel,
notebook_differs = defaultdict(lambda: diff, {
#"/cells": diff_sequence_multilevel,
#"/cells/*": diff,
#"/cells/*/source": diff,
"/cells/*/outputs": diff_sequence_multilevel,
#"/cells/*/outputs": diff_sequence_multilevel,
#"/cells/*/outputs/*": diff_single_outputs,
}
})


def diff_cells(a, b):
Expand Down
4 changes: 3 additions & 1 deletion nbdime/diffing/sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,9 @@


def diff_sequence(a, b, compare=operator.__eq__):
"""Compute a diff of two sequences.
"""Compute a shallow diff of two sequences.
I.e. these algorithms do not recursively diff elements of the sequences.
This is a wrapper for alternative diff implementations.
"""
Expand Down
62 changes: 34 additions & 28 deletions nbdime/diffing/snakes.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,22 +5,20 @@

from __future__ import unicode_literals

"""Tools for diffing notebooks.
All diff tools here currently assumes the notebooks have already been
converted to the same format version, currently v4 at time of writing.
Up- and down-conversion is handled by nbformat.
"""
Utilities for computing 'snakes', or contiguous sequences of equal elements of two sequences.
"""

import operator
from ..diff_format import SequenceDiff
from .seq_bruteforce import bruteforce_compute_snakes
from .generic import diff

__all__ = ["diff_sequence_multilevel"]
__all__ = ["compute_snakes_multilevel"]


def compute_snakes(A, B, rect, compare):
def compute_snakes(A, B, compare, rect=None):
if rect is None:
rect = (0, 0, len(A), len(B))
i0, j0, i1, j1 = rect

# TODO: Implement this using difflib or recursive Myers or simpler
Expand All @@ -34,39 +32,52 @@ def compute_snakes(A, B, rect, compare):
return snakes


def compute_snakes_multilevel(A, B, rect, compares, level):
def compute_snakes_multilevel(A, B, compares, rect=None, level=None):
"""Compute snakes using a multilevel multi-predicate algorithm.
TODO: Document this algorithm.
"""
if level is None:
level = len(compares) - 1
if rect is None:
rect = (0, 0, len(A), len(B))

# Compute initial set of coarse snakes
compare = compares[level]
snakes = compute_snakes(A, B, rect, compare)
snakes = compute_snakes(A, B, compare, rect)
if level == 0:
return snakes

newsnakes = [(0, 0, 0)]
i0, j0, i1, j1 = rect
for snake in snakes + [(i1, j1, 0)]:
i, j, n = snake
if i > i0 and j > j0:
newsnakes += compute_snakes_multilevel(A, B, (i0, j0, i, j), compares, level-1)
# Recurse to compute snakes with less accurate
# compare predicates between the coarse snakes
subrect = (i0, j0, i, j)
newsnakes += compute_snakes_multilevel(A, B, compares, subrect, level-1)
if n > 0:
if newsnakes[-1][0] == i and newsnakes[-1][1] == j:
snake = newsnakes[-1]
newsnakes[-1] = (snake[0], snake[1], snake[2] + n)
li, lj, ln = newsnakes[-1]
if li+ln == i and lj+ln == j:
# Merge contiguous snakes
newsnakes[-1] = (li, lj, ln + n)
else:
# Add new snake
newsnakes.append(snake)
i0 = i + n
j0 = j + n
# Pop empty snake from beginning if it wasn't extended inside the loop
if newsnakes[0][2] == 0:
newsnakes.pop(0)
return newsnakes


def compute_diff_from_snakes(a, b, snakes, path="", differs={}):
def compute_diff_from_snakes(a, b, snakes, path="", predicates=None, differs=None):
"Compute diff from snakes."

subpath = path + "/*"
diffit = differs.get(subpath, diff)
subpath = "/".join((path, "*"))
diffit = differs[subpath]

di = SequenceDiff()
i0, j0, i1, j1 = 0, 0, len(a), len(b)
Expand All @@ -75,19 +86,14 @@ def compute_diff_from_snakes(a, b, snakes, path="", differs={}):
di.remove(i0, i-i0)
if j > j0:
di.add(i0, b[j0:j])

for k in range(n):
cd = diffit(a[i + k], b[j + k], path=subpath)
aval = a[i + k]
bval = b[j + k]
cd = diffit(aval, bval, path=subpath, predicates=predicates, differs=differs)
if cd:
di.patch(i+k, cd)
di.patch(i + k, cd)

# Update corner offsets for next rectangle
i0, j0 = i+n, j+n
return di.diff # XXX


def diff_sequence_multilevel(a, b, path="", compare=operator.__eq__, predicates={}, differs={}):
# Invoke multilevel snake computation algorithm
compares = predicates.get(path, [compare])
level = len(compares) - 1
rect = (0, 0, len(a), len(b))
snakes = compute_snakes_multilevel(a, b, rect, compares, level)
return compute_diff_from_snakes(a, b, snakes, path=path, differs=differs)

0 comments on commit fbcb81f

Please sign in to comment.