Skip to content

Commit

Permalink
Merge pull request #639 from vidartf/cellid-diff
Browse files Browse the repository at this point in the history
Add support for using cell ID in diffing and merging
  • Loading branch information
vidartf committed Nov 1, 2023
2 parents 997be80 + 9884381 commit c1ea9b9
Show file tree
Hide file tree
Showing 56 changed files with 1,113 additions and 258 deletions.
34 changes: 34 additions & 0 deletions nbdime/diffing/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@

class DiffConfig:
"""Set of predicates/differs/other configs to pass around"""

def __init__(self, *, predicates=None, differs=None, atomic_paths=None):
if predicates is None:
from .generic import default_predicates
predicates = default_predicates()

if differs is None:
from .generic import default_differs
differs = default_differs()

self.predicates = predicates
self.differs = differs
self._atomic_paths = atomic_paths or {}

def diff_item_at_path(self, a, b, path):
"""Calculate the diff for path."""
self.differs[path](a, b, path=path, config=self)

def is_atomic(self, x, path=None):
"Return True for values that diff should treat as a single atomic value."
try:
return self._atomic_paths[path]
except KeyError:
return not isinstance(x, (str, list, dict))

def __copy__(self):
return DiffConfig(
predicates=self.predicates.copy(),
differs=self.differs.copy(),
atomic_paths=self._atomic_paths.copy(),
)
77 changes: 38 additions & 39 deletions nbdime/diffing/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,13 @@
from ..diff_format import SequenceDiffBuilder, MappingDiffBuilder, validate_diff
from ..diff_utils import count_consumed_symbols

from .config import DiffConfig
from .sequences import diff_strings_linewise, diff_sequence
from .snakes import compute_snakes_multilevel, compute_diff_from_snakes

__all__ = ["diff"]


def is_atomic(x):
"Return True for values that diff should treat as a single atomic value."
return not isinstance(x, (str, list, dict))


def default_predicates():
return defaultdict(lambda: (operator.__eq__,))

Expand Down Expand Up @@ -74,18 +70,16 @@ def compare_strings_approximate(x, y, threshold=0.7, maxlen=None):
return s.ratio() > threshold


def diff(a, b, path="", predicates=None, differs=None):
def diff(a, b, path="", config=None):
"Compute the diff of two json-like objects, list or dict or string."

if predicates is None:
predicates = default_predicates()
if differs is None:
differs = default_differs()
if config is None:
config = DiffConfig()

if isinstance(a, list) and isinstance(b, list):
d = diff_lists(a, b, path=path, predicates=predicates, differs=differs)
d = diff_lists(a, b, path=path, config=config)
elif isinstance(a, dict) and isinstance(b, dict):
d = diff_dicts(a, b, path=path, predicates=predicates, differs=differs)
d = diff_dicts(a, b, path=path, config=config)
elif isinstance(a, str) and isinstance(b, str):
# Don't pass differs/predicates as the only possible use case is to
# use a different character differ within each line or predicates
Expand All @@ -100,35 +94,42 @@ def diff(a, b, path="", predicates=None, differs=None):
return d


def diff_sequence_multilevel(a, b, path="", predicates=None, differs=None):
def diff_string_lines(a, b, path="", config=None):
"""Diff two lists of strings (lines)"""

# This is mainly about short-circuiting to avoid full snakes for equal content
# since we know we can rely on __eq__ comparison
if len(a) == len(b) and a == b:
return []

return diff_strings_linewise(a, b)


def diff_sequence_multilevel(a, b, path="", config=None):
"""Compute diff of two lists with configurable behaviour."""

if predicates is None:
predicates = default_predicates()
if differs is None:
differs = default_differs()
if config is None:
config = DiffConfig()

# Invoke multilevel snake computation algorithm
compares = predicates[path or '/']
compares = config.predicates[path or '/']
snakes = compute_snakes_multilevel(a, b, compares)

# Convert snakes to diff
return compute_diff_from_snakes(a, b, snakes, path=path, predicates=predicates, differs=differs)
return compute_diff_from_snakes(a, b, snakes, path=path, config=config)


def diff_lists(a, b, path="", predicates=None, differs=None, shallow_diff=None):
def diff_lists(a, b, path="", config=None, shallow_diff=None):
"""Compute diff of two lists with configurable behaviour."""

if predicates is None:
predicates = default_predicates()
if differs is None:
differs = default_differs()
if config is None:
config = DiffConfig()

# If multiple compares are provided to this path, delegate to multilevel algorithm
compares = predicates[path or '/']
compares = config.predicates[path or '/']
if len(compares) > 1:
assert shallow_diff is None
return diff_sequence_multilevel(a, b, path=path, predicates=predicates, differs=differs)
return diff_sequence_multilevel(a, b, path=path, config=config)

# First make a shallow sequence diff with custom compare,
# unless it's provided for us
Expand All @@ -138,7 +139,7 @@ def diff_lists(a, b, path="", predicates=None, differs=None, shallow_diff=None):
# Next we recurse to diff items in sequence that are considered
# similar by compares[0] in the loop below
subpath = "/".join((path, "*"))
diffit = differs[subpath]
diffit = config.differs[subpath]

# Count consumed items i,j from a,b, (i="take" in patch_list)
i, j = 0, 0
Expand All @@ -165,8 +166,8 @@ def diff_lists(a, b, path="", predicates=None, differs=None, shallow_diff=None):
for k in range(n):
aval = a[i + k]
bval = b[j + k]
if not is_atomic(aval):
cd = diffit(aval, bval, path=subpath, predicates=predicates, differs=differs)
if not config.is_atomic(aval, subpath):
cd = diffit(aval, bval, path=subpath, config=config)
if cd:
di.patch(i + k, cd) # FIXME: Not covered in tests, create test situation

Expand All @@ -186,7 +187,7 @@ def diff_lists(a, b, path="", predicates=None, differs=None, shallow_diff=None):
return di.validated()


def diff_dicts(a, b, path="", predicates=None, differs=None):
def diff_dicts(a, b, path="", config=None):
"""Compute diff of two dicts with configurable behaviour.
Keys in both a and b will be handled based on
Expand All @@ -197,10 +198,8 @@ def diff_dicts(a, b, path="", predicates=None, differs=None):
Items not mentioned in diff are items where compare(x, y) return True.
For other items the diff will contain delete, insert, or replace entries.
"""
if predicates is None:
predicates = default_predicates()
if differs is None:
differs = default_differs()
if config is None:
config = DiffConfig()

if not isinstance(a, dict) or not isinstance(b, dict):
raise TypeError('Arguments to diff_dicts need to be dicts, got %r and %r' % (a, b))
Expand All @@ -218,14 +217,14 @@ def diff_dicts(a, b, path="", predicates=None, differs=None):
avalue = a[key]
bvalue = b[key]
# If types are the same and nonatomic, recurse
if type(avalue) is type(bvalue) and not is_atomic(avalue):
subpath = "/".join((path, key))
diffit = differs[subpath]
dd = diffit(avalue, bvalue, path=subpath, predicates=predicates, differs=differs)
subpath = "/".join((path, key))
if type(avalue) is type(bvalue) and not config.is_atomic(avalue, path=subpath):
diffit = config.differs[subpath]
dd = diffit(avalue, bvalue, path=subpath, config=config)
if dd:
di.patch(key, dd)
else:
if (path or '/') in predicates:
if (path or '/') in config.predicates:
# Could also this a warning, but I think it shouldn't be done
raise RuntimeError(
"Found predicate(s) for path {} pointing to dict entry.".format(
Expand Down
49 changes: 34 additions & 15 deletions nbdime/diffing/notebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
from ..diff_format import MappingDiffBuilder, DiffOp
from ..utils import defaultdict2

from .generic import (diff, diff_sequence_multilevel,
compare_strings_approximate)
from .config import DiffConfig
from .generic import (
diff, diff_sequence_multilevel, compare_strings_approximate,
diff_string_lines,
)

__all__ = ["diff_notebooks"]

Expand Down Expand Up @@ -323,6 +326,7 @@ def compare_cell_moderate(x, y):
This is used to align cells in the /cells list
in the second multilevel diff iteration.
"""

# Cell types must match
if x["cell_type"] != y["cell_type"]:
return False
Expand All @@ -349,6 +353,7 @@ def compare_cell_strict(x, y):
This is used to align cells in the /cells list
in the first multilevel diff iteration.
"""

# Cell types must match
if x["cell_type"] != y["cell_type"]:
return False
Expand All @@ -373,8 +378,13 @@ def compare_cell_strict(x, y):
return True


def diff_single_outputs(a, b, path="/cells/*/outputs/*",
predicates=None, differs=None):
def compare_cell_by_ids(x, y):
"""Compare cells x,y strictly using cell IDs"""
# Only consider equal if both have IDs and they match
return 'id' in x and 'id' in y and x['id'] == y['id']


def diff_single_outputs(a, b, path="/cells/*/outputs/*", config=None):
"""DiffOp a pair of output cells."""
assert path == "/cells/*/outputs/*", 'Invalid path for ouput: %r' % path
assert a.output_type == b.output_type, 'cannot diff outputs of different types'
Expand All @@ -396,20 +406,22 @@ def diff_single_outputs(a, b, path="/cells/*/outputs/*",
di.append(e)

# Only diff data:
dd = diff_mime_bundle(a.data, b.data, path=path+"/data")
dd = diff_mime_bundle(a.data, b.data, path=path+"/data", config=config)
if dd:
di.patch("data", dd)

return di.validated()
else:
return diff(a, b)
return diff(a, b, config=config)


def add_mime_diff(key, avalue, bvalue, diffbuilder):
# TODO: Handle output diffing with plugins?
# I.e. image diff, svg diff, json diff, etc.

mimetype = key.lower()
if isinstance(avalue, str) and isinstance(bvalue, str) and avalue == bvalue:
return
if any(mimetype.startswith(tm) for tm in _split_mimes):
dd = diff(avalue, bvalue)
if dd:
Expand All @@ -418,8 +430,7 @@ def add_mime_diff(key, avalue, bvalue, diffbuilder):
diffbuilder.replace(key, bvalue)


def diff_attachments(a, b, path="/cells/*/attachments",
predicates=None, differs=None):
def diff_attachments(a, b, path="/cells/*/attachments", config=None):
"""Diff a pair of attachment collections"""
assert path == "/cells/*/attachments", 'Invalid path for attachment: %r' % path

Expand Down Expand Up @@ -454,8 +465,7 @@ def diff_attachments(a, b, path="/cells/*/attachments",
return di.validated()


def diff_mime_bundle(a, b, path=None,
predicates=None, differs=None):
def diff_mime_bundle(a, b, path=None, config=None):
"""Diff a MIME bundle.
A MIME bundle has MIME types as keys, with values that are
Expand Down Expand Up @@ -517,6 +527,7 @@ def ignored_diff(*args, **kwargs):
compare_cell_approximate,
compare_cell_moderate,
compare_cell_strict,
compare_cell_by_ids,
],
# Predicates to compare output cells (within one cell) in order of low-to-high precedence
"/cells/*/outputs": [
Expand All @@ -529,12 +540,22 @@ def ignored_diff(*args, **kwargs):
notebook_differs = defaultdict2(lambda: diff, {
"/cells": diff_sequence_multilevel,
"/cells/*": diff,
"/cells/*/source": diff_string_lines,
"/cells/*/outputs": diff_sequence_multilevel,
"/cells/*/outputs/*": diff_single_outputs,
"/cells/*/attachments": diff_attachments,
})


notebook_config = DiffConfig(
predicates=notebook_predicates,
differs=notebook_differs,
atomic_paths={
"/cells/*/id": True
}
)


def reset_notebook_differ():
"""Reset the notebook_differs dictionary to default values."""
# As it is a defaultdict2, simply clear all set keys to reset:
Expand Down Expand Up @@ -587,14 +608,12 @@ def set_notebook_diff_targets(sources=True, outputs=True, attachments=True,
def diff_cells(a, b):
"This is currently just used by some tests."
path = "/cells"
return notebook_differs[path](
a, b, path=path, predicates=notebook_predicates, differs=notebook_differs)
return notebook_differs[path](a, b, path=path, config=notebook_config)


def diff_item_at_path(a, b, path):
"""Calculate the diff using the configured notebook differ for path."""
return notebook_differs[path](
a, b, path=path, predicates=notebook_predicates, differs=notebook_differs)
return notebook_differs[path](a, b, path=path, config=notebook_config)


def diff_notebooks(a, b):
Expand All @@ -604,4 +623,4 @@ def diff_notebooks(a, b):
"""
if not (isinstance(a, dict) and isinstance(b, dict)):
raise TypeError("Expected inputs to be dicts, got %r and %r" % (a, b))
return diff(a, b, path="", predicates=notebook_predicates, differs=notebook_differs)
return diff(a, b, path="", config=notebook_config)
17 changes: 11 additions & 6 deletions nbdime/diffing/sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import operator
from collections import defaultdict

from .config import DiffConfig
from .seq_difflib import diff_sequence_difflib
from .seq_bruteforce import diff_sequence_bruteforce
from .seq_myers import diff_sequence_myers
Expand Down Expand Up @@ -37,7 +38,7 @@ def diff_sequence(a, b, compare=operator.__eq__):
raise RuntimeError("Unknown diff_sequence_algorithm {}.".format(diff_sequence_algorithm))


def diff_strings_by_char(a, b, path="", predicates=None, differs=None):
def diff_strings_by_char(a, b, path="", config=None):
"Compute char-based diff of two strings."
assert isinstance(a, str) and isinstance(b, str), (
'Arguments need to be string types. Got %r and %r' % (a, b))
Expand All @@ -52,12 +53,16 @@ def diff_strings_linewise(a, b):
"""
assert isinstance(a, str) and isinstance(b, str), (
'Arguments need to be string types. Got %r and %r' % (a, b))
if a == b:
return []
lines_a = a.splitlines(True)
lines_b = b.splitlines(True)

from .generic import diff_lists, compare_strings_approximate
predicates = defaultdict(lambda: [
compare_strings_approximate,
operator.__eq__])
differs = defaultdict(lambda: diff_strings_by_char)
return diff_lists(lines_a, lines_b, predicates=predicates, differs=differs)
config = DiffConfig(
predicates=defaultdict(lambda: [
compare_strings_approximate,
operator.__eq__]),
differs=defaultdict(lambda: diff_strings_by_char)
)
return diff_lists(lines_a, lines_b, config=config)

0 comments on commit c1ea9b9

Please sign in to comment.