Merge pull request #712 from koordinates/attachment-diffs

Support attachment diffs
koordinates · Sep 15, 2022 · 4c08365 · 4c08365
2 parents e05df57 + a57bbe5
commit 4c08365
Show file tree

Hide file tree

Showing 12 changed files with 410 additions and 33 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ _When adding new entries to the changelog, please include issue/PR numbers where
 - Bugfix: directing geojson diff output to the console didn't work in a multi-dataset repo, even if only one dataset had changed. [#702](https://github.com/koordinates/kart/issues/702)
 - Improve argument parsing for `kart diff` and `kart show`. [#706](https://github.com/koordinates/kart/issues/706)
 - Bugfix: don't allow `kart merge` to fast-forward if the user specifies a merge message. [#705](https://github.com/koordinates/kart/issues/705)
+- Kart now shows diffs to attached files that have been created using `kart commit-files`, if any [#583](https://github.com/koordinates/kart/issues/583)
 
 ## 0.11.5
 

diff --git a/kart/base_dataset.py b/kart/base_dataset.py
@@ -188,8 +188,8 @@ class BaseDataset(DatasetDiffMixin, metaclass=BaseDatasetMetaClass):
 
     WORKING_COPY_PART_TYPE = None  # Example: working_copy.PartType.TABULAR
 
-    # Paths are generally relative to self.inner_tree, but datasets may choose to put extra data in the outer
-    # tree also where it will eventually be user-visible (once attachments are fully supported).
+    # Paths are generally relative to self.inner_tree, but table datasets put certain meta items in the outer-tree.
+    # (This is an anti-pattern - when designing a new dataset type, don't put meta items in the outer-tree.)
 
     # Where meta-items are stored - blobs containing metadata about the structure or schema of the dataset.
     META_PATH = "meta/"

diff --git a/kart/base_diff_writer.py b/kart/base_diff_writer.py
@@ -6,14 +6,16 @@
 from pathlib import Path
 
 import click
+import pygit2
 
-from . import diff_util
-from .diff_structs import WORKING_COPY_EDIT
-from .exceptions import CrsError, InvalidOperation
-from .key_filters import RepoKeyFilter
-from .promisor_utils import FetchPromisedBlobsProcess, object_is_promised
-from .repo import KartRepoState
-from .spatial_filter import SpatialFilter
+from kart import diff_util
+from kart.diff_structs import FILES_KEY, WORKING_COPY_EDIT, BINARY_FILE, Delta
+from kart.exceptions import CrsError, InvalidOperation
+from kart.key_filters import RepoKeyFilter
+from kart.promisor_utils import FetchPromisedBlobsProcess, object_is_promised
+from kart.repo import KartRepoState
+from kart.spatial_filter import SpatialFilter
+from kart.serialise_util import b64encode_str
 
 
 L = logging.getLogger("kart.diff_writer")
@@ -115,6 +117,7 @@ def __init__(
 
         self.commit = None
         self.do_convert_to_dataset_format = False
+        self.do_full_file_diffs = False
 
     def include_target_commit_as_header(self):
         """
@@ -126,6 +129,9 @@ def include_target_commit_as_header(self):
     def convert_to_dataset_format(self, do_convert_to_dataset_format=True):
         self.do_convert_to_dataset_format = do_convert_to_dataset_format
 
+    def full_file_diffs(self, do_full_file_diffs=True):
+        self.do_full_file_diffs = do_full_file_diffs
+
     @classmethod
     def _normalize_output_path(cls, output_path):
         if not output_path or output_path == "-":
@@ -254,6 +260,7 @@ def write_diff(self):
         self.has_changes = False
         for ds_path in self.all_ds_paths:
             self.has_changes |= self.write_ds_diff_for_path(ds_path)
+        self.has_changes |= self.write_file_diff(self.get_file_diff())
         self.write_warnings_footer()
 
     def write_ds_diff_for_path(self, ds_path):
@@ -267,7 +274,38 @@ def write_ds_diff(self, ds_path, ds_diff):
         """For outputting ds_diff, the diff of a particular dataset."""
         raise NotImplementedError()
 
-    def get_repo_diff(self):
+    def write_file_diff(self, file_diff):
+        """For outputting file_diff - all the files that have changed, without reference to any dataset."""
+        raise NotImplementedError()
+
+    def _full_file_delta(self, delta, skip_binary_files=False):
+        def get_blob(half_delta):
+            return self.repo[half_delta.value] if half_delta else None
+
+        def is_binary(blob):
+            return blob.is_binary if blob else False
+
+        old_blob = get_blob(delta.old)
+        new_blob = get_blob(delta.new)
+        delta_is_binary = is_binary(old_blob) or is_binary(new_blob)
+
+        if is_binary(old_blob) or is_binary(new_blob):
+            delta.flags |= BINARY_FILE
+            if skip_binary_files:
+                return delta
+
+            blob_to_text = lambda blob: b64encode_str(memoryview(blob))
+        else:
+            blob_to_text = lambda blob: str(memoryview(blob), "utf-8")
+
+        old_half_delta = (delta.old_key, blob_to_text(old_blob)) if delta.old else None
+        new_half_delta = (delta.new_key, blob_to_text(new_blob)) if delta.new else None
+
+        result = Delta(old_half_delta, new_half_delta)
+        result.flags = delta.flags
+        return result
+
+    def get_repo_diff(self, include_files=True):
         """
         Generates a RepoDiff containing an entry for every dataset in the repo (that matches self.repo_key_filter).
         """
@@ -278,6 +316,7 @@ def get_repo_diff(self):
             workdir_diff_cache=self.workdir_diff_cache,
             repo_key_filter=self.repo_key_filter,
             convert_to_dataset_format=self.do_convert_to_dataset_format,
+            include_files=include_files,
         )
 
     def get_dataset_diff(self, ds_path):
@@ -302,6 +341,15 @@ def get_dataset_diff(self, ds_path):
             convert_to_dataset_format=self.do_convert_to_dataset_format,
         )
 
+    def get_file_diff(self):
+        """Returns the DatasetDiff object for the deltas that do not belong to any dataset."""
+        return diff_util.get_file_diff(
+            self.base_rs,
+            self.target_rs,
+            include_wc_diff=False,
+            repo_key_filter=self.repo_key_filter,
+        )
+
     def filtered_dataset_deltas(self, ds_path, ds_diff):
         """
         Yields the key, delta for only those deltas from the given dataset diff that match
@@ -586,6 +634,8 @@ def _get_old_or_new_dataset(self, ds_path):
         so, useful for accessing things that won't change (its path, its type), or for accessing
         things that haven't changed (ie, check the diff first to make sure it hasn't changed).
         """
+        if ds_path == FILES_KEY:
+            return None
         dataset = self.base_rs.datasets().get(ds_path)
         if not dataset:
             dataset = self.target_rs.datasets().get(ds_path)

diff --git a/kart/diff.py b/kart/diff.py
@@ -122,6 +122,11 @@ def feature_count_diff(
     help="Ignores file format differences in any new files when generating the diff - assumes that the new files will "
     "also committed using --convert-to-dataset-format, so the conversion step will remove the format differences.",
 )
+@click.option(
+    "--diff-files",
+    is_flag=True,
+    help="Show changes to file contents (instead of just showing the object IDs of changed files)",
+)
 @click.argument(
     "args",
     metavar="[REVISIONS] [--] [FILTERS]",
@@ -139,6 +144,7 @@ def diff(
     only_feature_count,
     add_feature_count_estimate,
     convert_to_dataset_format,
+    diff_files,
     args,
 ):
     """
@@ -199,6 +205,7 @@ def diff(
         diff_estimate_accuracy=add_feature_count_estimate,
     )
     diff_writer.convert_to_dataset_format(convert_to_dataset_format)
+    diff_writer.full_file_diffs(diff_files)
     diff_writer.write_diff()
 
     if exit_code or output_type == "quiet":

diff --git a/kart/diff_structs.py b/kart/diff_structs.py
@@ -5,6 +5,14 @@
 from .exceptions import InvalidOperation
 
 
+# A pseudo "dataset-path" used as the key for storing files. Files are all individual top-level items in the repo,
+# not pieces of a dataset - and so are addressed like this "path/to/file.txt" rather than something like this
+# "<dataset-path>:feature:<primary-key>". They may be alongside dataset content (ie <dataset-path>/my-attachment.txt)
+# or not (ie <repo-root>/my-attachment.txt) but when outputting diffs, they all go in the <files> area and not in any dataset.
+# Note that this is not a valid dataset-path, so it cannot conflict with any dataset.
+FILES_KEY = "<files>"
+
+
 class Conflict(Exception):
     pass
 
@@ -42,6 +50,7 @@ def get_lazy_value(self):
 
 # Delta flags:
 WORKING_COPY_EDIT = 0x1  # Delta represents a change made in the WC - it is "dirty".
+BINARY_FILE = 0x2  # Delta is a change to a binary file.
 
 
 @dataclass
@@ -182,7 +191,12 @@ def __add__(self, other):
             result.flags = self.flags | other.flags
         return result
 
-    def to_plus_minus_dict(self):
+    def to_plus_minus_dict(self, minimal=False):
+        # NOTE - it might make more sense to have:
+        # insert ++      delete --      update - +      minimal-update +
+        # but this is a breaking change, would need to be co-ordinated.
+        if minimal and self.old and self.new:
+            return {"*": self.new_value}
         result = {}
         if self.old:
             result["-"] = self.old_value

diff --git a/kart/diff_util.py b/kart/diff_util.py
@@ -1,9 +1,14 @@
+import functools
 import logging
+import re
+from pathlib import PurePosixPath
+import subprocess
 
-
-from .diff_structs import DatasetDiff, RepoDiff
-from .key_filters import DatasetKeyFilter, RepoKeyFilter
-from .structure import RepoStructure
+from kart.cli_util import tool_environment
+from kart.diff_structs import FILES_KEY, Delta, DeltaDiff, DatasetDiff, RepoDiff
+from kart.exceptions import SubprocessError
+from kart.key_filters import DatasetKeyFilter, RepoKeyFilter
+from kart.structure import RepoStructure
 
 L = logging.getLogger("kart.diff_util")
 
@@ -41,6 +46,7 @@ def get_repo_diff(
     workdir_diff_cache=None,
     repo_key_filter=RepoKeyFilter.MATCH_ALL,
     convert_to_dataset_format=False,
+    include_files=False,
 ):
     """
     Generates a RepoDiff containing an entry for every dataset in the repo
@@ -52,6 +58,10 @@ def get_repo_diff(
     workdir_diff_cache - not required, but can be provided if a WorkdirDiffCache is already in use
         to save repeated work.
     repo_key_filter - controls which datasets (and PK values) match and are included in the diff.
+    convert_to_dataset_format - whether to show the diff of what would be committed if files were
+       converted to dataset format at commit-time (ie, for point-cloud tiles)
+    include_files - whether to include a DatasetDiff in the result for changes to files that
+       are simply standalone files, rather than part of a dataset's contents.
     """
 
     all_ds_paths = get_all_ds_paths(base_rs, target_rs, repo_key_filter)
@@ -70,7 +80,12 @@ def get_repo_diff(
             ds_filter=repo_key_filter[ds_path],
             convert_to_dataset_format=convert_to_dataset_format,
         )
-    # No need to recurse since self.get_dataset_diff already prunes the dataset diffs.
+    if include_files:
+        file_diff = get_file_diff(base_rs, target_rs, repo_key_filter=repo_key_filter)
+        if file_diff:
+            repo_diff.recursive_set([FILES_KEY, FILES_KEY], file_diff)
+
+    # No need to prune recursively since self.get_dataset_diff already prunes the dataset diffs.
     repo_diff.prune(recurse=False)
     return repo_diff
 
@@ -138,3 +153,112 @@ def get_dataset_diff(
     )
     ds_diff.prune()
     return ds_diff
+
+
+ZEROES = re.compile(r"0+")
+
+
+def get_file_diff(
+    base_rs,
+    target_rs,
+    *,
+    include_wc_diff=False,
+    workdir_diff_cache=None,
+    repo_key_filter=RepoKeyFilter.MATCH_ALL,
+):
+    """
+    Returns a delta-diff for changed files aka attachments.
+    Each delta just contains the old and new file OIDs - any more than this may be unhelpful since it takes
+    CPU time to produce but isn't necessarily easier to consume than OIDs, which are straight-forward to
+    turn into raw files once you know how. (Various diff-writers can transform these OIDs into inline diffs if you
+    set the --diff-files flag).
+    """
+
+    # We don't yet support attachment diffs in the workdir
+    assert not include_wc_diff
+
+    old_tree = base_rs.tree
+    new_tree = target_rs.tree
+    repo = target_rs.repo
+
+    # TODO - make sure this is skipping over datasets efficiently.
+    # TODO - we could turn on rename detection.
+    cmd = [
+        "git",
+        "-C",
+        repo.path,
+        "diff",
+        old_tree.hex,
+        new_tree.hex,
+        "--raw",
+        "--no-renames",
+        "--",
+        ":^.kart.*",  # Top-level hidden kart blobs
+        ":^**/.*dataset*/**",  # Data inside datasets
+    ]
+    try:
+        lines = (
+            subprocess.check_output(
+                cmd,
+                encoding="utf8",
+                env=tool_environment(),
+            )
+            .strip()
+            .splitlines()
+        )
+    except subprocess.CalledProcessError as e:
+        raise SubprocessError(
+            f"There was a problem with git diff: {e}", called_process_error=e
+        )
+
+    attachment_deltas = DeltaDiff()
+
+    for line in lines:
+        parts = line.split()
+        old_sha, new_sha, path = parts[2], parts[3], parts[5]
+        if not path_matches_repo_key_filter(path, repo_key_filter):
+            continue
+        if is_dataset_metadata_xml(old_tree, new_tree, path):
+            continue
+        old_half_delta = (path, old_sha) if not ZEROES.fullmatch(old_sha) else None
+        new_half_delta = (path, new_sha) if not ZEROES.fullmatch(new_sha) else None
+        attachment_deltas.add_delta(Delta(old_half_delta, new_half_delta))
+
+    return attachment_deltas
+
+
+def path_matches_repo_key_filter(path, repo_key_filter):
+    if repo_key_filter.match_all:
+        return True
+    # Return attachments that have a name that we are matching all of.
+    if path in repo_key_filter and repo_key_filter[path].match_all:
+        return True
+    # Return attachments that are inside a folder that we are matching all of.
+    for p, dataset_filter in repo_key_filter.items():
+        if not dataset_filter.match_all:
+            continue
+        if p == path:
+            return True
+        if path.startswith(p) and (p.endswith("/") or path[len(p)] == "/"):
+            return True
+    # Don't return attachments inside a dataset / folder that we are only matching some of
+    # ie, only matching certain features or meta items.
+    return False
+
+
+def is_dataset_metadata_xml(old_tree, new_tree, path):
+    # metadata.xml is sometimes stored in the "attachment" area, alongside the dataset, rather than inside it.
+    # Storing it in this unusual location adds complexity without actually solving any problems, so any datasets designed
+    # after table.v3 don't do this. Since this change already appears in meta-diffs, we suppress it from attachment diffs.
+    if not path.endswith("/metadata.xml"):
+        return False
+    parent_path = str(PurePosixPath(path).parent)
+    for tree in (old_tree, new_tree):
+        try:
+            parent_tree = tree / parent_path
+            for sibling in parent_tree:
+                if sibling.name.startswith(".table-dataset"):
+                    return True
+        except KeyError:
+            continue
+    return False