Skip to content

Commit

Permalink
Merge pull request #712 from koordinates/attachment-diffs
Browse files Browse the repository at this point in the history
Support attachment diffs
  • Loading branch information
olsen232 committed Sep 15, 2022
2 parents e05df57 + a57bbe5 commit 4c08365
Show file tree
Hide file tree
Showing 12 changed files with 410 additions and 33 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ _When adding new entries to the changelog, please include issue/PR numbers where
- Bugfix: directing geojson diff output to the console didn't work in a multi-dataset repo, even if only one dataset had changed. [#702](https://github.com/koordinates/kart/issues/702)
- Improve argument parsing for `kart diff` and `kart show`. [#706](https://github.com/koordinates/kart/issues/706)
- Bugfix: don't allow `kart merge` to fast-forward if the user specifies a merge message. [#705](https://github.com/koordinates/kart/issues/705)
- Kart now shows diffs to attached files that have been created using `kart commit-files`, if any [#583](https://github.com/koordinates/kart/issues/583)

## 0.11.5

Expand Down
4 changes: 2 additions & 2 deletions kart/base_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,8 +188,8 @@ class BaseDataset(DatasetDiffMixin, metaclass=BaseDatasetMetaClass):

WORKING_COPY_PART_TYPE = None # Example: working_copy.PartType.TABULAR

# Paths are generally relative to self.inner_tree, but datasets may choose to put extra data in the outer
# tree also where it will eventually be user-visible (once attachments are fully supported).
# Paths are generally relative to self.inner_tree, but table datasets put certain meta items in the outer-tree.
# (This is an anti-pattern - when designing a new dataset type, don't put meta items in the outer-tree.)

# Where meta-items are stored - blobs containing metadata about the structure or schema of the dataset.
META_PATH = "meta/"
Expand Down
66 changes: 58 additions & 8 deletions kart/base_diff_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,16 @@
from pathlib import Path

import click
import pygit2

from . import diff_util
from .diff_structs import WORKING_COPY_EDIT
from .exceptions import CrsError, InvalidOperation
from .key_filters import RepoKeyFilter
from .promisor_utils import FetchPromisedBlobsProcess, object_is_promised
from .repo import KartRepoState
from .spatial_filter import SpatialFilter
from kart import diff_util
from kart.diff_structs import FILES_KEY, WORKING_COPY_EDIT, BINARY_FILE, Delta
from kart.exceptions import CrsError, InvalidOperation
from kart.key_filters import RepoKeyFilter
from kart.promisor_utils import FetchPromisedBlobsProcess, object_is_promised
from kart.repo import KartRepoState
from kart.spatial_filter import SpatialFilter
from kart.serialise_util import b64encode_str


L = logging.getLogger("kart.diff_writer")
Expand Down Expand Up @@ -115,6 +117,7 @@ def __init__(

self.commit = None
self.do_convert_to_dataset_format = False
self.do_full_file_diffs = False

def include_target_commit_as_header(self):
"""
Expand All @@ -126,6 +129,9 @@ def include_target_commit_as_header(self):
def convert_to_dataset_format(self, do_convert_to_dataset_format=True):
self.do_convert_to_dataset_format = do_convert_to_dataset_format

def full_file_diffs(self, do_full_file_diffs=True):
self.do_full_file_diffs = do_full_file_diffs

@classmethod
def _normalize_output_path(cls, output_path):
if not output_path or output_path == "-":
Expand Down Expand Up @@ -254,6 +260,7 @@ def write_diff(self):
self.has_changes = False
for ds_path in self.all_ds_paths:
self.has_changes |= self.write_ds_diff_for_path(ds_path)
self.has_changes |= self.write_file_diff(self.get_file_diff())
self.write_warnings_footer()

def write_ds_diff_for_path(self, ds_path):
Expand All @@ -267,7 +274,38 @@ def write_ds_diff(self, ds_path, ds_diff):
"""For outputting ds_diff, the diff of a particular dataset."""
raise NotImplementedError()

def get_repo_diff(self):
def write_file_diff(self, file_diff):
"""For outputting file_diff - all the files that have changed, without reference to any dataset."""
raise NotImplementedError()

def _full_file_delta(self, delta, skip_binary_files=False):
def get_blob(half_delta):
return self.repo[half_delta.value] if half_delta else None

def is_binary(blob):
return blob.is_binary if blob else False

old_blob = get_blob(delta.old)
new_blob = get_blob(delta.new)
delta_is_binary = is_binary(old_blob) or is_binary(new_blob)

if is_binary(old_blob) or is_binary(new_blob):
delta.flags |= BINARY_FILE
if skip_binary_files:
return delta

blob_to_text = lambda blob: b64encode_str(memoryview(blob))
else:
blob_to_text = lambda blob: str(memoryview(blob), "utf-8")

old_half_delta = (delta.old_key, blob_to_text(old_blob)) if delta.old else None
new_half_delta = (delta.new_key, blob_to_text(new_blob)) if delta.new else None

result = Delta(old_half_delta, new_half_delta)
result.flags = delta.flags
return result

def get_repo_diff(self, include_files=True):
"""
Generates a RepoDiff containing an entry for every dataset in the repo (that matches self.repo_key_filter).
"""
Expand All @@ -278,6 +316,7 @@ def get_repo_diff(self):
workdir_diff_cache=self.workdir_diff_cache,
repo_key_filter=self.repo_key_filter,
convert_to_dataset_format=self.do_convert_to_dataset_format,
include_files=include_files,
)

def get_dataset_diff(self, ds_path):
Expand All @@ -302,6 +341,15 @@ def get_dataset_diff(self, ds_path):
convert_to_dataset_format=self.do_convert_to_dataset_format,
)

def get_file_diff(self):
"""Returns the DatasetDiff object for the deltas that do not belong to any dataset."""
return diff_util.get_file_diff(
self.base_rs,
self.target_rs,
include_wc_diff=False,
repo_key_filter=self.repo_key_filter,
)

def filtered_dataset_deltas(self, ds_path, ds_diff):
"""
Yields the key, delta for only those deltas from the given dataset diff that match
Expand Down Expand Up @@ -586,6 +634,8 @@ def _get_old_or_new_dataset(self, ds_path):
so, useful for accessing things that won't change (its path, its type), or for accessing
things that haven't changed (ie, check the diff first to make sure it hasn't changed).
"""
if ds_path == FILES_KEY:
return None
dataset = self.base_rs.datasets().get(ds_path)
if not dataset:
dataset = self.target_rs.datasets().get(ds_path)
Expand Down
7 changes: 7 additions & 0 deletions kart/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,11 @@ def feature_count_diff(
help="Ignores file format differences in any new files when generating the diff - assumes that the new files will "
"also committed using --convert-to-dataset-format, so the conversion step will remove the format differences.",
)
@click.option(
"--diff-files",
is_flag=True,
help="Show changes to file contents (instead of just showing the object IDs of changed files)",
)
@click.argument(
"args",
metavar="[REVISIONS] [--] [FILTERS]",
Expand All @@ -139,6 +144,7 @@ def diff(
only_feature_count,
add_feature_count_estimate,
convert_to_dataset_format,
diff_files,
args,
):
"""
Expand Down Expand Up @@ -199,6 +205,7 @@ def diff(
diff_estimate_accuracy=add_feature_count_estimate,
)
diff_writer.convert_to_dataset_format(convert_to_dataset_format)
diff_writer.full_file_diffs(diff_files)
diff_writer.write_diff()

if exit_code or output_type == "quiet":
Expand Down
16 changes: 15 additions & 1 deletion kart/diff_structs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,14 @@
from .exceptions import InvalidOperation


# A pseudo "dataset-path" used as the key for storing files. Files are all individual top-level items in the repo,
# not pieces of a dataset - and so are addressed like this "path/to/file.txt" rather than something like this
# "<dataset-path>:feature:<primary-key>". They may be alongside dataset content (ie <dataset-path>/my-attachment.txt)
# or not (ie <repo-root>/my-attachment.txt) but when outputting diffs, they all go in the <files> area and not in any dataset.
# Note that this is not a valid dataset-path, so it cannot conflict with any dataset.
FILES_KEY = "<files>"


class Conflict(Exception):
pass

Expand Down Expand Up @@ -42,6 +50,7 @@ def get_lazy_value(self):

# Delta flags:
WORKING_COPY_EDIT = 0x1 # Delta represents a change made in the WC - it is "dirty".
BINARY_FILE = 0x2 # Delta is a change to a binary file.


@dataclass
Expand Down Expand Up @@ -182,7 +191,12 @@ def __add__(self, other):
result.flags = self.flags | other.flags
return result

def to_plus_minus_dict(self):
def to_plus_minus_dict(self, minimal=False):
# NOTE - it might make more sense to have:
# insert ++ delete -- update - + minimal-update +
# but this is a breaking change, would need to be co-ordinated.
if minimal and self.old and self.new:
return {"*": self.new_value}
result = {}
if self.old:
result["-"] = self.old_value
Expand Down
134 changes: 129 additions & 5 deletions kart/diff_util.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import functools
import logging
import re
from pathlib import PurePosixPath
import subprocess


from .diff_structs import DatasetDiff, RepoDiff
from .key_filters import DatasetKeyFilter, RepoKeyFilter
from .structure import RepoStructure
from kart.cli_util import tool_environment
from kart.diff_structs import FILES_KEY, Delta, DeltaDiff, DatasetDiff, RepoDiff
from kart.exceptions import SubprocessError
from kart.key_filters import DatasetKeyFilter, RepoKeyFilter
from kart.structure import RepoStructure

L = logging.getLogger("kart.diff_util")

Expand Down Expand Up @@ -41,6 +46,7 @@ def get_repo_diff(
workdir_diff_cache=None,
repo_key_filter=RepoKeyFilter.MATCH_ALL,
convert_to_dataset_format=False,
include_files=False,
):
"""
Generates a RepoDiff containing an entry for every dataset in the repo
Expand All @@ -52,6 +58,10 @@ def get_repo_diff(
workdir_diff_cache - not required, but can be provided if a WorkdirDiffCache is already in use
to save repeated work.
repo_key_filter - controls which datasets (and PK values) match and are included in the diff.
convert_to_dataset_format - whether to show the diff of what would be committed if files were
converted to dataset format at commit-time (ie, for point-cloud tiles)
include_files - whether to include a DatasetDiff in the result for changes to files that
are simply standalone files, rather than part of a dataset's contents.
"""

all_ds_paths = get_all_ds_paths(base_rs, target_rs, repo_key_filter)
Expand All @@ -70,7 +80,12 @@ def get_repo_diff(
ds_filter=repo_key_filter[ds_path],
convert_to_dataset_format=convert_to_dataset_format,
)
# No need to recurse since self.get_dataset_diff already prunes the dataset diffs.
if include_files:
file_diff = get_file_diff(base_rs, target_rs, repo_key_filter=repo_key_filter)
if file_diff:
repo_diff.recursive_set([FILES_KEY, FILES_KEY], file_diff)

# No need to prune recursively since self.get_dataset_diff already prunes the dataset diffs.
repo_diff.prune(recurse=False)
return repo_diff

Expand Down Expand Up @@ -138,3 +153,112 @@ def get_dataset_diff(
)
ds_diff.prune()
return ds_diff


ZEROES = re.compile(r"0+")


def get_file_diff(
base_rs,
target_rs,
*,
include_wc_diff=False,
workdir_diff_cache=None,
repo_key_filter=RepoKeyFilter.MATCH_ALL,
):
"""
Returns a delta-diff for changed files aka attachments.
Each delta just contains the old and new file OIDs - any more than this may be unhelpful since it takes
CPU time to produce but isn't necessarily easier to consume than OIDs, which are straight-forward to
turn into raw files once you know how. (Various diff-writers can transform these OIDs into inline diffs if you
set the --diff-files flag).
"""

# We don't yet support attachment diffs in the workdir
assert not include_wc_diff

old_tree = base_rs.tree
new_tree = target_rs.tree
repo = target_rs.repo

# TODO - make sure this is skipping over datasets efficiently.
# TODO - we could turn on rename detection.
cmd = [
"git",
"-C",
repo.path,
"diff",
old_tree.hex,
new_tree.hex,
"--raw",
"--no-renames",
"--",
":^.kart.*", # Top-level hidden kart blobs
":^**/.*dataset*/**", # Data inside datasets
]
try:
lines = (
subprocess.check_output(
cmd,
encoding="utf8",
env=tool_environment(),
)
.strip()
.splitlines()
)
except subprocess.CalledProcessError as e:
raise SubprocessError(
f"There was a problem with git diff: {e}", called_process_error=e
)

attachment_deltas = DeltaDiff()

for line in lines:
parts = line.split()
old_sha, new_sha, path = parts[2], parts[3], parts[5]
if not path_matches_repo_key_filter(path, repo_key_filter):
continue
if is_dataset_metadata_xml(old_tree, new_tree, path):
continue
old_half_delta = (path, old_sha) if not ZEROES.fullmatch(old_sha) else None
new_half_delta = (path, new_sha) if not ZEROES.fullmatch(new_sha) else None
attachment_deltas.add_delta(Delta(old_half_delta, new_half_delta))

return attachment_deltas


def path_matches_repo_key_filter(path, repo_key_filter):
if repo_key_filter.match_all:
return True
# Return attachments that have a name that we are matching all of.
if path in repo_key_filter and repo_key_filter[path].match_all:
return True
# Return attachments that are inside a folder that we are matching all of.
for p, dataset_filter in repo_key_filter.items():
if not dataset_filter.match_all:
continue
if p == path:
return True
if path.startswith(p) and (p.endswith("/") or path[len(p)] == "/"):
return True
# Don't return attachments inside a dataset / folder that we are only matching some of
# ie, only matching certain features or meta items.
return False


def is_dataset_metadata_xml(old_tree, new_tree, path):
# metadata.xml is sometimes stored in the "attachment" area, alongside the dataset, rather than inside it.
# Storing it in this unusual location adds complexity without actually solving any problems, so any datasets designed
# after table.v3 don't do this. Since this change already appears in meta-diffs, we suppress it from attachment diffs.
if not path.endswith("/metadata.xml"):
return False
parent_path = str(PurePosixPath(path).parent)
for tree in (old_tree, new_tree):
try:
parent_tree = tree / parent_path
for sibling in parent_tree:
if sibling.name.startswith(".table-dataset"):
return True
except KeyError:
continue
return False

0 comments on commit 4c08365

Please sign in to comment.