Skip to content

Commit

Permalink
Merge pull request #962 from koordinates/quicker-diffs
Browse files Browse the repository at this point in the history
Faster diffs: Don't always run pygit2.Tree.diff_tree
  • Loading branch information
olsen232 committed Jan 31, 2024
2 parents 0247ef9 + 6dbc748 commit fe9958e
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 70 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ _When adding new entries to the changelog, please include issue/PR numbers where
- Prevented committing local changes to linked datasets. [#953](https://github.com/koordinates/kart/pull/953)
- Fixes a bug where even datasets marked as `--no-checkout` are checked out when the working copy is first created. [#954](https://github.com/koordinates/kart/pull/954)
- Fixes a bug where minor changes to auxiliary metadata (.aux.xml) files that Kart is supposed to ignore were preventing branch changes. [#957](https://github.com/koordinates/kart/pull/957)
- Improved performance when the user filters a diff request to only show certain features. [#962](https://github.com/koordinates/kart/pull/962)

## 0.15.0

Expand Down
245 changes: 181 additions & 64 deletions kart/dataset_mixins.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from collections.abc import Iterable
from typing import Callable, Optional

import pygit2

from kart.diff_structs import DatasetDiff, DeltaDiff, Delta
Expand All @@ -6,15 +9,20 @@


class DatasetDiffMixin:
"""Adds diffing of meta-items to a dataset, by delegating to dataset.meta_items()"""
"""
This mixin should be added to a dataset to add diffing functionality.
self.diff_meta() should work "out of the box" as long as self.meta_items() is implemented.
self.diff_subtree() can be called with appropriate args to generate diffs of dataset contents, eg, features.
"""

# Returns the meta-items diff for this dataset.
def diff(
self,
other,
ds_filter=DatasetKeyFilter.MATCH_ALL,
reverse=False,
diff_format=DiffFormat.FULL,
self: "BaseDataset",
other: Optional["BaseDataset"],
ds_filter: DatasetKeyFilter = DatasetKeyFilter.MATCH_ALL,
reverse: bool = False,
diff_format: DiffFormat = DiffFormat.FULL,
):
"""
Generates a Diff from self -> other.
Expand All @@ -25,7 +33,12 @@ def diff(
ds_diff["meta"] = self.diff_meta(other, meta_filter, reverse=reverse)
return ds_diff

def diff_meta(self, other, meta_filter=MetaKeyFilter.MATCH_ALL, reverse=False):
def diff_meta(
self: "BaseDataset",
other: Optional["BaseDataset"],
meta_filter: UserStringKeyFilter = MetaKeyFilter.MATCH_ALL,
reverse: bool = False,
):
"""
Returns a diff from self -> other, but only for meta items.
If reverse is true, generates a diff from other -> self.
Expand All @@ -48,11 +61,11 @@ def diff_meta(self, other, meta_filter=MetaKeyFilter.MATCH_ALL, reverse=False):
return DeltaDiff.diff_dicts(meta_old, meta_new)

def diff_to_working_copy(
self,
workdir_diff_cache,
ds_filter=DatasetKeyFilter.MATCH_ALL,
self: "BaseDataset",
workdir_diff_cache: "WorkdirDiffCache",
ds_filter: DatasetKeyFilter = DatasetKeyFilter.MATCH_ALL,
*,
convert_to_dataset_format=None,
convert_to_dataset_format: bool = None,
):
"""
Generates a diff from self to the working-copy.
Expand All @@ -63,12 +76,16 @@ def diff_to_working_copy(
# Subclasses to override.
pass

def get_raw_diff_for_subtree(self, other, subtree_name, reverse=False):
def get_raw_diff_for_subtree(
self: "BaseDataset",
other: Optional["BaseDataset"],
subtree_name: str,
reverse: bool = False,
):
"""
Get a pygit2.Diff of the diff between some subtree of this dataset, and the same subtree of another dataset
(generally the "same" dataset at a different revision).
(typically actually the "same" dataset, but at a different revision).
"""

flags = pygit2.GIT_DIFF_SKIP_BINARY_CHECK
self_subtree = self.get_subtree(subtree_name)
other_subtree = other.get_subtree(subtree_name) if other else self._empty_tree
Expand All @@ -85,14 +102,15 @@ def get_raw_diff_for_subtree(self, other, subtree_name, reverse=False):
return diff

def diff_subtree(
self,
other,
subtree_name,
key_filter=UserStringKeyFilter.MATCH_ALL,
self: "BaseDataset",
other: Optional["BaseDataset"],
subtree_name: str,
key_filter: UserStringKeyFilter = UserStringKeyFilter.MATCH_ALL,
*,
key_decoder_method,
value_decoder_method,
reverse=False,
key_decoder_method: str,
value_decoder_method: str,
key_encoder_method: Optional[str] = None,
reverse: bool = False,
):
"""
A pattern for datasets to use for diffing some specific subtree. Works as follows:
Expand All @@ -111,42 +129,50 @@ def diff_subtree(
key_filter - deltas are only yielded if they involve at least one key that matches the key filter.
key_decoder_method, value_decoder_method - these must be names of methods that are present in both
self and other - self's methods are used to decode self's items, and other's methods for other's items.
key_encoder_method - optional. A method that is present in both self and other that allows us to go
straight to the keys the user is interested in (if they have requested specific keys in the key_filter).
reverse - normally yields deltas from self -> other, but if reverse is True, yields deltas from other -> self.
"""
# TODO - if the key-filter is very restrictive (ie it has only a few items in) then
# it would be more efficient if we first search for those items and diff only those.

subtree_name = subtree_name.rstrip("/")
raw_diff = self.get_raw_diff_for_subtree(other, subtree_name, reverse=reverse)
# NOTE - we could potentially call diff.find_similar() to detect renames here,

if reverse:
old, new = other, self
if not key_filter.match_all and key_encoder_method is not None:
# Handle the case where we are only interested in a few features.
deltas = self.get_raw_deltas_for_keys(
other, key_encoder_method, key_filter, reverse=reverse
)
else:
old, new = self, other
raw_diff = self.get_raw_diff_for_subtree(
other, subtree_name, reverse=reverse
)
# NOTE - we could potentially call diff.find_similar() to detect renames here
deltas = self.wrap_deltas_from_raw_diff(
raw_diff, lambda path: f"{subtree_name}/{path}"
)

def _no_dataset_error(method_name):
raise RuntimeError(
f"Can't call {method_name} to decode diff deltas: dataset is None"
)

def get_decoder(dataset, method_name):
def get_dataset_attr(dataset, method_name):
if dataset is not None:
return getattr(dataset, method_name)
# This shouldn't happen:
return lambda x: _no_dataset_error(method_name)

path_decoder = lambda path: f"{subtree_name}/{path}"
if reverse:
old, new = other, self
else:
old, new = self, other

yield from self.transform_raw_deltas(
raw_diff.deltas,
deltas,
key_filter,
old_path_transform=path_decoder,
old_key_transform=get_decoder(old, key_decoder_method),
old_value_transform=get_decoder(old, value_decoder_method),
new_path_transform=path_decoder,
new_key_transform=get_decoder(new, key_decoder_method),
new_value_transform=get_decoder(new, value_decoder_method),
old_key_transform=get_dataset_attr(old, key_decoder_method),
old_value_transform=get_dataset_attr(old, value_decoder_method),
new_key_transform=get_dataset_attr(new, key_decoder_method),
new_value_transform=get_dataset_attr(new, value_decoder_method),
)

# We treat UNTRACKED like an ADD since we don't have a staging area -
Expand All @@ -160,78 +186,169 @@ def get_decoder(dataset, method_name):
_INSERT_UPDATE = _INSERT_TYPES + _UPDATE_TYPES
_UPDATE_DELETE = _UPDATE_TYPES + _DELETE_TYPES

def get_raw_deltas_for_keys(
self: "BaseDataset",
other: Optional["BaseDataset"],
key_encoder_method: str,
key_filter: UserStringKeyFilter,
reverse: bool = False,
):
"""
Since we know which keys we are looking for, we can encode those keys and look up those blobs directly.
We output this as a series of deltas, just as we do when we run a normal diff, so that we can
take output from either code path and use it to generate a kart diff using transform_raw_deltas.
"""

def _expand_keys(keys):
# If the user asks for mydataset:feature:123 they might mean str("123") or int(123) - which
# would be encoded differently. We look up both paths to see what we can find.
for key in keys:
yield key
if isinstance(key, str) and key.isdigit():
yield int(key)

encode_fn = getattr(self, key_encoder_method)
paths = set()
for key in _expand_keys(key_filter):
try:
paths.add(encode_fn(key, relative=True))
except TypeError:
# The path encoder for this dataset can't encode that key, so it won't be there.
continue

if reverse:
old, new = other, self
else:
old, new = self, other

def _get_blob(dataset, path):
if dataset is None or dataset.inner_tree is None:
return None
try:
return dataset.inner_tree / path
except KeyError:
return None

for path in paths:
old_blob = _get_blob(old, path)
new_blob = _get_blob(new, path)
if old_blob is None and new_blob is None:
continue
if (
old_blob is not None
and new_blob is not None
and old_blob.oid == new_blob.oid
):
continue
yield RawDiffDelta.of(
path if old_blob else None, path if new_blob else None
)

def wrap_deltas_from_raw_diff(
self: "BaseDataset", raw_diff: pygit2.Diff, path_transform: Callable[[str], str]
):
for delta in raw_diff.deltas:
old_path = path_transform(delta.old_file.path) if delta.old_file else None
new_path = path_transform(delta.new_file.path) if delta.new_file else None
yield RawDiffDelta(delta.status, delta.status_char(), old_path, new_path)

def transform_raw_deltas(
self,
deltas,
key_filter=UserStringKeyFilter.MATCH_ALL,
self: "BaseDataset",
deltas: Iterable["RawDiffDelta"],
key_filter: UserStringKeyFilter = UserStringKeyFilter.MATCH_ALL,
*,
old_path_transform=lambda x: x,
old_key_transform=lambda x: x,
old_value_transform=lambda x: x,
new_path_transform=lambda x: x,
new_key_transform=lambda x: x,
new_value_transform=lambda x: x,
old_key_transform: Callable[[str], str] = lambda x: x,
old_value_transform: Callable[[str], str] = lambda x: x,
new_key_transform: Callable[[str], str] = lambda x: x,
new_value_transform: Callable[[str], str] = lambda x: x,
):
"""
Given a list of deltas - inserts, updates, and deletes -
Given a list of RawDiffDeltas - inserts, updates, and deletes that happened at particular paths -
yields a list of Kart deltas, which look something like ((old_key, old_value), (new_key, new_value)).
A key could be a path, a meta-item name, or a primary key value.
key-filter - deltas are discarded if they don't involve any keys that matches the key filter.
old/new_path_transform - converts the raw-path into a canonical path.
Useful if the raw-path is not relative to the preferred folder, you can tidy it up first.
old/new_key_transform - converts the canonical-path into a key.
old/new_key_transform - converts the path into a key.
old/new_value_transform - converts the canonical-path into a value,
presumably first by loading the file contents at that path.
If any transform is not set, that transform defaults to returning the value it was input.
"""
for d in deltas:
self.L.debug(
"diff(): %s %s %s", d.status_char(), d.old_file.path, d.new_file.path
)
self.L.debug("diff(): %s %s %s", d.status_char, d.old_path, d.new_path)

if d.status not in self._INSERT_UPDATE_DELETE:
# RENAMED, COPIED, IGNORED, TYPECHANGE, UNMODIFIED, UNREADABLE
# We don't enounter these status codes in the diffs we generate.
raise NotImplementedError(f"Delta status: {d.status_char()}")
raise NotImplementedError(f"Delta status: {d.status_char}")

if d.status in self._UPDATE_DELETE:
old_path = old_path_transform(d.old_file.path)
old_key = old_key_transform(old_path)
old_key = old_key_transform(d.old_path)
else:
old_key = None

if d.status in self._INSERT_UPDATE:
new_path = new_path_transform(d.new_file.path)
new_key = new_key_transform(d.new_file.path)
new_key = new_key_transform(d.new_path)
else:
new_key = None

if old_key not in key_filter and new_key not in key_filter:
continue

if d.status in self._INSERT_TYPES:
self.L.debug("diff(): insert %s (%s)", new_path, new_key)
self.L.debug("diff(): insert %s (%s)", d.new_path, new_key)
elif d.status in self._UPDATE_TYPES:
self.L.debug(
"diff(): update %s %s -> %s %s",
old_path,
d.old_path,
old_key,
new_path,
d.new_path,
new_key,
)
elif d.status in self._DELETE_TYPES:
self.L.debug("diff(): delete %s %s", old_path, old_key)
self.L.debug("diff(): delete %s %s", d.old_path, old_key)

if d.status in self._UPDATE_DELETE:
old_half_delta = old_key, old_value_transform(old_path)
old_half_delta = old_key, old_value_transform(d.old_path)
else:
old_half_delta = None

if d.status in self._INSERT_UPDATE:
new_half_delta = new_key, new_value_transform(new_path)
new_half_delta = new_key, new_value_transform(d.new_path)
else:
new_half_delta = None

yield Delta(old_half_delta, new_half_delta)


class RawDiffDelta:
"""
Just like pygit2.DiffDelta, this simply stores the fact that a particular git blob has changed.
Exactly how it is changed is not stored - just the status and the blob paths.
Contrast with diff_structs.Delta, which is higher level - it stores information about
a particular feature or meta-item that has changed, and exposes the values it has changed from and to.
This is needed to fill the same purpose as pygit2.DiffDelta because pygit2.DiffDelta's can't be
created except by running a pygit2 diff - which we don't always want to do when generating diff deltas:
see get_raw_deltas_for_keys.
"""

__slots__ = ["status", "status_char", "old_path", "new_path"]

def __init__(self, status, status_char, old_path, new_path):
self.status = status
self.status_char = status_char
self.old_path = old_path
self.new_path = new_path

@classmethod
def of(cls, old_path, new_path, reverse=False):
if reverse:
old_path, new_path = new_path, old_path

if old_path is None:
return RawDiffDelta(pygit2.GIT_DELTA_ADDED, "A", old_path, new_path)
elif new_path is None:
return RawDiffDelta(pygit2.GIT_DELTA_DELETED, "D", old_path, new_path)
else:
return RawDiffDelta(pygit2.GIT_DELTA_MODIFIED, "M", old_path, new_path)
1 change: 1 addition & 0 deletions kart/tabular/rich_table_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def diff_feature(
"feature",
key_filter=feature_filter,
key_decoder_method="decode_path_to_1pk",
key_encoder_method="encode_1pk_to_path",
value_decoder_method="get_feature_promise_from_path",
reverse=reverse,
)
Expand Down

0 comments on commit fe9958e

Please sign in to comment.