Skip to content

Commit

Permalink
Treat metadata.xml as an attached file...
Browse files Browse the repository at this point in the history
not as a hidden "meta item".
- show it in <files> diff not "<dataset>:meta" diff
- ignore changes to gpkg_metadata when finding WC meta diffs
  (this may be reintroduced later as a <files> diff)
  • Loading branch information
olsen232 committed Sep 19, 2022
1 parent 91e87cd commit 8ee77a2
Show file tree
Hide file tree
Showing 23 changed files with 158 additions and 203 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ _When adding new entries to the changelog, please include issue/PR numbers where
- Improve argument parsing for `kart diff` and `kart show`. [#706](https://github.com/koordinates/kart/issues/706)
- Bugfix: don't allow `kart merge` to fast-forward if the user specifies a merge message. [#705](https://github.com/koordinates/kart/issues/705)
- `diff`, `show`, `create-patch` and `apply` now handle attached files (as could already be created using `kart commit-files`) [#583](https://github.com/koordinates/kart/issues/583)
- `metadata.xml` is now consistently treated as an attached file, rather than as a hidden "meta item". [#583](https://github.com/koordinates/kart/issues/583)

## 0.11.5

Expand Down
3 changes: 1 addition & 2 deletions docs/pages/meta_items.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,7 @@ Here are some example meta items.
}
}

The possible meta items are named: - ``title`` - ``description`` -
``schema.json`` - ``crs/<some-identifier>.wkt`` - ``metadata.xml``
The possible meta items are named: - ``title`` - ``description`` - ``schema.json`` - ``crs/<some-identifier>.wkt``

For a more complete specification, see Kart's
:ref:`Datasets V3`
Expand Down
18 changes: 15 additions & 3 deletions kart/base_dataset.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
import binascii
import functools
import logging
import re
import sys

import click
import pygit2

from kart.core import find_blobs_with_paths_in_tree
from kart.dataset_mixins import DatasetDiffMixin
from kart.exceptions import InvalidOperation, UNSUPPORTED_VERSION, PATCH_DOES_NOT_APPLY
from kart.meta_items import MetaItemFileType, MetaItemVisibility
from kart.serialise_util import ensure_bytes, json_pack


class BaseDatasetMetaClass(type):
Expand Down Expand Up @@ -278,6 +276,16 @@ def _meta_item_paths_grouped_by_definition(self):
result.setdefault(definition, []).append(meta_item_path)
return result

def attachments(self):
if self.tree is None:
return
for obj in self.tree:
if obj.type == pygit2.GIT_OBJ_BLOB:
yield obj.name, obj.data

def get_attachment(self, name, missing_ok=True):
return self.get_data_at(name, missing_ok=missing_ok, from_tree=self.tree)

def apply_meta_diff(
self, meta_diff, object_builder, *, resolve_missing_values_from_ds=None
):
Expand Down Expand Up @@ -423,6 +431,10 @@ def ensure_full_path(self, path):
# A path relative to the inner tree.
return f"{self.inner_path}/{path}"

def full_attachment_path(self, rel_path):
"""Given the path of an attachment relative to this dataset's attachment path, returns its full path from the repo root."""
return f"{self.path}/{rel_path}"

def ensure_rel_path(self, path):
"""Given a full path to something in this dataset, returns its path relative to the dataset inner-path."""
if path.startswith(self.inner_path):
Expand Down
22 changes: 0 additions & 22 deletions kart/diff_util.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import functools
import logging
import re
from pathlib import PurePosixPath
import subprocess

from kart.cli_util import tool_environment
Expand Down Expand Up @@ -218,8 +216,6 @@ def get_file_diff(
old_sha, new_sha, path = parts[2], parts[3], parts[5]
if not path_matches_repo_key_filter(path, repo_key_filter):
continue
if is_dataset_metadata_xml(old_tree, new_tree, path):
continue
old_half_delta = (path, old_sha) if not ZEROES.fullmatch(old_sha) else None
new_half_delta = (path, new_sha) if not ZEROES.fullmatch(new_sha) else None
attachment_deltas.add_delta(Delta(old_half_delta, new_half_delta))
Expand All @@ -244,21 +240,3 @@ def path_matches_repo_key_filter(path, repo_key_filter):
# Don't return attachments inside a dataset / folder that we are only matching some of
# ie, only matching certain features or meta items.
return False


def is_dataset_metadata_xml(old_tree, new_tree, path):
# metadata.xml is sometimes stored in the "attachment" area, alongside the dataset, rather than inside it.
# Storing it in this unusual location adds complexity without actually solving any problems, so any datasets designed
# after table.v3 don't do this. Since this change already appears in meta-diffs, we suppress it from attachment diffs.
if not path.endswith("/metadata.xml"):
return False
parent_path = str(PurePosixPath(path).parent)
for tree in (old_tree, new_tree):
try:
parent_tree = tree / parent_path
for sibling in parent_tree:
if sibling.name.startswith(".table-dataset"):
return True
except KeyError:
continue
return False
3 changes: 0 additions & 3 deletions kart/meta_items.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,9 +187,6 @@ def match_group(self, meta_item_path, match_group):
# JSON representation of the dataset's schema. See kart/tabular/schema.py, datasets_v3.rst
SCHEMA_JSON = MetaItemDefinition("schema.json", SchemaJsonFileType.INSTANCE)

# Any XML metadata about the dataset.
METADATA_XML = MetaItemDefinition("metadata.xml", MetaItemFileType.XML)

# No more than one unnamed CRS definition in a single file named "crs.wkt":
CRS_WKT = MetaItemDefinition("crs.wkt", MetaItemFileType.WKT)

Expand Down
2 changes: 0 additions & 2 deletions kart/point_cloud/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ class PointCloudV1(BaseDataset):
TITLE = meta_items.TITLE
DESCRIPTION = meta_items.DESCRIPTION
TAGS_JSON = meta_items.TAGS_JSON
METADATA_XML = meta_items.METADATA_XML

# Which tile format(s) this dataset requires / allows.
FORMAT_JSON = MetaItemDefinition("format.json", MetaItemFileType.JSON)
Expand All @@ -65,7 +64,6 @@ class PointCloudV1(BaseDataset):
TITLE,
DESCRIPTION,
TAGS_JSON,
METADATA_XML,
FORMAT_JSON,
SCHEMA_JSON,
CRS_WKT,
Expand Down
13 changes: 8 additions & 5 deletions kart/sqlalchemy/adapter/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,13 +129,11 @@ def sql_type_to_v2_type(cls, sql_type):
return v2_type, {}

@classmethod
def all_v2_meta_items(
cls, sess, db_schema, table_name, id_salt, include_legacy_items=False
):
def all_v2_meta_items(cls, sess, db_schema, table_name, id_salt):
"""
Returns a dict all V2 meta items for the specified table.
Guaranteed to at least generate the table's V2 schema with key "schema.json", if the table exists at all.
Possibly returns any or all of the title, description, xml metadata, and attached CRS definitions.
Possibly returns any or all of the title, description, and attached CRS definitions.
Varying the id_salt varies the column ids that are generated for the schema.json item -
these are generated deterministically so that running the same command twice in a row produces the same output.
But if the user does something different later, a different salt should be provided.
Expand All @@ -152,10 +150,15 @@ def all_v2_meta_items(
db_schema,
table_name,
id_salt,
include_legacy_items=include_legacy_items,
)
)

@classmethod
def get_metadata_xml(cls, sess, db_schema, table_name):
"""Read the XML metadata stored within the database for this table, if any."""
# As a rule, the database tables themselves don't store XML metadata. The exception is GPKG.
return None

@classmethod
def remove_empty_values(cls, meta_items):
"""
Expand Down
45 changes: 27 additions & 18 deletions kart/sqlalchemy/adapter/gpkg.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
)
from kart.sqlalchemy.gpkg import Db_GPKG
from kart.schema import ColumnSchema, Schema
from kart.serialise_util import ensure_text
from kart.timestamps import datetime_to_iso8601_utc
from kart.utils import ungenerator

Expand Down Expand Up @@ -156,26 +157,34 @@ def all_gpkg_meta_items(cls, v2_obj, table_name):
)

@classmethod
def all_v2_meta_items_including_empty(
cls, sess, db_schema, table_name, id_salt, include_legacy_items=False
):
def all_v2_meta_items_including_empty(cls, sess, db_schema, table_name, id_salt):
"""
Generate all V2 meta items for the given table.
Varying the id_salt varies the ids that are generated for the schema.json item.
"""
assert not db_schema
include_metadata_json = include_legacy_items

gpkg_meta_items = cls._gpkg_meta_items_from_db(sess, table_name)
return cls.all_v2_meta_items_from_gpkg_meta_items(
gpkg_meta_items, id_salt, include_metadata_json
gpkg_meta_items = cls._gpkg_meta_items_from_db(
sess, table_name, skip_keys={"gpkg_metadata", "gpkg_metadata_reference"}
)
return cls.all_v2_meta_items_from_gpkg_meta_items(gpkg_meta_items, id_salt)

@classmethod
def get_metadata_xml(cls, sess, db_schema, table_name):
"""
Returns the metadata of type text/xml that is contained and gpkg_metadata
and associated with the given table, if any.
"""
assert not db_schema

gpkg_meta_items = cls._gpkg_meta_items_from_db(
sess, table_name, keys={"gpkg_metadata", "gpkg_metadata_reference"}
)
return cls.gpkg_to_xml_metadata(gpkg_meta_items)

@classmethod
@ungenerator(dict)
def all_v2_meta_items_from_gpkg_meta_items(
cls, gpkg_meta_items, id_salt=None, include_metadata_json=False
):
def all_v2_meta_items_from_gpkg_meta_items(cls, gpkg_meta_items, id_salt=None):
"""
Generate all the V2 meta items from the given gpkg_meta_items lists / dicts -
either loaded from JSON, or generated directly from the database.
Expand All @@ -193,10 +202,6 @@ def all_v2_meta_items_from_gpkg_meta_items(
schema = cls._gpkg_to_v2_schema(gpkg_meta_items, id_salt)
yield "schema.json", schema.to_column_dicts() if schema else None

if include_metadata_json:
yield "metadata/dataset.json", cls.gpkg_to_json_metadata(gpkg_meta_items)
yield "metadata.xml", cls.gpkg_to_xml_metadata(gpkg_meta_items)

gpkg_spatial_ref_sys = gpkg_meta_items.get("gpkg_spatial_ref_sys")
for gsrs in gpkg_spatial_ref_sys:
d = gsrs["definition"]
Expand Down Expand Up @@ -311,8 +316,11 @@ def generate_gpkg_spatial_ref_sys(cls, v2_obj):
@classmethod
def generate_gpkg_metadata(cls, v2_obj, table_name, reference=False):
metadata_xml = v2_obj.get_meta_item("metadata.xml")
if metadata_xml is None:
metadata_xml = ensure_text(v2_obj.get_attachment("metadata.xml"))
if metadata_xml is not None:
return cls.xml_to_gpkg_metadata(metadata_xml, table_name, reference)

v2json = v2_obj.get_meta_item("metadata/dataset.json")
if v2json is not None:
return cls.json_to_gpkg_metadata(v2json, table_name, reference)
Expand Down Expand Up @@ -516,9 +524,8 @@ def gpkg_to_xml_metadata(cls, gpkg_meta_items):
if sole_useful_xml is not None:
return sole_useful_xml

# We can't actually commit a whole list of XML, but we need to return something that makes sense.
# Simply throwing an error here stops dirty-detection working, and stops commands that would fix the situation
# from working, like `kart reset --discard-changes` or `kart create-workingcopy --discard-changes`.
# There's not just one XML metadata attached to this table - there's multiple.
# We leave this to the caller to handle however they choose.
return ListOfConflicts(xml_list)

METADATA_QUERY = """
Expand Down Expand Up @@ -562,7 +569,7 @@ def find_sole_useful_xml(cls, xml_list):

@classmethod
@ungenerator(dict)
def _gpkg_meta_items_from_db(cls, sess, table_name, keys=None):
def _gpkg_meta_items_from_db(cls, sess, table_name, keys=None, skip_keys=None):
"""
Returns metadata from the gpkg_* tables about this GPKG.
"""
Expand Down Expand Up @@ -607,6 +614,8 @@ def _gpkg_meta_items_from_db(cls, sess, table_name, keys=None):
for key, (sql, rtype) in QUERIES.items():
if keys is not None and key not in keys:
continue
if skip_keys is not None and key in skip_keys:
continue
# check table exists, the metadata ones may not
if not key.startswith("sqlite_"):
r = sess.execute(
Expand Down
2 changes: 1 addition & 1 deletion kart/sqlalchemy/adapter/mysql.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def _v2_geometry_type_to_sql_type(cls, column_schema, v2_obj=None):
@classmethod
@ungenerator(dict)
def all_v2_meta_items_including_empty(
cls, sess, db_schema, table_name, id_salt=None, include_legacy_items=False
cls, sess, db_schema, table_name, id_salt=None
):
title = sess.scalar(
"""
Expand Down
2 changes: 1 addition & 1 deletion kart/sqlalchemy/adapter/postgis.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ def _v2_geometry_type_to_sql_type(cls, col, v2_obj=None):
@classmethod
@ungenerator(dict)
def all_v2_meta_items_including_empty(
cls, sess, db_schema, table_name, id_salt, include_legacy_items=False
cls, sess, db_schema, table_name, id_salt=None
):
"""
Generate all V2 meta items for the given table.
Expand Down
2 changes: 1 addition & 1 deletion kart/sqlalchemy/adapter/sqlserver.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def v2_type_to_sql_type(cls, col, v2_obj=None):
@classmethod
@ungenerator(dict)
def all_v2_meta_items_including_empty(
cls, sess, db_schema, table_name, id_salt=None, include_legacy_items=False
cls, sess, db_schema, table_name, id_salt=None
):
"""
Generate all V2 meta items for the given table.
Expand Down
4 changes: 2 additions & 2 deletions kart/tabular/import_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,9 @@ def meta_items(self):
"""
raise NotImplementedError()

def attachment_items(self):
def attachments(self):
"""
Returns a dict of all the attachment items that need to be imported.
Yields (name, data) tuples for all the attachments that need to be imported.
These are files that will be imported verbatim to dest_path, but not hidden inside the dataset.
This could be a license or a readme.
"""
Expand Down
11 changes: 11 additions & 0 deletions kart/tabular/sqlalchemy_import_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
import sqlalchemy
from kart.exceptions import NO_IMPORT_SOURCE, NO_TABLE, NotFound, NotYetImplemented
from kart.output_util import dump_json_output
from kart.list_of_conflicts import ListOfConflicts
from kart.schema import Schema
from kart.sqlalchemy import DbType, separate_last_path_part, strip_username_and_password
from kart.utils import chunk, ungenerator
from kart.serialise_util import ensure_bytes
from sqlalchemy.orm import sessionmaker

from .import_source import TableImportSource
Expand Down Expand Up @@ -237,6 +239,15 @@ def meta_items_from_db(self):
sess, self.db_schema, self.table, id_salt
)

def attachments(self):
with sessionmaker(bind=self.engine)() as sess:
metadata_xml = self.db_type.adapter.get_metadata_xml(
sess, self.db_schema, self.table
)

if metadata_xml and not isinstance(metadata_xml, ListOfConflicts):
yield "metadata.xml", ensure_bytes(metadata_xml)

def align_schema_to_existing_schema(self, existing_schema):
aligned_schema = existing_schema.align_to_self(self.schema)
self.meta_overrides["schema.json"] = aligned_schema.to_column_dicts()
Expand Down
4 changes: 0 additions & 4 deletions kart/tabular/table_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,6 @@ def schema(self):
self._schema = super().schema
return self._schema

def full_attachment_path(self, rel_path):
"""Given the path of an attachment relative to this dataset's attachment path, returns its full path from the repo root."""
return f"{self.path}/{rel_path}"

def decode_path(self, path):
"""
Given a relative path to something inside this dataset eg "feature/49/3e/Bg==""
Expand Down

0 comments on commit 8ee77a2

Please sign in to comment.