Merge pull request #940 from koordinates/import-from-s3

Allow for regular import from S3 URLs
koordinates · Nov 16, 2023 · 3bc2c91 · 3bc2c91
2 parents 05dd970 + 452fa55
commit 3bc2c91
Show file tree

Hide file tree

Showing 10 changed files with 265 additions and 49 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ _When adding new entries to the changelog, please include issue/PR numbers where
 - Improve schema extraction for point cloud datasets. [#924](https://github.com/koordinates/kart/issues/924)
 - Some tweaks to `--dry-run` output of Kart LFS commands. [#932](https://github.com/koordinates/kart/pull/932)
 - Now using Python 3.11 to build Kart, and vendored dependencies have been updated to newer versions. [#933](https://github.com/koordinates/kart/pull/933)
+- Adds support for importing point-cloud and raster tiles directly from an S3 URL. [#940](https://github.com/koordinates/kart/pull/940)
 
 ## 0.14.2
 

diff --git a/kart/byod/importer.py b/kart/byod/importer.py
@@ -1,13 +1,11 @@
 import logging
 
-import click
 
 from kart.fast_import import (
     write_blob_to_stream,
 )
 from kart.lfs_util import dict_to_pointer_file_bytes
 from kart.progress_util import progress_bar
-from kart.s3_util import expand_s3_glob
 from kart.tile.tilename_util import PAM_SUFFIX
 
 L = logging.getLogger(__name__)
@@ -19,19 +17,15 @@ class ByodTileImporter:
     while leaving the data in-place on existing hosted storage.
     """
 
-    EXTRACT_TILE_METADATA_STEP = "Fetching tile metadata"
+    ALLOWED_SCHEMES = ("s3",)
+    ALLOWED_SCHEMES_DESC = "an S3 URL"
 
-    def sanity_check_sources(self, sources):
-        for source in sources:
-            if not source.startswith("s3://"):
-                raise click.UsageError(f"SOURCE {source} should be an S3 url")
-
-        for source in list(sources):
-            if "*" in source:
-                sources.remove(source)
-                sources += expand_s3_glob(source)
+    @property
+    def extracting_tile_metadata_desc(self):
+        return "Fetching tile metadata"
 
     def extract_tile_metadata(self, tile_location):
+        # Implemented in subclasses.
         raise NotImplementedError()
 
     def get_conversion_func(self, source_metadata):

diff --git a/kart/byod/point_cloud_import.py b/kart/byod/point_cloud_import.py
@@ -134,4 +134,4 @@ def extract_tile_metadata(self, tile_location):
         )
         tmp_downloaded_tile.unlink()
         result["tile"]["url"] = tile_location
-        return result
+        return None, result
diff --git a/kart/byod/raster_import.py b/kart/byod/raster_import.py
@@ -127,4 +127,6 @@ def byod_raster_import(
 class ByodRasterImporter(ByodTileImporter, RasterImporter):
     def extract_tile_metadata(self, tile_location):
         oid_and_size = get_hash_and_size_of_s3_object(tile_location)
-        return extract_raster_tile_metadata(tile_location, oid_and_size=oid_and_size)
+        return None, extract_raster_tile_metadata(
+            tile_location, oid_and_size=oid_and_size
+        )
diff --git a/kart/lfs_util.py b/kart/lfs_util.py
@@ -305,21 +305,19 @@ def get_local_path_from_lfs_hash(repo, lfs_hash):
 
 
 def copy_file_to_local_lfs_cache(
-    repo, source_path, conversion_func=None, oid_and_size=None
+    repo, source_path, conversion_func=None, oid_and_size=None, preserve_original=True
 ):
     """
     Given the path to a file, copies it to the appropriate location in the local LFS cache based on its sha256 hash.
     Optionally takes a conversion function which can convert the file while copying it - this saves us doing an extra
     copy after the convert operation, if we just write the converted version to where we would copy it.
     Optionally takes the oid and size of the source, if this is known, to avoid recomputing it.
     """
-
-    lfs_tmp_path = repo.gitdir_path / "lfs" / "objects" / "tmp"
-    lfs_tmp_path.mkdir(parents=True, exist_ok=True)
-
-    tmp_object_path = lfs_tmp_path / str(uuid.uuid4())
+    tmp_object_path = repo.lfs_tmp_path / str(uuid.uuid4())
     if conversion_func is not None:
         conversion_func(source_path, tmp_object_path)
+    elif not preserve_original:
+        Path(source_path).rename(tmp_object_path)
     else:
         try:
             reflink(source_path, tmp_object_path)

diff --git a/kart/repo.py b/kart/repo.py
@@ -802,6 +802,12 @@ def gitdir_file(self, rel_path):
     def workdir_file(self, rel_path):
         return self.workdir_path / rel_path
 
+    @cached_property
+    def lfs_tmp_path(self):
+        result = self.gitdir_path / "lfs" / "objects" / "tmp"
+        result.mkdir(parents=True, exist_ok=True)
+        return result
+
     def write_gitdir_file(self, rel_path, text):
         assert isinstance(text, str)
         if not text.endswith("\n"):

diff --git a/kart/s3_util.py b/kart/s3_util.py
@@ -126,7 +126,6 @@ def fetch_from_s3(s3_url, output_path=None, sha256_hash=None):
     If sha_256 hash is set, verifies that the downloaded file has the expected hash -
         if it does not, deletes the downloaded file and raises a ValueError.
     """
-    # TODO: handle failure.
     bucket, key = parse_s3_url(s3_url)
     if output_path is None:
         fd, output_path = tempfile.mkstemp()
@@ -183,7 +182,6 @@ def expand_s3_glob(source_spec):
     Subdirectories (or the S3 equivalent - S3 is not exactly a directory hierarchy) are not matched -
     that is, s3://bucket/path/*.txt matches s3://bucket/path/example.txt but not s3://bucket/path/subpath/example.txt
     """
-    # TODO: handle any kind of failure, sanity check to make sure we don't match a million objects.
     if "*" not in source_spec:
         return [source_spec]