Skip to content

Commit

Permalink
Merge pull request #940 from koordinates/import-from-s3
Browse files Browse the repository at this point in the history
Allow for regular import from S3 URLs
  • Loading branch information
olsen232 committed Nov 16, 2023
2 parents 05dd970 + 452fa55 commit 3bc2c91
Show file tree
Hide file tree
Showing 10 changed files with 265 additions and 49 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ _When adding new entries to the changelog, please include issue/PR numbers where
- Improve schema extraction for point cloud datasets. [#924](https://github.com/koordinates/kart/issues/924)
- Some tweaks to `--dry-run` output of Kart LFS commands. [#932](https://github.com/koordinates/kart/pull/932)
- Now using Python 3.11 to build Kart, and vendored dependencies have been updated to newer versions. [#933](https://github.com/koordinates/kart/pull/933)
- Adds support for importing point-cloud and raster tiles directly from an S3 URL. [#940](https://github.com/koordinates/kart/pull/940)

## 0.14.2

Expand Down
18 changes: 6 additions & 12 deletions kart/byod/importer.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
import logging

import click

from kart.fast_import import (
write_blob_to_stream,
)
from kart.lfs_util import dict_to_pointer_file_bytes
from kart.progress_util import progress_bar
from kart.s3_util import expand_s3_glob
from kart.tile.tilename_util import PAM_SUFFIX

L = logging.getLogger(__name__)
Expand All @@ -19,19 +17,15 @@ class ByodTileImporter:
while leaving the data in-place on existing hosted storage.
"""

EXTRACT_TILE_METADATA_STEP = "Fetching tile metadata"
ALLOWED_SCHEMES = ("s3",)
ALLOWED_SCHEMES_DESC = "an S3 URL"

def sanity_check_sources(self, sources):
for source in sources:
if not source.startswith("s3://"):
raise click.UsageError(f"SOURCE {source} should be an S3 url")

for source in list(sources):
if "*" in source:
sources.remove(source)
sources += expand_s3_glob(source)
@property
def extracting_tile_metadata_desc(self):
return "Fetching tile metadata"

def extract_tile_metadata(self, tile_location):
# Implemented in subclasses.
raise NotImplementedError()

def get_conversion_func(self, source_metadata):
Expand Down
2 changes: 1 addition & 1 deletion kart/byod/point_cloud_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,4 @@ def extract_tile_metadata(self, tile_location):
)
tmp_downloaded_tile.unlink()
result["tile"]["url"] = tile_location
return result
return None, result
4 changes: 3 additions & 1 deletion kart/byod/raster_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,4 +127,6 @@ def byod_raster_import(
class ByodRasterImporter(ByodTileImporter, RasterImporter):
def extract_tile_metadata(self, tile_location):
oid_and_size = get_hash_and_size_of_s3_object(tile_location)
return extract_raster_tile_metadata(tile_location, oid_and_size=oid_and_size)
return None, extract_raster_tile_metadata(
tile_location, oid_and_size=oid_and_size
)
10 changes: 4 additions & 6 deletions kart/lfs_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,21 +305,19 @@ def get_local_path_from_lfs_hash(repo, lfs_hash):


def copy_file_to_local_lfs_cache(
repo, source_path, conversion_func=None, oid_and_size=None
repo, source_path, conversion_func=None, oid_and_size=None, preserve_original=True
):
"""
Given the path to a file, copies it to the appropriate location in the local LFS cache based on its sha256 hash.
Optionally takes a conversion function which can convert the file while copying it - this saves us doing an extra
copy after the convert operation, if we just write the converted version to where we would copy it.
Optionally takes the oid and size of the source, if this is known, to avoid recomputing it.
"""

lfs_tmp_path = repo.gitdir_path / "lfs" / "objects" / "tmp"
lfs_tmp_path.mkdir(parents=True, exist_ok=True)

tmp_object_path = lfs_tmp_path / str(uuid.uuid4())
tmp_object_path = repo.lfs_tmp_path / str(uuid.uuid4())
if conversion_func is not None:
conversion_func(source_path, tmp_object_path)
elif not preserve_original:
Path(source_path).rename(tmp_object_path)
else:
try:
reflink(source_path, tmp_object_path)
Expand Down
6 changes: 6 additions & 0 deletions kart/repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -802,6 +802,12 @@ def gitdir_file(self, rel_path):
def workdir_file(self, rel_path):
return self.workdir_path / rel_path

@cached_property
def lfs_tmp_path(self):
result = self.gitdir_path / "lfs" / "objects" / "tmp"
result.mkdir(parents=True, exist_ok=True)
return result

def write_gitdir_file(self, rel_path, text):
assert isinstance(text, str)
if not text.endswith("\n"):
Expand Down
2 changes: 0 additions & 2 deletions kart/s3_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,6 @@ def fetch_from_s3(s3_url, output_path=None, sha256_hash=None):
If sha_256 hash is set, verifies that the downloaded file has the expected hash -
if it does not, deletes the downloaded file and raises a ValueError.
"""
# TODO: handle failure.
bucket, key = parse_s3_url(s3_url)
if output_path is None:
fd, output_path = tempfile.mkstemp()
Expand Down Expand Up @@ -183,7 +182,6 @@ def expand_s3_glob(source_spec):
Subdirectories (or the S3 equivalent - S3 is not exactly a directory hierarchy) are not matched -
that is, s3://bucket/path/*.txt matches s3://bucket/path/example.txt but not s3://bucket/path/subpath/example.txt
"""
# TODO: handle any kind of failure, sanity check to make sure we don't match a million objects.
if "*" not in source_spec:
return [source_spec]

Expand Down

0 comments on commit 3bc2c91

Please sign in to comment.