Skip to content

Commit

Permalink
Merge pull request #932 from koordinates/dont-push-byod-blobs
Browse files Browse the repository at this point in the history
Don't push blobs with URLs to the remote
  • Loading branch information
olsen232 committed Nov 3, 2023
2 parents f6873f4 + d3181b5 commit bb753cf
Show file tree
Hide file tree
Showing 5 changed files with 78 additions and 33 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ _When adding new entries to the changelog, please include issue/PR numbers where
- Adds information on referencing and citing Kart to `CITATION`. [#914](https://github.com/koordinates/kart/pull/914)
- Fixes a bug where Kart would misidentify a non-Kart repo as a Kart V1 repo in some circumstances. [#918](https://github.com/koordinates/kart/issues/918)
- Improve schema extraction for point cloud datasets. [#924](https://github.com/koordinates/kart/issues/924)
- Some tweaks to `--dry-run` output of Kart LFS commands. [#932](https://github.com/koordinates/kart/pull/932)

## 0.14.2

Expand Down
78 changes: 51 additions & 27 deletions kart/lfs_commands/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,25 +120,34 @@ def pre_push(ctx, remote_name, remote_url, dry_run):

start_commits, stop_commits = get_start_and_stop_commits(sys.stdin)

lfs_oids = set()
# This is a dicts {lfs_oid: file_size} - see _lfs_blobs() function:
to_push = {}

for commit_id, path_match_result, pointer_blob in rev_list_tile_pointer_files(
repo, start_commits, [f"--remotes={remote_name}", *stop_commits]
):
# Because of the way a Kart repo is laid out, we know that:
# All LFS pointer files are blobs inside **/.*-dataset.v?/tile/** and conversely,
# All blobs inside **/.*-dataset.v?/tile/** are LFS pointer files.
lfs_oids.add(get_hash_from_pointer_file(pointer_blob))
pointer_dict = pointer_file_bytes_to_dict(pointer_blob)
if pointer_dict.get("url"):
# Currently, the rule is that we never push pointer files that contain a URL.
# If anyone - any clone of this repo - needs the blob, they can fetch it directly from the URL.
# We may decide to allow for more complicated flows in a later version of Kart.
continue
lfs_oid = get_hash_from_pointer_file(pointer_dict)
to_push[lfs_oid] = pointer_dict["size"]

if dry_run:
click.echo(
f"Running pre-push with --dry-run: pushing {len(lfs_oids)} LFS blobs"
f"Running pre-push with --dry-run: found {_lfs_blobs(to_push)} to push"
)
for lfs_oid in lfs_oids:
for lfs_oid in to_push:
click.echo(lfs_oid)
return

if lfs_oids:
push_lfs_oids(repo, remote_name, lfs_oids)
if to_push:
push_lfs_oids(repo, remote_name, to_push)


def get_start_and_stop_commits(input_iter):
Expand Down Expand Up @@ -282,10 +291,6 @@ def fetch_lfs_blobs_for_commits(
)


def _blobs(count):
return "1 blob" if count == 1 else f"{count} blobs"


def fetch_lfs_blobs_for_pointer_files(
repo, pointer_files, *, remote_name=None, dry_run=False, quiet=False
):
Expand All @@ -303,6 +308,10 @@ def fetch_lfs_blobs_for_pointer_files(
urls = []
non_urls = []

# These are dicts {lfs_oid: file_size} - see _lfs_blobs() function:
urls_sizes = {}
non_urls_sizes = {}

for pointer_file in pointer_files:
if isinstance(pointer_file, str):
pointer_blob = repo[pointer_file]
Expand All @@ -321,8 +330,10 @@ def fetch_lfs_blobs_for_pointer_files(

if url:
urls.append((url, lfs_oid))
urls_sizes[lfs_oid] = pointer_dict["size"]
else:
non_urls.append((pointer_file_oid, lfs_oid))
non_urls_sizes[lfs_oid] = pointer_dict["size"]

if dry_run:
if url:
Expand All @@ -333,9 +344,11 @@ def fetch_lfs_blobs_for_pointer_files(
if dry_run:
click.echo("Running fetch with --dry-run:")
if urls:
click.echo(f" Found {_blobs(len(urls))} to fetch from specific URLs")
click.echo(f" Found {_lfs_blobs(urls_sizes)} to fetch from specific URLs")
if non_urls:
found_non_urls = f" Found {_blobs(len(non_urls))} to fetch from the remote"
found_non_urls = (
f" Found {_lfs_blobs(non_urls_sizes)} to fetch from the remote"
)
found_non_urls += "" if remote_name else " - but no remote is configured"
click.echo(found_non_urls)
if not urls and not non_urls:
Expand Down Expand Up @@ -411,7 +424,7 @@ def _do_fetch_from_remote(repo, pointer_file_and_lfs_oids, remote_name, quiet=Fa
@click.option(
"--dry-run",
is_flag=True,
help="Don't fetch anything, just show what would be fetched",
help="Don't garbage-collect anything, just show what would be garbage-collected",
)
def gc(ctx, dry_run):
"""
Expand Down Expand Up @@ -440,11 +453,9 @@ def gc(ctx, dry_run):
if dataset.path not in non_checkout_datasets:
checked_out_lfs_oids.update(dataset.tile_lfs_hashes(spatial_filter))

to_delete = set()
total_size_to_delete = 0

to_delete_once_pushed = set()
total_size_to_delete_once_pushed = 0
# These are dicts {lfs_oid: file_size} - see _lfs_blobs() function.
to_delete = {}
to_delete_once_pushed = {}

for file in (repo.gitdir_path / "lfs" / "objects").glob("**/*"):
if not file.is_file() or not LFS_OID_PATTERN.fullmatch(file.name):
Expand All @@ -454,32 +465,45 @@ def gc(ctx, dry_run):
continue # Can't garbage-collect anything that's currently checked out.

if file.name in unpushed_lfs_oids:
to_delete_once_pushed.add(file)
total_size_to_delete_once_pushed += file.stat().st_size
to_delete_once_pushed[file] = file.stat().st_size
else:
to_delete.add(file)
total_size_to_delete += file.stat().st_size
to_delete[file] = file.stat().st_size

if to_delete_once_pushed:
size_desc = human_readable_bytes(total_size_to_delete_once_pushed)
click.echo(
f"Can't delete {len(to_delete_once_pushed)} LFS blobs ({size_desc}) from the cache since they have not been pushed to a remote"
f"Can't delete {_lfs_blobs(to_delete_once_pushed)} from the cache since they have not been pushed to a remote"
)

size_desc = human_readable_bytes(total_size_to_delete)
if dry_run:
click.echo(
f"Running gc with --dry-run: found {len(to_delete)} LFS blobs ({size_desc}) to delete from the cache"
f"Running gc with --dry-run: found {_lfs_blobs(to_delete)} to delete from the cache"
)
for file in sorted(to_delete, key=lambda f: f.name):
click.echo(file.name)
return

click.echo(f"Deleting {len(to_delete)} LFS blobs ({size_desc}) from the cache...")
click.echo(f"Deleting {_lfs_blobs(to_delete)} from the cache...")
for file in to_delete:
file.unlink()


def _lfs_blobs(file_size_dict):
"""
Returns a string looking something like "5 LFS blobs (1MiB)".
Takes a dict of the form {lfs_oid: file_size_in_bytes}, where the length of the dict is the count of unique blobs.
This is because building a dict like this is a straight-forward way of getting a unique set of OIDs along
with a way of finding their total size; maintaining two separate variables - a set of OIDS and a total size -
makes the code more complicated.
"""

count = len(file_size_dict)
total_size = sum(file_size_dict.values())

blobs = "blob" if count == 1 else "blobs"
size_desc = human_readable_bytes(total_size)
return f"{count} LFS {blobs} ({size_desc})"


def human_readable_bytes(num):
for unit in ("", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"):
if num < 1024:
Expand Down
24 changes: 22 additions & 2 deletions tests/byod/test_imports.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import json
import os
import subprocess

import pytest

from kart.repo import KartRepo


DUMMY_REPO = "git@example.com/example.git"


@pytest.mark.slow
def test_byod_point_cloud_import(
tmp_path,
Expand Down Expand Up @@ -101,7 +105,7 @@ def test_byod_point_cloud_import(
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines()[:6] == [
"Running fetch with --dry-run:",
" Found 16 blobs to fetch from specific URLs",
" Found 16 LFS blobs (373KiB) to fetch from specific URLs",
"",
"LFS blob OID: (Pointer file OID):",
"03e3d4dc6fc8e75c65ffdb39b630ffe26e4b95982b9765c919e34fb940e66fc0 (ecb9c281c7e8cc354600d41e88d733faf2e991e1) → s3://kart-bring-your-own-data-poc/auckland-small-laz1.2/auckland_3_2.laz",
Expand Down Expand Up @@ -215,7 +219,7 @@ def test_byod_raster_import(
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines() == [
"Running fetch with --dry-run:",
" Found 2 blobs to fetch from specific URLs",
" Found 2 LFS blobs (627KiB) to fetch from specific URLs",
"",
"LFS blob OID: (Pointer file OID):",
"c4bbea4d7cfd54f4cdbca887a1b358a81710e820a6aed97cdf3337fd3e14f5aa (6864fc3291a79b2ce9e4c89004172aa698b84d7c) → s3://kart-bring-your-own-data-poc/erorisk_si/erorisk_silcdb4.tif",
Expand All @@ -237,3 +241,19 @@ def test_byod_raster_import(
"Running fetch with --dry-run:",
" Found nothing to fetch",
]

# Make sure LFS blobs with URLs are not pushed to the remote:
r = cli_runner.invoke(["remote", "add", "origin", DUMMY_REPO])
assert r.exit_code == 0, r.stderr
repo.config[f"lfs.{DUMMY_REPO}/info/lfs.locksverify"] = False

head_sha = repo.head_commit.hex
stdout = subprocess.check_output(
["kart", "lfs+", "pre-push", "origin", "DUMMY_REPO", "--dry-run"],
input=f"main {head_sha} main 0000000000000000000000000000000000000000\n",
encoding="utf8",
)
assert (
stdout.splitlines()[0]
== "Running pre-push with --dry-run: found 0 LFS blobs (0B) to push"
)
4 changes: 2 additions & 2 deletions tests/point_cloud/test_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def test_import_single_las__convert(
)
assert (
stdout.splitlines()[0]
== "Running pre-push with --dry-run: pushing 1 LFS blobs"
== "Running pre-push with --dry-run: found 1 LFS blob (3.5KiB) to push"
)

assert (repo_path / "autzen" / "autzen.copc.laz").is_file()
Expand Down Expand Up @@ -316,7 +316,7 @@ def test_import_several_laz__convert(
)
assert (
stdout.splitlines()[0]
== "Running pre-push with --dry-run: pushing 16 LFS blobs"
== "Running pre-push with --dry-run: found 16 LFS blobs (410KiB) to push"
)

for x in range(4):
Expand Down
4 changes: 2 additions & 2 deletions tests/point_cloud/test_workingcopy.py
Original file line number Diff line number Diff line change
Expand Up @@ -1102,7 +1102,7 @@ def test_lfs_fetch(cli_runner, data_archive):
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines() == [
"Running fetch with --dry-run:",
" Found 16 blobs to fetch from the remote",
" Found 16 LFS blobs (410KiB) to fetch from the remote",
"",
"LFS blob OID: (Pointer file OID):",
"0a696f35ab1404bbe9663e52774aaa800b0cf308ad2e5e5a9735d1c8e8b0a8c4 (7e8e0ec75c9c6b6654ebfc3b73f525a53f9db1de)",
Expand Down Expand Up @@ -1179,7 +1179,7 @@ def test_lfs_gc(cli_runner, data_archive, monkeypatch):
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines() == [
"Running fetch with --dry-run:",
" Found 4 blobs to fetch from the remote",
" Found 4 LFS blobs (82KiB) to fetch from the remote",
"",
"LFS blob OID: (Pointer file OID):",
"0a696f35ab1404bbe9663e52774aaa800b0cf308ad2e5e5a9735d1c8e8b0a8c4 (7e8e0ec75c9c6b6654ebfc3b73f525a53f9db1de)",
Expand Down

0 comments on commit bb753cf

Please sign in to comment.