Skip to content

Commit

Permalink
Merge pull request #710 from koordinates/lfs-gc
Browse files Browse the repository at this point in the history
Add `kart lfs+ gc` - cleans up unreferenced LFS tiles
  • Loading branch information
olsen232 committed Sep 5, 2022
2 parents 3352b92 + f624e75 commit 266dade
Show file tree
Hide file tree
Showing 3 changed files with 206 additions and 48 deletions.
185 changes: 137 additions & 48 deletions kart/lfs_commands/__init__.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
import os
import itertools
import pygit2
import re
import subprocess
import sys
import tempfile

import click

from kart.cli_util import KartGroup, add_help_subcommand, tool_environment
from kart.exceptions import SubprocessError
from kart.exceptions import SubprocessError, InvalidOperation
from kart.lfs_util import get_hash_from_pointer_file, get_local_path_from_lfs_hash
from kart.object_builder import ObjectBuilder
from kart.rev_list_objects import rev_list_tile_pointer_files
Expand All @@ -25,53 +26,6 @@ def lfs_plus(ctx, **kwargs):
"""Git-LFS commands re-implemented in Kart to allow for spatial filtering."""


def push_lfs_oids(repo, remote_name, lfs_oids):
"""
Given a list of OIDs of LFS blobs (not the pointer files, but the LFS blobs themselves)
push all of those LFS blobs from the local cache to the given remote.
"""
# Older git-lfs doesn't support stdin so we fall back to using args if we somehow have an older version.
if not _push_lfs_oids_using_stdin(repo, remote_name, lfs_oids):
_push_lfs_oids_using_args(repo, remote_name, lfs_oids)


def _push_lfs_oids_using_stdin(repo, remote_name, lfs_oids):
# TODO - capture progress reporting and do our own.
with tempfile.TemporaryFile() as oid_file:
oid_file.write("\n".join(lfs_oids).encode("utf-8"))
oid_file.write(b"\n")
oid_file.seek(0)

returncode, stdout, stderr = subprocess_tee(
["git-lfs", "push", remote_name, "--object-id", "--stdin"],
env=tool_environment(),
cwd=repo.workdir_path,
stdin=oid_file,
)
if returncode == 0:
return True
elif b"unknown flag: --stdin" in stderr:
return False
else:
raise SubprocessError(
"There was a problem with git-lfs push", exit_code=returncode
)


def _push_lfs_oids_using_args(repo, remote_name, lfs_oids):
try:
# TODO - capture progress reporting and do our own.
subprocess.check_call(
["git-lfs", "push", remote_name, "--object-id", *lfs_oids],
env=tool_environment(),
cwd=repo.workdir_path,
)
except subprocess.CalledProcessError as e:
raise SubprocessError(
f"There was a problem with git-lfs push: {e}", called_process_error=e
)


@lfs_plus.command()
@click.pass_context
@click.option(
Expand Down Expand Up @@ -139,6 +93,53 @@ def get_start_and_stop_commits(input_iter):
return start_commits, stop_commits


def push_lfs_oids(repo, remote_name, lfs_oids):
"""
Given a list of OIDs of LFS blobs (not the pointer files, but the LFS blobs themselves)
push all of those LFS blobs from the local cache to the given remote.
"""
# Older git-lfs doesn't support stdin so we fall back to using args if we somehow have an older version.
if not _push_lfs_oids_using_stdin(repo, remote_name, lfs_oids):
_push_lfs_oids_using_args(repo, remote_name, lfs_oids)


def _push_lfs_oids_using_stdin(repo, remote_name, lfs_oids):
# TODO - capture progress reporting and do our own.
with tempfile.TemporaryFile() as oid_file:
oid_file.write("\n".join(lfs_oids).encode("utf-8"))
oid_file.write(b"\n")
oid_file.seek(0)

returncode, stdout, stderr = subprocess_tee(
["git-lfs", "push", remote_name, "--object-id", "--stdin"],
env=tool_environment(),
cwd=repo.workdir_path,
stdin=oid_file,
)
if returncode == 0:
return True
elif b"unknown flag: --stdin" in stderr:
return False
else:
raise SubprocessError(
"There was a problem with git-lfs push", exit_code=returncode
)


def _push_lfs_oids_using_args(repo, remote_name, lfs_oids):
try:
# TODO - capture progress reporting and do our own.
subprocess.check_call(
["git-lfs", "push", remote_name, "--object-id", *lfs_oids],
env=tool_environment(),
cwd=repo.workdir_path,
)
except subprocess.CalledProcessError as e:
raise SubprocessError(
f"There was a problem with git-lfs push: {e}", called_process_error=e
)


@lfs_plus.command()
@click.pass_context
@click.option(
Expand Down Expand Up @@ -255,3 +256,91 @@ def fetch_lfs_blobs_for_pointer_files(
raise SubprocessError(
f"There was a problem with git-lfs fetch: {e}", called_process_error=e
)


LFS_OID_PATTERN = re.compile("[0-9a-fA-F]{64}")


@lfs_plus.command()
@click.pass_context
@click.option(
"--dry-run",
is_flag=True,
help="Don't fetch anything, just show what would be fetched",
)
def gc(ctx, dry_run):
"""
Delete (garbage-collect) LFS files that are not referenced at HEAD from the local cache.
Point Cloud tiles are LFS files, and they will remain in the local cache until they are explicitly garbage
collected. Tiles will be present but unreferenced if they part of a commit that was checked out previously, but not
part of the current commit. The previously checked-out commit could be an earlier revision of the current branch,
or on another branch entirely.
"""
repo = ctx.obj.repo

remote_name = repo.head_remote_name_or_default
if not remote_name:
raise InvalidOperation(
"LFS files cannot be garbage collected unless there is a remote to refetch them from."
)

unpushed_lfs_oids = set()
for (commit_id, path_match_result, pointer_blob) in rev_list_tile_pointer_files(
repo, ["--branches"], ["--remotes"]
):
unpushed_lfs_oids.add(get_hash_from_pointer_file(pointer_blob))

spatial_filter = repo.spatial_filter
checked_out_lfs_oids = set()
for dataset in repo.datasets("HEAD", filter_dataset_type="point-cloud"):
checked_out_lfs_oids.update(
dataset.tile_lfs_hashes(spatial_filter.transform_for_dataset(dataset))
)

to_delete = set()
total_size_to_delete = 0

to_delete_once_pushed = set()
total_size_to_delete_once_pushed = 0

for file in (repo.gitdir_path / "lfs" / "objects").glob("**/*"):
if not file.is_file() or not LFS_OID_PATTERN.fullmatch(file.name):
continue # Not an LFS blob at all.

if file.name in checked_out_lfs_oids:
continue # Can't garbage-collect anything that's currently checked out.

if file.name in unpushed_lfs_oids:
to_delete_once_pushed.add(file)
total_size_to_delete_once_pushed += file.stat().st_size
else:
to_delete.add(file)
total_size_to_delete += file.stat().st_size

if to_delete_once_pushed:
size_desc = human_readable_bytes(total_size_to_delete_once_pushed)
click.echo(
f"Can't delete {len(to_delete_once_pushed)} LFS blobs ({size_desc}) from the cache since they have not been pushed to the remote"
)

size_desc = human_readable_bytes(total_size_to_delete)
if dry_run:
click.echo(
f"Running gc with --dry-run: deleting {len(to_delete)} LFS blobs ({size_desc}) from the cache"
)
for file in sorted(to_delete, key=lambda f: f.name):
click.echo(file.name)
return

click.echo(f"Deleting {len(to_delete)} LFS blobs ({size_desc}) from the cache...")
for file in to_delete:
file.unlink()


def human_readable_bytes(num):
for unit in ("", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"):
if num < 1024:
return f"{num:.1f}{unit}B" if (unit and num < 10) else f"{num:.0f}{unit}B"
num /= 1024.0
return f"{num:.1f}YiB"
5 changes: 5 additions & 0 deletions kart/point_cloud/v1.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,11 @@ def tile_count(self):
"""The total number of features in this dataset."""
return sum(1 for blob in self.tile_pointer_blobs())

def tile_lfs_hashes(self, spatial_filter=SpatialFilter.MATCH_ALL):
"""Returns a generator that yields every LFS hash."""
for blob in self.tile_pointer_blobs(spatial_filter=spatial_filter):
yield get_hash_from_pointer_file(blob)

def tilenames_with_lfs_hashes(
self, spatial_filter=SpatialFilter.MATCH_ALL, fix_extensions=True
):
Expand Down
64 changes: 64 additions & 0 deletions tests/point_cloud/test_workingcopy.py
Original file line number Diff line number Diff line change
Expand Up @@ -1032,3 +1032,67 @@ def test_lfs_fetch(cli_runner, data_archive, monkeypatch):
"d380a98414ab209f36c7fba4734b02f67de519756e341837217716c5b4768339 (f866ac0ecf4326931d10aaa16140e2240eeada90)",
"ec80af6cae31be5318f9380cd953b25469bd8ecda25086deca2b831bbb89168a (c76e89f23f512214063d31e7a9c85657f0cf8fb6)",
]


def test_lfs_gc(cli_runner, data_archive, monkeypatch):
monkeypatch.setenv("X_KART_POINT_CLOUDS", "1")
with data_archive("point-cloud/auckland.tgz") as repo_path:
# Delete everything in the local LFS cache.
for file in (repo_path / "auckland").glob("auckland_3_*.copc.laz"):
file.unlink()

r = cli_runner.invoke(["lfs+", "gc", "--dry-run"])
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines() == [
"Running gc with --dry-run: deleting 0 LFS blobs (0B) from the cache"
]

r = cli_runner.invoke(["commit", "-m", "Delete auckland_3_*"])
assert r.exit_code == 0, r.stderr

r = cli_runner.invoke(["lfs+", "gc", "--dry-run"])
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines() == [
"Can't delete 4 LFS blobs (100KiB) from the cache since they have not been pushed to the remote",
"Running gc with --dry-run: deleting 0 LFS blobs (0B) from the cache",
]

# Simulate pushing the latest commit to the remote (we don't actually have a remote set up):
(repo_path / ".kart" / "refs" / "remotes" / "origin").mkdir(
parents=True, exist_ok=True
)
shutil.copy(
repo_path / ".kart" / "refs" / "heads" / "main",
repo_path / ".kart" / "refs" / "remotes" / "origin" / "main",
)

r = cli_runner.invoke(["lfs+", "gc", "--dry-run"])
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines() == [
"Running gc with --dry-run: deleting 4 LFS blobs (100KiB) from the cache",
"64895828ea03ce9cafaef4f387338aab8d498c8eccaef1503b8b3bd97e57c5a3",
"817b6ddadd95166012143df55fa73dd6c5a8b42b603c33d1b6c38f187261096e",
"d380a98414ab209f36c7fba4734b02f67de519756e341837217716c5b4768339",
"ec80af6cae31be5318f9380cd953b25469bd8ecda25086deca2b831bbb89168a",
]

r = cli_runner.invoke(["lfs+", "gc"])
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines() == [
"Deleting 4 LFS blobs (100KiB) from the cache..."
]

r = cli_runner.invoke(["lfs+", "gc"])
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines() == ["Deleting 0 LFS blobs (0B) from the cache..."]

r = cli_runner.invoke(["lfs+", "fetch", "HEAD^", "--dry-run"])
assert r.exit_code == 0, r.stderr
assert r.stdout.splitlines() == [
"Running fetch with --dry-run: fetching 4 LFS blobs",
"LFS blob OID: (Pointer file OID):",
"64895828ea03ce9cafaef4f387338aab8d498c8eccaef1503b8b3bd97e57c5a3 (ba01f6e0d8a64b920e1d8dbaa563a7a641c164b6)",
"817b6ddadd95166012143df55fa73dd6c5a8b42b603c33d1b6c38f187261096e (364046ba21d4a0154c77a2544348bea9fd6baa93)",
"d380a98414ab209f36c7fba4734b02f67de519756e341837217716c5b4768339 (f866ac0ecf4326931d10aaa16140e2240eeada90)",
"ec80af6cae31be5318f9380cd953b25469bd8ecda25086deca2b831bbb89168a (c76e89f23f512214063d31e7a9c85657f0cf8fb6)",
]

0 comments on commit 266dade

Please sign in to comment.