diff --git a/.gitignore b/.gitignore index 33be5cb20c..f63e9f0c4b 100644 --- a/.gitignore +++ b/.gitignore @@ -26,7 +26,7 @@ innosetup/config.ini .coverage .coverage.* -*.swp +*.sw? pip-wheel-metadata/ .vscode/ diff --git a/dvc/cli.py b/dvc/cli.py index 0d612f85d4..5ab81ad215 100644 --- a/dvc/cli.py +++ b/dvc/cli.py @@ -20,6 +20,7 @@ imp_url, init, install, + ls, lock, metrics, move, @@ -62,6 +63,7 @@ metrics, install, root, + ls, lock, pipeline, daemon, diff --git a/dvc/command/ls.py b/dvc/command/ls.py new file mode 100644 index 0000000000..37c857241f --- /dev/null +++ b/dvc/command/ls.py @@ -0,0 +1,68 @@ +import argparse +import logging + +from dvc.command.base import append_doc_link +from dvc.command.base import CmdBaseNoRepo +from dvc.exceptions import DvcException + + +logger = logging.getLogger(__name__) + + +class CmdList(CmdBaseNoRepo): + def run(self): + from dvc.repo import Repo + + try: + nodes = Repo.ls( + self.args.url, + self.args.target, + rev=self.args.rev, + recursive=self.args.recursive, + outs_only=self.args.outs_only, + ) + if nodes: + logger.info("\n".join(nodes)) + return 0 + except DvcException: + logger.exception("failed to list '{}'".format(self.args.url)) + return 1 + + +def add_parser(subparsers, parent_parser): + LIST_HELP = "List files and DVC outputs in the repo." + list_parser = subparsers.add_parser( + "list", + parents=[parent_parser], + description=append_doc_link(LIST_HELP, "list"), + help=LIST_HELP, + formatter_class=argparse.RawTextHelpFormatter, + ) + list_parser.add_argument( + "url", + help="Supported urls:\n" + "/path/to/file\n" + "/path/to/directory\n" + "C:\\\\path\\to\\file\n" + "C:\\\\path\\to\\directory\n" + "https://github.com/path/to/repo\n" + "git@github.com:path/to/repo.git\n", + ) + list_parser.add_argument( + "-R", + "--recursive", + action="store_true", + help="Recursively list files.", + ) + list_parser.add_argument( + "--outs-only", action="store_true", help="Show only DVC outputs." + ) + list_parser.add_argument( + "--rev", nargs="?", help="Git revision (e.g. branch, tag, SHA)" + ) + list_parser.add_argument( + "target", + nargs="?", + help="Path to directory within the repository to list outputs for", + ) + list_parser.set_defaults(func=CmdList) diff --git a/dvc/exceptions.py b/dvc/exceptions.py index 4af6111080..a5429f1364 100644 --- a/dvc/exceptions.py +++ b/dvc/exceptions.py @@ -294,12 +294,23 @@ def __init__(self, code, reason): class PathMissingError(DvcException): - def __init__(self, path, repo): + default_msg = ( + "The path '{}' does not exist in the target repository '{}'" + " neither as an output nor a git-handled file." + ) + default_msg_output_only = ( + "The path '{}' does not exist in the target repository '{}'" + " as an output." + ) + + def __init__(self, path, repo, output_only=False): msg = ( - "The path '{}' does not exist in the target repository '{}'" - " neither as an output nor a git-handled file." + self.default_msg + if not output_only + else self.default_msg_output_only ) super().__init__(msg.format(path, repo)) + self.output_only = output_only class RemoteCacheRequiredError(DvcException): diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index 16062042fd..734a841263 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -43,6 +43,7 @@ class Repo(object): from dvc.repo.install import install from dvc.repo.add import add from dvc.repo.remove import remove + from dvc.repo.ls import ls from dvc.repo.lock import lock as lock_stage from dvc.repo.move import move from dvc.repo.run import run diff --git a/dvc/repo/ls.py b/dvc/repo/ls.py new file mode 100644 index 0000000000..dba795a9f4 --- /dev/null +++ b/dvc/repo/ls.py @@ -0,0 +1,84 @@ +import os + +from dvc.exceptions import PathMissingError, OutputNotFoundError + + +@staticmethod +def ls(url, target=None, rev=None, recursive=None, outs_only=False): + from dvc.external_repo import external_repo + from dvc.repo import Repo + from dvc.utils import relpath + + with external_repo(url, rev) as repo: + target_path_info = _get_target_path_info(repo, target) + result = [] + if isinstance(repo, Repo): + result.extend(_ls_outs_repo(repo, target_path_info, recursive)) + + if not outs_only: + result.extend(_ls_files_repo(target_path_info, recursive)) + + if target and not result: + raise PathMissingError(target, repo, output_only=outs_only) + + def prettify(path_info): + if path_info == target_path_info: + return path_info.name + return relpath(path_info, target_path_info) + + result = list(set(map(prettify, result))) + result.sort() + return result + + +def _ls_files_repo(target_path_info, recursive=None): + from dvc.compat import fspath + from dvc.ignore import CleanTree + from dvc.path_info import PathInfo + from dvc.scm.tree import WorkingTree + + if not os.path.exists(fspath(target_path_info)): + return [] + + files = [] + tree = CleanTree(WorkingTree(target_path_info)) + try: + for dirpath, dirnames, filenames in tree.walk(target_path_info): + files.extend(map(lambda f: PathInfo(dirpath, f), filenames)) + if not recursive: + files.extend(map(lambda d: PathInfo(dirpath, d), dirnames)) + break + except NotADirectoryError: + if os.path.isfile(fspath(target_path_info)): + return [target_path_info] + + return files + + +def _ls_outs_repo(repo, target_path_info, recursive=None): + from dvc.compat import fspath + from dvc.path_info import PathInfo + + try: + outs = repo.find_outs_by_path(fspath(target_path_info), recursive=True) + except OutputNotFoundError: + return [] + + if recursive: + return [out.path_info for out in outs] + + def get_top_part(path_info): + relpath = path_info.relpath(target_path_info) + if relpath.parts: + return PathInfo(target_path_info, relpath.parts[0]) + return path_info + + return list({get_top_part(out.path_info) for out in outs}) + + +def _get_target_path_info(repo, target=None): + from dvc.path_info import PathInfo + + if not target: + return PathInfo(repo.root_dir) + return PathInfo(repo.root_dir, target) diff --git a/scripts/completion/dvc.bash b/scripts/completion/dvc.bash index f505431b84..f84efcf832 100644 --- a/scripts/completion/dvc.bash +++ b/scripts/completion/dvc.bash @@ -9,7 +9,7 @@ #---------------------------------------------------------- _dvc_commands='add cache checkout commit config destroy diff fetch get-url get gc \ - import-url import init install lock metrics move pipeline pull push \ + import-url import init install lock list metrics move pipeline pull push \ remote remove repro root run status unlock unprotect update version' _dvc_options='-h --help -V --version' @@ -31,6 +31,7 @@ _dvc_import_url='-f --file' _dvc_import='-o --out --rev' _dvc_init='--no-scm -f --force' _dvc_install='' +_dvc_list='-R --recursive --outs-only --rev $(compgen -G *)' _dvc_lock='$(compgen -G *.dvc)' _dvc_metrics='add modify rmeove show' _dvc_metrics_add='-t --type -x --xpath $(compgen -G *)' @@ -60,6 +61,26 @@ _dvc_unprotect='$(compgen -G *)' _dvc_update='$(compgen -G *.dvc)' _dvc_version='' +# Params +# $1 - COMP_WORDS[1] +comp_command() { + local options_list="_dvc_$(replace_hyphen $1)" + + COMPREPLY=( $(compgen -W "$_dvc_global_options ${!options_list}" -- "$word") ) +} + +# Params +# $1 - COMP_WORDS[1] +# $1 - COMP_WORDS[2] +comp_subcommand() { + local options_list="_dvc_$(replace_hyphen $1)_$(replace_hyphen $2)" + if [ -z "${!options_list}" ]; then + comp_command $1 + else + COMPREPLY=( $(compgen -W "$_dvc_global_options ${!options_list}" -- "$word") ) + fi +} + # Notes: # # `COMPREPLY` contains what will be rendered after completion is triggered @@ -76,7 +97,6 @@ _dvc() { replace_hyphen() { echo $(echo $1 | sed 's/-/_/g') } - local word="${COMP_WORDS[COMP_CWORD]}" COMPREPLY=() @@ -87,13 +107,9 @@ _dvc() { *) COMPREPLY=($(compgen -W "$_dvc_commands" -- "$word")) ;; esac elif [ "${COMP_CWORD}" -eq 2 ]; then - local options_list="_dvc_$(replace_hyphen ${COMP_WORDS[1]})" - - COMPREPLY=($(compgen -W "$_dvc_global_options ${!options_list}" -- "$word")) + comp_command ${COMP_WORDS[1]} elif [ "${COMP_CWORD}" -eq 3 ]; then - local options_list="_dvc_$(replace_hyphen ${COMP_WORDS[1]})_$(replace_hyphen ${COMP_WORDS[2]})" - - COMPREPLY=($(compgen -W "$_dvc_global_options ${!options_list}" -- "$word")) + comp_subcommand ${COMP_WORDS[1]} ${COMP_WORDS[2]} fi return 0 diff --git a/scripts/completion/dvc.zsh b/scripts/completion/dvc.zsh index 761c89c243..7ee7386b29 100644 --- a/scripts/completion/dvc.zsh +++ b/scripts/completion/dvc.zsh @@ -27,6 +27,7 @@ _dvc_commands() { "import:Download data from DVC repository and take it under DVC control." "init:Initialize DVC in the current directory." "install:Install DVC git hooks into the repository." + "list:List files." "lock:Lock DVC-file." "metrics:Commands to add, manage, collect and display metrics." "move:Rename or move a DVC controlled data file or a directory." @@ -160,6 +161,14 @@ _dvc_lock=( "*:Stages:_files -g '(*.dvc|Dvcfile)'" ) +_dvc_list=( + "--rev[Git revision (e.g. branch, tag, SHA)]:Revision:" + {-R,--recursive}"[Recursively add each file under the directory.]" + "--outs-only[Only outputs DVC-outs.]" + "1:URL:" + "2:Target:" +) + _dvc_metrics=( "1:Sub command:(add show diff modify remove)" ) @@ -292,6 +301,7 @@ case $words[1] in init) _arguments $_dvc_global_options $_dvc_init ;; install) _arguments $_dvc_global_options $_dvc_install ;; lock) _arguments $_dvc_global_options $_dvc_lock ;; + list) _arguments $_dvc_global_options $_dvc_list ;; metrics) _arguments $_dvc_global_options $_dvc_metrics ;; move) _arguments $_dvc_global_options $_dvc_move ;; pipeline) _arguments $_dvc_global_options $_dvc_pipeline ;; diff --git a/tests/func/test_ls.py b/tests/func/test_ls.py new file mode 100644 index 0000000000..4286cec65a --- /dev/null +++ b/tests/func/test_ls.py @@ -0,0 +1,355 @@ +import shutil +import os +import pytest + +from dvc.compat import fspath +from dvc.exceptions import PathMissingError +from dvc.scm.base import CloneError +from dvc.repo import Repo + +FS_STRUCTURE = { + "README.md": "content", + "model/script.py": "content", + "model/train.py": "content", + ".gitignore": "content", +} + +DVC_STRUCTURE = { + "structure.xml": "content", + "data/subcontent/data.xml": "content", + "data/subcontent/statistics/data.csv": "content", + "model/people.csv": "content", +} + + +def match_files(files, expected_files): + assert set(files) == set( + map(lambda args: os.path.join(*args), expected_files) + ) + + +def create_dvc_pipeline(tmp_dir, dvc): + script = os.linesep.join( + [ + "from pathlib import Path", + "Path({}).touch()".format(os.path.join("out", "file")), + ] + ) + tmp_dir.scm_gen({"script.py": script}, commit="init") + tmp_dir.dvc_gen({"dep": "content"}, commit="init dvc") + dvc.run( + **{ + "command": "python script.py", + "outs": [os.path.join("out", "file")], + "deps": ["dep"], + "fname": "out.dvc", + } + ) + tmp_dir.scm_add(["out.dvc"], commit="run") + shutil.rmtree("out") + + +def test_ls_repo(tmp_dir, dvc, scm): + tmp_dir.scm_gen(FS_STRUCTURE, commit="init") + tmp_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + files = Repo.ls(fspath(tmp_dir)) + match_files( + files, + ( + (".gitignore",), + ("README.md",), + ("structure.xml.dvc",), + ("model",), + ("data",), + ("structure.xml",), + ), + ) + + +def test_ls_repo_recursive(tmp_dir, dvc, scm): + tmp_dir.scm_gen(FS_STRUCTURE, commit="init") + tmp_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + files = Repo.ls(fspath(tmp_dir), recursive=True) + match_files( + files, + ( + (".gitignore",), + ("README.md",), + ("structure.xml.dvc",), + ("model", "script.py"), + ("model", "train.py"), + ("model", "people.csv.dvc"), + ("data", "subcontent", "data.xml.dvc"), + ("data", "subcontent", "statistics", "data.csv.dvc"), + ("data", "subcontent", "statistics", "data.csv"), + ("data", "subcontent", "data.xml"), + ("model", "people.csv"), + ("structure.xml",), + ), + ) + + +def test_ls_repo_outs_only_recursive(tmp_dir, dvc, scm): + tmp_dir.scm_gen(FS_STRUCTURE, commit="init") + tmp_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + files = Repo.ls(fspath(tmp_dir), recursive=True, outs_only=True) + match_files( + files, + ( + ("data", "subcontent", "statistics", "data.csv"), + ("data", "subcontent", "data.xml"), + ("model", "people.csv"), + ("structure.xml",), + ), + ) + + +def test_ls_repo_with_target_dir(tmp_dir, dvc, scm): + tmp_dir.scm_gen(FS_STRUCTURE, commit="init") + tmp_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + files = Repo.ls(fspath(tmp_dir), target="model") + match_files( + files, + (("script.py",), ("train.py",), ("people.csv",), ("people.csv.dvc",)), + ) + + +def test_ls_repo_with_target_dir_outs_only_empty(tmp_dir, dvc, scm): + tmp_dir.scm_gen(FS_STRUCTURE, commit="init") + tmp_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + tmp_dir.scm_gen({"folder/.keep": "content"}, commit="add .keep") + + with pytest.raises(PathMissingError): + Repo.ls(fspath(tmp_dir), target="folder", outs_only=True) + + +def test_ls_repo_with_target_subdir(tmp_dir, dvc, scm): + tmp_dir.scm_gen(FS_STRUCTURE, commit="init") + tmp_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + target = os.path.join("data", "subcontent") + files = Repo.ls(fspath(tmp_dir), target) + match_files(files, (("data.xml",), ("data.xml.dvc",), ("statistics",))) + + +def test_ls_repo_with_target_subdir_outs_only(tmp_dir, dvc, scm): + tmp_dir.scm_gen(FS_STRUCTURE, commit="init") + tmp_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + target = os.path.join("data", "subcontent") + files = Repo.ls(fspath(tmp_dir), target, outs_only=True) + match_files(files, (("data.xml",), ("statistics",))) + + +def test_ls_repo_with_target_subdir_outs_only_recursive(tmp_dir, dvc, scm): + tmp_dir.scm_gen(FS_STRUCTURE, commit="init") + tmp_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + target = os.path.join("data", "subcontent") + files = Repo.ls(fspath(tmp_dir), target, outs_only=True, recursive=True) + match_files(files, (("data.xml",), ("statistics", "data.csv"))) + + +def test_ls_repo_with_target_file_out(tmp_dir, dvc, scm): + tmp_dir.scm_gen(FS_STRUCTURE, commit="init") + tmp_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + target = os.path.join("data", "subcontent", "data.xml") + files = Repo.ls(fspath(tmp_dir), target) + match_files(files, (("data.xml",),)) + + +def test_ls_repo_with_file_target_fs(tmp_dir, dvc, scm): + tmp_dir.scm_gen(FS_STRUCTURE, commit="init") + tmp_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + target = "README.md" + files = Repo.ls(fspath(tmp_dir), target, recursive=True) + match_files(files, (("README.md",),)) + + +def test_ls_repo_with_missed_target(tmp_dir, dvc, scm): + tmp_dir.scm_gen(FS_STRUCTURE, commit="init") + tmp_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + with pytest.raises(PathMissingError) as exc_info: + Repo.ls(fspath(tmp_dir), target="missed_target") + assert not exc_info.value.output_only + + +def test_ls_repo_with_missed_target_outs_only(tmp_dir, dvc, scm): + tmp_dir.scm_gen(FS_STRUCTURE, commit="init") + tmp_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + with pytest.raises(PathMissingError) as exc_info: + Repo.ls( + fspath(tmp_dir), + target="missed_target", + recursive=True, + outs_only=True, + ) + assert exc_info.value.output_only + + +def test_ls_repo_with_removed_dvc_dir(tmp_dir, dvc, scm): + create_dvc_pipeline(tmp_dir, dvc) + + files = Repo.ls(fspath(tmp_dir)) + match_files( + files, (("script.py",), ("dep.dvc",), ("out.dvc",), ("dep",), ("out",)) + ) + + +def test_ls_repo_with_removed_dvc_dir_recursive(tmp_dir, dvc, scm): + create_dvc_pipeline(tmp_dir, dvc) + + files = Repo.ls(fspath(tmp_dir), recursive=True) + match_files( + files, + ( + ("script.py",), + ("dep.dvc",), + ("out.dvc",), + ("dep",), + ("out", "file"), + ), + ) + + +def test_ls_repo_with_removed_dvc_dir_with_target_dir(tmp_dir, dvc, scm): + create_dvc_pipeline(tmp_dir, dvc) + + target = "out" + files = Repo.ls(fspath(tmp_dir), target) + match_files(files, (("file",),)) + + +def test_ls_repo_with_removed_dvc_dir_with_target_file(tmp_dir, dvc, scm): + create_dvc_pipeline(tmp_dir, dvc) + + target = os.path.join("out", "file") + files = Repo.ls(fspath(tmp_dir), target) + match_files(files, (("file",),)) + + +def test_ls_remote_repo(erepo_dir): + with erepo_dir.chdir(): + erepo_dir.scm_gen(FS_STRUCTURE, commit="init") + erepo_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + url = "file://{}".format(erepo_dir) + files = Repo.ls(url) + match_files( + files, + ( + (".gitignore",), + ("README.md",), + ("structure.xml.dvc",), + ("model",), + ("data",), + ("structure.xml",), + ), + ) + + +def test_ls_remote_repo_recursive(erepo_dir): + with erepo_dir.chdir(): + erepo_dir.scm_gen(FS_STRUCTURE, commit="init") + erepo_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + url = "file://{}".format(erepo_dir) + files = Repo.ls(url, recursive=True) + match_files( + files, + ( + (".gitignore",), + ("README.md",), + ("structure.xml.dvc",), + ("model", "script.py"), + ("model", "train.py"), + ("model", "people.csv.dvc"), + ("data", "subcontent", "data.xml.dvc"), + ("data", "subcontent", "statistics", "data.csv.dvc"), + ("data", "subcontent", "statistics", "data.csv"), + ("data", "subcontent", "data.xml"), + ("model", "people.csv"), + ("structure.xml",), + ), + ) + + +def test_ls_remote_git_only_repo_recursive(git_dir): + with git_dir.chdir(): + git_dir.scm_gen(FS_STRUCTURE, commit="init") + + url = "file://{}".format(git_dir) + files = Repo.ls(url, recursive=True) + match_files( + files, + ( + (".gitignore",), + ("README.md",), + ("model", "script.py"), + ("model", "train.py"), + ), + ) + + +def test_ls_remote_repo_with_target_dir(erepo_dir): + with erepo_dir.chdir(): + erepo_dir.scm_gen(FS_STRUCTURE, commit="init") + erepo_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + url = "file://{}".format(erepo_dir) + target = "model" + files = Repo.ls(url, target) + match_files( + files, + (("script.py",), ("train.py",), ("people.csv",), ("people.csv.dvc",)), + ) + + +def test_ls_remote_repo_with_rev(erepo_dir): + with erepo_dir.chdir(): + erepo_dir.scm_gen(FS_STRUCTURE, commit="init") + erepo_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + + rev = erepo_dir.scm.list_all_commits()[1] + url = "file://{}".format(erepo_dir) + files = Repo.ls(url, rev=rev) + match_files(files, ((".gitignore",), ("README.md",), ("model",))) + + +def test_ls_remote_repo_with_rev_recursive(erepo_dir): + with erepo_dir.chdir(): + erepo_dir.dvc_gen(DVC_STRUCTURE, commit="dvc") + erepo_dir.scm_gen(FS_STRUCTURE, commit="init") + + rev = erepo_dir.scm.list_all_commits()[1] + url = "file://{}".format(erepo_dir) + files = Repo.ls(url, rev=rev, recursive=True) + match_files( + files, + ( + ("structure.xml.dvc",), + ("model", "people.csv.dvc"), + ("data", "subcontent", "data.xml.dvc"), + ("data", "subcontent", "statistics", "data.csv.dvc"), + ("data", "subcontent", "statistics", "data.csv"), + ("data", "subcontent", "data.xml"), + ("model", "people.csv"), + ("structure.xml",), + ), + ) + + +def test_ls_not_existed_url(): + from time import time + + dirname = "__{}_{}".format("not_existed", time()) + with pytest.raises(CloneError): + Repo.ls(dirname, recursive=True) diff --git a/tests/unit/command/test_ls.py b/tests/unit/command/test_ls.py new file mode 100644 index 0000000000..ed6908e8ea --- /dev/null +++ b/tests/unit/command/test_ls.py @@ -0,0 +1,54 @@ +from dvc.cli import parse_args +from dvc.command.ls import CmdList + + +def _test_cli(mocker, *args): + cli_args = parse_args(["list", *args]) + assert cli_args.func == CmdList + + cmd = cli_args.func(cli_args) + m = mocker.patch("dvc.repo.Repo.ls") + + assert cmd.run() == 0 + return m + + +def test_list(mocker): + url = "local_dir" + m = _test_cli(mocker, url) + m.assert_called_once_with( + url, None, recursive=False, rev=None, outs_only=False + ) + + +def test_list_recursive(mocker): + url = "local_dir" + m = _test_cli(mocker, url, "-R") + m.assert_called_once_with( + url, None, recursive=True, rev=None, outs_only=False + ) + + +def test_list_git_ssh_rev(mocker): + url = "git@github.com:repo" + m = _test_cli(mocker, url, "--rev", "123") + m.assert_called_once_with( + url, None, recursive=False, rev="123", outs_only=False + ) + + +def test_list_targets(mocker): + url = "local_dir" + target = "subdir" + m = _test_cli(mocker, url, target) + m.assert_called_once_with( + url, target, recursive=False, rev=None, outs_only=False + ) + + +def test_list_outputs_only(mocker): + url = "local_dir" + m = _test_cli(mocker, url, None, "--outs-only") + m.assert_called_once_with( + url, None, recursive=False, rev=None, outs_only=True + )