From cb87096406f40bb4d559ec88ed59a807df64c871 Mon Sep 17 00:00:00 2001 From: jigold Date: Tue, 9 Jan 2024 16:03:48 -0500 Subject: [PATCH] [hailctl] Fix uploading files from parent directories of cwd (#13812) The bug in #13785 was that we used a `relpath`, but the files that were to be included was the parent directory of the current working directory of the script. To solve this, I added a new option `--mounts` that uses the Docker syntax for `-v src:dest` to specify where the contents of the source should be mounted into the container. Fixes #13785 --- build.yaml | 4 +- hail/python/hailtop/hailctl/batch/submit.py | 66 ++++++++--- .../test/hailtop/hailctl/batch/__init__.py | 0 .../test/hailtop/hailctl/batch/test_submit.py | 111 ++++++++++++++++++ 4 files changed, 166 insertions(+), 15 deletions(-) create mode 100644 hail/python/test/hailtop/hailctl/batch/__init__.py create mode 100644 hail/python/test/hailtop/hailctl/batch/test_submit.py diff --git a/build.yaml b/build.yaml index 7f61c623c4f..a4cbe0d7c36 100644 --- a/build.yaml +++ b/build.yaml @@ -1042,6 +1042,7 @@ steps: --durations=50 \ --ignore=test/hailtop/batch/ \ --ignore=test/hailtop/inter_cloud \ + --ignore=test/hailtop/hailctl/batch/ \ --timeout=120 \ test inputs: @@ -3006,7 +3007,8 @@ steps: --instafail \ --durations=50 \ --timeout=360 \ - /io/test/hailtop/batch/ + /io/test/hailtop/batch/ \ + /io/test/hailtop/hailctl/batch/ inputs: - from: /repo/hail/python/test to: /io/test diff --git a/hail/python/hailtop/hailctl/batch/submit.py b/hail/python/hailtop/hailctl/batch/submit.py index 5f770f330a9..69cf8d30bb3 100644 --- a/hail/python/hailtop/hailctl/batch/submit.py +++ b/hail/python/hailtop/hailctl/batch/submit.py @@ -1,7 +1,12 @@ import orjson import os +import re from shlex import quote as shq from hailtop import pip_version +from typing import Tuple + + +FILE_REGEX = re.compile(r'(?P[^:]+)(:(?P.+))?') async def submit(name, image_name, files, output, script, arguments): @@ -18,43 +23,76 @@ async def submit(name, image_name, files, output, script, arguments): ) files = unpack_comma_delimited_inputs(files) - user_config = get_user_config_path() + user_config = str(get_user_config_path()) + quiet = output != 'text' remote_tmpdir = get_remote_tmpdir('hailctl batch submit') + remote_tmpdir = remote_tmpdir.rstrip('/') + tmpdir_path_prefix = secret_alnum_string() def cloud_prefix(path): + path = path.lstrip('/') return f'{remote_tmpdir}/{tmpdir_path_prefix}/{path}' + def file_input_to_src_dest(file: str) -> Tuple[str, str, str]: + match = FILE_REGEX.match(file) + if match is None: + raise ValueError(f'invalid file specification {file}. Must have the form "src" or "src:dest"') + + result = match.groupdict() + + src = result.get('src') + if src is None: + raise ValueError(f'invalid file specification {file}. Must have a "src" defined.') + src = os.path.abspath(os.path.expanduser(src)) + src = src.rstrip('/') + + dest = result.get('dest') + if dest is not None: + dest = os.path.abspath(os.path.expanduser(dest)) + else: + dest = os.getcwd() + + cloud_file = cloud_prefix(src) + + return (src, dest, cloud_file) + backend = hb.ServiceBackend() b = hb.Batch(name=name, backend=backend) j = b.new_bash_job() j.image(image_name or os.environ.get('HAIL_GENETICS_HAIL_IMAGE', f'hailgenetics/hail:{pip_version()}')) - rel_file_paths = [os.path.relpath(file) for file in files] - local_files_to_cloud_files = [{'from': local, 'to': cloud_prefix(local)} for local in rel_file_paths] + local_files_to_cloud_files = [] + + for file in files: + src, dest, cloud_file = file_input_to_src_dest(file) + local_files_to_cloud_files.append({'from': src, 'to': cloud_file}) + in_file = b.read_input(cloud_file) + j.command(f'mkdir -p {os.path.dirname(dest)}; ln -s {in_file} {dest}') + + script_src, _, script_cloud_file = file_input_to_src_dest(script) + user_config_src, _, user_config_cloud_file = file_input_to_src_dest(user_config) + + await copy_from_dict(files=local_files_to_cloud_files) await copy_from_dict( files=[ - {'from': script, 'to': cloud_prefix(script)}, - {'from': str(user_config), 'to': cloud_prefix(user_config)}, - *local_files_to_cloud_files, + {'from': script_src, 'to': script_cloud_file}, + {'from': user_config_src, 'to': user_config_cloud_file}, ] ) - for file in local_files_to_cloud_files: - local_file = file['from'] - cloud_file = file['to'] - in_file = b.read_input(cloud_file) - j.command(f'ln -s {in_file} {local_file}') - script_file = b.read_input(cloud_prefix(script)) - config_file = b.read_input(cloud_prefix(user_config)) - j.command(f'mkdir -p $HOME/.config/hail && ln -s {config_file} $HOME/.config/hail/config.ini') + script_file = b.read_input(script_cloud_file) + config_file = b.read_input(user_config_cloud_file) j.env('HAIL_QUERY_BACKEND', 'batch') command = 'python3' if script.endswith('.py') else 'bash' script_arguments = " ".join(shq(x) for x in arguments) + + j.command(f'mkdir -p $HOME/.config/hail && ln -s {config_file} $HOME/.config/hail/config.ini') + j.command(f'cd {os.getcwd()}') j.command(f'{command} {script_file} {script_arguments}') batch_handle = await b._async_run(wait=False, disable_progress_bar=quiet) assert batch_handle diff --git a/hail/python/test/hailtop/hailctl/batch/__init__.py b/hail/python/test/hailtop/hailctl/batch/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/hail/python/test/hailtop/hailctl/batch/test_submit.py b/hail/python/test/hailtop/hailctl/batch/test_submit.py new file mode 100644 index 00000000000..aa56da2652a --- /dev/null +++ b/hail/python/test/hailtop/hailctl/batch/test_submit.py @@ -0,0 +1,111 @@ +import os +import pytest +import tempfile + +from typer.testing import CliRunner + +from hailtop.hailctl.batch import cli + + +@pytest.fixture +def runner(): + yield CliRunner(mix_stderr=False) + + +def write_script(dir: str, filename: str): + with open(f'{dir}/test_job.py', 'w') as f: + f.write( + f''' +import hailtop.batch as hb +b = hb.Batch() +j = b.new_job() +j.command('cat {filename}') +b.run(wait=False) +backend.close() +''' + ) + + +def write_hello(filename: str): + os.makedirs(os.path.dirname(filename), exist_ok=True) + with open(filename, 'w') as f: + f.write('hello\n') + + +def test_file_with_no_dest(runner: CliRunner): + with tempfile.TemporaryDirectory() as dir: + os.chdir(dir) + write_hello(f'{dir}/hello.txt') + write_script(dir, f'{dir}/hello.txt') + res = runner.invoke(cli.app, ['submit', '--files', 'hello.txt', 'test_job.py']) + assert res.exit_code == 0 + + +def test_file_in_current_dir(runner: CliRunner): + with tempfile.TemporaryDirectory() as dir: + os.chdir(dir) + write_hello(f'{dir}/hello.txt') + write_script(dir, f'/hello.txt') + res = runner.invoke(cli.app, ['submit', '--files', 'hello.txt:/', 'test_job.py']) + assert res.exit_code == 0 + + +def test_file_mount_in_child_dir(runner: CliRunner): + with tempfile.TemporaryDirectory() as dir: + os.chdir(dir) + write_hello(f'{dir}/hello.txt') + write_script(dir, '/child/hello.txt') + res = runner.invoke(cli.app, ['submit', '--files', 'hello.txt:/child/', 'test_job.py']) + assert res.exit_code == 0 + + +def test_file_mount_in_child_dir_to_root_dir(runner: CliRunner): + with tempfile.TemporaryDirectory() as dir: + os.chdir(dir) + write_hello(f'{dir}/child/hello.txt') + write_script(dir, '/hello.txt') + res = runner.invoke(cli.app, ['submit', '--files', 'child/hello.txt:/', 'test_job.py']) + assert res.exit_code == 0 + + +def test_mount_multiple_files(runner: CliRunner): + with tempfile.TemporaryDirectory() as dir: + os.chdir(dir) + write_hello(f'{dir}/child/hello1.txt') + write_hello(f'{dir}/child/hello2.txt') + write_script(dir, '/hello1.txt') + res = runner.invoke( + cli.app, ['submit', '--files', 'child/hello1.txt:/', '--files', 'child/hello2.txt:/', 'test_job.py'] + ) + assert res.exit_code == 0 + + +def test_dir_mount_in_child_dir_to_child_dir(runner: CliRunner): + with tempfile.TemporaryDirectory() as dir: + os.chdir(dir) + write_hello(f'{dir}/child/hello1.txt') + write_hello(f'{dir}/child/hello2.txt') + write_script(dir, '/child/hello1.txt') + res = runner.invoke(cli.app, ['submit', '--files', 'child/:/child/', 'test_job.py']) + assert res.exit_code == 0 + + +def test_file_outside_curdir(runner: CliRunner): + with tempfile.TemporaryDirectory() as dir: + os.mkdir(f'{dir}/working_dir') + os.chdir(f'{dir}/working_dir') + write_hello(f'{dir}/hello.txt') + write_script(dir, '/hello.txt') + res = runner.invoke(cli.app, ['submit', '--files', f'{dir}/hello.txt:/', '../test_job.py']) + assert res.exit_code == 0 + + +def test_dir_outside_curdir(runner: CliRunner): + with tempfile.TemporaryDirectory() as dir: + os.mkdir(f'{dir}/working_dir') + os.chdir(f'{dir}/working_dir') + write_hello(f'{dir}/hello1.txt') + write_hello(f'{dir}/hello2.txt') + write_script(dir, '/hello1.txt') + res = runner.invoke(cli.app, ['submit', '--files', f'{dir}/:/', '../test_job.py']) + assert res.exit_code == 0, (res.exit_code, res.stdout, res.stderr)