Skip to content

Commit

Permalink
[hailctl] Fix uploading files from parent directories of cwd (#13812)
Browse files Browse the repository at this point in the history
The bug in #13785 was that we used a `relpath`, but the files that were
to be included was the parent directory of the current working directory
of the script. To solve this, I added a new option `--mounts` that uses
the Docker syntax for `-v src:dest` to specify where the contents of the
source should be mounted into the container.


Fixes #13785
  • Loading branch information
jigold committed Jan 9, 2024
1 parent 3ddbf5f commit cb87096
Show file tree
Hide file tree
Showing 4 changed files with 166 additions and 15 deletions.
4 changes: 3 additions & 1 deletion build.yaml
Expand Up @@ -1042,6 +1042,7 @@ steps:
--durations=50 \
--ignore=test/hailtop/batch/ \
--ignore=test/hailtop/inter_cloud \
--ignore=test/hailtop/hailctl/batch/ \
--timeout=120 \
test
inputs:
Expand Down Expand Up @@ -3006,7 +3007,8 @@ steps:
--instafail \
--durations=50 \
--timeout=360 \
/io/test/hailtop/batch/
/io/test/hailtop/batch/ \
/io/test/hailtop/hailctl/batch/
inputs:
- from: /repo/hail/python/test
to: /io/test
Expand Down
66 changes: 52 additions & 14 deletions hail/python/hailtop/hailctl/batch/submit.py
@@ -1,7 +1,12 @@
import orjson
import os
import re
from shlex import quote as shq
from hailtop import pip_version
from typing import Tuple


FILE_REGEX = re.compile(r'(?P<src>[^:]+)(:(?P<dest>.+))?')


async def submit(name, image_name, files, output, script, arguments):
Expand All @@ -18,43 +23,76 @@ async def submit(name, image_name, files, output, script, arguments):
)

files = unpack_comma_delimited_inputs(files)
user_config = get_user_config_path()
user_config = str(get_user_config_path())

quiet = output != 'text'

remote_tmpdir = get_remote_tmpdir('hailctl batch submit')
remote_tmpdir = remote_tmpdir.rstrip('/')

tmpdir_path_prefix = secret_alnum_string()

def cloud_prefix(path):
path = path.lstrip('/')
return f'{remote_tmpdir}/{tmpdir_path_prefix}/{path}'

def file_input_to_src_dest(file: str) -> Tuple[str, str, str]:
match = FILE_REGEX.match(file)
if match is None:
raise ValueError(f'invalid file specification {file}. Must have the form "src" or "src:dest"')

result = match.groupdict()

src = result.get('src')
if src is None:
raise ValueError(f'invalid file specification {file}. Must have a "src" defined.')
src = os.path.abspath(os.path.expanduser(src))
src = src.rstrip('/')

dest = result.get('dest')
if dest is not None:
dest = os.path.abspath(os.path.expanduser(dest))
else:
dest = os.getcwd()

cloud_file = cloud_prefix(src)

return (src, dest, cloud_file)

backend = hb.ServiceBackend()
b = hb.Batch(name=name, backend=backend)
j = b.new_bash_job()
j.image(image_name or os.environ.get('HAIL_GENETICS_HAIL_IMAGE', f'hailgenetics/hail:{pip_version()}'))

rel_file_paths = [os.path.relpath(file) for file in files]
local_files_to_cloud_files = [{'from': local, 'to': cloud_prefix(local)} for local in rel_file_paths]
local_files_to_cloud_files = []

for file in files:
src, dest, cloud_file = file_input_to_src_dest(file)
local_files_to_cloud_files.append({'from': src, 'to': cloud_file})
in_file = b.read_input(cloud_file)
j.command(f'mkdir -p {os.path.dirname(dest)}; ln -s {in_file} {dest}')

script_src, _, script_cloud_file = file_input_to_src_dest(script)
user_config_src, _, user_config_cloud_file = file_input_to_src_dest(user_config)

await copy_from_dict(files=local_files_to_cloud_files)
await copy_from_dict(
files=[
{'from': script, 'to': cloud_prefix(script)},
{'from': str(user_config), 'to': cloud_prefix(user_config)},
*local_files_to_cloud_files,
{'from': script_src, 'to': script_cloud_file},
{'from': user_config_src, 'to': user_config_cloud_file},
]
)
for file in local_files_to_cloud_files:
local_file = file['from']
cloud_file = file['to']
in_file = b.read_input(cloud_file)
j.command(f'ln -s {in_file} {local_file}')

script_file = b.read_input(cloud_prefix(script))
config_file = b.read_input(cloud_prefix(user_config))
j.command(f'mkdir -p $HOME/.config/hail && ln -s {config_file} $HOME/.config/hail/config.ini')
script_file = b.read_input(script_cloud_file)
config_file = b.read_input(user_config_cloud_file)

j.env('HAIL_QUERY_BACKEND', 'batch')

command = 'python3' if script.endswith('.py') else 'bash'
script_arguments = " ".join(shq(x) for x in arguments)

j.command(f'mkdir -p $HOME/.config/hail && ln -s {config_file} $HOME/.config/hail/config.ini')
j.command(f'cd {os.getcwd()}')
j.command(f'{command} {script_file} {script_arguments}')
batch_handle = await b._async_run(wait=False, disable_progress_bar=quiet)
assert batch_handle
Expand Down
Empty file.
111 changes: 111 additions & 0 deletions hail/python/test/hailtop/hailctl/batch/test_submit.py
@@ -0,0 +1,111 @@
import os
import pytest
import tempfile

from typer.testing import CliRunner

from hailtop.hailctl.batch import cli


@pytest.fixture
def runner():
yield CliRunner(mix_stderr=False)


def write_script(dir: str, filename: str):
with open(f'{dir}/test_job.py', 'w') as f:
f.write(
f'''
import hailtop.batch as hb
b = hb.Batch()
j = b.new_job()
j.command('cat {filename}')
b.run(wait=False)
backend.close()
'''
)


def write_hello(filename: str):
os.makedirs(os.path.dirname(filename), exist_ok=True)
with open(filename, 'w') as f:
f.write('hello\n')


def test_file_with_no_dest(runner: CliRunner):
with tempfile.TemporaryDirectory() as dir:
os.chdir(dir)
write_hello(f'{dir}/hello.txt')
write_script(dir, f'{dir}/hello.txt')
res = runner.invoke(cli.app, ['submit', '--files', 'hello.txt', 'test_job.py'])
assert res.exit_code == 0


def test_file_in_current_dir(runner: CliRunner):
with tempfile.TemporaryDirectory() as dir:
os.chdir(dir)
write_hello(f'{dir}/hello.txt')
write_script(dir, f'/hello.txt')
res = runner.invoke(cli.app, ['submit', '--files', 'hello.txt:/', 'test_job.py'])
assert res.exit_code == 0


def test_file_mount_in_child_dir(runner: CliRunner):
with tempfile.TemporaryDirectory() as dir:
os.chdir(dir)
write_hello(f'{dir}/hello.txt')
write_script(dir, '/child/hello.txt')
res = runner.invoke(cli.app, ['submit', '--files', 'hello.txt:/child/', 'test_job.py'])
assert res.exit_code == 0


def test_file_mount_in_child_dir_to_root_dir(runner: CliRunner):
with tempfile.TemporaryDirectory() as dir:
os.chdir(dir)
write_hello(f'{dir}/child/hello.txt')
write_script(dir, '/hello.txt')
res = runner.invoke(cli.app, ['submit', '--files', 'child/hello.txt:/', 'test_job.py'])
assert res.exit_code == 0


def test_mount_multiple_files(runner: CliRunner):
with tempfile.TemporaryDirectory() as dir:
os.chdir(dir)
write_hello(f'{dir}/child/hello1.txt')
write_hello(f'{dir}/child/hello2.txt')
write_script(dir, '/hello1.txt')
res = runner.invoke(
cli.app, ['submit', '--files', 'child/hello1.txt:/', '--files', 'child/hello2.txt:/', 'test_job.py']
)
assert res.exit_code == 0


def test_dir_mount_in_child_dir_to_child_dir(runner: CliRunner):
with tempfile.TemporaryDirectory() as dir:
os.chdir(dir)
write_hello(f'{dir}/child/hello1.txt')
write_hello(f'{dir}/child/hello2.txt')
write_script(dir, '/child/hello1.txt')
res = runner.invoke(cli.app, ['submit', '--files', 'child/:/child/', 'test_job.py'])
assert res.exit_code == 0


def test_file_outside_curdir(runner: CliRunner):
with tempfile.TemporaryDirectory() as dir:
os.mkdir(f'{dir}/working_dir')
os.chdir(f'{dir}/working_dir')
write_hello(f'{dir}/hello.txt')
write_script(dir, '/hello.txt')
res = runner.invoke(cli.app, ['submit', '--files', f'{dir}/hello.txt:/', '../test_job.py'])
assert res.exit_code == 0


def test_dir_outside_curdir(runner: CliRunner):
with tempfile.TemporaryDirectory() as dir:
os.mkdir(f'{dir}/working_dir')
os.chdir(f'{dir}/working_dir')
write_hello(f'{dir}/hello1.txt')
write_hello(f'{dir}/hello2.txt')
write_script(dir, '/hello1.txt')
res = runner.invoke(cli.app, ['submit', '--files', f'{dir}/:/', '../test_job.py'])
assert res.exit_code == 0, (res.exit_code, res.stdout, res.stderr)

0 comments on commit cb87096

Please sign in to comment.