From 731380b2f6652acfb6f71ee7bbc1a607db4a8232 Mon Sep 17 00:00:00 2001 From: Amir Mohammadi <5738695+183amir@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:06:20 +0200 Subject: [PATCH 1/4] fix: handle sacct failures Some Slurm installations have accounting disabled and you can only get information about the job using scontrol show job For now, we just handle the failed call here. --- src/gridtk/manager.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/gridtk/manager.py b/src/gridtk/manager.py index 032c52d..5206004 100644 --- a/src/gridtk/manager.py +++ b/src/gridtk/manager.py @@ -38,10 +38,13 @@ def update_job_statuses(grid_ids: Iterable[int]) -> dict[int, dict]: """Retrieve the status of the jobs in the database.""" status = dict() - output = subprocess.check_output( - ["sacct", "-j", ",".join([str(x) for x in grid_ids]), "--json"], - text=True, - ) + try: + output = subprocess.check_output( + ["sacct", "-j", ",".join([str(x) for x in grid_ids]), "--json"], + text=True, + ) + except subprocess.CalledProcessError: + return status for job in json.loads(output)["jobs"]: status[job["job_id"]] = job return status From 1e8c607f57c18dc4ea48cdb29cb3c6cf7f39b417 Mon Sep 17 00:00:00 2001 From: Amir Mohammadi <5738695+183amir@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:08:11 +0200 Subject: [PATCH 2/4] fix: do not delete the logs folder when the database is not empty It can happen that the logs folder is empty (all jobs are pending) but the database is not empty. We should not delete the logs folder in this case. --- src/gridtk/manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/gridtk/manager.py b/src/gridtk/manager.py index 5206004..9b86a22 100644 --- a/src/gridtk/manager.py +++ b/src/gridtk/manager.py @@ -203,6 +203,6 @@ def __del__(self): and len(self.list_jobs(update_jobs=False)) == 0 ): Path(self.database).unlink() - if self.logs_dir.exists() and len(os.listdir(self.logs_dir)) == 0: - shutil.rmtree(self.logs_dir) + if self.logs_dir.exists() and len(os.listdir(self.logs_dir)) == 0: + shutil.rmtree(self.logs_dir) self.engine.dispose() From 4aadd84ea34d9def2cdb49848aaf1daa422086dc Mon Sep 17 00:00:00 2001 From: Amir Mohammadi <5738695+183amir@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:13:44 +0200 Subject: [PATCH 3/4] fix: do not recommend installing with pixi when installing with pixi global install, the CONDA_PREFIX and PATH env variables are shadowed (see https://github.com/prefix-dev/pixi/issues/1382) and this breaks binary discovery (e.g. python) when a job is submitted. --- README.md | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/README.md b/README.md index 32e8733..8856502 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ installation, submission, monitoring, and various commands provided by GridTK. Before diving into GridTK, ensure you have the following prerequisites: 1. A working Slurm setup. -2. [Pixi](https://pixi.sh/) (recommended) or [pipx](https://pipx.pypa.io/stable/) installed. +2. [pipx](https://pipx.pypa.io/stable/) installed. 3. GridTK installed (instructions provided below). ## Installation @@ -34,11 +34,6 @@ Before diving into GridTK, ensure you have the following prerequisites: To install GridTK, open your terminal and run the following command: ```bash -# Install gridtk using pixi -$ curl -fsSL https://pixi.sh/install.sh | bash # installs pixi -$ pixi global install gridtk - -# Install gridtk using pipx $ pipx install gridtk ``` It is **not recommended** to install GridTK using `pip install gridtk` in the From 0efd61c38b847734b945eb3ccc52ed5fced60365 Mon Sep 17 00:00:00 2001 From: Amir Mohammadi Date: Thu, 18 Jul 2024 12:20:53 +0200 Subject: [PATCH 4/4] fix: retrieve job status from scontrol when sacct is not available This can happen on slurm installations where accounting is disabled. --- src/gridtk/manager.py | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/gridtk/manager.py b/src/gridtk/manager.py index 9b86a22..e24cd15 100644 --- a/src/gridtk/manager.py +++ b/src/gridtk/manager.py @@ -25,6 +25,7 @@ from collections.abc import Iterable from pathlib import Path +from typing import Any import sqlalchemy @@ -35,6 +36,37 @@ from .tools import job_ids_from_dep_str, parse_array_indexes +def parse_scontrol_output(output: str) -> dict[str, Any]: + """Parse scontrol output and return a dict similar to `sacct --json`.""" + result: dict[str, Any] = dict() + for key_value in output.strip().split(): + if "=" not in key_value: + continue + key, value = key_value.split("=", 1) + result[key] = value + # make results similar to sacct --json + result["state"] = {"current": [result["JobState"]], "reason": result["Reason"]} + result["derived_exit_code"] = { + "return_code": {"number": result["ExitCode"].split(":")[0]} + } + result["nodes"] = result["NodeList"] + if result["nodes"] == "(null)": + result["nodes"] = "None assigned" + return result + + +def job_status_from_scontrol(job_id: int) -> dict: + """Retrieve the status of a job using scontrol.""" + try: + # we don't use --json because it is not supported by older versions of scontrol + output = subprocess.check_output( + ["scontrol", "show", "job", str(job_id)], text=True + ) + except subprocess.CalledProcessError: + return dict() + return parse_scontrol_output(output) + + def update_job_statuses(grid_ids: Iterable[int]) -> dict[int, dict]: """Retrieve the status of the jobs in the database.""" status = dict() @@ -44,6 +76,10 @@ def update_job_statuses(grid_ids: Iterable[int]) -> dict[int, dict]: text=True, ) except subprocess.CalledProcessError: + for job_id in grid_ids: + job_status = job_status_from_scontrol(job_id) + if job_status: + status[job_id] = job_status return status for job in json.loads(output)["jobs"]: status[job["job_id"]] = job