diff --git a/README.md b/README.md index 32e8733..8856502 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ installation, submission, monitoring, and various commands provided by GridTK. Before diving into GridTK, ensure you have the following prerequisites: 1. A working Slurm setup. -2. [Pixi](https://pixi.sh/) (recommended) or [pipx](https://pipx.pypa.io/stable/) installed. +2. [pipx](https://pipx.pypa.io/stable/) installed. 3. GridTK installed (instructions provided below). ## Installation @@ -34,11 +34,6 @@ Before diving into GridTK, ensure you have the following prerequisites: To install GridTK, open your terminal and run the following command: ```bash -# Install gridtk using pixi -$ curl -fsSL https://pixi.sh/install.sh | bash # installs pixi -$ pixi global install gridtk - -# Install gridtk using pipx $ pipx install gridtk ``` It is **not recommended** to install GridTK using `pip install gridtk` in the diff --git a/src/gridtk/manager.py b/src/gridtk/manager.py index 032c52d..e24cd15 100644 --- a/src/gridtk/manager.py +++ b/src/gridtk/manager.py @@ -25,6 +25,7 @@ from collections.abc import Iterable from pathlib import Path +from typing import Any import sqlalchemy @@ -35,13 +36,51 @@ from .tools import job_ids_from_dep_str, parse_array_indexes +def parse_scontrol_output(output: str) -> dict[str, Any]: + """Parse scontrol output and return a dict similar to `sacct --json`.""" + result: dict[str, Any] = dict() + for key_value in output.strip().split(): + if "=" not in key_value: + continue + key, value = key_value.split("=", 1) + result[key] = value + # make results similar to sacct --json + result["state"] = {"current": [result["JobState"]], "reason": result["Reason"]} + result["derived_exit_code"] = { + "return_code": {"number": result["ExitCode"].split(":")[0]} + } + result["nodes"] = result["NodeList"] + if result["nodes"] == "(null)": + result["nodes"] = "None assigned" + return result + + +def job_status_from_scontrol(job_id: int) -> dict: + """Retrieve the status of a job using scontrol.""" + try: + # we don't use --json because it is not supported by older versions of scontrol + output = subprocess.check_output( + ["scontrol", "show", "job", str(job_id)], text=True + ) + except subprocess.CalledProcessError: + return dict() + return parse_scontrol_output(output) + + def update_job_statuses(grid_ids: Iterable[int]) -> dict[int, dict]: """Retrieve the status of the jobs in the database.""" status = dict() - output = subprocess.check_output( - ["sacct", "-j", ",".join([str(x) for x in grid_ids]), "--json"], - text=True, - ) + try: + output = subprocess.check_output( + ["sacct", "-j", ",".join([str(x) for x in grid_ids]), "--json"], + text=True, + ) + except subprocess.CalledProcessError: + for job_id in grid_ids: + job_status = job_status_from_scontrol(job_id) + if job_status: + status[job_id] = job_status + return status for job in json.loads(output)["jobs"]: status[job["job_id"]] = job return status @@ -200,6 +239,6 @@ def __del__(self): and len(self.list_jobs(update_jobs=False)) == 0 ): Path(self.database).unlink() - if self.logs_dir.exists() and len(os.listdir(self.logs_dir)) == 0: - shutil.rmtree(self.logs_dir) + if self.logs_dir.exists() and len(os.listdir(self.logs_dir)) == 0: + shutil.rmtree(self.logs_dir) self.engine.dispose()