# Utilization of the cryoem nodes
### August 2023
### J. Halverson

In [1]:
import subprocess
import pandas as pd
from datetime import datetime

In [2]:
start_date = "2023-05-01"
end_date = "2023-08-20"

In [3]:
fields = "jobid,user,account,partition,nnodes,ncpus,alloctres,elapsedraw,nodelist"
cmd = f"sacct -M della -a -X -P -n -S {start_date} -E {end_date} -o {fields} -r cryoem"

In [4]:
output = subprocess.run(cmd,
                        stdout=subprocess.PIPE,
                        shell=True,
                        timeout=100,
                        text=True,
                        check=True)

In [5]:
rows = [row.split("|") for row in output.stdout.split()]
df = pd.DataFrame(rows)
df.columns = fields.split(",")

In [6]:
df.head(2).T

Unnamed: 0,0,1
jobid,47332172,47347657
user,xuelanw,zhonge
account,molbio,cs
partition,cryoem,cryoem
nnodes,3,1
ncpus,84,1
alloctres,"billing=84,cpu=84,gres/gpu=9,mem=375G,node=3","billing=40,cpu=1,gres/gpu=1,mem=512G,node=1"
elapsedraw,173129,162549
nodelist,della-l06g[6-8],della-l09g9


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35464 entries, 0 to 35463
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   jobid       35464 non-null  object
 1   user        35063 non-null  object
 2   account     35063 non-null  object
 3   partition   35063 non-null  object
 4   nnodes      35063 non-null  object
 5   ncpus       35063 non-null  object
 6   alloctres   35063 non-null  object
 7   elapsedraw  35063 non-null  object
 8   nodelist    35063 non-null  object
dtypes: object(9)
memory usage: 2.4+ MB


Only consider jobs that ran for 1 second or more:

In [8]:
df = df[pd.notna(df.elapsedraw) & df.elapsedraw.str.isnumeric()]
df.elapsedraw = df.elapsedraw.astype("int64")
df = df[df.elapsedraw > 0]

In [9]:
df.shape[0]

34599

Expand the nodelist into a Python list for each job. This will allow us to exclude the v100 nodes.

In [10]:
def make_node_list(nodes: str) -> list:
    """Convert a nodelist from sacct to a Python list of node names. For
       example: della-l07g[4-7],della-l08g2 becomes
       ['della-l07g4', 'della-l07g5', 'della-l07g6', 'della-l07g7', 'della-l08g2']
    """
    cmd = f"scontrol show hostname {nodes}"
    try:
        output = subprocess.run(cmd,
                                stdout=subprocess.PIPE,
                                shell=True,
                                timeout=5,
                                text=True,
                                check=True)
    except:
        print(f"Failed to get nodes for {nodes}")
    else:
        return output.stdout.strip().split("\n")

Next line may take several minutes to run -- use cache after first run or only function to jobs with more than 1 node:

In [11]:
cache = False

In [12]:
if cache:
    df = pd.read_json("cache.json")
else:
    df["nodes"] = df["nodelist"].apply(make_node_list)
    df.to_json("cache.json")

Add GPU metrics:

In [13]:
def gpus_per_job(tres: str) -> int:
    """Extract number of GPUs from alloctres."""
    if "gres/gpu=" in tres:
        for part in tres.split(","):
            if "gres/gpu=" in part:
                gpus = int(part.split("=")[-1])
        return gpus
    else:
        return 0

In [14]:
df["gpus"] = df.alloctres.apply(gpus_per_job)
df["gpu-seconds"] = df.apply(lambda row: row["elapsedraw"] * row["gpus"], axis='columns')

In [15]:
df.head(2).T

Unnamed: 0,0,1
jobid,47332172,47347657
user,xuelanw,zhonge
account,molbio,cs
partition,cryoem,cryoem
nnodes,3,1
ncpus,84,1
alloctres,"billing=84,cpu=84,gres/gpu=9,mem=375G,node=3","billing=40,cpu=1,gres/gpu=1,mem=512G,node=1"
elapsedraw,173129,162549
nodelist,della-l06g[6-8],della-l09g9
nodes,"[della-l06g6, della-l06g7, della-l06g8]",[della-l09g9]


Exclude jobs that ran on the v100 nodes.

In [16]:
v100 = [f"della-l06g{i}" for i in range(1, 12)] + \
       ["della-l07g8", "della-l07g9"] + \
       ["della-l08g8", "della-l08g9"] + \
       ["della-l09g8", "della-l09g9"]
v100

['della-l06g1',
 'della-l06g2',
 'della-l06g3',
 'della-l06g4',
 'della-l06g5',
 'della-l06g6',
 'della-l06g7',
 'della-l06g8',
 'della-l06g9',
 'della-l06g10',
 'della-l06g11',
 'della-l07g8',
 'della-l07g9',
 'della-l08g8',
 'della-l08g9',
 'della-l09g8',
 'della-l09g9']

In [17]:
def ran_on_v100(job_nodes: list, v100_nodes: list) -> bool:
    diff = set(job_nodes) - set(v100_nodes)
    return True if len(diff) == 0 else False

In [18]:
df["v100"] = df.nodes.apply(lambda job_nodes: ran_on_v100(job_nodes, v100))

In [19]:
df.head(2).T

Unnamed: 0,0,1
jobid,47332172,47347657
user,xuelanw,zhonge
account,molbio,cs
partition,cryoem,cryoem
nnodes,3,1
ncpus,84,1
alloctres,"billing=84,cpu=84,gres/gpu=9,mem=375G,node=3","billing=40,cpu=1,gres/gpu=1,mem=512G,node=1"
elapsedraw,173129,162549
nodelist,della-l06g[6-8],della-l09g9
nodes,"[della-l06g6, della-l06g7, della-l06g8]",[della-l09g9]


In [20]:
a100_jobs = df[~df.v100]

Maxium available GPU hours on the 21 nodes:

In [21]:
diff = datetime.strptime(end_date, "%Y-%m-%d") - datetime.strptime(start_date, "%Y-%m-%d")
hrs_per_day = 24
nodes = 21
gpus_per_node = 4
max_gpu_hours = diff.days * hrs_per_day * nodes * gpus_per_node

Finally, the usage:

In [22]:
secs_per_hour = 3600
percent_utilization = 100 * a100_jobs["gpu-seconds"].sum() / secs_per_hour / max_gpu_hours
print(f"percent_utilization = {round(percent_utilization, 1)}%")

percent_utilization = 9.8%
