# Should we convert 80GB A100 GPUs into two 40 GB MIG instances?
### April 22, 2025

In [1]:
import os
import re
import subprocess
import pandas as pd
# wget https://raw.githubusercontent.com/PrincetonUniversity/job_defense_shield/refs/heads/main/efficiency.py
from efficiency import get_stats_dict
from efficiency import get_nodelist
from efficiency import cpu_memory_usage
from efficiency import gpu_memory_usage_eff_tuples

In [2]:
# convert slurm timestamps to seconds
os.environ["SLURM_TIME_FORMAT"] = "%s"

In [3]:
def get_data_from_sacct(clusters: str,
                        start_date: str,
                        end_date: str,
                        partitions: str,
                        fields: str) -> pd.DataFrame:
    """Return a dataframe of the sacct output."""
    cmd = f"sacct -M {clusters} -a -X -P -n -S {start_date} -E {end_date} {partitions} -o {fields}"
    output = subprocess.run(cmd,
                            stdout=subprocess.PIPE,
                            shell=True,
                            timeout=100,
                            text=True,
                            check=True)
    rows = [row.split("|") for row in output.stdout.split()]
    df = pd.DataFrame(rows)
    df.columns = fields.split(",")
    return df

### Ignore "gputest" partition since those jobs also run on cryoem

In [4]:
clusters = "della"
partitions = f"-r cryoem"
start_date = "2025-01-01"
end_date = "now"
fields = "jobid,cluster,user,alloctres,elapsedraw,admincomment,ncpus"
df = get_data_from_sacct(clusters, start_date, end_date, partitions, fields)

In [5]:
df = df[pd.notna(df.elapsedraw)]
df = df[df.elapsedraw.str.isnumeric()]
df.elapsedraw = df.elapsedraw.astype("int64")
df = df[df.elapsedraw > 0]

In [6]:
df = df[pd.notna(df.ncpus)]
df = df[df.ncpus.str.isnumeric()]
df.ncpus = df.ncpus.astype("int64")
df = df[df.ncpus > 0]

In [7]:
def gpus_per_job(tres: str) -> int:
    """Return the number of allocated GPUs."""
    gpus = re.findall(r"gres/gpu=\d+", tres)
    return int(gpus[0].replace("gres/gpu=", "")) if gpus else 0

In [8]:
df["gpus"] = df.alloctres.apply(gpus_per_job)

In [9]:
df["gpu-seconds"] = df.apply(lambda row: row["elapsedraw"] * row["gpus"], axis='columns')
df["gpu-hours"] = df["gpu-seconds"] / 3600

In [10]:
df["admincomment"] = df["admincomment"].apply(get_stats_dict)
df["node-tuple"] = df.apply(lambda row: get_nodelist(row["admincomment"],
                                                     row["jobid"],
                                                     row["cluster"],
                                                     verbose=False),
                                                     axis="columns")
cols = ["job_nodes", "error_code"]
df[cols] = pd.DataFrame(df["node-tuple"].tolist(), index=df.index)
df = df[df["error_code"] == 0]
df.drop(columns=["error_code"], inplace=True)

In [11]:
df.head(2).T

Unnamed: 0,0,1
jobid,61354923,61358355_1303
cluster,della,della
user,rf2366,ab50
alloctres,"billing=28,cpu=28,gres/gpu=4,mem=250G,node=1","billing=15,cpu=8,gres/gpu=1,mem=200000M,node=1"
elapsedraw,900318,5849
admincomment,{'nodes': {'della-l07g5': {'total_memory': 268...,{'nodes': {'della-l07g6': {'total_memory': 209...
ncpus,28,8
gpus,4,1
gpu-seconds,3601272,5849
gpu-hours,1000.353333,1.624722


Ignore V100 jobs

In [12]:
v100 = [f"della-l06g{i}" for i in range(1, 12)] + \
       ["della-l07g8", "della-l07g9"] + \
       ["della-l08g8", "della-l08g9"] + \
       ["della-l09g8", "della-l09g9"]
v100 = set(v100)

In [13]:
def ran_on_v100(job_nodes: set, v100_nodes: set) -> bool:
    diff = job_nodes - v100
    return True if len(diff) == 0 else False

In [14]:
df["v100"] = df.job_nodes.apply(lambda job_nodes: ran_on_v100(job_nodes, v100))

In [15]:
df.v100.value_counts()

v100
False    53654
True     38557
Name: count, dtype: int64

In [16]:
df = df[~df.v100]

### Percent Usage of A100 GPUs cryoem

In [17]:
days = 31 + 28 + 31 + 24  # jan, feb, march, part of april
hours_per_day = 24
num_gpus = 20 * 4
pct_usage = df["gpu-hours"].sum() / (days * hours_per_day * num_gpus)
print(f"Percent Usage = {round(100 * pct_usage)}%")

Percent Usage = 54%


In [18]:
df["gpu-tuple"] = df.apply(lambda row: gpu_memory_usage_eff_tuples(row["admincomment"],
                                                                   row["jobid"],
                                                                   row["cluster"],
                                                                   verbose=False),
                                                                   axis="columns")

In [19]:
df["error_code"] = df["gpu-tuple"].apply(lambda x: x[1])
df = df[df["error_code"] == 0]
df["GPU-Mem-Used"] = df["gpu-tuple"].apply(lambda tpl: tpl[0][0][0])
df["GPU-Util"]     = df["gpu-tuple"].apply(lambda tpl: tpl[0][0][2])

In [20]:
df.drop(columns=["error_code"], inplace=True)

In [21]:
df["memory-tuple"] = df.apply(lambda row: cpu_memory_usage(row["admincomment"],
                                                           row["jobid"],
                                                           row["cluster"],
                                                           verbose=False),
                                                           axis="columns")
cols = ["CPU-Mem-Used", "mem-alloc", "error_code"]
df[cols] = pd.DataFrame(df["memory-tuple"].tolist(), index=df.index)
df = df[df["error_code"] == 0]

In [22]:
df["cores_per_gpu"] = df.ncpus / df.gpus
df["CPU-Mem-Used-per-GPU"] = df["CPU-Mem-Used"] / df.gpus

In [23]:
def max_gpu_mem(tpl):
    items, error_code = tpl
    return max([item[0] for item in items])

In [24]:
def max_gpu_util(tpl):
    items, error_code = tpl
    return max([item[2] for item in items])

In [25]:
df["max_gpu_mem"]  = df["gpu-tuple"].apply(max_gpu_mem)
df["max_gpu_util"] = df["gpu-tuple"].apply(max_gpu_util)

In [26]:
df[df.gpus > 1].head(2).T

Unnamed: 0,0,669
jobid,61354923,61364300
cluster,della,della
user,rf2366,yz6956
alloctres,"billing=28,cpu=28,gres/gpu=4,mem=250G,node=1","billing=61,cpu=16,gres/gpu=4,mem=768G,node=1"
elapsedraw,900318,116792
admincomment,{'nodes': {'della-l07g5': {'total_memory': 268...,{'nodes': {'della-l08g5': {'total_memory': 824...
ncpus,28,16
gpus,4,4
gpu-seconds,3601272,467168
gpu-hours,1000.353333,129.768889


The nodes with 80 GB A100's have 1000 GB of CPU memory, 48 cores and 4 GPUs. If split in half then would have 6 cores per GPU, 125 GB of CPU memory and 40 GB of GPU memory. The percentage of the GPU-hours that could run on these instances is:

In [27]:
100 * df[df["max_gpu_util"] <= 50]["gpu-seconds"].sum() / df["gpu-seconds"].sum()

31.009093079624563

In [28]:
100 * df[df["max_gpu_util"] > 50]["gpu-seconds"].sum() / df["gpu-seconds"].sum()

68.99090692037544

In [29]:
# mean utilization
df["util_times_seconds"] = df["max_gpu_util"] * df["gpu-seconds"]
df["gpu_mem_times_hours"] = df["max_gpu_mem"] * df["gpu-hours"]
df["util_times_seconds"].sum() / df["gpu-seconds"].sum()

61.182203523460736

In [30]:
gp = df.groupby("user").agg({"gpu-hours":"sum", "gpu_mem_times_hours":"sum"}).reset_index()
gp["mean_gpu_mem_GB"] = gp["gpu_mem_times_hours"] / gp["gpu-hours"] 
gp.columns = ["user", "gpu-hours", "mem_weight", "mean_gpu_mem_GB"]
gp["proportion"] = gp["gpu-hours"] / gp["gpu-hours"].sum()
gp["proportion"] = gp["proportion"].apply(lambda x: round(x, 2))
gp["gpu-hours"] = gp["gpu-hours"].apply(round)
gp["mean_gpu_mem_GB"] = gp["mean_gpu_mem_GB"].apply(round)
x = gp[["user", "gpu-hours", "proportion", "mean_gpu_mem_GB"]].sort_values("gpu-hours", ascending=False).reset_index(drop=True)
x.index += 1
print(x)

       user  gpu-hours  proportion  mean_gpu_mem_GB
1      jy15      39809        0.33               41
2    zx8205      20617        0.17               19
3      ab50      18577        0.16               61
4    yz6956      11956        0.10               41
5    rf2366       7787        0.07               62
6    mg2332       5507        0.05               28
7    mj7341       3658        0.03               32
8     aelin       2021        0.02               61
9    fa1073       1541        0.01               45
10   mg6942       1536        0.01               63
11   jg0428       1220        0.01               61
12  gghanim       1190        0.01               78
13   jg1427       1172        0.01               38
14   an3411        807        0.01               61
15  jiaweim        552        0.00               49
16  stravis        429        0.00               25
17   cryoem        207        0.00                8
18   la4139        109        0.00               61
19   ar5531 

In [31]:
df["gpu-hours"] = df["gpu-hours"].apply(lambda x: round(x, 1))

In [32]:
df[df["gpu-seconds"] > 0][["jobid", "user", "gpu-hours", "gpus", "max_gpu_mem"]].to_csv("cryoem_gpu_mem_jan1_apr25.csv", index=False)

In [33]:
gpu_hrs = df[(df["max_gpu_mem"] < 40)]["gpu-hours"].sum()
pct = f"Percentage of GPU-hours that could run on 40 GB GPUs = {round(100 * gpu_hrs / (days * hours_per_day * num_gpus))}%"
print(pct)

Percentage of GPU-hours that could run on 40 GB GPUs = 23%


In [34]:
gpu_hrs = df[(df["cores_per_gpu"] <= 6) & (df["CPU-Mem-Used-per-GPU"] < 125) & (df["max_gpu_mem"] < 40)]["gpu-hours"].sum()
pct = f"Percentage of GPU-hours that could run on 40 GB MIG instance (ignoring utilization) = {round(100 * gpu_hrs / (days * hours_per_day * num_gpus))}%"
print(pct)

Percentage of GPU-hours that could run on 40 GB MIG instance (ignoring utilization) = 20%


In [35]:
gpu_hrs = df[(df["cores_per_gpu"] <= 6) & (df["CPU-Mem-Used-per-GPU"] < 125) & (df["max_gpu_util"] < 50) & (df["max_gpu_mem"] < 40)]["gpu-hours"].sum()
pct = f"Percentage of GPU-hours that could run on 40 GB MIG instance = {round(100 * gpu_hrs / (days * hours_per_day * num_gpus))}%"
print(pct)

Percentage of GPU-hours that could run on 40 GB MIG instance = 3%
