# Should we convert 80GB A100 GPUs into two 40 GB MIG instances?
### April 21, 2025

In [1]:
import os
import re
import subprocess
import pandas as pd
# wget https://raw.githubusercontent.com/PrincetonUniversity/job_defense_shield/refs/heads/main/efficiency.py
from efficiency import get_stats_dict
from efficiency import cpu_memory_usage
from efficiency import gpu_memory_usage_eff_tuples

In [2]:
# convert slurm timestamps to seconds
os.environ["SLURM_TIME_FORMAT"] = "%s"

In [3]:
def get_data_from_sacct(clusters: str,
                        start_date: str,
                        end_date: str,
                        partitions: str,
                        fields: str) -> pd.DataFrame:
    """Return a dataframe of the sacct output."""
    cmd = f"sacct -M {clusters} -a -X -P -n -S {start_date} -E {end_date} {partitions} -o {fields}"
    output = subprocess.run(cmd,
                            stdout=subprocess.PIPE,
                            shell=True,
                            timeout=100,
                            text=True,
                            check=True)
    rows = [row.split("|") for row in output.stdout.split()]
    df = pd.DataFrame(rows)
    df.columns = fields.split(",")
    return df

### Ignore "gputest" partition since those jobs also run on cryoem

In [4]:
clusters = "della"
partitions = f"-r gpu,gpu-shared"
fields = "alloctres,elapsedraw,start"
start_date = "2025-02-01"
end_date = "now"
fields = "jobid,cluster,user,alloctres,elapsedraw,admincomment,ncpus"
df = get_data_from_sacct(clusters, start_date, end_date, partitions, fields)

In [5]:
df = df[pd.notna(df.elapsedraw)]
df = df[df.elapsedraw.str.isnumeric()]
df.elapsedraw = df.elapsedraw.astype("int64")
df = df[df.elapsedraw > 0]

In [6]:
df = df[pd.notna(df.ncpus)]
df = df[df.ncpus.str.isnumeric()]
df.ncpus = df.ncpus.astype("int64")
df = df[df.ncpus > 0]

In [7]:
def gpus_per_job(tres: str) -> int:
    """Return the number of allocated GPUs."""
    gpus = re.findall(r"gres/gpu=\d+", tres)
    return int(gpus[0].replace("gres/gpu=", "")) if gpus else 0

In [8]:
df["gpus"] = df.alloctres.apply(gpus_per_job)

In [9]:
df["gpu-seconds"] = df.apply(lambda row: row["elapsedraw"] * row["gpus"], axis='columns')
df["gpu-hours"] = df["gpu-seconds"] / 3600

### Percent Usage of Public GPUs on Della

In [10]:
days = 28 + 31 + 20  # feb, march, part of april
hours_per_day = 24
num_gpus = 20 * 2 + 69 * 4
pct_usage = df["gpu-hours"].sum() / (days * hours_per_day * num_gpus)
print(f"Percent Usage = {round(100 * pct_usage)}%")

Percent Usage = 93%


We see that 93% of the available GPU-hours were consumed.

In [11]:
df["admincomment"] = df["admincomment"].apply(get_stats_dict)

In [12]:
df["gpu-tuple"] = df.apply(lambda row: gpu_memory_usage_eff_tuples(row["admincomment"],
                                                                   row["jobid"],
                                                                   row["cluster"],
                                                                   verbose=False),
                                                                   axis="columns")

In [13]:
df["error_code"] = df["gpu-tuple"].apply(lambda x: x[1])
df = df[df["error_code"] == 0]
df["GPU-Mem-Used"] = df["gpu-tuple"].apply(lambda tpl: tpl[0][0][0])
df["GPU-Util"]     = df["gpu-tuple"].apply(lambda tpl: tpl[0][0][2])

In [14]:
df.drop(columns=["error_code"], inplace=True)

In [15]:
df["memory-tuple"] = df.apply(lambda row: cpu_memory_usage(row["admincomment"],
                                                           row["jobid"],
                                                           row["cluster"],
                                                           verbose=False),
                                                           axis="columns")
cols = ["CPU-Mem-Used", "mem-alloc", "error_code"]
df[cols] = pd.DataFrame(df["memory-tuple"].tolist(), index=df.index)
df = df[df["error_code"] == 0]

In [16]:
df["cores_per_gpu"] = df.ncpus / df.gpus
df["CPU-Mem-Used-per-GPU"] = df["CPU-Mem-Used"] / df.gpus

In [17]:
def max_gpu_mem(tpl):
    items, error_code = tpl
    return max([item[0] for item in items])

In [18]:
def max_gpu_util(tpl):
    items, error_code = tpl
    return max([item[2] for item in items])

In [19]:
df["max_gpu_mem"]  = df["gpu-tuple"].apply(max_gpu_mem)
df["max_gpu_util"] = df["gpu-tuple"].apply(max_gpu_util)

In [20]:
df[df.gpus > 1].head(2).T

Unnamed: 0,282,283
jobid,61797715_0,61797715_1
cluster,della,della
user,hyen,hyen
alloctres,"billing=8192,cpu=8,gres/gpu=2,mem=100G,node=1","billing=8192,cpu=8,gres/gpu=2,mem=100G,node=1"
elapsedraw,149,607
admincomment,{'nodes': {'della-l03g1': {'total_memory': 107...,{'nodes': {'della-l03g1': {'total_memory': 107...
ncpus,8,8
gpus,2,2
gpu-seconds,298,1214
gpu-hours,0.082778,0.337222


The nodes with 80 GB A100's have 1000 GB of CPU memory, 48 cores and 4 GPUs. If split in half then would have 6 cores per GPU, 125 GB of CPU memory and 40 GB of GPU memory. The percentage of the GPU-hours that could run on these instances is:

In [21]:
df = df[(df["cores_per_gpu"] <= 6) & (df["CPU-Mem-Used-per-GPU"] < 125) & (df["max_gpu_util"] < 50) & (df["max_gpu_mem"] < 40)]
pct = f"Percentage of GPU-hours that could run on 40 GB MIG instance = {round(100 * df["gpu-hours"].sum() / (days * hours_per_day * num_gpus))}%"
print(pct)

Percentage of GPU-hours that could run on 40 GB MIG instance = 21%


Note that the 21% is a lower bound because some jobs use less than 40 GB of GPU memory and up to 64 CPU-cores and 384 of CPU memory. If one uses thresholds of 64 cores, 384GB CPU, 40GB GPU, <50% GPU util, then the percentage is 26%. Also, for multinode jobs, we use the max memory usage per GPU and the max utilization per GPU.

Another interesting set of thresholds is to ignore GPU utilization and just look at <6 core, < 125 GB CPU, and < 40GB GPU. In this case the number is 63%.

Unfortunately, we do not have SM% or occupancy which are much better indicators of how much of the GPU is being used. A code with high GPU utilization and low SM% will run just as fast on a MIG instance.

Let's take the value as 24%.

There are 316 A100 GPUs in the public pool on Della. There are already 40 GPUs with only 40GB which can account for about half of the needed 24%. An additional 36 40GB GPUs are needed to cover the entire 24%.

# Final Recommendation

Convert between 16 to 20 of the 80GB A100s (4 or 5 nodes) into between 32 to 40 40GB MIG instances. If all goes well then we could convert more.