# Should we convert 80GB A100 GPUs into two 40 GB MIG instances?
### April 22, 2025

In [1]:
import os
import re
import subprocess
import pandas as pd
# wget https://raw.githubusercontent.com/PrincetonUniversity/job_defense_shield/refs/heads/main/efficiency.py
from efficiency import get_stats_dict
from efficiency import get_nodelist
from efficiency import cpu_memory_usage
from efficiency import gpu_memory_usage_eff_tuples

In [2]:
# convert slurm timestamps to seconds
os.environ["SLURM_TIME_FORMAT"] = "%s"

In [3]:
def get_data_from_sacct(clusters: str,
                        start_date: str,
                        end_date: str,
                        partitions: str,
                        fields: str) -> pd.DataFrame:
    """Return a dataframe of the sacct output."""
    cmd = f"sacct -M {clusters} -a -X -P -n -S {start_date} -E {end_date} {partitions} -o {fields}"
    output = subprocess.run(cmd,
                            stdout=subprocess.PIPE,
                            shell=True,
                            timeout=100,
                            text=True,
                            check=True)
    rows = [row.split("|") for row in output.stdout.split()]
    df = pd.DataFrame(rows)
    df.columns = fields.split(",")
    return df

### Ignore "gputest" partition since those jobs also run on cryoem

In [4]:
clusters = "della"
partitions = f"-r cryoem"
start_date = "2025-02-01"
end_date = "now"
fields = "jobid,cluster,user,alloctres,elapsedraw,admincomment,ncpus"
df = get_data_from_sacct(clusters, start_date, end_date, partitions, fields)

In [5]:
df = df[pd.notna(df.elapsedraw)]
df = df[df.elapsedraw.str.isnumeric()]
df.elapsedraw = df.elapsedraw.astype("int64")
df = df[df.elapsedraw > 0]

In [6]:
df = df[pd.notna(df.ncpus)]
df = df[df.ncpus.str.isnumeric()]
df.ncpus = df.ncpus.astype("int64")
df = df[df.ncpus > 0]

In [7]:
def gpus_per_job(tres: str) -> int:
    """Return the number of allocated GPUs."""
    gpus = re.findall(r"gres/gpu=\d+", tres)
    return int(gpus[0].replace("gres/gpu=", "")) if gpus else 0

In [8]:
df["gpus"] = df.alloctres.apply(gpus_per_job)

In [9]:
df["gpu-seconds"] = df.apply(lambda row: row["elapsedraw"] * row["gpus"], axis='columns')
df["gpu-hours"] = df["gpu-seconds"] / 3600

In [10]:
df["admincomment"] = df["admincomment"].apply(get_stats_dict)
df["node-tuple"] = df.apply(lambda row: get_nodelist(row["admincomment"],
                                                     row["jobid"],
                                                     row["cluster"],
                                                     verbose=False),
                                                     axis="columns")
cols = ["job_nodes", "error_code"]
df[cols] = pd.DataFrame(df["node-tuple"].tolist(), index=df.index)
df = df[df["error_code"] == 0]
df.drop(columns=["error_code"], inplace=True)

In [11]:
df.head(2).T

Unnamed: 0,0,1
jobid,61992202,62003849_274
cluster,della,della
user,mg6942,ab50
alloctres,"billing=16,cpu=16,gres/gpu=1,mem=128G,node=1","billing=15,cpu=8,gres/gpu=1,mem=200000M,node=1"
elapsedraw,259521,8026
admincomment,{'nodes': {'della-l07g3': {'total_memory': 137...,{'nodes': {'della-l08g1': {'total_memory': 209...
ncpus,16,8
gpus,1,1
gpu-seconds,259521,8026
gpu-hours,72.089167,2.229444


Ignore V100 jobs

In [12]:
v100 = [f"della-l06g{i}" for i in range(1, 12)] + \
       ["della-l07g8", "della-l07g9"] + \
       ["della-l08g8", "della-l08g9"] + \
       ["della-l09g8", "della-l09g9"]
v100 = set(v100)

In [13]:
def ran_on_v100(job_nodes: set, v100_nodes: set) -> bool:
    diff = job_nodes - v100
    return True if len(diff) == 0 else False

In [14]:
df["v100"] = df.job_nodes.apply(lambda job_nodes: ran_on_v100(job_nodes, v100))

In [15]:
df.v100.value_counts()

v100
False    26205
True     16888
Name: count, dtype: int64

In [16]:
df = df[~df.v100]

### Percent Usage of A100 GPUs cryoem

In [17]:
days = 28 + 31 + 21  # feb, march, part of april
hours_per_day = 24
num_gpus = 20 * 4
pct_usage = df["gpu-hours"].sum() / (days * hours_per_day * num_gpus)
print(f"Percent Usage = {round(100 * pct_usage)}%")

Percent Usage = 55%


In [18]:
df["gpu-tuple"] = df.apply(lambda row: gpu_memory_usage_eff_tuples(row["admincomment"],
                                                                   row["jobid"],
                                                                   row["cluster"],
                                                                   verbose=False),
                                                                   axis="columns")

In [19]:
df["error_code"] = df["gpu-tuple"].apply(lambda x: x[1])
df = df[df["error_code"] == 0]
df["GPU-Mem-Used"] = df["gpu-tuple"].apply(lambda tpl: tpl[0][0][0])
df["GPU-Util"]     = df["gpu-tuple"].apply(lambda tpl: tpl[0][0][2])

In [20]:
df.drop(columns=["error_code"], inplace=True)

In [21]:
df["memory-tuple"] = df.apply(lambda row: cpu_memory_usage(row["admincomment"],
                                                           row["jobid"],
                                                           row["cluster"],
                                                           verbose=False),
                                                           axis="columns")
cols = ["CPU-Mem-Used", "mem-alloc", "error_code"]
df[cols] = pd.DataFrame(df["memory-tuple"].tolist(), index=df.index)
df = df[df["error_code"] == 0]

In [22]:
df["cores_per_gpu"] = df.ncpus / df.gpus
df["CPU-Mem-Used-per-GPU"] = df["CPU-Mem-Used"] / df.gpus

In [23]:
def max_gpu_mem(tpl):
    items, error_code = tpl
    return max([item[0] for item in items])

In [24]:
def max_gpu_util(tpl):
    items, error_code = tpl
    return max([item[2] for item in items])

In [25]:
df["max_gpu_mem"]  = df["gpu-tuple"].apply(max_gpu_mem)
df["max_gpu_util"] = df["gpu-tuple"].apply(max_gpu_util)

In [26]:
df[df.gpus > 1].head(2).T

Unnamed: 0,651,652
jobid,62006995,62007217
cluster,della,della
user,gghanim,gghanim
alloctres,"billing=40,cpu=24,gres/gpu=2,mem=512G,node=1","billing=40,cpu=24,gres/gpu=2,mem=512G,node=1"
elapsedraw,173102,165626
admincomment,{'nodes': {'della-l09g4': {'total_memory': 549...,{'nodes': {'della-l09g3': {'total_memory': 549...
ncpus,24,24
gpus,2,2
gpu-seconds,346204,331252
gpu-hours,96.167778,92.014444


The nodes with 80 GB A100's have 1000 GB of CPU memory, 48 cores and 4 GPUs. If split in half then would have 6 cores per GPU, 125 GB of CPU memory and 40 GB of GPU memory. The percentage of the GPU-hours that could run on these instances is:

In [27]:
100 * df[df["max_gpu_util"] <= 50]["gpu-seconds"].sum() / df["gpu-seconds"].sum()

27.162304386135663

In [28]:
100 * df[df["max_gpu_util"] > 50]["gpu-seconds"].sum() / df["gpu-seconds"].sum()

72.83769561386434

In [29]:
# mean utilization
df["util_times_seconds"] = df["max_gpu_util"] * df["gpu-seconds"]
df["util_times_seconds"].sum() / df["gpu-seconds"].sum()

64.63392923014177

In [30]:
df = df[(df["max_gpu_mem"] < 40)]
pct = f"Percentage of GPU-hours that could run on 40 GB GPUs = {round(100 * df["gpu-hours"].sum() / (days * hours_per_day * num_gpus))}%"
print(pct)

Percentage of GPU-hours that could run on 40 GB GPUs = 30%


In [31]:
df = df[(df["cores_per_gpu"] <= 6) & (df["CPU-Mem-Used-per-GPU"] < 125) & (df["max_gpu_mem"] < 40)]
pct = f"Percentage of GPU-hours that could run on 40 GB MIG instance (ignoring utilization) = {round(100 * df["gpu-hours"].sum() / (days * hours_per_day * num_gpus))}%"
print(pct)

Percentage of GPU-hours that could run on 40 GB MIG instance (ignoring utilization) = 27%


In [32]:
df = df[(df["cores_per_gpu"] <= 6) & (df["CPU-Mem-Used-per-GPU"] < 125) & (df["max_gpu_util"] < 50) & (df["max_gpu_mem"] < 40)]
pct = f"Percentage of GPU-hours that could run on 40 GB MIG instance = {round(100 * df["gpu-hours"].sum() / (days * hours_per_day * num_gpus))}%"
print(pct)

Percentage of GPU-hours that could run on 40 GB MIG instance = 5%
