# Cleaning sacct data for Stellar

<pre>$ ssh stellar
$ export SLURM_TIME_FORMAT="%s"
$ sacct -M stellar -a -X -P -S 2023-01-01T00:00:00 -E 2023-12-31T23:59:59 -o cluster,start,end,elapsedraw,timelimitraw,ncpus,nnodes,cputimeraw,alloctres,nodelist,admincomment > stellar.2023</pre>

## Working directory

On della, see /home/jdh4/wentzlaff_job_data_2023

## Cleaning

In [1]:
import os
import re
import subprocess
import numpy as np
import pandas as pd
from efficiency import get_stats_dict
from efficiency import cpu_efficiency
from efficiency import gpu_efficiency
from efficiency import cpu_memory_usage
from efficiency import gpu_memory_usage_eff_tuples

In [2]:
df = pd.read_csv("stellar.2023", sep="|")
df.head(2).T

Unnamed: 0,0,1
Cluster,stellar,stellar
Start,1672399940.0,1672573066.0
End,1672573056,1672746193
ElapsedRaw,173116,173127
TimelimitRaw,2880,2880
NCPUS,1536,1536
NNodes,16,16
CPUTimeRAW,265906176,265923072
AllocTRES,"billing=2812,cpu=1536,mem=11250G,node=16","billing=2812,cpu=1536,mem=11250G,node=16"
NodeList,"stellar-i08n[1-5,7-17]","stellar-i08n[1-5,7-17]"


In [3]:
df.columns = [col.lower() for col in df.columns]

In [4]:
renamings = {"user":"netid",
             "cputimeraw":"cpu-seconds",
             "nnodes":"nodes",
             "ncpus":"cores",
             "timelimitraw":"limit-minutes"}
df.rename(columns=renamings, inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 805503 entries, 0 to 805502
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   cluster        805503 non-null  object 
 1   start          786649 non-null  float64
 2   end            805503 non-null  int64  
 3   elapsedraw     805503 non-null  int64  
 4   limit-minutes  805503 non-null  int64  
 5   cores          805503 non-null  int64  
 6   nodes          805503 non-null  int64  
 7   cpu-seconds    805503 non-null  int64  
 8   alloctres      785881 non-null  object 
 9   nodelist       805503 non-null  object 
 10  admincomment   786640 non-null  object 
dtypes: float64(1), int64(6), object(4)
memory usage: 67.6+ MB


In [6]:
if df["elapsedraw"].dtype == 'object':
    # clean elapsedraw field
    df = df[pd.notna(df.elapsedraw)]
    df = df[df.elapsedraw.str.isnumeric()]
df.elapsedraw = df.elapsedraw.astype("int64")
df = df[df.elapsedraw > 0]

In [7]:
if df["start"].dtype == 'object':
    # clean start field
    df = df[pd.notna(df.start)]
    df = df[df.start.str.isnumeric()]
df.start = df.start.astype("int64")

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 687625 entries, 0 to 805502
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   cluster        687625 non-null  object
 1   start          687625 non-null  int64 
 2   end            687625 non-null  int64 
 3   elapsedraw     687625 non-null  int64 
 4   limit-minutes  687625 non-null  int64 
 5   cores          687625 non-null  int64 
 6   nodes          687625 non-null  int64 
 7   cpu-seconds    687625 non-null  int64 
 8   alloctres      687625 non-null  object
 9   nodelist       687625 non-null  object
 10  admincomment   687616 non-null  object
dtypes: int64(7), object(4)
memory usage: 63.0+ MB


In [9]:
def gpus_per_job(tres: str) -> int:
    """Return the number of allocated GPUs."""
    gpus = re.findall(r"gres/gpu=\d+", tres)
    return int(gpus[0].replace("gres/gpu=", "")) if gpus else 0

In [10]:
df["gpus"] = df.alloctres.apply(gpus_per_job)
df["gpu-seconds"] = df.apply(lambda row: row["elapsedraw"] * row["gpus"], axis='columns')

In [11]:
df["admincomment"] = df["admincomment"].apply(get_stats_dict)

## CPU and GPU efficiency

In [12]:
df["jobid"] = -1

In [13]:
df["cpu-eff-tuple"] = df.apply(lambda row: cpu_efficiency(row["admincomment"],
                                                          row["elapsedraw"],
                                                          row["jobid"],
                                                          row["cluster"],
                                                          single=True),
                                                          axis="columns")

In [14]:
def clean_eff_tuple(tpl):
    eff, error_code = tpl
    if error_code:
        return np.nan
    else:
        return eff

In [15]:
df["cpu-eff"] = df["cpu-eff-tuple"].apply(clean_eff_tuple)

In [16]:
df["gpu-eff-tuple"] = df.apply(lambda row: gpu_efficiency(row["admincomment"],
                                                          row["elapsedraw"],
                                                          row["jobid"],
                                                          row["cluster"],
                                                          single=True,
                                                          verbose=False),
                                                          axis="columns")

In [17]:
df["gpu-eff"] = df["gpu-eff-tuple"].apply(clean_eff_tuple)

In [18]:
df.head(2).T

Unnamed: 0,0,1
cluster,stellar,stellar
start,1672399940,1672573066
end,1672573056,1672746193
elapsedraw,173116,173127
limit-minutes,2880,2880
cores,1536,1536
nodes,16,16
cpu-seconds,265906176,265923072
alloctres,"billing=2812,cpu=1536,mem=11250G,node=16","billing=2812,cpu=1536,mem=11250G,node=16"
nodelist,"stellar-i08n[1-5,7-17]","stellar-i08n[1-5,7-17]"


In [19]:
df[["nodes", "cores", "gpus", "cpu-eff", "gpu-eff"]].describe()

Unnamed: 0,nodes,cores,gpus,cpu-eff,gpu-eff
count,687625.0,687625.0,687625.0,269542.0,31328.0
mean,1.665389,89.656233,0.058272,62.850998,15.053001
std,5.17862,646.157794,0.246596,30.411529,27.418795
min,1.0,1.0,0.0,0.0,0.0
25%,1.0,1.0,0.0,39.7,0.0
50%,1.0,1.0,0.0,67.3,1.4
75%,1.0,4.0,0.0,93.4,16.0
max,144.0,18432.0,4.0,99.9,100.0


## CPU and GPU memory

In [20]:
df["cpu-mem-tuple"] = df.apply(lambda row: cpu_memory_usage(row["admincomment"],
                                                            row["jobid"],
                                                            row["cluster"],
                                                            verbose=False),
                                                            axis="columns")

CPU memory usage > 100%: -1 stellar 87600263168 31457280000
CPU memory usage > 100%: -1 stellar 55828275200 34359738368
CPU memory usage > 100%: -1 stellar 44024221696 34359738368
CPU memory usage > 100%: -1 stellar 1130527821824 1127428915200
CPU memory usage > 100%: -1 stellar 41761218560 34359738368
CPU memory usage > 100%: -1 stellar 368970825728 360777252864
CPU memory usage > 100%: -1 stellar 537174888448 536870912000


In [21]:
df["gpu-mem-tuple"] = df.apply(lambda row: gpu_memory_usage_eff_tuples(row["admincomment"],
                                                                       row["jobid"],
                                                                       row["cluster"],
                                                                       verbose=False),
                                                                       axis="columns")

In [22]:
def cpu_mem_util(tpl):
    used, alloc, err = tpl
    if err or alloc == 0:
        return np.nan
    util = round(100 * used / alloc)
    if util > 100:
        return np.nan
    return util


def cpu_mem_alloc(tpl):
    used, alloc, err = tpl
    if err:
        return np.nan
    return alloc

def gpu_mem_util(tpl):
    gpus, err = tpl
    if err:
        return np.nan
    used = 0
    alloc = 0
    for gpu in gpus:
        used += gpu[0]
        alloc += gpu[1]
    return round(100 * used / alloc)

In [23]:
df["cpu-mem-util"] = df["cpu-mem-tuple"].apply(cpu_mem_util)
df["cpu-mem-alloc"] = df["cpu-mem-tuple"].apply(cpu_mem_alloc)
df["gpu-mem-util"] = df["gpu-mem-tuple"].apply(gpu_mem_util)

In [24]:
df.head(15).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
cluster,stellar,stellar,stellar,stellar,stellar,stellar,stellar,stellar,stellar,stellar,stellar,stellar,stellar,stellar,stellar
start,1672399940,1672573066,1672746201,1673092206,1672177580,1672491840,1672365636,1672434522,1672435269,1672436043,1672436076,1672436114,1672437011,1672437037,1672457911
end,1672573056,1672746193,1672919328,1673265331,1672782704,1672596302,1672625137,1672694051,1672606295,1672607051,1672607085,1672607132,1672694424,1672694445,1672717423
elapsedraw,173116,173127,173127,173125,605124,104462,259501,259529,171026,171008,171009,171018,257413,257408,259512
limit-minutes,2880,2880,2880,2880,10080,1800,4320,4320,2880,2880,2880,2880,4320,4320,4320
cores,1536,1536,1536,1536,16,1920,96,960,384,384,384,384,384,384,960
nodes,16,16,16,16,1,20,1,10,4,4,4,4,4,4,10
cpu-seconds,265906176,265923072,265923072,265920000,9681984,200567040,24912096,249147840,65673984,65667072,65667456,65670912,98846592,98844672,249131520
alloctres,"billing=2812,cpu=1536,mem=11250G,node=16","billing=2812,cpu=1536,mem=11250G,node=16","billing=2812,cpu=1536,mem=11250G,node=16","billing=2812,cpu=1536,mem=11250G,node=16","billing=64,cpu=16,mem=256G,node=1","billing=1920,cpu=1920,mem=1920G,node=20","billing=175,cpu=96,mem=720000M,node=1","billing=960,cpu=960,mem=3840G,node=10","billing=703,cpu=384,mem=2812.50G,node=4","billing=703,cpu=384,mem=2812.50G,node=4","billing=703,cpu=384,mem=2812.50G,node=4","billing=703,cpu=384,mem=2812.50G,node=4","billing=703,cpu=384,mem=2812.50G,node=4","billing=703,cpu=384,mem=2812.50G,node=4","billing=960,cpu=960,mem=3840G,node=10"
nodelist,"stellar-i08n[1-5,7-17]","stellar-i08n[1-5,7-17]","stellar-i08n[1-5,7-17]","stellar-k07n[2,4-5,20-21],stellar-k08n[1-8,11-13]",stellar-k09n1,"stellar-i03n21,stellar-i04n[1-19]",stellar-i08n19,"stellar-i10n[1-2],stellar-k07n[2,4-5],stellar-...","stellar-m08n21,stellar-m09n[1-3]",stellar-m09n[14-17],stellar-i07n[1-4],stellar-i07n[5-8],stellar-i07n[9-12],stellar-i07n[13-16],"stellar-k07n1,stellar-k08n[7-15]"


In [25]:
 df[["cpu-eff",  "cpu-mem-util", "cpu-mem-alloc", "gpus", "gpu-seconds", "gpu-eff", "gpu-mem-util"]].describe()

Unnamed: 0,cpu-eff,cpu-mem-util,cpu-mem-alloc,gpus,gpu-seconds,gpu-eff,gpu-mem-util
count,269542.0,269471.0,269542.0,687625.0,687625.0,31328.0,31328.0
mean,62.850998,16.907745,1035.791042,0.058272,160.266311,15.053001,73.01213
std,30.411529,26.192279,4253.584018,0.246596,4788.305164,27.418795,31.424176
min,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,39.7,0.0,16.0,0.0,0.0,0.0,42.0
50%,67.3,6.0,16.0,0.0,0.0,1.4,92.0
75%,93.4,17.0,703.0,0.0,0.0,16.0,92.0
max,99.9,100.0,72000.0,4.0,656678.0,100.0,100.0


In [26]:
df["cpu-seconds"].sum() / 3600

262750797.77222222

In [27]:
df["gpu-seconds"].sum() / 3600

30611.978333333333

## Node type

In [28]:
def expand_nodelist(nodes: int, nodelist: str) -> list:
    """Convert a nodelist from sacct to a Python list of node names. For
       example: della-l07g[4-7],della-l08g2 becomes
       ['della-l07g4', 'della-l07g5', 'della-l07g6', 'della-l07g7', 'della-l08g2']
    """
    if nodes == 1:
        return [nodelist]
    cmd = f"scontrol show hostname {nodelist}"
    try:
        output = subprocess.run(cmd,
                                stdout=subprocess.PIPE,
                                shell=True,
                                timeout=5,
                                text=True,
                                check=True)
    except:
        print(f"Failed to get nodes for {nodelist}")
    else:
        return output.stdout.strip().split("\n")

In [29]:
df["nodelist-expanded"] = df.apply(lambda row: expand_nodelist(row["nodes"], row["nodelist"]), axis="columns")

In [30]:
amd =  [f"stellar-m01n{j}" for j in range(1, 21)]
amd += [f"stellar-m0{i}n{j}" for i in [2, 3] for j in range(1, 35)]
amd += [f"stellar-m04n{j}" for j in range(1, 33)]
amd += [f"stellar-m05n{j}" for j in range(1, 34)]
amd += [f"stellar-m06n{j}" for j in range(1, 35)]
amd = set(amd)

In [31]:
print(len(amd))

187


In [32]:
def node_type(nodelist, gpus, jobid):
    if nodelist == ["stellar-bigmem"]:
        return "bigmem"
    elif gpus > 0:
        return "gpu"
    elif any([nodelist == [f"stellar-m01g{i}"] for i in range(1, 7)]):
        # alloctres was missing gres/gpu for these jobs
        return "gpu"
    else:
        sub = set(nodelist) - amd
        if len(sub) < len(nodelist):
            # check for intel nodes being used for amd jobs
            if "k" in "".join(nodelist):
                print(nodelist, jobid)
            return "amd"
        else:
            # check for amd nodes being used for intel jobs
            if any([f"m0{i}" in "".join(nodelist) for i in range(1, 7)]):
                print(nodelist, jobid)
            return "intel"

In [33]:
df["node-type"] = df.apply(lambda row: node_type(row["nodelist-expanded"], row["gpus"], row["jobid"]), axis="columns")

In [34]:
df["node-type"].value_counts()

node-type
amd       487500
intel     160651
gpu        38442
bigmem      1032
Name: count, dtype: int64

## Write to JSON

In [35]:
cols = ["cluster", "start", "end", "elapsedraw", "limit-minutes", "cores", "nodes", "cpu-seconds", "cpu-eff",  "cpu-mem-util", "cpu-mem-alloc", "gpus", "gpu-seconds", "gpu-eff", "gpu-mem-util", "node-type"]
df[cols].to_json("stellar_2023.json")

## Definitions

<pre>end minus start is the run time in seconds (this should equal elapsedraw)
limit-minutes is the run time limit in minutes
cpu-seconds is the number of CPU-cores multiplied by elapsedraw
cpu-eff is the CPU efficiency or CPU utilization (varies from 0-100%)
cpu-mem-util is the CPU memory utilization (used/allocated); it varies from 0-100%
cpu-mem-alloc is the total allocated CPU memory in GB for the job
gpus is the number of GPUs allocated for the job
gpu-seconds is the number of GPUs multiplied by elapsedraw
gpu-eff is the GPU efficiency or GPU utilization (varies from 0-100%)
gpu-mem-util is the GPU memory utilization (varies from 0-100%)</pre>