# Cleaning sacct data for Stellar

<pre>$ ssh tiger
$ export SLURM_TIME_FORMAT="%s"
$ sacct -M tiger2 -a -X -P -S 2023-01-01T00:00:00 -E 2023-12-31T23:59:59 -o cluster,start,end,elapsedraw,timelimitraw,ncpus,nnodes,cputimeraw,alloctres,nodelist,admincomment > tiger.2023</pre>

## Working directory

On della, see /home/jdh4/wentzlaff_job_data_2023

## Cleaning

In [1]:
import os
import re
import subprocess
import numpy as np
import pandas as pd
from efficiency import get_stats_dict
from efficiency import cpu_efficiency
from efficiency import gpu_efficiency
from efficiency import cpu_memory_usage
from efficiency import gpu_memory_usage_eff_tuples

In [2]:
df = pd.read_csv("tiger.2023", sep="|")
df.head(2).T

Unnamed: 0,0,1
Cluster,tiger2,tiger2
Start,1672344064.0,1672356142.0
End,1672581955,1672576343
ElapsedRaw,237891,220201
TimelimitRaw,4260,4260
NCPUS,2,2
NNodes,1,1
CPUTimeRAW,475782,440402
AllocTRES,"billing=25,cpu=2,mem=120G,node=1","billing=21,cpu=2,mem=100G,node=1"
NodeList,tiger-i26c2n9,tiger-i26c2n10


In [3]:
df.columns = [col.lower() for col in df.columns]

In [4]:
renamings = {"user":"netid",
             "cputimeraw":"cpu-seconds",
             "nnodes":"nodes",
             "ncpus":"cores",
             "timelimitraw":"limit-minutes"}
df.rename(columns=renamings, inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 857281 entries, 0 to 857280
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   cluster        857281 non-null  object 
 1   start          804992 non-null  float64
 2   end            857281 non-null  int64  
 3   elapsedraw     857281 non-null  int64  
 4   limit-minutes  857281 non-null  int64  
 5   cores          857281 non-null  int64  
 6   nodes          857281 non-null  int64  
 7   cpu-seconds    857281 non-null  int64  
 8   alloctres      804754 non-null  object 
 9   nodelist       857281 non-null  object 
 10  admincomment   804973 non-null  object 
dtypes: float64(1), int64(6), object(4)
memory usage: 71.9+ MB


In [6]:
if df["elapsedraw"].dtype == 'object':
    # clean elapsedraw field
    df = df[pd.notna(df.elapsedraw)]
    df = df[df.elapsedraw.str.isnumeric()]
df.elapsedraw = df.elapsedraw.astype("int64")
df = df[df.elapsedraw > 0]

In [7]:
if df["start"].dtype == 'object':
    # clean start field
    df = df[pd.notna(df.start)]
    df = df[df.start.str.isnumeric()]
df.start = df.start.astype("int64")

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 765425 entries, 0 to 857280
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   cluster        765425 non-null  object
 1   start          765425 non-null  int64 
 2   end            765425 non-null  int64 
 3   elapsedraw     765425 non-null  int64 
 4   limit-minutes  765425 non-null  int64 
 5   cores          765425 non-null  int64 
 6   nodes          765425 non-null  int64 
 7   cpu-seconds    765425 non-null  int64 
 8   alloctres      765425 non-null  object
 9   nodelist       765425 non-null  object
 10  admincomment   765406 non-null  object
dtypes: int64(7), object(4)
memory usage: 70.1+ MB


In [9]:
def gpus_per_job(tres: str) -> int:
    """Return the number of allocated GPUs."""
    gpus = re.findall(r"gres/gpu=\d+", tres)
    return int(gpus[0].replace("gres/gpu=", "")) if gpus else 0

In [10]:
df["gpus"] = df.alloctres.apply(gpus_per_job)
df["gpu-seconds"] = df.apply(lambda row: row["elapsedraw"] * row["gpus"], axis='columns')

In [11]:
df["admincomment"] = df["admincomment"].apply(get_stats_dict)

## CPU and GPU efficiency

In [12]:
df["jobid"] = -1

In [13]:
df["cpu-eff-tuple"] = df.apply(lambda row: cpu_efficiency(row["admincomment"],
                                                          row["elapsedraw"],
                                                          row["jobid"],
                                                          row["cluster"],
                                                          single=True),
                                                          axis="columns")

In [14]:
def clean_eff_tuple(tpl):
    eff, error_code = tpl
    if error_code:
        return np.nan
    else:
        return eff

In [15]:
df["cpu-eff"] = df["cpu-eff-tuple"].apply(clean_eff_tuple)

In [16]:
df["gpu-eff-tuple"] = df.apply(lambda row: gpu_efficiency(row["admincomment"],
                                                          row["elapsedraw"],
                                                          row["jobid"],
                                                          row["cluster"],
                                                          single=True,
                                                          verbose=False),
                                                          axis="columns")

In [17]:
df["gpu-eff"] = df["gpu-eff-tuple"].apply(clean_eff_tuple)

In [18]:
df.head(2).T

Unnamed: 0,0,1
cluster,tiger2,tiger2
start,1672344064,1672356142
end,1672581955,1672576343
elapsedraw,237891,220201
limit-minutes,4260,4260
cores,2,2
nodes,1,1
cpu-seconds,475782,440402
alloctres,"billing=25,cpu=2,mem=120G,node=1","billing=21,cpu=2,mem=100G,node=1"
nodelist,tiger-i26c2n9,tiger-i26c2n10


In [19]:
df[["nodes", "cores", "gpus", "cpu-eff", "gpu-eff"]].describe()

Unnamed: 0,nodes,cores,gpus,cpu-eff,gpu-eff
count,765425.0,765425.0,765425.0,341293.0,1203.0
mean,1.890835,42.144943,0.034578,61.636323,81.481712
std,3.767816,157.497636,0.905928,34.405438,11.100336
min,1.0,1.0,0.0,0.0,0.0
25%,1.0,1.0,0.0,30.0,80.3
50%,1.0,1.0,0.0,69.5,86.6
75%,1.0,4.0,0.0,94.7,87.2
max,128.0,5120.0,24.0,99.9,95.2


## CPU and GPU memory

In [20]:
df["cpu-mem-tuple"] = df.apply(lambda row: cpu_memory_usage(row["admincomment"],
                                                            row["jobid"],
                                                            row["cluster"],
                                                            verbose=False),
                                                            axis="columns")

In [21]:
df["gpu-mem-tuple"] = df.apply(lambda row: gpu_memory_usage_eff_tuples(row["admincomment"],
                                                                       row["jobid"],
                                                                       row["cluster"],
                                                                       verbose=False),
                                                                       axis="columns")

In [22]:
def cpu_mem_util(tpl):
    used, alloc, err = tpl
    if err or alloc == 0:
        return np.nan
    util = round(100 * used / alloc)
    if util > 100:
        return np.nan
    return util


def cpu_mem_alloc(tpl):
    used, alloc, err = tpl
    if err:
        return np.nan
    return alloc

def gpu_mem_util(tpl):
    gpus, err = tpl
    if err:
        return np.nan
    used = 0
    alloc = 0
    for gpu in gpus:
        used += gpu[0]
        alloc += gpu[1]
    return round(100 * used / alloc)

In [23]:
df["cpu-mem-util"] = df["cpu-mem-tuple"].apply(cpu_mem_util)
df["cpu-mem-alloc"] = df["cpu-mem-tuple"].apply(cpu_mem_alloc)
df["gpu-mem-util"] = df["gpu-mem-tuple"].apply(gpu_mem_util)

In [24]:
df.head(15).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
cluster,tiger2,tiger2,tiger2,tiger2,tiger2,tiger2,tiger2,tiger2,tiger2,tiger2,tiger2,tiger2,tiger2,tiger2,tiger2
start,1672344064,1672356142,1672336647,1672444889,1672316882,1672318017,1672467126,1672336697,1672336704,1672336707,1672345072,1672345440,1672347997,1672350191,1672350191
end,1672581955,1672576343,1672853505,1672703796,1672608746,1672642893,1672567561,1672590507,1672581101,1672577127,1672585663,1672584957,1672604244,1672592271,1672587692
elapsedraw,237891,220201,516858,258907,291864,324876,100435,253810,244397,240420,240591,239517,256247,242080,237501
limit-minutes,4260,4260,8640,4310,6719,6719,4260,4320,4320,4320,4320,4320,4320,4320,4320
cores,2,2,800,2,2,2,4,120,120,120,120,120,120,120,120
nodes,1,1,20,1,1,1,1,3,3,3,3,3,3,3,3
cpu-seconds,475782,440402,413486400,517814,583728,649752,401740,30457200,29327640,28850400,28870920,28742040,30749640,29049600,28500120
alloctres,"billing=25,cpu=2,mem=120G,node=1","billing=21,cpu=2,mem=100G,node=1","billing=800,cpu=800,mem=3125G,node=20","billing=21,cpu=2,mem=100G,node=1","billing=25,cpu=2,mem=120G,node=1","billing=25,cpu=2,mem=120G,node=1","billing=46,cpu=4,mem=220G,node=1","billing=120,cpu=120,mem=480000M,node=3","billing=120,cpu=120,mem=480000M,node=3","billing=120,cpu=120,mem=480000M,node=3","billing=120,cpu=120,mem=480000M,node=3","billing=120,cpu=120,mem=480000M,node=3","billing=120,cpu=120,mem=480000M,node=3","billing=120,cpu=120,mem=480000M,node=3","billing=120,cpu=120,mem=480000M,node=3"
nodelist,tiger-i26c2n9,tiger-i26c2n10,"tiger-h19c2n[20-23],tiger-h20c1n[9-16],tiger-h...",tiger-h20c1n4,tiger-i26c2n13,tiger-h22c1n16,tiger-h19c2n14,tiger-h26c1n[7-9],tiger-h26c1n[12-14],tiger-h26c2n[10-12],"tiger-h19c1n7,tiger-h19c2n16,tiger-h24c2n23",tiger-h25c1n[1-3],"tiger-h25c1n4,tiger-h26c1n11,tiger-h26c2n24",tiger-h20c1n[19-21],tiger-h24c2n[12-14]


In [25]:
 df[["cpu-eff",  "cpu-mem-util", "cpu-mem-alloc", "gpus", "gpu-seconds", "gpu-eff", "gpu-mem-util"]].describe()

Unnamed: 0,cpu-eff,cpu-mem-util,cpu-mem-alloc,gpus,gpu-seconds,gpu-eff,gpu-mem-util
count,341293.0,341230.0,341293.0,765425.0,765425.0,1203.0,1203.0
mean,61.636323,15.667145,334.39888,0.034578,40.134958,81.481712,17.940981
std,34.405438,22.370998,870.406122,0.905928,2231.380164,11.100336,10.184439
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,0.0,16.0,0.0,0.0,80.3,7.0
50%,69.5,6.0,16.0,0.0,0.0,86.6,27.0
75%,94.7,16.0,128.0,0.0,0.0,87.2,27.0
max,99.9,100.0,19200.0,24.0,346896.0,95.2,30.0


In [26]:
df["cpu-seconds"].sum() / 3600

119795110.50055556

In [27]:
df["gpu-seconds"].sum() / 3600

8533.416666666666

## Node type

In [28]:
def expand_nodelist(nodes: int, nodelist: str) -> list:
    """Convert a nodelist from sacct to a Python list of node names. For
       example: della-l07g[4-7],della-l08g2 becomes
       ['della-l07g4', 'della-l07g5', 'della-l07g6', 'della-l07g7', 'della-l08g2']
    """
    if nodes == 1:
        return [nodelist]
    cmd = f"scontrol show hostname {nodelist}"
    try:
        output = subprocess.run(cmd,
                                stdout=subprocess.PIPE,
                                shell=True,
                                timeout=5,
                                text=True,
                                check=True)
    except:
        print(f"Failed to get nodes for {nodelist}")
    else:
        return output.stdout.strip().split("\n")

In [29]:
df["nodelist-expanded"] = df.apply(lambda row: expand_nodelist(row["nodes"], row["nodelist"]), axis="columns")

In [30]:
p100 =  [f"tiger-i19g{j}" for j in range(1, 17)]
p100 = set(p100)

In [31]:
print(len(p100))

16


In [32]:
def node_type(nodelist, gpus, jobid):
    if gpus > 0:
        return "gpu"
    else:
        sub = set(nodelist) - p100
        if len(sub) < len(nodelist):
            # check for cpu nodes being used for gpu jobs
            if any([node.count("g") == 1 for node in nodelist]):
                print(nodelist, jobid)
            return "gpu"
        else:
            # check for gpu nodes being used for cpu jobs
            if any([node.count("g") == 2 for node in nodelist]):
                print(nodelist, jobid)
            return "cpu"

In [33]:
df["node-type"] = df.apply(lambda row: node_type(row["nodelist-expanded"], row["gpus"], row["jobid"]), axis="columns")

In [34]:
df["node-type"].value_counts()

node-type
cpu    764205
gpu      1220
Name: count, dtype: int64

## Write to JSON

In [35]:
cols = ["cluster", "start", "end", "elapsedraw", "limit-minutes", "cores", "nodes", "cpu-seconds", "cpu-eff",  "cpu-mem-util", "cpu-mem-alloc", "gpus", "gpu-seconds", "gpu-eff", "gpu-mem-util", "node-type"]
df[cols].to_json("tiger_2023.json")

## Definitions

<pre>end minus start is the run time in seconds (this should equal elapsedraw)
limit-minutes is the run time limit in minutes
cpu-seconds is the number of CPU-cores multiplied by elapsedraw
cpu-eff is the CPU efficiency or CPU utilization (varies from 0-100%)
cpu-mem-util is the CPU memory utilization (used/allocated); it varies from 0-100%
cpu-mem-alloc is the total allocated CPU memory in GB for the job
gpus is the number of GPUs allocated for the job
gpu-seconds is the number of GPUs multiplied by elapsedraw
gpu-eff is the GPU efficiency or GPU utilization (varies from 0-100%)
gpu-mem-util is the GPU memory utilization (varies from 0-100%)</pre>