# Cleaning sacct data for Traverse

<pre>$ ssh traverse
$ export SLURM_TIME_FORMAT="%s"
$ sacct -M traverse -a -X -P -S 2023-01-01T00:00:00 -E 2023-12-31T23:59:59 -o cluster,start,end,elapsedraw,timelimitraw,ncpus,nnodes,cputimeraw,alloctres,nodelist,admincomment > traverse.2023</pre>

## Working directory

On della, see /home/jdh4/wentzlaff_job_data_2023

## Cleaning

In [1]:
import os
import re
import subprocess
import numpy as np
import pandas as pd
from efficiency import get_stats_dict
from efficiency import cpu_efficiency
from efficiency import gpu_efficiency
from efficiency import cpu_memory_usage
from efficiency import gpu_memory_usage_eff_tuples

In [2]:
df = pd.read_csv("traverse.2023", sep="|")
df.head(2).T

Unnamed: 0,0,1
Cluster,traverse,traverse
Start,1672557622.0,1672557763.0
End,1672557643,1672557770
ElapsedRaw,21,7
TimelimitRaw,2880,2880
NCPUS,4,4
NNodes,1,1
CPUTimeRAW,84,28
AllocTRES,"billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1"
NodeList,traverse-k05g10,traverse-k05g10


In [3]:
df.columns = [col.lower() for col in df.columns]

In [4]:
renamings = {"user":"netid",
             "cputimeraw":"cpu-seconds",
             "nnodes":"nodes",
             "ncpus":"cores",
             "timelimitraw":"limit-minutes"}
df.rename(columns=renamings, inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86994 entries, 0 to 86993
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   cluster        86994 non-null  object 
 1   start          82644 non-null  float64
 2   end            86994 non-null  int64  
 3   elapsedraw     86994 non-null  int64  
 4   limit-minutes  86994 non-null  int64  
 5   cores          86994 non-null  int64  
 6   nodes          86994 non-null  int64  
 7   cpu-seconds    86994 non-null  int64  
 8   alloctres      82460 non-null  object 
 9   nodelist       86994 non-null  object 
 10  admincomment   82614 non-null  object 
dtypes: float64(1), int64(6), object(4)
memory usage: 7.3+ MB


In [6]:
if df["elapsedraw"].dtype == 'object':
    # clean elapsedraw field
    df = df[pd.notna(df.elapsedraw)]
    df = df[df.elapsedraw.str.isnumeric()]
df.elapsedraw = df.elapsedraw.astype("int64")
df = df[df.elapsedraw > 0]

In [7]:
if df["start"].dtype == 'object':
    # clean start field
    df = df[pd.notna(df.start)]
    df = df[df.start.str.isnumeric()]
df.start = df.start.astype("int64")

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 82318 entries, 0 to 86993
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   cluster        82318 non-null  object
 1   start          82318 non-null  int64 
 2   end            82318 non-null  int64 
 3   elapsedraw     82318 non-null  int64 
 4   limit-minutes  82318 non-null  int64 
 5   cores          82318 non-null  int64 
 6   nodes          82318 non-null  int64 
 7   cpu-seconds    82318 non-null  int64 
 8   alloctres      82318 non-null  object
 9   nodelist       82318 non-null  object
 10  admincomment   82288 non-null  object
dtypes: int64(7), object(4)
memory usage: 7.5+ MB


In [9]:
def gpus_per_job(tres: str) -> int:
    """Return the number of allocated GPUs."""
    gpus = re.findall(r"gres/gpu=\d+", tres)
    return int(gpus[0].replace("gres/gpu=", "")) if gpus else 0

In [10]:
df["gpus"] = df.alloctres.apply(gpus_per_job)
df["gpu-seconds"] = df.apply(lambda row: row["elapsedraw"] * row["gpus"], axis='columns')

In [11]:
df["admincomment"] = df["admincomment"].apply(get_stats_dict)

## CPU and GPU efficiency

In [12]:
df["jobid"] = -1

In [13]:
df["cpu-eff-tuple"] = df.apply(lambda row: cpu_efficiency(row["admincomment"],
                                                          row["elapsedraw"],
                                                          row["jobid"],
                                                          row["cluster"],
                                                          single=True),
                                                          axis="columns")



In [14]:
def clean_eff_tuple(tpl):
    eff, error_code = tpl
    if error_code:
        return np.nan
    else:
        return eff

In [15]:
df["cpu-eff"] = df["cpu-eff-tuple"].apply(clean_eff_tuple)

In [16]:
df["gpu-eff-tuple"] = df.apply(lambda row: gpu_efficiency(row["admincomment"],
                                                          row["elapsedraw"],
                                                          row["jobid"],
                                                          row["cluster"],
                                                          single=True,
                                                          verbose=False),
                                                          axis="columns")

In [17]:
df["gpu-eff"] = df["gpu-eff-tuple"].apply(clean_eff_tuple)

In [18]:
df.head(2).T

Unnamed: 0,0,1
cluster,traverse,traverse
start,1672557622,1672557763
end,1672557643,1672557770
elapsedraw,21,7
limit-minutes,2880,2880
cores,4,4
nodes,1,1
cpu-seconds,84,28
alloctres,"billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1"
nodelist,traverse-k05g10,traverse-k05g10


In [19]:
df[["cores", "gpus", "cpu-eff", "gpu-eff"]].describe()

Unnamed: 0,cores,gpus,cpu-eff,gpu-eff
count,82318.0,82318.0,59534.0,57594.0
mean,62.364975,4.298064,52.807078,25.282653
std,220.275712,6.965476,29.763709,31.291436
min,4.0,0.0,0.0,0.0
25%,4.0,2.0,36.3,1.6
50%,4.0,4.0,49.8,14.8
75%,32.0,4.0,84.8,33.5
max,4096.0,128.0,100.0,100.0


## CPU and GPU memory

In [20]:
df["cpu-mem-tuple"] = df.apply(lambda row: cpu_memory_usage(row["admincomment"],
                                                            row["jobid"],
                                                            row["cluster"]),
                                                            axis="columns")



IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [21]:
df["gpu-mem-tuple"] = df.apply(lambda row: gpu_memory_usage_eff_tuples(row["admincomment"],
                                                                       row["jobid"],
                                                                       row["cluster"]),
                                                                       axis="columns")



IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)





In [22]:
def cpu_mem_util(tpl):
    used, alloc, err = tpl
    if err:
        return np.nan
    return round(100 * used / alloc)


def cpu_mem_alloc(tpl):
    used, alloc, err = tpl
    if err:
        return np.nan
    return alloc

def gpu_mem_util(tpl):
    gpus, err = tpl
    if err:
        return np.nan
    used = 0
    alloc = 0
    for gpu in gpus:
        used += gpu[0]
        alloc += gpu[1]
    return round(100 * used / alloc)

In [23]:
df["cpu-mem-util"] = df["cpu-mem-tuple"].apply(cpu_mem_util)
df["cpu-mem-alloc"] = df["cpu-mem-tuple"].apply(cpu_mem_alloc)
df["gpu-mem-util"] = df["gpu-mem-tuple"].apply(gpu_mem_util)

In [24]:
df.head(15).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
cluster,traverse,traverse,traverse,traverse,traverse,traverse,traverse,traverse,traverse,traverse,traverse,traverse,traverse,traverse,traverse
start,1672557622,1672557763,1672557815,1672557944,1672557979,1672558211,1672558255,1672576431,1672576780,1672577131,1672577460,1672578217,1672578411,1672578557,1672579424
end,1672557643,1672557770,1672557821,1672557951,1672558156,1672558245,1672559678,1672576445,1672576794,1672577172,1672577500,1672578232,1672578451,1672578599,1672579779
elapsedraw,21,7,6,7,177,34,1423,14,14,41,40,15,40,42,355
limit-minutes,2880,2880,2880,2880,2880,2880,2880,2880,2880,2880,2880,2880,2880,2880,2880
cores,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
nodes,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
cpu-seconds,84,28,24,28,708,136,5692,56,56,164,160,60,160,168,1420
alloctres,"billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1","billing=32,cpu=4,gres/gpu=1,mem=64G,node=1"
nodelist,traverse-k05g10,traverse-k05g10,traverse-k05g10,traverse-k05g10,traverse-k05g10,traverse-k05g10,traverse-k05g10,traverse-k05g10,traverse-k05g10,traverse-k05g10,traverse-k05g10,traverse-k05g10,traverse-k05g10,traverse-k05g10,traverse-k05g10


In [25]:
 df[["cpu-eff",  "cpu-mem-util", "cpu-mem-alloc", "gpus", "gpu-seconds", "gpu-eff", "gpu-mem-util"]].describe()

Unnamed: 0,cpu-eff,cpu-mem-util,cpu-mem-alloc,gpus,gpu-seconds,gpu-eff,gpu-mem-util
count,59534.0,59534.0,59534.0,82318.0,82318.0,57594.0,57594.0
mean,52.807078,25.056741,115.935533,4.298064,32038.07,25.282653,21.951766
std,29.763709,15.99385,439.609651,6.965476,321822.2,31.291436,30.677634
min,0.0,0.0,1.0,0.0,0.0,0.0,3.0
25%,36.3,18.0,32.0,2.0,132.0,1.6,7.0
50%,49.8,24.0,32.0,4.0,652.0,14.8,7.0
75%,84.8,31.0,40.0,4.0,4492.0,33.5,12.0
max,100.0,185.0,7812.0,128.0,10383410.0,100.0,100.0


In [26]:
df["cpu-seconds"].sum() / 3600

16801995.68

In [27]:
df["gpu-seconds"].sum() / 3600

732585.9786111111

In [28]:
cols = ["cluster", "start", "end", "elapsedraw", "limit-minutes", "cores", "nodes", "cpu-seconds", "cpu-eff",  "cpu-mem-util", "cpu-mem-alloc", "gpus", "gpu-seconds", "gpu-eff", "gpu-mem-util"]
df[cols].to_json("traverse_2023.json")

## Definitions

<pre>end minus start is the run time in seconds (this should equal elapsedraw)
limit-minutes is the run time limit in minutes
cpu-seconds is the number of CPU-cores multiplied by elapsedraw
cpu-eff is the CPU efficiency or CPU utilization (varies from 0-100%)
cpu-mem-util is the CPU memory utilization (used/allocated); it varies from 0-100%
cpu-mem-alloc is the total allocated CPU memory in GB for the job
gpus is the number of GPUs allocated for the job
gpu-seconds is the number of GPUs multiplied by elapsedraw
gpu-eff is the GPU efficiency or GPU utilization (varies from 0-100%)
gpu-mem-util is the GPU memory utilization (varies from 0-100%)</pre>

There is one job with an erroneous CPU memory utilization of greater than 100%:

In [29]:
df[df["cpu-mem-util"] > 100]

Unnamed: 0,cluster,start,end,elapsedraw,limit-minutes,cores,nodes,cpu-seconds,alloctres,nodelist,...,jobid,cpu-eff-tuple,cpu-eff,gpu-eff-tuple,gpu-eff,cpu-mem-tuple,gpu-mem-tuple,cpu-mem-util,cpu-mem-alloc,gpu-mem-util
82518,traverse,1701890226,1701908548,18322,300,32,1,586304,"billing=32,cpu=32,gres/gpu=1,mem=48G,node=1",traverse-k05g5,...,-1,"(48.1, 0)",48.1,"(31.1, 0)",31.1,"(89.0, 48.0, 0)","([(27.4, 32.0, 31.1)], 0)",185.0,48.0,86.0
