# Tally slurm total GPU hours for an account

This assumes that you have run

```bash
sacct -p pli-c --allusers --json > sacct_pli.json    
sacct -p pli --allusers --json > sacct_other.json    
```

on the server, printing out all jobs for the `pli` account.

Or for a larger time window, add `-S 2024-01-01 `

In [141]:
%load_ext autoreload
%autoreload 2

import json
from pathlib import Path
from pandas import DataFrame as DF
import matplotlib.pyplot as plt
from slurm_analyzer import SLURMAnalyzer
import seaborn as sns
import pandas as pd
from datetime import datetime
import tabulate

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [142]:
dpath = Path("..", 'data')
assert dpath.is_dir()


In [143]:
df = pd.concat([
    SLURMAnalyzer().parse(json.loads((dpath / "pli_7.json").read_text())),
    SLURMAnalyzer().parse(json.loads((dpath / "pli_c_7.json").read_text()))
])

In [163]:
# clean
# remove strange values (negative wait times etc.) by requiring that jobs run at least 10 min
df = df.query("elapsed > 600")

In [178]:
def by_time(df, title=""):
    tab = []

    def add(label, query):
        sdf = df.query(query)
        tab.append((label, sdf.wait_time_h.mean(), len(sdf)))
    
    add("Last 7 days", "age_days <= 7")
    add("Last 30 days", "age_days <= 30")
    add("Last 60 days", "age_days <= 60")

    if title:
        print(title)
    print(tabulate.tabulate(tab, headers=["Period", "Wait time (h)", "Jobs"]))


# Core partition

## Large jobs 

In [181]:
for nodes in [1, 2, 4]:
    by_time(df.query(f"partition == 'pli-c' and {nodes} <= n_nodes and elapsed_h > 24"), f">= {nodes} nodes, >= 24h runtime")
    print()

>= 1 nodes, >= 24h runtime
Period          Wait time (h)    Jobs
------------  ---------------  ------
Last 7 days           5.10251      61
Last 30 days          4.65745     154
Last 60 days          7.12816     443

>= 2 nodes, >= 24h runtime
Period          Wait time (h)    Jobs
------------  ---------------  ------
Last 7 days           17.2819      15
Last 30 days          14.4519      46
Last 60 days          19.0333     109

>= 4 nodes, >= 24h runtime
Period          Wait time (h)    Jobs
------------  ---------------  ------
Last 7 days        0.00486111       2
Last 30 days      12.1466          21
Last 60 days       8.69215         37



## Smaller jobs

In [192]:
for t in [1, 10, 24]:
    by_time(df.query(f"partition == 'pli-c' and gpu_time_h >= {t}"), f">= {t} GPU hours")

>= 1 GPU hours
Period          Wait time (h)    Jobs
------------  ---------------  ------
Last 7 days           2.90843     794
Last 30 days          9.07424    7445
Last 60 days          9.86799   13413
>= 10 GPU hours
Period          Wait time (h)    Jobs
------------  ---------------  ------
Last 7 days           7.33164     248
Last 30 days          4.45485    1007
Last 60 days          4.30135    1963
>= 24 GPU hours
Period          Wait time (h)    Jobs
------------  ---------------  ------
Last 7 days           7.4221      149
Last 30 days          5.97688     545
Last 60 days          5.7936     1084


# Campus partition

In [190]:
for t in [1, 10, 24]:
    by_time(df.query(f"partition == 'pli-c' and gpu_time_h >= {t}"), f">= {t} GPU ours")

>= 1 GPU ours
Period          Wait time (h)    Jobs
------------  ---------------  ------
Last 7 days           2.90843     794
Last 30 days          9.07424    7445
Last 60 days          9.86799   13413
>= 10 GPU ours
Period          Wait time (h)    Jobs
------------  ---------------  ------
Last 7 days           7.33164     248
Last 30 days          4.45485    1007
Last 60 days          4.30135    1963
>= 24 GPU ours
Period          Wait time (h)    Jobs
------------  ---------------  ------
Last 7 days           7.4221      149
Last 30 days          5.97688     545
Last 60 days          5.7936     1084
