In [1]:
import json
import os
import yaml
from pathlib import Path
from dask.distributed import Client
import re

In [2]:
use_local=True

In [3]:
if not use_local:
    with open(f'~/.dlio_profiler/configuration.yaml', 'r') as file:
        dlp_yaml = yaml.safe_load(file)
        app_root = dlp_yaml["app"]
else:
    app_root = str(Path(os.getcwd()).parent.parent)

In [4]:
import sys
sys.path.insert(0, app_root)
sys.path

['/usr/WS2/haridev',
 '/usr/WS2/haridev/scr-dlio/examples',
 '/usr/workspace/iopp/projects/digio',
 '/collab/usr/gapps/python/build/spack-toss4.1/var/spack/environments/python/._view/75prb56irmif5ejtirjthpx6kq3gqo52/lib/python39.zip',
 '/collab/usr/gapps/python/build/spack-toss4.1/var/spack/environments/python/._view/75prb56irmif5ejtirjthpx6kq3gqo52/lib/python3.9',
 '/collab/usr/gapps/python/build/spack-toss4.1/var/spack/environments/python/._view/75prb56irmif5ejtirjthpx6kq3gqo52/lib/python3.9/lib-dynload',
 '',
 '/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages']

In [5]:
import dlp_analyzer
print(dlp_analyzer.__file__)
from dlp_analyzer.main import DLPAnalyzer,get_dlp_configuration,update_dlp_configuration,setup_logging,setup_dask_cluster, reset_dask_cluster, get_dlp_configuration


/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/__init__.py


In [6]:
if not use_local:
    dask_run_dir = os.path.join(app_root, "dlp_analyzer", "dask", "run_dir")
    with open (os.path.join(dask_run_dir, f"scheduler_{os.getenv('USER')}.json"), "r") as f:
        dask_scheduler = json.load(f)["address"]
else:
    dask_scheduler = None

In [7]:
app_name = "dlio_scr" # dlio dlio_scr

In [8]:
def get_conditions(json_object):
    app_io_cond = "reader" in json_object["cat"] or "checkpoint" in json_object["cat"]
    compute_cond = "compute" in json_object["name"] # Cosmoflow
    io_cond = "POSIX" == json_object["cat"] # Cosmoflow
    return app_io_cond, compute_cond, io_cond

In [9]:
conf = update_dlp_configuration(dask_scheduler=dask_scheduler, verbose=True, workers=64,
                                log_file=f"./dlp_{os.getenv('USER')}.log", rebuild_index=False, time_approximate=True, 
                                host_pattern=r'lassen(\d+)', time_granularity=30e6, skip_hostname=True, conditions=get_conditions)

In [10]:
setup_logging()

In [11]:
setup_dask_cluster()

[INFO] [14:56:21] Initialized Client with 64 workers and link http://127.0.0.1:35903/status [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:668]


In [12]:
def get_checkpoint_time(df):
    app_time = df.query("name.str.contains('PyTorchCheckpointing.checkpoint')")
    app_time_files = app_time.groupby(["pid","tid"]).agg({"dur":sum}).max()
    return app_time_files.compute() / 1e6

def get_posix_pfs_time(df):
    posix_lustre_time = df.query("cat == 'POSIX' and filename.str.contains('/p/lustre2/haridev/dlio/scr/checkpoints')")
    posix_lustre_time_files = posix_lustre_time.groupby(["pid","tid"]).agg({"dur":sum}).max()
    return posix_lustre_time_files.compute() / 1e6

def get_posix_ssd_time(df):
    posix_lustre_time = df.query("cat == 'POSIX' and filename.str.contains('/l/ssd/haridev/scr/checkpoints')")
    posix_lustre_time_files = posix_lustre_time.groupby(["pid","tid"]).agg({"dur":sum}).max()
    return posix_lustre_time_files.compute() / 1e6

In [13]:
# Baseline over PFS
filename = "/g/g92/haridev/projects/scr-dlio/logs/n2_p8_base/trace*.pfw.gz"
analyzer_base_pfs = DLPAnalyzer(filename).events

[INFO] [14:56:22] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [14:56:22] Total size of all files are <dask.bag.core.Item object at 0x155540e9cee0> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [14:56:23] Loading 8994 batches out of 16 files and has 147302750 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [14:59:20] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [14:59:20] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [14]:
base_ckp_time = get_checkpoint_time(analyzer_base_pfs)
base_posix_lustre_time = get_posix_pfs_time(analyzer_base_pfs)
base_posix_ssd_time = get_posix_ssd_time(analyzer_base_pfs)

In [15]:
base_ckp_time,base_posix_lustre_time,base_posix_ssd_time

(dur    103.057664
 dtype: double[pyarrow],
 dur    72.63898
 dtype: double[pyarrow],
 dur    <NA>
 dtype: double[pyarrow])

In [16]:
# Baseline over SSD
filename = "/usr/WS2/haridev/scr-dlio/scripts/hydra_log/megatron_deepspeed/2024-03-21-23-06-04/trace*.pfw.gz"
analyzer_base_ssd = DLPAnalyzer(filename).events

[INFO] [14:59:28] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [14:59:28] Total size of all files are <dask.bag.core.Item object at 0x15516e3fd730> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [14:59:29] Loading 8976 batches out of 16 files and has 146988160 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [15:02:50] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [15:02:50] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [22]:
base_ssd_ckp_time = get_checkpoint_time(analyzer_base_ssd)
base_ssd_posix_lustre_time = get_posix_pfs_time(analyzer_base_ssd)
base_ssd_posix_ssd_time = get_posix_ssd_time(analyzer_base_ssd)
base_ssd_ckp_time, base_ssd_posix_lustre_time, base_ssd_posix_ssd_time

(dur    126.296083
 dtype: double[pyarrow],
 dur    <NA>
 dtype: double[pyarrow],
 dur    97.497736
 dtype: double[pyarrow])

In [18]:
# SCR on SSD and with Async Flush to PFS
filename = "/g/g92/haridev/projects/scr-dlio/logs/n2_p8_scr_finer/trace*.pfw.gz"
analyzer_scr_basic = DLPAnalyzer(filename).events

[INFO] [15:02:57] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [15:02:57] Total size of all files are <dask.bag.core.Item object at 0x155540ec8640> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [15:02:57] Loading 9132 batches out of 16 files and has 149432067 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [15:06:12] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [15:06:12] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [19]:
scr_ssd_async_pfs_ckp_time = get_checkpoint_time(analyzer_scr_basic)
scr_ssd_async_pfs_posix_lustre_time = get_posix_pfs_time(analyzer_scr_basic)
scr_ssd_async_pfs_posix_ssd_time = get_posix_ssd_time(analyzer_scr_basic)
scr_ssd_async_pfs_ckp_time, scr_ssd_async_pfs_posix_lustre_time, scr_ssd_async_pfs_posix_ssd_time

(dur    378.215081
 dtype: double[pyarrow],
 dur    34.150455
 dtype: double[pyarrow],
 dur    327.636878
 dtype: double[pyarrow])

In [20]:
# SCR on SSD and no flush
filename = "/usr/WS2/haridev/scr-dlio/scripts/hydra_log/scr_megatron_deepspeed/2024-03-20-10-52-41/trace*.pfw.gz"
analyzer_scr_no_flush = DLPAnalyzer(filename).events

[INFO] [15:06:18] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [15:06:18] Total size of all files are <dask.bag.core.Item object at 0x15534519da30> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [15:06:18] Loading 9124 batches out of 16 files and has 149369304 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [15:09:20] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [15:09:20] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [21]:

scr_ssd_no_flush_ckp_time = get_checkpoint_time(analyzer_scr_no_flush)
scr_ssd_no_flush_posix_lustre_time = get_posix_pfs_time(analyzer_scr_no_flush)
scr_ssd_no_flush_posix_ssd_time = get_posix_ssd_time(analyzer_scr_no_flush)
scr_ssd_no_flush_ckp_time, scr_ssd_no_flush_posix_lustre_time, scr_ssd_no_flush_posix_ssd_time

(dur    349.139191
 dtype: double[pyarrow],
 dur    19.608824
 dtype: double[pyarrow],
 dur    308.222378
 dtype: double[pyarrow])

In [59]:
# SCR bypass
filename = "/usr/WS2/haridev/scr-dlio/scripts/hydra_log/scr_megatron_deepspeed/2024-03-21-22-44-18/trace*.pfw.gz"
analyzer_scr_bypass = DLPAnalyzer(filename).events

[INFO] [14:41:28] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [14:41:28] Total size of all files are <dask.bag.core.Item object at 0x15515505dd60> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [14:41:28] Loading 8977 batches out of 16 files and has 146963672 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [14:45:16] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [14:45:16] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [60]:

scr_bypass_ckp_time = get_checkpoint_time(analyzer_scr_bypass)
scr_bypass_posix_lustre_time = get_posix_pfs_time(analyzer_scr_bypass)
scr_bypass_posix_ssd_time = get_posix_ssd_time(analyzer_scr_bypass)
scr_bypass_ckp_time, scr_bypass_posix_lustre_time, scr_bypass_posix_ssd_time

(dur    145.122746
 dtype: double[pyarrow],
 dur    118.42119
 dtype: double[pyarrow],
 dur    0.152704
 dtype: double[pyarrow])

In [56]:
# SCR ssd single copy
filename = "/usr/WS2/haridev/scr-dlio/scripts/hydra_log/scr_megatron_deepspeed/2024-04-08-09-33-09/trace*.pfw.gz"
analyzer_scr_ssd_single = DLPAnalyzer(filename).events

[INFO] [14:38:15] Created index for 8 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [14:38:15] Total size of all files are <dask.bag.core.Item object at 0x155142eb1c70> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [14:38:16] Loading 4488 batches out of 8 files and has 73467844 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [14:39:54] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [14:39:54] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [57]:

scr_ssd_single_ckp_time = get_checkpoint_time(analyzer_scr_ssd_single)
scr_ssd_single_posix_lustre_time = get_posix_pfs_time(analyzer_scr_ssd_single)
scr_ssd_single_posix_ssd_time = get_posix_ssd_time(analyzer_scr_ssd_single)

In [58]:
scr_ssd_single_ckp_time, scr_ssd_single_posix_lustre_time, scr_ssd_single_posix_ssd_time

(dur    153.369808
 dtype: double[pyarrow],
 dur    <NA>
 dtype: double[pyarrow],
 dur    121.076746
 dtype: double[pyarrow])

In [53]:
# SCR pfs single copy
filename = "/usr/WS2/haridev/scr-dlio/scripts/hydra_log/scr_megatron_deepspeed/2024-04-08-10-52-57/trace*.pfw.gz"
analyzer_scr_pfs_single = DLPAnalyzer(filename).events

[INFO] [14:31:46] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [14:31:46] Total size of all files are <dask.bag.core.Item object at 0x1552a3a705b0> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [14:31:47] Loading 8986 batches out of 16 files and has 147099195 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
ERROR:root:Processing {"id":"6930","name":"close","cat":"POSIX","pid":"0","tid":"953748","ts":"1712599257392616","dur":"4012","ph":"X","args":{"hostname":"corona171","ret":0,"fname":"","fd":39}} failed with Invalid control character at: line 1 column 162 (char 161)
[INFO] [14:35:48] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [14:35:48] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/pytho

In [54]:

scr_pfs_single_ckp_time = get_checkpoint_time(analyzer_scr_pfs_single)
scr_pfs_single_posix_lustre_time = get_posix_pfs_time(analyzer_scr_pfs_single)
scr_pfs_single_posix_ssd_time = get_posix_ssd_time(analyzer_scr_pfs_single)

In [55]:
scr_pfs_single_ckp_time, scr_pfs_single_posix_lustre_time, scr_pfs_single_posix_ssd_time

(dur    167.23401
 dtype: double[pyarrow],
 dur    30.059165
 dtype: double[pyarrow],
 dur    121.617282
 dtype: double[pyarrow])

In [50]:
# SCR pfs single copy large buffer
filename = "/usr/WS2/haridev/scr-dlio/scripts/hydra_log/scr_megatron_deepspeed/2024-06-17-15-11-39/trace*.pfw.gz"
analyzer_scr_pfs_single_large = DLPAnalyzer(filename).events

[INFO] [14:25:42] Created index for 15 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [14:25:42] Total size of all files are <dask.bag.core.Item object at 0x1552eaf53100> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [14:25:44] Loading 8424 batches out of 15 files and has 137896085 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
ERROR: database disk image is malformed
[INFO] [14:29:29] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [14:29:29] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [51]:

analyzer_scr_pfs_single_large_ckp_time = get_checkpoint_time(analyzer_scr_pfs_single_large)
analyzer_scr_pfs_single_large_posix_lustre_time = get_posix_pfs_time(analyzer_scr_pfs_single_large)
analyzer_scr_pfs_single_large_ssd_time = get_posix_ssd_time(analyzer_scr_pfs_single_large)

In [52]:
analyzer_scr_pfs_single_large_ckp_time, analyzer_scr_pfs_single_large_posix_lustre_time, analyzer_scr_pfs_single_large_ssd_time

(dur    164.652538
 dtype: double[pyarrow],
 dur    30.810418
 dtype: double[pyarrow],
 dur    116.153655
 dtype: double[pyarrow])

In [47]:

# SCR pfs single copy large buffer 1 thread
filename = "/usr/WS2/haridev/scr-dlio/scripts/hydra_log/scr_megatron_deepspeed/2024-07-01-15-47-25/trace*.pfw.gz"
analyzer_scr_pfs_single_large_one_thread = DLPAnalyzer(filename).events

[INFO] [14:09:00] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [14:09:00] Total size of all files are <dask.bag.core.Item object at 0x155170320310> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [14:09:01] Loading 8978 batches out of 16 files and has 147041895 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [14:13:04] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [14:13:04] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [48]:

analyzer_scr_pfs_large_one_ckp_time = get_checkpoint_time(analyzer_scr_pfs_single_large_one_thread)
analyzer_scr_pfs_large_one_posix_lustre_time = get_posix_pfs_time(analyzer_scr_pfs_single_large_one_thread)
analyzer_scr_pfs_large_one_ssd_time = get_posix_ssd_time(analyzer_scr_pfs_single_large_one_thread)

In [49]:
analyzer_scr_pfs_large_one_ckp_time, analyzer_scr_pfs_large_one_posix_lustre_time, analyzer_scr_pfs_large_one_ssd_time

(dur    159.972029
 dtype: double[pyarrow],
 dur    41.068813
 dtype: double[pyarrow],
 dur    116.591521
 dtype: double[pyarrow])

In [45]:
# SCR pfs single copy large buffer 2 thread
filename = "/usr/WS2/haridev/scr-dlio/scripts/hydra_log/scr_megatron_deepspeed/2024-07-01-16-18-21/trace*.pfw.gz"
analyzer_scr_pfs_single_large_2_thread = DLPAnalyzer(filename).events

[INFO] [14:04:34] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [14:04:34] Total size of all files are <dask.bag.core.Item object at 0x155170e307f0> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [14:04:34] Loading 8994 batches out of 16 files and has 147218602 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [14:08:46] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [14:08:46] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [46]:

analyzer_scr_pfs_large_2_ckp_time = get_checkpoint_time(analyzer_scr_pfs_single_large_2_thread)
analyzer_scr_pfs_large_2_posix_lustre_time = get_posix_pfs_time(analyzer_scr_pfs_single_large_2_thread)
analyzer_scr_pfs_large_2_ssd_time = get_posix_ssd_time(analyzer_scr_pfs_single_large_2_thread)
analyzer_scr_pfs_large_2_ckp_time, analyzer_scr_pfs_large_2_posix_lustre_time, analyzer_scr_pfs_large_2_ssd_time

(dur    159.779753
 dtype: double[pyarrow],
 dur    34.782902
 dtype: double[pyarrow],
 dur    120.074487
 dtype: double[pyarrow])

In [43]:


# SCR pfs single copy max buffer 1 thread
filename = "/usr/WS2/haridev/scr-dlio/scripts/hydra_log/scr_megatron_deepspeed/2024-07-01-16-47-05/trace*.pfw.gz"
analyzer_scr_pfs_single_max_1_thread = DLPAnalyzer(filename).events

[INFO] [14:00:38] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [14:00:38] Total size of all files are <dask.bag.core.Item object at 0x15514c3af820> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [14:00:38] Loading 8992 batches out of 16 files and has 147114386 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [14:04:09] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [14:04:09] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [44]:

analyzer_scr_pfs_max_1_ckp_time = get_checkpoint_time(analyzer_scr_pfs_single_max_1_thread)
analyzer_scr_pfs_max_1_posix_lustre_time = get_posix_pfs_time(analyzer_scr_pfs_single_max_1_thread)
analyzer_scr_pfs_max_1_ssd_time = get_posix_ssd_time(analyzer_scr_pfs_single_max_1_thread)
analyzer_scr_pfs_max_1_ckp_time, analyzer_scr_pfs_max_1_posix_lustre_time, analyzer_scr_pfs_max_1_ssd_time

(dur    156.93601
 dtype: double[pyarrow],
 dur    41.359542
 dtype: double[pyarrow],
 dur    121.022714
 dtype: double[pyarrow])

In [29]:
# SCR pfs single copy max buffer 1 thread no core affinity
filename = "/usr/WS2/haridev/scr-dlio/scripts/hydra_log/scr_megatron_deepspeed/2024-07-16-11-57-48/trace*.pfw.gz"
analyzer_scr_pfs_single_max_nc = DLPAnalyzer(filename).events

[INFO] [12:40:49] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [12:40:49] Total size of all files are <dask.bag.core.Item object at 0x155344ad27f0> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [12:40:49] Loading 8994 batches out of 16 files and has 147244848 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [12:44:30] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [12:44:30] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [42]:

analyzer_scr_pfs_single_max_nc_ckp_time = get_checkpoint_time(analyzer_scr_pfs_single_max_nc)
analyzer_scr_pfs_single_max_nc_posix_lustre_time = get_posix_pfs_time(analyzer_scr_pfs_single_max_nc)
analyzer_scr_pfs_single_max_nc_ssd_time = get_posix_ssd_time(analyzer_scr_pfs_single_max_nc)
analyzer_scr_pfs_single_max_nc_ckp_time, analyzer_scr_pfs_single_max_nc_posix_lustre_time,analyzer_scr_pfs_single_max_nc_ssd_time

(dur    148.042881
 dtype: double[pyarrow],
 dur    22.997724
 dtype: double[pyarrow],
 dur    113.803828
 dtype: double[pyarrow])

In [35]:
# SCR pfs single copy max buffer 2 thread no core affinity
filename = "/usr/WS2/haridev/scr-dlio/scripts/hydra_log/scr_megatron_deepspeed/2024-07-16-13-23-52/trace*.pfw.gz"
analyzer_scr_pfs_2_max_nc = DLPAnalyzer(filename).events

[INFO] [13:52:11] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [13:52:11] Total size of all files are <dask.bag.core.Item object at 0x155173087820> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [13:52:11] Loading 8994 batches out of 16 files and has 147219578 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [13:55:47] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [13:55:47] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [36]:

analyzer_scr_pfs_2_max_nc_ckp_time = get_checkpoint_time(analyzer_scr_pfs_2_max_nc)
analyzer_scr_pfs_2_max_nc_posix_lustre_time = get_posix_pfs_time(analyzer_scr_pfs_2_max_nc)
analyzer_scr_pfs_2_max_nc_ssd_time = get_posix_ssd_time(analyzer_scr_pfs_2_max_nc)
analyzer_scr_pfs_2_max_nc_ckp_time, analyzer_scr_pfs_2_max_nc_posix_lustre_time, analyzer_scr_pfs_2_max_nc_ssd_time

(dur    150.306512
 dtype: double[pyarrow],
 dur    16.773334
 dtype: double[pyarrow],
 dur    120.014858
 dtype: double[pyarrow])

In [23]:
# SCR pfs single copy max buffer 16 thread no core affinity
filename = "/usr/WS2/haridev/scr-dlio/scripts/hydra_log/scr_megatron_deepspeed/2024-07-16-15-37-25/trace*.pfw.gz"
analyzer_scr_pfs_16_max_nc = DLPAnalyzer(filename).events

[INFO] [16:17:40] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [16:17:40] Total size of all files are <dask.bag.core.Item object at 0x1552f08ad820> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [16:17:41] Loading 8994 batches out of 16 files and has 147279950 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [16:21:34] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [16:21:34] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [25]:

analyzer_scr_pfs_16_max_nc_ckp_time = get_checkpoint_time(analyzer_scr_pfs_16_max_nc)
analyzer_scr_pfs_16_max_nc_posix_lustre_time = get_posix_pfs_time(analyzer_scr_pfs_16_max_nc)
analyzer_scr_pfs_16_max_nc_ssd_time = get_posix_ssd_time(analyzer_scr_pfs_16_max_nc)
analyzer_scr_pfs_16_max_nc_ckp_time, analyzer_scr_pfs_16_max_nc_posix_lustre_time, analyzer_scr_pfs_16_max_nc_ssd_time

(dur    145.843344
 dtype: double[pyarrow],
 dur    14.289911
 dtype: double[pyarrow],
 dur    113.770895
 dtype: double[pyarrow])

In [35]:
print(f"{str('case'):20}, {str('base'):15}, {str('base_ssd'):15}, {str('scr_ssd_a_pfs'):15}, {str('scr_ssd_nf'):15}, {str('scr_bypass'):15}, {str('scr_ssd_nf_one'):15}, {str('scr_pfs_one'):15}, {str('scr_pfs_one_large'):15}")
print(f"{str('checkpoint time'):20}, {float(base_ckp_time):15}, {float(base_ssd_ckp_time):15}, {float(scr_ssd_async_pfs_ckp_time):15}, {float(scr_ssd_no_flush_ckp_time):15}, {float(scr_bypass_ckp_time):15}, {float(scr_ssd_single_ckp_time):15}, {float(scr_pfs_single_ckp_time):15}, {float(analyzer_scr_pfs_single_large_ckp_time):15}")
print(f"{str('lustre time'):20}, {float(base_posix_lustre_time):15}, {0:15}, {float(scr_ssd_async_pfs_posix_lustre_time):15}, {float(scr_ssd_no_flush_posix_lustre_time):15}, {float(scr_bypass_posix_lustre_time):15}, {float(0):15}, {float(scr_pfs_single_posix_lustre_time):15}, {float(analyzer_scr_pfs_single_large_posix_lustre_time):15}")
print(f"{str('ssd time'):20}, {0:15}, {float(base_ssd_posix_ssd_time):15}, {float(scr_ssd_async_pfs_posix_ssd_time):15}, {float(scr_ssd_no_flush_posix_ssd_time):15}, {float(scr_bypass_posix_ssd_time):15}, {float(scr_ssd_single_posix_ssd_time):15}, {float(scr_pfs_single_posix_ssd_time):15}, {float(analyzer_scr_pfs_single_large_ssd_time):15}")

case                , base           , base_ssd       , scr_ssd_a_pfs  , scr_ssd_nf     , scr_bypass     , scr_ssd_nf_one , scr_pfs_one    , scr_pfs_one_large
checkpoint time     ,      103.057664,      126.296083,      378.215081,      349.139191,      145.122746,      153.369808,       167.23401,      164.652538
lustre time         ,        72.63898,               0,     1471.540838,       19.608824,       118.42119,             0.0,      1060.07277,     1127.496048
ssd time            ,               0,       97.497736,      837.766052,      308.222378,        0.152704,      121.076746,      472.086514,      632.798778


### Things to try (3/27/24)
- Remove redundancy schemes: SCR_COPY_TYPE=SINGLE
  - (Potentially) Less frequent reduncy schemes:


### Things to try (4/17/24)
- Compare Base SSD with SCR SSD no flush
  - Check size
  - Check files

In [15]:
base_ssd_checkpoints = analyzer_base_ssd.query("cat == 'POSIX' and filename.str.contains('/l/ssd/haridev/scr/checkpoints')").compute()
base_ssd_checkpoints["size"].sum()/1024**3


577.07171420753

In [16]:
scr_ssd_nf_checkpoints = analyzer_scr_ssd_single.query("cat == 'POSIX' and filename.str.contains('/l/ssd/haridev/scr/checkpoints')").compute()
scr_ssd_nf_checkpoints["size"].sum()/1024**3

288.5385148981586

In [17]:
import pandas as pd
def print_full(x):
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', None)
    print(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

### Check files and sizes

In [18]:
print_full(1000)
import re
base_ssd_checkpoints.groupby(["filename"])["size"].sum()
base_ssd_checkpoints["fname"] = base_ssd_checkpoints["filename"].apply(lambda x: str(os.path.basename(x)))
base_ssd_checkpoints["fname_regex"] = base_ssd_checkpoints["fname"].apply(lambda x: re.sub("[\d\-]+", "", x))
base_ssd_checkpoints["size_gb"] = base_ssd_checkpoints["size"] / 1024 ** 3
base_ssd_checkpoints["dur_sec"] = base_ssd_checkpoints["dur"] / 1e6
base_ssd_checkpoints.groupby(["fname_regex", "pid"])[["size_gb","dur_sec"]].sum().groupby(["fname_regex"]).max()

1000


Unnamed: 0_level_0,size_gb,dur_sec
fname_regex,Unnamed: 1_level_1,Unnamed: 2_level_1
layer.pt,37.060777,39.932368
model.pt,0.000172,0.027605
optimizer.pt,31.434213,68.512567
scr_base_megatron_deepspeed,0.0,2.5e-05


In [19]:
print_full(1000)
scr_ssd_nf_checkpoints.groupby(["filename"])["size"].sum()
scr_ssd_nf_checkpoints["fname"] = scr_ssd_nf_checkpoints["filename"].apply(lambda x: str(os.path.basename(x)))
scr_ssd_nf_checkpoints["fname_regex"] = scr_ssd_nf_checkpoints["fname"].apply(lambda x: re.sub("[\d\-]+", "", x))
scr_ssd_nf_checkpoints["size_gb"] = scr_ssd_nf_checkpoints["size"] / 1024 ** 3
scr_ssd_nf_checkpoints["dur_sec"] = scr_ssd_nf_checkpoints["dur"] / 1e6
scr_ssd_nf_checkpoints.groupby(["fname_regex", "pid"])[["size_gb","dur_sec"]].sum().groupby(["fname_regex"]).max()

1000


Unnamed: 0_level_0,size_gb,dur_sec
fname_regex,Unnamed: 1_level_1,Unnamed: 2_level_1
.scr,0.0,0.00023
.scrconf,0.0,3.5e-05
filemap_,0.002303,20.196445
flush.scr,1.1e-05,0.780726
halt.scr,0.0,0.720644
haridev,0.0,3.7e-05
index.scr,0.0,2.8e-05
layer.pt,37.060777,32.954393
model.pt,0.000172,0.001463
nodes.scr,0.0,0.000327


#### We see a lot of extra files which i think is expected. 
- For files which are written we see same written cost.

### Check operations for model files

In [20]:
base_ssd_checkpoints.query("fname.str.contains('model')")["name"].value_counts()

name
write         288
open64         96
__fxstat64     96
lseek64        96
close          96
Name: count, dtype: int64[pyarrow]

In [21]:
scr_ssd_nf_checkpoints.query("fname.str.contains('model')")["name"].value_counts()


name
__xstat       192
write         144
access         96
open64         48
__fxstat64     48
lseek64        48
close          48
Name: count, dtype: int64[pyarrow]

**INVESTIGATION** Why the number of write calls different smaller (1/2).

In [27]:
base_ssd_checkpoints.query("fname.str.contains('model') and name == 'write'").groupby("filename").agg({"dur":sum, "size_gb":sum, "name":"count"})

Unnamed: 0_level_0,dur,size_gb,name
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
/l/ssd/haridev/scr/checkpoints/scr_base_megatron_deepspeed/model-1-100-0.pt,43,0.000029,3
/l/ssd/haridev/scr/checkpoints/scr_base_megatron_deepspeed/model-1-100-1.pt,13419,0.000029,3
/l/ssd/haridev/scr/checkpoints/scr_base_megatron_deepspeed/model-1-100-10.pt,60,0.000029,3
/l/ssd/haridev/scr/checkpoints/scr_base_megatron_deepspeed/model-1-100-11.pt,56,0.000029,3
/l/ssd/haridev/scr/checkpoints/scr_base_megatron_deepspeed/model-1-100-12.pt,55,0.000029,3
...,...,...,...
/l/ssd/haridev/scr/checkpoints/scr_base_megatron_deepspeed/model-1-600-5.pt,160,0.000029,3
/l/ssd/haridev/scr/checkpoints/scr_base_megatron_deepspeed/model-1-600-6.pt,41,0.000029,3
/l/ssd/haridev/scr/checkpoints/scr_base_megatron_deepspeed/model-1-600-7.pt,48,0.000029,3
/l/ssd/haridev/scr/checkpoints/scr_base_megatron_deepspeed/model-1-600-8.pt,43,0.000029,3


In [28]:
scr_ssd_nf_checkpoints.query("fname.str.contains('model') and name == 'write'").groupby("filename").agg({"dur":sum, "size_gb":sum, "name":"count"})


Unnamed: 0_level_0,dur,size_gb,name
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
/l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.dataset.1/model-1-100-0.pt,43,2.9e-05,3
/l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.dataset.1/model-1-100-1.pt,41,2.9e-05,3
/l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.dataset.1/model-1-100-2.pt,30,2.9e-05,3
/l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.dataset.1/model-1-100-3.pt,32,2.9e-05,3
/l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.dataset.1/model-1-100-4.pt,113,2.9e-05,3
/l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.dataset.1/model-1-100-5.pt,33,2.9e-05,3
/l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.dataset.1/model-1-100-6.pt,32,2.9e-05,3
/l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.dataset.1/model-1-100-7.pt,226,2.9e-05,3
/l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.dataset.2/model-1-200-0.pt,292,2.9e-05,3
/l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.dataset.2/model-1-200-1.pt,37,2.9e-05,3


##### Check per model file

In [29]:
base_ssd_checkpoints.query("fname.str.contains('model-1-600-0.pt')")["name"].value_counts()

name
write         3
open64        1
__fxstat64    1
lseek64       1
close         1
Name: count, dtype: int64[pyarrow]

In [53]:
scr_ssd_nf_checkpoints.query("fname.str.contains('model-1-600-0.pt')")["name"].value_counts()

name
__xstat       4
write         3
access        2
open64        1
__fxstat64    1
lseek64       1
close         1
Name: count, dtype: int64[pyarrow]

In [54]:
scr_ssd_nf_checkpoints.query("fname.str.contains('model-1-600-0.pt')")

Unnamed: 0,name,cat,pid,tid,ts,te,dur,tinterval,trange,hostname,...,io_time,app_io_time,total_time,filename,phase,size,fname,fname_regex,size_gb,dur_sec
8159,open64,POSIX,0,335916,1001811003,1001811038,35,,33.0,corona174,...,35,,35,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,3.5e-05
8160,__fxstat64,POSIX,0,335916,1001811048,1001811050,2,,33.0,corona174,...,2,,2,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,2e-06
8161,lseek64,POSIX,0,335916,1001811065,1001811066,1,,33.0,corona174,...,1,,1,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,1e-06
8162,write,POSIX,0,335916,1001811279,1001811289,10,,33.0,corona174,...,10,,10,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,320.0,model-1-600-0.pt,model.pt,0.0,1e-05
8163,write,POSIX,0,335916,1001811303,1001811322,19,,33.0,corona174,...,19,,19,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,30102.0,model-1-600-0.pt,model.pt,2.8e-05,1.9e-05
8164,write,POSIX,0,335916,1001811356,1001811360,4,,33.0,corona174,...,4,,4,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,405.0,model-1-600-0.pt,model.pt,0.0,4e-06
8165,close,POSIX,0,335916,1001811378,1001811388,10,,33.0,corona174,...,10,,10,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,1e-05
4124,access,POSIX,0,335916,1025776223,1025776228,5,,34.0,corona174,...,5,,5,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,5e-06
4125,__xstat,POSIX,0,335916,1025776246,1025776250,4,,34.0,corona174,...,4,,4,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,4e-06
4265,access,POSIX,0,335916,1025892888,1025892893,5,,34.0,corona174,...,5,,5,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,5e-06


In [52]:
base_ssd_checkpoints.query("fname.str.contains('model-1-600-0.pt')").groupby("name")["dur_sec"].sum()

name
__fxstat64    0.000002
close         0.000009
lseek64       0.000001
open64        0.000111
write         0.000151
Name: dur_sec, dtype: double[pyarrow]

In [55]:
scr_ssd_nf_checkpoints.query("fname.str.contains('model-1-600-0.pt')").groupby("name")["dur_sec"].sum()

name
__fxstat64    0.000002
__xstat       0.000027
access         0.00001
close          0.00001
lseek64       0.000001
open64        0.000035
write         0.000033
Name: dur_sec, dtype: double[pyarrow]

In [70]:
scr_ssd_nf_checkpoints["trange"].max()

35.0

In [71]:
base_ssd_checkpoints.query("fname.str.contains('model-1-600-0.pt')")["size"].value_counts()

size
320      1
30102    1
405      1
Name: count, dtype: int64[pyarrow]

In [72]:
scr_ssd_nf_checkpoints.query("fname.str.contains('model-1-600-0.pt')")["size"].value_counts()

size
320      1
30102    1
405      1
Name: count, dtype: int64[pyarrow]

In [73]:
scr_ssd_nf_checkpoints.query("fname.str.contains('model-1-600-0.pt')")

Unnamed: 0,name,cat,pid,tid,ts,te,dur,tinterval,trange,hostname,...,io_time,app_io_time,total_time,filename,phase,size,fname,fname_regex,size_gb,dur_sec
8159,open64,POSIX,0,335916,1001811003,1001811038,35,,33.0,corona174,...,35,,35,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,3.5e-05
8160,__fxstat64,POSIX,0,335916,1001811048,1001811050,2,,33.0,corona174,...,2,,2,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,2e-06
8161,lseek64,POSIX,0,335916,1001811065,1001811066,1,,33.0,corona174,...,1,,1,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,1e-06
8162,write,POSIX,0,335916,1001811279,1001811289,10,,33.0,corona174,...,10,,10,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,320.0,model-1-600-0.pt,model.pt,0.0,1e-05
8163,write,POSIX,0,335916,1001811303,1001811322,19,,33.0,corona174,...,19,,19,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,30102.0,model-1-600-0.pt,model.pt,2.8e-05,1.9e-05
8164,write,POSIX,0,335916,1001811356,1001811360,4,,33.0,corona174,...,4,,4,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,405.0,model-1-600-0.pt,model.pt,0.0,4e-06
8165,close,POSIX,0,335916,1001811378,1001811388,10,,33.0,corona174,...,10,,10,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,1e-05
4124,access,POSIX,0,335916,1025776223,1025776228,5,,34.0,corona174,...,5,,5,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,5e-06
4125,__xstat,POSIX,0,335916,1025776246,1025776250,4,,34.0,corona174,...,4,,4,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,4e-06
4265,access,POSIX,0,335916,1025892888,1025892893,5,,34.0,corona174,...,5,,5,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,,model-1-600-0.pt,model.pt,,5e-06


- Many extra operations
  - original app used 7 I/O calls across 5 ops
  - SCR has 13 I/O calls across 7 ops 

*Question* why do we do extra calls on data produced as outputs?

### Check operations for layer files

In [74]:
base_ssd_checkpoints.query("fname.str.contains('layer')")["name"].value_counts()

name
write         2640
open64         528
__fxstat64     528
lseek64        528
close          528
Name: count, dtype: int64[pyarrow]

In [75]:
scr_ssd_nf_checkpoints.query("fname.str.contains('layer')")["name"].value_counts()


name
write         1320
__xstat       1056
access         528
open64         264
__fxstat64     264
lseek64        264
close          264
Name: count, dtype: int64[pyarrow]

**INVESTIGATION** Again why do we see half the writes. 

##### Check per model file

In [76]:
base_ssd_checkpoints.query("fname.str.contains('layer-') and fname.str.contains('1-600-0.pt')")["name"].value_counts()

name
write         220
open64         44
__fxstat64     44
lseek64        44
close          44
Name: count, dtype: int64[pyarrow]

In [77]:
scr_ssd_nf_checkpoints.query("fname.str.contains('layer-') and fname.str.contains('1-600-0.pt')")["name"].value_counts()

name
write         220
__xstat       176
access         88
open64         44
__fxstat64     44
lseek64        44
close          44
Name: count, dtype: int64[pyarrow]

In [78]:
base_ssd_checkpoints.query("fname.str.contains('layer-') and fname.str.contains('1-600-0.pt')")["size"].value_counts()

size
384          44
129761280    44
64           44
20971520     44
487          44
Name: count, dtype: int64[pyarrow]

In [79]:
scr_ssd_nf_checkpoints.query("fname.str.contains('layer-') and fname.str.contains('1-600-0.pt')")["size"].value_counts()

size
384          44
129761280    44
64           44
20971520     44
487          44
Name: count, dtype: int64[pyarrow]

In [80]:
base_ssd_checkpoints.query("fname.str.contains('layer-0-1-600-0.pt')")["size"].value_counts()

size
384          1
129761280    1
64           1
20971520     1
487          1
Name: count, dtype: int64[pyarrow]

In [81]:
scr_ssd_nf_checkpoints.query("fname.str.contains('layer-0-1-600-0.pt')")["size"].value_counts()

size
384          1
129761280    1
64           1
20971520     1
487          1
Name: count, dtype: int64[pyarrow]

In [82]:
base_ssd_checkpoints.query("fname.str.contains('layer-0-1-600-0.pt')")["name"].value_counts()

name
write         5
open64        1
__fxstat64    1
lseek64       1
close         1
Name: count, dtype: int64[pyarrow]

In [83]:
scr_ssd_nf_checkpoints.query("fname.str.contains('layer-0-1-600-0.pt')")["name"].value_counts()

name
write         5
__xstat       4
access        2
open64        1
__fxstat64    1
lseek64       1
close         1
Name: count, dtype: int64[pyarrow]

Same Behavior here.

# Next Steps
1. Check why PFS is so much cost in SCR. We looked at flush being the bulk of the cost.
2. INVESTIGATE: Without async I/O the ssd is 121.076746 and with async I/O is 472.086514

In [16]:
scr_pfs_single_ckp_time, scr_pfs_single_posix_lustre_time, scr_pfs_single_posix_ssd_time

(dur    167.23401
 dtype: double[pyarrow],
 dur    1060.07277
 dtype: double[pyarrow],
 dur    472.086514
 dtype: double[pyarrow])

In [17]:
def get_checkpoint_time(df):
    app_time = df.query("name.str.contains('PyTorchCheckpointing.checkpoint')")
    app_time_files = app_time.groupby(["pid"]).agg({"dur":sum}).max()
    return app_time_files.compute() / 1e6

def get_posix_pfs(df):
    posix_lustre_time = df.query("cat == 'POSIX' and filename.str.contains('/p/lustre2/haridev/dlio/scr/checkpoints')")
    return posix_lustre_time

def get_posix_ssd_time(df):
    posix_lustre_time = df.query("cat == 'POSIX' and filename.str.contains('/l/ssd/haridev/scr/checkpoints')")
    posix_lustre_time_files = posix_lustre_time.groupby(["pid"]).agg({"dur":sum}).max()
    return posix_lustre_time_files.compute() / 1e6

In [28]:
scr_pfs_single_posix_lustre_df = get_posix_pfs(analyzer_scr_pfs_single)
scr_pfs_single_posix_lustre_df.groupby(["name","pid"]).agg({"dur":sum}).groupby("name").max().compute() / 1e6, scr_pfs_single_posix_lustre_time

(                   dur
 name                  
 __lxstat      1.416893
 __xstat       0.314929
 __xstat64     0.001076
 access        0.023796
 chmod         1.043387
 chown         1.463341
 close            1.201
 fsync        10.568369
 ftruncate     0.015173
 lseek         0.000066
 mkdir         0.010559
 open         12.207076
 read          0.114711
 write      1032.015623,
 dur    1060.07277
 dtype: double[pyarrow])

In [32]:
scr_pfs_single_posix_lustre_df.groupby(["name"]).agg({"size":sum}).compute() /1024 **3

Unnamed: 0_level_0,size
name,Unnamed: 1_level_1
mkdir,0.0
__xstat64,0.0
access,0.0
open,0.0
write,576.791053
fsync,0.0
close,0.0
read,0.000106
lseek,0.0
ftruncate,0.0


In [33]:
analyzer_base_pfs_posix_lustre_df = get_posix_pfs(analyzer_base_pfs)
analyzer_base_pfs_posix_lustre_df.groupby(["name"]).agg({"size":sum}).compute() /1024 **3

Unnamed: 0_level_0,size
name,Unnamed: 1_level_1
mkdir,0.0
__xstat64,0.0
open64,0.0
__fxstat64,0.0
lseek64,0.0
write,577.071714
close,0.0


In [34]:
analyzer_base_pfs_posix_lustre_df.groupby(["name","pid"]).agg({"dur":sum}).groupby("name").max().compute() / 1e6

Unnamed: 0_level_0,dur
name,Unnamed: 1_level_1
__fxstat64,0.156578
__xstat64,0.005139
close,0.179031
lseek64,0.000629
mkdir,0.005487
open64,2.708844
write,69.628272


In [41]:
analyzer_base_pfs_posix_lustre_df.compute().groupby(["name"]).agg({"size": "median"}) / 1024 **2

Unnamed: 0_level_0,size
name,Unnamed: 1_level_1
__fxstat64,
__xstat64,
close,
lseek64,
mkdir,
open64,
write,0.000671


In [45]:
scr_pfs_single_posix_lustre_df.compute().query("name == 'write'")["size"].describe()

count            18936.0
mean     32706203.922634
std       4315202.438485
min                 28.0
25%           33554432.0
50%           33554432.0
75%           33554432.0
max           33554432.0
Name: size, dtype: double[pyarrow]

In [46]:
analyzer_base_pfs_posix_lustre_df.compute().query("name == 'write'")["size"].describe()

count              3792.0
mean     163403490.240506
std      392716773.891337
min                  64.0
25%                 384.0
50%                 704.0
75%           129761280.0
max          1875123200.0
Name: size, dtype: double[pyarrow]

In [None]:
analyzer_scr_pfs_single_large

1. Change env SCR_FILE_BUF_SIZE

In [38]:
analyzer_scr_pfs_single_large_lustre_df = get_posix_pfs(analyzer_scr_pfs_single_large)
analyzer_scr_pfs_single_large_lustre_df.compute().query("name == 'write'")["size"].describe()

count              4478.0
mean     121846320.648727
std       36950676.885976
min                  13.0
25%           134217728.0
50%           134217728.0
75%           134217728.0
max           134217728.0
Name: size, dtype: double[pyarrow]

In [39]:
analyzer_scr_pfs_single_large_lustre_df.groupby(["name","pid"]).agg({"dur":sum}).groupby("name").max().compute() / 1e6

Unnamed: 0_level_0,dur
name,Unnamed: 1_level_1
__lxstat,1.488029
__xstat,0.372969
__xstat64,0.003926
access,0.059052
chmod,1.334396
chown,1.930512
close,3.063757
fsync,18.598632
ftruncate,0.029949
lseek,6e-05


# the Bandwidth is still very bad.

Fix MAX_THREADS on AXL recompile and run.

In [18]:

analyzer_scr_pfs_single_large_one_thread_lustre_df = get_posix_pfs(analyzer_scr_pfs_single_large_one_thread)
analyzer_scr_pfs_single_large_one_thread_lustre_df.compute().query("name == 'write'")["size"].describe()

count              5266.0
mean     117665428.209077
std       41553463.989748
min                  28.0
25%           134217728.0
50%           134217728.0
75%           134217728.0
max           134217728.0
Name: size, dtype: double[pyarrow]

In [19]:
analyzer_scr_pfs_single_large_one_thread_lustre_df.groupby(["name","pid"]).agg({"dur":sum}).groupby("name").max().compute() / 1e6

Unnamed: 0_level_0,dur
name,Unnamed: 1_level_1
__lxstat,0.35929
__xstat,0.319354
__xstat64,0.001142
access,0.029981
chmod,0.703554
chown,0.750761
close,0.511923
fsync,5.289375
ftruncate,0.01644
lseek,6.6e-05


In [24]:

analyzer_scr_pfs_single_max_1_thread_lustre_df = get_posix_pfs(analyzer_scr_pfs_single_max_1_thread)
analyzer_scr_pfs_single_max_1_thread_lustre_df.compute().query("name == 'write'")["size"].describe()

count              1098.0
mean     564322536.554645
std      784396934.569683
min                  13.0
25%               30827.0
50%           150733735.0
75%          1875123200.0
max          1875123200.0
Name: size, dtype: double[pyarrow]

In [27]:

analyzer_scr_pfs_single_max_1_thread_lustre_df.compute().query("name == 'read'")["size"].describe()

count         194.0
mean     588.871134
std      904.400588
min             4.0
25%            20.0
50%            20.0
75%         1024.75
max          3136.0
Name: size, dtype: double[pyarrow]

In [None]:
analyzer_scr_pfs_single_large_lustre_df.groupby(["name","pid"]).agg({"dur":sum}).groupby("name").max().compute() / 1e6

Unnamed: 0_level_0,dur
name,Unnamed: 1_level_1
__lxstat,1.488029
__xstat,0.372969
__xstat64,0.003926
access,0.059052
chmod,1.334396
chown,1.930512
close,3.063757
fsync,18.598632
ftruncate,0.029949
lseek,6e-05


In [25]:
analyzer_scr_pfs_single_max_1_thread_lustre_df.groupby(["name","pid"]).agg({"dur":sum}).groupby("name").max().compute() / 1e6

Unnamed: 0_level_0,dur
name,Unnamed: 1_level_1
__lxstat,0.369047
__xstat,0.342443
__xstat64,0.000973
access,0.032467
chmod,0.670344
chown,0.812952
close,0.295399
fsync,4.21661
ftruncate,0.015716
lseek,6.4e-05


In [None]:
analyzer_base_pfs_posix_lustre_df.groupby(["name","pid"]).agg({"dur":sum}).groupby("name").max().compute() / 1e6

Unnamed: 0_level_0,dur
name,Unnamed: 1_level_1
__fxstat64,0.156578
__xstat64,0.005139
close,0.179031
lseek64,0.000629
mkdir,0.005487
open64,2.708844
write,69.628272


In [None]:
analyzer_scr_pfs_single_max_1_thread_lustre_df.groupby(["name","pid"]).agg({"dur":sum}).groupby("name").max().compute() / 1e6

In [26]:
analyzer_scr_pfs_single_max_1_thread_lustre_df.compute().query("name == 'write'")["size"].sum() / 1024**3

577.071816788055

Ok got it down to 233 from 1095.066957 but the baseline is 73

## Things to Consider

Optimal tradeoff in checkpoint frequency vs cost. Theoritical. Insight for readers, 

Flushing code to check the buffer size passed to read and write.
LDMS counters on Corona Kathleen.

Impact on SSD performance

Utilization and Power of the system

### Theoretical calculation.

Bandwidth per SSD = 2.5 GB/s <br>
Bandwidth Lustre file system per compute node = 4.8 GB/s <br>
Scaling SSD factor = 2
Scaling PFS factor = 1.6

1 node (277 GB)  <br>
PFS checkpointing = 277 / 4.8 = 57.5 seconds  <br>
SSD checkpointing = 277 / 2.5 = 110.5 seconds <br>
Async Flush to PFS checkpointing = read from SSD + write to PFS = 110.5 + 57.5 = 168 seconds <br>

2 node (554 GB)  <br>
PFS checkpointing = 554 / (4.8*1.6) = 72.1 seconds  <br>
SSD checkpointing = 554 / (2.5* 2) = 110.5 seconds <br>
Async Flush to PFS checkpointing = read from SSD + write to PFS = 110.5 + 72.1 = 182.6 seconds <br>

4 node (277 * 4 GB)  <br>
PFS checkpointing = 277 * 4 / (4.8 * 1.6^3) = 56.3 seconds  <br>
SSD checkpointing = 277 * 4 / (2.5 * 2^3) = 55.4 seconds <br>
Async Flush to PFS checkpointing = read from SSD + write to PFS = 55.4 + 56.3 = 111.7 seconds <br>

8 node (277 * 8 GB)  <br>
PFS checkpointing = 277 * 8 / (4.8 * 1.6^7) = 17.1 seconds  <br>
SSD checkpointing = 277 * 8 / (2.5 * 2^7) = 6.9 seconds <br>
Async Flush to PFS checkpointing = read from SSD + write to PFS = 6.9 + 17.1 = 26 seconds <br>





NO Impact on SSD Checkpointing

### Look at differences in write time and size

In [18]:
write_base_pfs = analyzer_base_pfs.query("cat == 'POSIX' and name == 'write' and filename.str.contains('/p/lustre2/haridev/dlio/scr/checkpoints')")
file_base_pfs = write_base_pfs.groupby(["filename","pid"]).agg({"dur":sum, "size":sum}).groupby(["filename"]).agg({"dur":max, "size":sum})
file_base_pfs.sort_values("dur", ascending=False).compute()

Unnamed: 0_level_0,dur,size
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-100-4.pt,8463940,5625371495
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-100-11.pt,7793586,5625371495
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-100-12.pt,7782774,5625371495
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-100-13.pt,7073267,5625371495
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-500-15.pt,7043377,5625371495
...,...,...
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/model-1-300-7.pt,363,30827
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/model-1-400-2.pt,359,30827
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/model-1-600-9.pt,353,30827
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/model-1-400-15.pt,352,30827


In [19]:
write_max_scr_pfs = analyzer_scr_pfs_single_max_1_thread.query("cat == 'POSIX' and name == 'write' and filename.str.contains('/p/lustre2/haridev/dlio/scr/checkpoints')")
file_max_scr_pfs = write_max_scr_pfs.groupby(["filename","pid"]).agg({"dur":sum, "size":sum}).groupby(["filename"]).agg({"dur":max, "size":sum})
file_max_scr_pfs.sort_values("dur", ascending=False).compute()

Unnamed: 0_level_0,dur,size
filename,Unnamed: 1_level_1,Unnamed: 2_level_1
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-400-1.pt,24088469,5625371495
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-400-2.pt,23878186,5625371495
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-400-3.pt,23764038,5625371495
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-600-3.pt,23639717,5625371495
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-400-7.pt,23513002,5625371495
...,...,...
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/.scr/scr.dataset.67/rank2file,684,111
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/model-1-400-13.pt,668,30827
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/model-1-500-1.pt,602,30827
/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/model-1-300-6.pt,544,30827


## Optimizer files are 3x expensive.

In [23]:
analyzer_scr_pfs_single_max_1_thread.query("filename == '/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-400-1.pt'").sort_values("name", ascending=False).compute()

Unnamed: 0,name,cat,pid,tid,ts,te,dur,tinterval,trange,hostname,compute_time,io_time,app_io_time,total_time,filename,phase,size
9897,write,POSIX,1,3853615,830134069,838627427,8493358,,27.0,corona171,,8493358,,8493358,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1875123200.0
6147,write,POSIX,1,3853615,839300610,846203179,6902569,,27.0,corona171,,6902569,,6902569,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1875123200.0
13799,write,POSIX,1,3853615,846881772,855574310,8692538,,28.0,corona171,,8692538,,8692538,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1875123200.0
13802,write,POSIX,1,3853615,855574360,855574364,4,,28.0,corona171,,4,,4,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1895.0
14567,open,POSIX,1,3853615,825047710,825052965,5255,,27.0,corona171,,5255,,5255,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,
363,fsync,POSIX,1,3853615,856027863,856042669,14806,,28.0,corona171,,14806,,14806,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,
375,close,POSIX,1,3853615,856042685,856043777,1092,,28.0,corona171,,1092,,1092,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,
6901,chown,POSIX,1,3852848,1001834334,1001839017,4683,,33.0,corona171,,4683,,4683,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,
6899,chmod,POSIX,1,3852848,1001832367,1001834304,1937,,33.0,corona171,,1937,,1937,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,
80,__xstat,POSIX,1,3852848,70675420,70676363,943,,2.0,corona171,,943,,943,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,


In [24]:
analyzer_base_pfs.query("filename == '/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-400-1.pt'").sort_values("name", ascending=False).compute()

Unnamed: 0,name,cat,pid,tid,ts,te,dur,tinterval,trange,hostname,compute_time,io_time,app_io_time,total_time,filename,phase,size
7728,write,POSIX,1,2664211,646979271,646992079,12808,,21.0,corona171,,12808,,12808,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,704.0
4637,write,POSIX,1,2664211,646992100,648714292,1722192,,21.0,corona171,,1722192,,1722192,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1009254400.0
9428,write,POSIX,1,2664211,649337877,649337896,19,,21.0,corona171,,19,,19,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,64.0
14030,write,POSIX,1,2664211,649337911,650571690,1233779,,21.0,corona171,,1233779,,1233779,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1009254400.0
14031,write,POSIX,1,2664211,650839693,650839718,25,,21.0,corona171,,25,,25,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,64.0
14032,write,POSIX,1,2664211,650839761,651706031,866270,,21.0,corona171,,866270,,866270,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,865075200.0
14033,write,POSIX,1,2664211,656548294,656548320,26,,21.0,corona171,,26,,26,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,64.0
14034,write,POSIX,1,2664211,656548370,657338538,790168,,21.0,corona171,,790168,,790168,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,865075200.0
14035,write,POSIX,1,2664211,657924870,657926229,1359,,21.0,corona171,,1359,,1359,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1587392.0
14036,write,POSIX,1,2664211,657926282,659631724,1705442,,21.0,corona171,,1705442,,1705442,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1875123200.0


In [27]:
analyzer_base_pfs.query("filename == '/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-400-1.pt' and name == 'write'")[["size","dur"]].sum().compute()

size    5625371495
dur        6332092
dtype: uint64[pyarrow]

In [28]:
analyzer_scr_pfs_single_max_1_thread.query("filename == '/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-400-1.pt' and name == 'write'")[["size","dur"]].sum().compute()

size    5625371495
dur       24088469
dtype: uint64[pyarrow]

## CHeck core affinity

In [31]:
analyzer_scr_pfs_single_max_nc.query("filename == '/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-400-1.pt' and name == 'write'")[["size","dur"]].sum().compute()

size    5625371495
dur       13399167
dtype: uint64[pyarrow]

In [38]:
analyzer_scr_pfs_2_max_nc.query("filename == '/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-400-1.pt'").sort_values("name", ascending=False).compute()

Unnamed: 0,name,cat,pid,tid,ts,te,dur,tinterval,trange,hostname,compute_time,io_time,app_io_time,total_time,filename,phase,size
12550,write,POSIX,1,952133,725711508,729564000,3852492,,24.0,corona171,,3852492,,3852492,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1875123200.0
2489,write,POSIX,1,952133,729922050,733046138,3124088,,24.0,corona171,,3124088,,3124088,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1875123200.0
7182,write,POSIX,1,952133,733394194,739090644,5696450,,24.0,corona171,,5696450,,5696450,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1875123200.0
7185,write,POSIX,1,952133,739090757,739090761,4,,24.0,corona171,,4,,4,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1895.0
14565,open,POSIX,1,952133,723314734,723323289,8555,,24.0,corona171,,8555,,8555,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,
9854,fsync,POSIX,1,952133,739316847,739321320,4473,,24.0,corona171,,4473,,4473,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,
9861,close,POSIX,1,952133,739321388,739322073,685,,24.0,corona171,,685,,685,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,
6907,chown,POSIX,1,950547,869379583,869381848,2265,,28.0,corona171,,2265,,2265,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,
6906,chmod,POSIX,1,950547,869377934,869379553,1619,,28.0,corona171,,1619,,1619,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,
80,__xstat,POSIX,1,950547,60347581,60348247,666,,2.0,corona171,,666,,666,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,


In [39]:
analyzer_scr_pfs_single_max_nc.query("filename == '/p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/optimizer-1-400-1.pt' and name == 'write'")[["size","dur"]].sum().compute()

size    5625371495
dur       13399167
dtype: uint64[pyarrow]

## Current State

| Case            | CHKP (sec) | FLUSH    |
| --------------- | ------- | -------- |
| Base PFS        | 103     |          |
| Base SSD        | 126     |          |
| SCR PFS         | 378     | 1024     |
| SCR SSD         | 349     |          |
| SCR Single      | 167     | 1023     |
| SCR 1 Th        | 159     | 996      |
| SCR 2 Th        | 159     | 996      |
| SCR Max Buf     | 156     | 224      |
| SCR 1 Th Core   | 148     | 22       |
| SCR 2 Th Core   | 150     | 16       |
| SCR 16 Th Core  | 150     | 14       |


## Next Steps

1. Scale the Base PFS and SCR 16 Th Core
2. Identify Dynamic Intent-driven configurations for SCR.
   1. Use cases
      1. Megatron Deepspeed
      2. MuMMI workflow: Currently they do this synchronously and manually.
      3. Earthquake Model (SW4)  pankajakshan1@llnl.gov
      4. Marble (no access) and LiDO
   2. Plugin
      1. SCR need checkpoint API to tune checkpoint.
      2. int SCR_Start_output(char* name, int flags);

3. Dynamic recovery of diverged models to previous better checkpoints

## Types of optimizations
1. SCR Init: 
   1. Core affinity to allow Async Flushing
      1. Else configure PThreads synchronicity to match core affinity? Potentially configure SCR_CACHE_BYPASS, SCR_FLUSH_ASYNC
      2. **Intent 1** process-core mapping: Contains
         1. Total cores available on node.
         2. Cores used by workload. 
   2. Workload specific buffer size for transfers 
      1. **Intent 2** top 5 accesses based on distribution. Configure SCR_MPI_BUF_SIZE and SCR_FILE_BUF_SIZE
   3. Which Redundancy schemes to use? 
      1. **Question:** When to use which scheme? Potentially  **Intent 3**
         1. It dependends on fault model.
            1. storage node - fault tolerance: dont use SINGLE. SINGLE is the least cost.
            2. BUDDY and DUO is most expensive for I/O but most fault tolerant as u can tolerate half of the nodes going away
            3. For both XOR and RS, larger sets require less storage, but they also increase the probability that a given set will suffer multiple failures simultaneously.
            4. Computationally, XOR is more expensive than Partner, but it requires less storage space. Whereas Partner must store two full dataset files, XOR stores one full dataset file plus one XOR parity segment
            5. Partner scheme is slower than Single, and it requires twice the storage space. However, it is capable of withstanding failures that disable a storage device.
            6. RS and XOR can be done outside the snapshot. (this would need SCR)
   4. **Intent 4** Clean start? SCR_CACHE_PURGE and SCR_DISTRIBUTE
   5. **Intent 5** Fault tolerance factor (number of redundancies needed ). SCR_SET_SIZE and/or SCR_SET_FAILURES
   6. **Intent 6: Workload level** Min and Max length of history maintained for the workload. Configure SCR_PREFIX_PURGE and SCR_PREFIX_SIZE
      1. **Question:** Can checkpoints have replace semantics? I am thinking keep only latest 5? I would assume this would be a popular use case. Is this the one SCR_PREFIX_SIZE
   7. **Intent 7** System level intent to describe the number of targets available on a system
      1. Configure lustre stripe count and SCR_FLUSH_WIDTH
2. SCR Checkpoint Start Output
   1. Output specific buffer size. Output scoped **Intent 2** 
   2. Flushing rate vs checkpointing rate.
      1. Monitor rates to dynamically tune threads, checkpoint frequency (SCR_FLUSH, SCR_CHECKPOINT_INTERVAL), cache size (SCR_CACHE_SIZE)
      2. **Intent 8** Min and Max number of checkpoints required for the workload.
   3. **Intent 9**: Current scope distribution of buffer sizes could drive the stripe size and count for next checkpoint 
   4. **Question**: What is the purpose of SCR_COPY_TYPE=FILE or checkpoint descriptor 
      1. It just means read from a configuration file. (MAYBE not useful)
   5. **Question**: What is the purpose of SCR_GROUP (May be not user-driven)
      1. Which nodes belong to which group. Are they belong to the same failure domain.
      2. Default is compute node is a group.
      3. Power is shared between two nodes. 
3.  Inter Checkpoint
    1.  **Intent 10** Checkpoint rate. We can track the distribution of two checkpoints. To configure SCR_CHECKPOINT_SECONDS
        1.  What the user expects. There should be a checkpoint every hour. 
        2.  MAYBE does not do anything right now.
    2.  **Question**: How to configure or use SCR_CHECKPOINT_OVERHEAD
        1.  Maybe part of flush routine. 
        2.  Look at Need CHeckpoint and figure out what these variables (SCR_CHECKPOINT_OVERHEAD, SCR_CHECKPOINT_SECONDS) do.
4.  SCR Finalize
    1.  **Intent 11** is a workflow or has multiple apps. Configure SCR_FLUSH_POSTSTAGE

### Main Idea: Exploring runtime I/O intents and configure using SCR.
1. Capture outputs and maintain profiling statistics: Use DFTracer API and push events to profiling service for aggregation for larger-than process scoped intents
   1. Profiling statistics at different levels
      1. In situ or process-scoped intents (process local, checkpointing output)
      2. distributed nodes, node-local
   2. Extract intents at different levels and communicate with workload for node-level and cluster level intents.
2. In the workloads, use intents to map to configurations.