In [1]:
import json
import os
import yaml
from pathlib import Path
from dask.distributed import Client

In [2]:
use_local=True

In [3]:
if not use_local:
    with open(f'~/.dlio_profiler/configuration.yaml', 'r') as file:
        dlp_yaml = yaml.safe_load(file)
        app_root = dlp_yaml["app"]
else:
    app_root = str(Path(os.getcwd()).parent.parent)

In [4]:
import sys
sys.path.insert(0, app_root)
sys.path

['/usr/WS2/haridev',
 '/usr/WS2/haridev/scr-dlio/examples',
 '/usr/workspace/iopp/projects/digio',
 '/collab/usr/gapps/python/build/spack-toss4.1/var/spack/environments/python/._view/75prb56irmif5ejtirjthpx6kq3gqo52/lib/python39.zip',
 '/collab/usr/gapps/python/build/spack-toss4.1/var/spack/environments/python/._view/75prb56irmif5ejtirjthpx6kq3gqo52/lib/python3.9',
 '/collab/usr/gapps/python/build/spack-toss4.1/var/spack/environments/python/._view/75prb56irmif5ejtirjthpx6kq3gqo52/lib/python3.9/lib-dynload',
 '',
 '/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages']

In [5]:
import dlp_analyzer
print(dlp_analyzer.__file__)
from dlp_analyzer.main import DLPAnalyzer,get_dlp_configuration,update_dlp_configuration,setup_logging,setup_dask_cluster, reset_dask_cluster, get_dlp_configuration


/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/__init__.py


In [6]:
if not use_local:
    dask_run_dir = os.path.join(app_root, "dlp_analyzer", "dask", "run_dir")
    with open (os.path.join(dask_run_dir, f"scheduler_{os.getenv('USER')}.json"), "r") as f:
        dask_scheduler = json.load(f)["address"]
else:
    dask_scheduler = None

In [7]:
app_name = "dlio_scr" # dlio dlio_scr

In [8]:
def get_conditions(json_object):
    app_io_cond = "reader" in json_object["cat"] or "checkpoint" in json_object["cat"]
    compute_cond = "compute" in json_object["name"] # Cosmoflow
    io_cond = "POSIX" == json_object["cat"] # Cosmoflow
    return app_io_cond, compute_cond, io_cond

In [9]:
condition_fn = None #
if app_name == "dlio":
    filename = "/g/g92/haridev/projects/scr-dlio/logs/n2_p8_base/trace*.pfw.gz"
    condition_fn = get_conditions
elif app_name == "dlio_scr":
    filename = "/g/g92/haridev/projects/scr-dlio/logs/n2_p8_scr/trace*.pfw.gz"
    filename = "/g/g92/haridev/projects/scr-dlio/logs/n2_p8_scr_finer/trace*.pfw.gz"
    condition_fn = get_conditions

In [10]:
conf = update_dlp_configuration(dask_scheduler=dask_scheduler, verbose=True, workers=64,
                                log_file=f"./dlp_{os.getenv('USER')}.log", rebuild_index=False, time_approximate=True, 
                                host_pattern=r'lassen(\d+)', time_granularity=30e6, skip_hostname=True, conditions=condition_fn)

In [11]:
setup_logging()

In [12]:
setup_dask_cluster()

[INFO] [22:10:25] Initialized Client with 64 workers and link http://127.0.0.1:8787/status [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:668]


In [13]:
analyzer = DLPAnalyzer(filename)

[INFO] [22:10:26] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [22:10:26] Total size of all files are <dask.bag.core.Item object at 0x1554a9b1f4f0> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [22:10:28] Loading 9132 batches out of 16 files and has 149432067 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [22:14:36] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [22:14:36] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [14]:
items = analyzer.summary()
items

[INFO] [22:00:42] Total number of events in the workload are 149432035 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:520]
[INFO] [22:00:46] Approximate True 1563728774, 1015223244.0, 19695081.0, 2740876489.0,                995528163.0, 0.0, 2721181408.0, 0.0 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:474]


In [15]:
checkpoint_events = analyzer.events.query("name.str.contains('checkpoint')").compute()
checkpoint_events

Unnamed: 0,name,cat,pid,tid,ts,te,dur,tinterval,trange,hostname,compute_time,io_time,app_io_time,total_time,filename,phase,size
12252,checkpoint_start_1_100,checkpoint,0,2363474,345336825,345341531,4706,,11.0,corona171,,,4706,4706,,3,
3112,checkpoint_end_1_100,checkpoint,0,2363474,368831508,401679869,32848361,,12.0,corona171,,,32848361,32848361,,3,
3113,SCRPyTorchCheckpointing.checkpoint,checkpoint,0,2363474,345336790,401679961,56343171,,11.0,corona171,,,56343171,56343171,,3,
1292,checkpoint_start_1_200,checkpoint,0,2363474,553776958,553809392,32434,,18.0,corona171,,,32434,32434,,3,
11938,checkpoint_end_1_200,checkpoint,0,2363474,577214697,612213782,34999085,,19.0,corona171,,,34999085,34999085,,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12202,checkpoint_end_1_500,checkpoint,9,1883505,1212240498,1266736860,54496362,,40.0,corona173,,,54496362,54496362,,3,
12203,SCRPyTorchCheckpointing.checkpoint,checkpoint,9,1883505,1198878023,1266736974,67858951,,39.0,corona173,,,67858951,67858951,,3,
9355,checkpoint_start_1_600,checkpoint,9,1883505,1419930384,1419953645,23261,,47.0,corona173,,,23261,23261,,3,
4584,checkpoint_end_1_600,checkpoint,9,1883505,1433499678,1487615505,54115827,,47.0,corona173,,,54115827,54115827,,3,


In [28]:
checkpoint_events.query("name.str.contains('checkpoint_end')")["dur"].sum() / 1e6

4665.911755

In [29]:
checkpoint_events.query("name.str.contains('SCRPyTorchCheckpointing.checkpoint')")["dur"].sum() / 1e6

6049.440787

In [42]:
app_time = analyzer.events.query("name == 'SCRPyTorchCheckpointing.checkpoint'").compute()
app_time_files = app_time.groupby(["trange","pid"]).agg({"dur":sum}).\
                  groupby(["trange"]).agg({"dur":max}).sum()
app_time_files / 1e6

dur    378.282453
dtype: double[pyarrow]

In [15]:
posix_ssd_time = analyzer.events.query("cat == 'POSIX' and filename.str.contains('ssd')").compute()
posix_ssd_time_files = posix_ssd_time.groupby(["trange","pid","tid"]).agg({"dur":sum}).\
                  groupby(["trange"]).agg({"dur":max}).sum()
posix_ssd_time_files / 1e6

dur    419.682159
dtype: double[pyarrow]

In [41]:
posix_lustre_time = analyzer.events.query("cat == 'POSIX' and filename.str.contains('/p/lustre2/haridev/dlio/scr/checkpoints')").compute()
posix_lustre_time_files = posix_lustre_time.groupby(["trange","pid"]).agg({"dur":sum}).\
                  groupby(["trange"]).agg({"dur":max}).sum()
posix_lustre_time_files / 1e6

dur    1483.017034
dtype: double[pyarrow]

In [34]:
posix_lustre_time = analyzer.events.query("pid == 0 and cat == 'POSIX' and filename.str.contains('/p/lustre2/haridev/dlio/scr/checkpoints')").compute()
posix_lustre_time_files = posix_lustre_time["dur"].sum()
posix_lustre_time_files / 1e6

1471.540838

In [35]:
posix_lustre_time_files = posix_lustre_time["size"].sum()
posix_lustre_time_files / 1024**3

79.91119105741382

In [37]:
posix_lustre_time.query("name == 'read'")[["size"]].sum() / 1024**3

size    11.41595
dtype: double[pyarrow]

In [47]:
import pandas as pd
pd.set_option('display.max_colwidth',1000)
posix_lustre_time.query("name == 'write' and not filename.str.contains('pt')")[["name","pid","dur","filename","size"]].sort_values("dur", ascending=False)["dur"].sum() / 1e6

1.240783

In [48]:

posix_lustre_time.query("name == 'write' and filename.str.contains('pt')")[["name","pid","dur","filename","size"]].sort_values("dur", ascending=False)["dur"].sum()/ 1e6

4199.216728

In [27]:
posix_lustre_time.query("name == 'write'")[["size"]].sum() / 1024**3

size    68.495242
dtype: double[pyarrow]

In [None]:
app_time

In [None]:
app_time = analyzer.events.query("name == 'SCRPyTorchCheckpointing.checkpoint'").compute()
app_time_files = app_time.groupby(["trange","pid","tid"]).agg({"dur":sum}).\
                  groupby(["trange"]).agg({"dur":max}).sum()
app_time_files / 1e6

In [26]:
checkpoint_events = analyzer.events.query("name == 'write' and filename.str.contains('ssd')")
checkpoint_events = checkpoint_events.compute()
checkpoint_events["dur"].sum() / 1e6

1893.115273

In [30]:
checkpoint_events = analyzer.events.query("name == 'write' and filename.str.contains('lustre')")
checkpoint_events = checkpoint_events.compute()
checkpoint_events["dur"].sum() / 1e6

4200.457511

In [38]:
import pandas
pandas.options.display.max_columns = 2000
pandas.set_option('display.max_colwidth',1000)
analyzer.events.query("name == 'write'")["filename"].unique().compute()

0                                                                  /p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/.scr/nodes.scr
1                                                                   /p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/.scr/halt.scr
2                                                                  /p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/.scr/flush.scr
3                                                                  /p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/.scr/index.scr
4                                            /l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.dataset.20/model-1-600-0.pt
                                                                             ...                                                                       
2165    /l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.d

In [32]:
checkpoint_files = checkpoint_events.groupby(["filename","trange","pid","tid"]).agg({"dur":sum}).\
                  groupby(["filename","trange"]).agg({"dur":max}).\
                  groupby(["filename"]).agg({"dur":sum})

In [40]:
checkpoint_files.min(), checkpoint_files.max() / 1e6, checkpoint_files.sum() / 1e6, checkpoint_files.mean() / 1e6, checkpoint_files.count()

(dur    5
 dtype: uint64[pyarrow],
 dur    16.372719
 dtype: double[pyarrow],
 dur    1428.036601
 dtype: double[pyarrow],
 dur    1.000026
 dtype: double[pyarrow],
 dur    1428
 dtype: int64)

In [None]:
(dur    347
 dtype: uint64[pyarrow],
 dur    8463940
 dtype: uint64[pyarrow],
 dur    965268.883333
 dtype: double[pyarrow])