In [1]:
import json
import os
import yaml
from pathlib import Path
from dask.distributed import Client

In [2]:
use_local=True

In [3]:
if not use_local:
    with open(f'~/.dlio_profiler/configuration.yaml', 'r') as file:
        dlp_yaml = yaml.safe_load(file)
        app_root = dlp_yaml["app"]
else:
    app_root = str(Path(os.getcwd()).parent.parent)

In [4]:
import sys
sys.path.insert(0, app_root)
sys.path

['/usr/WS2/haridev',
 '/collab/usr/gapps/python/build/spack-toss4.1/var/spack/environments/python/._view/75prb56irmif5ejtirjthpx6kq3gqo52/lib/python39.zip',
 '/collab/usr/gapps/python/build/spack-toss4.1/var/spack/environments/python/._view/75prb56irmif5ejtirjthpx6kq3gqo52/lib/python3.9',
 '/collab/usr/gapps/python/build/spack-toss4.1/var/spack/environments/python/._view/75prb56irmif5ejtirjthpx6kq3gqo52/lib/python3.9/lib-dynload',
 '',
 '/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages']

In [5]:
import dlp_analyzer
print(dlp_analyzer.__file__)
from dlp_analyzer.main import DLPAnalyzer,get_dlp_configuration,update_dlp_configuration,setup_logging,setup_dask_cluster, reset_dask_cluster, get_dlp_configuration


/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/__init__.py


In [6]:
if not use_local:
    dask_run_dir = os.path.join(app_root, "dlp_analyzer", "dask", "run_dir")
    with open (os.path.join(dask_run_dir, f"scheduler_{os.getenv('USER')}.json"), "r") as f:
        dask_scheduler = json.load(f)["address"]
else:
    dask_scheduler = None

In [8]:
app_name = "dlio_scr" # dlio dlio_scr

In [9]:
def get_conditions(json_object):
    app_io_cond = "reader" in json_object["cat"] or "checkpoint" in json_object["cat"]
    compute_cond = "compute" in json_object["name"] # Cosmoflow
    io_cond = "POSIX" == json_object["cat"] # Cosmoflow
    return app_io_cond, compute_cond, io_cond

In [10]:
condition_fn = None #
if app_name == "dlio":
    filename = "/g/g92/haridev/projects/scr-dlio/logs/n2_p8_base/trace*.pfw.gz"
    condition_fn = get_conditions
elif app_name == "dlio_scr":
    filename = "/g/g92/haridev/projects/scr-dlio/logs/n2_p8_scr/trace*.pfw.gz"
    condition_fn = get_conditions

In [11]:
conf = update_dlp_configuration(dask_scheduler=dask_scheduler, verbose=True, workers=64,
                                log_file=f"./dlp_{os.getenv('USER')}.log", rebuild_index=False, time_approximate=True, 
                                host_pattern=r'lassen(\d+)', time_granularity=30e6, skip_hostname=True, conditions=condition_fn)

In [12]:
setup_logging()

In [13]:
setup_dask_cluster()

[INFO] [16:07:40] Initialized Client with 64 workers and link http://127.0.0.1:45227/status [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:668]


In [14]:
analyzer = DLPAnalyzer(filename)

[INFO] [16:08:22] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [16:08:22] Total size of all files are <dask.bag.core.Item object at 0x1553df7cafd0> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [16:08:23] Loading 9131 batches out of 16 files and has 149399610 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [16:11:26] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [16:11:26] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [15]:
items = analyzer.summary()
items

[INFO] [16:11:26] Total number of events in the workload are 149399578 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:520]
[INFO] [16:11:28] Approximate True 1265128597, 721835447.0, 19822127.0, 1804425785.0,                702013320.0, 0.0, 1784603658.0, 0.0 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:474]


In [26]:
app_time = analyzer.events.query("name == 'SCRPyTorchCheckpointing.checkpoint'").compute()
app_time_files = app_time.groupby(["trange","pid","tid"]).agg({"dur":sum}).\
                  groupby(["trange"]).agg({"dur":max}).sum()
app_time_files / 1e6

dur    324.336308
dtype: double[pyarrow]

In [24]:
app_time

Unnamed: 0,name,cat,pid,tid,ts,te,dur,tinterval,trange,hostname,compute_time,io_time,app_io_time,total_time,filename,phase,size
3106,SCRPyTorchCheckpointing.checkpoint,checkpoint,0,227602,312908612,367925208,55016596,,10.0,corona174,,,55016596,55016596,,3,
11975,SCRPyTorchCheckpointing.checkpoint,checkpoint,0,227602,482741178,537330743,54589565,,16.0,corona174,,,54589565,54589565,,3,
4355,SCRPyTorchCheckpointing.checkpoint,checkpoint,0,227602,652763690,709546747,56783057,,21.0,corona174,,,56783057,56783057,,3,
13133,SCRPyTorchCheckpointing.checkpoint,checkpoint,0,227602,823810123,875827183,52017060,,27.0,corona174,,,52017060,52017060,,3,
5372,SCRPyTorchCheckpointing.checkpoint,checkpoint,0,227602,991188713,1044800768,53612055,,33.0,corona174,,,53612055,53612055,,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2290,SCRPyTorchCheckpointing.checkpoint,checkpoint,9,229479,482739261,537289202,54549941,,16.0,corona175,,,54549941,54549941,,3,
11054,SCRPyTorchCheckpointing.checkpoint,checkpoint,9,229479,652761857,709521177,56759320,,21.0,corona175,,,56759320,56759320,,3,
3436,SCRPyTorchCheckpointing.checkpoint,checkpoint,9,229479,823795234,875827179,52031945,,27.0,corona175,,,52031945,52031945,,3,
12206,SCRPyTorchCheckpointing.checkpoint,checkpoint,9,229479,991186879,1044844242,53657363,,33.0,corona175,,,53657363,53657363,,3,


In [31]:
checkpoint_events = analyzer.events.query("name == 'write' and filename.str.contains('ssd')")
checkpoint_events = checkpoint_events.compute()
checkpoint_events

Unnamed: 0,name,cat,pid,tid,ts,te,dur,tinterval,trange,hostname,compute_time,io_time,app_io_time,total_time,filename,phase,size
234,write,POSIX,0,227602,58772234,58772260,26,,1.0,corona174,,26,,26,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,30827
242,write,POSIX,0,227602,58840187,58854589,14402,,1.0,corona174,,14402,,14402,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,33554432
244,write,POSIX,0,227602,58887847,58902143,14296,,1.0,corona174,,14296,,14296,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,33554432
246,write,POSIX,0,227602,58935762,58950980,15218,,1.0,corona174,,15218,,15218,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,33554432
248,write,POSIX,0,227602,58984651,58998509,13858,,1.0,corona174,,13858,,13858,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,33554432
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4502,write,POSIX,9,229479,1195227241,1195227531,290,,39.0,corona175,,290,,290,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,1048576
4505,write,POSIX,9,229479,1195228316,1195228581,265,,39.0,corona175,,265,,265,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,1048576
4508,write,POSIX,9,229479,1195229490,1195229784,294,,39.0,corona175,,294,,294,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,1048576
4511,write,POSIX,9,229479,1195230641,1195230908,267,,39.0,corona175,,267,,267,/l/ssd/haridev/scr/checkpoints/scr_megatron_de...,2,1048576


In [38]:
import pandas
pandas.options.display.max_columns = 2000
pandas.set_option('display.max_colwidth',1000)
analyzer.events.query("name == 'write'")["filename"].unique().compute()

0                                                                  /p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/.scr/nodes.scr
1                                                                   /p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/.scr/halt.scr
2                                                                  /p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/.scr/flush.scr
3                                                                  /p/lustre2/haridev/dlio/scr/checkpoints/scr_megatron_deepspeed_medium/.scr/index.scr
4                                            /l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.dataset.20/model-1-600-0.pt
                                                                             ...                                                                       
2165    /l/ssd/haridev/scr/checkpoints/scr_megatron_deepspeed/haridev/scr.defjobid/scr.d

In [32]:
checkpoint_files = checkpoint_events.groupby(["filename","trange","pid","tid"]).agg({"dur":sum}).\
                  groupby(["filename","trange"]).agg({"dur":max}).\
                  groupby(["filename"]).agg({"dur":sum})

In [40]:
checkpoint_files.min(), checkpoint_files.max() / 1e6, checkpoint_files.sum() / 1e6, checkpoint_files.mean() / 1e6, checkpoint_files.count()

(dur    5
 dtype: uint64[pyarrow],
 dur    16.372719
 dtype: double[pyarrow],
 dur    1428.036601
 dtype: double[pyarrow],
 dur    1.000026
 dtype: double[pyarrow],
 dur    1428
 dtype: int64)

In [None]:
(dur    347
 dtype: uint64[pyarrow],
 dur    8463940
 dtype: uint64[pyarrow],
 dur    965268.883333
 dtype: double[pyarrow])