In [1]:
import json
import os
import yaml
from pathlib import Path
from dask.distributed import Client

In [2]:
use_local=True

In [3]:
if not use_local:
    with open(f'~/.dlio_profiler/configuration.yaml', 'r') as file:
        dlp_yaml = yaml.safe_load(file)
        app_root = dlp_yaml["app"]
else:
    app_root = str(Path(os.getcwd()).parent.parent)

In [4]:
import sys
sys.path.insert(0, app_root)
sys.path

['/usr/WS2/haridev',
 '/usr/WS2/haridev/scr-dlio/examples',
 '/usr/workspace/iopp/projects/digio',
 '/collab/usr/gapps/python/build/spack-toss4.1/var/spack/environments/python/._view/75prb56irmif5ejtirjthpx6kq3gqo52/lib/python39.zip',
 '/collab/usr/gapps/python/build/spack-toss4.1/var/spack/environments/python/._view/75prb56irmif5ejtirjthpx6kq3gqo52/lib/python3.9',
 '/collab/usr/gapps/python/build/spack-toss4.1/var/spack/environments/python/._view/75prb56irmif5ejtirjthpx6kq3gqo52/lib/python3.9/lib-dynload',
 '',
 '/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages']

In [5]:
import dlp_analyzer
print(dlp_analyzer.__file__)
from dlp_analyzer.main import DLPAnalyzer,get_dlp_configuration,update_dlp_configuration,setup_logging,setup_dask_cluster, reset_dask_cluster, get_dlp_configuration


/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/__init__.py


In [6]:
if not use_local:
    dask_run_dir = os.path.join(app_root, "dlp_analyzer", "dask", "run_dir")
    with open (os.path.join(dask_run_dir, f"scheduler_{os.getenv('USER')}.json"), "r") as f:
        dask_scheduler = json.load(f)["address"]
else:
    dask_scheduler = None

In [7]:
app_name = "dlio" # dlio dlio_scr

In [8]:
def get_conditions(json_object):
    app_io_cond = "reader" in json_object["cat"] or "checkpoint" in json_object["cat"]
    compute_cond = "compute" in json_object["name"] # Cosmoflow
    io_cond = "POSIX" == json_object["cat"] # Cosmoflow
    return app_io_cond, compute_cond, io_cond

In [9]:
condition_fn = None #
if app_name == "dlio":
    filename = "/g/g92/haridev/projects/scr-dlio/logs/n2_p8_base/trace*.pfw.gz"
    filename = "/usr/WS2/haridev/scr-dlio/scripts/hydra_log/megatron_deepspeed/2024-03-21-23-06-04/trace*.pfw.gz" #base_ssd
    condition_fn = get_conditions
elif app_name == "dlio_scr":
    filename = "/g/g92/haridev/projects/scr-dlio/logs/n2_p8_scr/trace*.pfw.gz"
    condition_fn = get_conditions

In [10]:
conf = update_dlp_configuration(dask_scheduler=dask_scheduler, verbose=True, workers=64,
                                log_file=f"./dlp_{os.getenv('USER')}.log", rebuild_index=False, time_approximate=True, 
                                host_pattern=r'lassen(\d+)', time_granularity=30e6, skip_hostname=True, conditions=condition_fn)

In [11]:
setup_logging()

In [12]:
setup_dask_cluster()

[INFO] [23:34:35] Initialized Client with 64 workers and link http://127.0.0.1:33691/status [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:668]


In [13]:
analyzer = DLPAnalyzer(filename)

[INFO] [23:35:24] Created index for 16 files [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:365]
[INFO] [23:35:24] Total size of all files are <dask.bag.core.Item object at 0x1554aa953430> bytes [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:367]
[INFO] [23:35:26] Loading 8976 batches out of 16 files and has 146988160 lines overall [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:380]
[INFO] [23:39:51] Loaded events [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:422]
[INFO] [23:39:51] Loaded plots with slope threshold: 45 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:428]


In [14]:
items = analyzer.summary()
items

[INFO] [23:39:52] Total number of events in the workload are 146988128 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:520]
[INFO] [23:39:55] Approximate True 1053490919, 578312926.0, 19345779.0, 1643333036.0,                558967147.0, 0.0, 1623987257.0, 0.0 [/usr/workspace/haridev/scr-dlio/venv/lib/python3.9/site-packages/dlp_analyzer/main.py:474]


In [15]:
checkpoint_events = analyzer.events.query("name.str.contains('checkpoint')").compute()
checkpoint_events

Unnamed: 0,name,cat,pid,tid,ts,te,dur,tinterval,trange,hostname,compute_time,io_time,app_io_time,total_time,filename,phase,size
5025,PyTorchCheckpointing.checkpoint,checkpoint,0,3087224,263431381,284667111,21235730,,8.0,corona171,,,21235730,21235730,,3,
14086,PyTorchCheckpointing.checkpoint,checkpoint,0,3087224,407680258,428585959,20905701,,13.0,corona171,,,20905701,20905701,,3,
6763,PyTorchCheckpointing.checkpoint,checkpoint,0,3087224,551051497,571609525,20558028,,18.0,corona171,,,20558028,20558028,,3,
15824,PyTorchCheckpointing.checkpoint,checkpoint,0,3087224,695603022,716841668,21238646,,23.0,corona171,,,21238646,21238646,,3,
8501,PyTorchCheckpointing.checkpoint,checkpoint,0,3087224,839879584,861462518,21582934,,27.0,corona171,,,21582934,21582934,,3,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13206,PyTorchCheckpointing.checkpoint,checkpoint,9,2086952,407679190,419959775,12280585,,13.0,corona173,,,12280585,12280585,,3,
5443,PyTorchCheckpointing.checkpoint,checkpoint,9,2086952,551056177,563706609,12650432,,18.0,corona173,,,12650432,12650432,,3,
14064,PyTorchCheckpointing.checkpoint,checkpoint,9,2086952,695612185,708282008,12669823,,23.0,corona173,,,12669823,12669823,,3,
6301,PyTorchCheckpointing.checkpoint,checkpoint,9,2086952,839885189,852444819,12559630,,27.0,corona173,,,12559630,12559630,,3,


In [17]:
posix_lustre_time = analyzer.events.query("pid == 0 and cat == 'POSIX' and filename.str.contains('ssd')").compute()
posix_lustre_time_files = posix_lustre_time["dur"].sum()
posix_lustre_time_files / 1e6

97.497736

In [16]:
posix_lustre_time = analyzer.events.query("pid == 0 and cat == 'POSIX' and filename.str.contains('/p/lustre2/haridev/dlio/scr/checkpoints')").compute()
posix_lustre_time_files = posix_lustre_time["dur"].sum()
posix_lustre_time_files / 1e6

69.864806

In [21]:
posix_ssd_time = analyzer.events.query("cat == 'POSIX' and filename.str.contains('ssd')").compute()
posix_ssd_time_files = posix_ssd_time.groupby(["trange","pid"]).agg({"dur":sum}).\
                  groupby(["trange"]).agg({"dur":max}).sum()
posix_ssd_time_files / 1e6

dur    103.825482
dtype: double[pyarrow]

In [19]:
app_time = analyzer.events.query("name == 'PyTorchCheckpointing.checkpoint'").compute()
app_time_files = app_time.groupby(["trange","pid"]).agg({"dur":sum}).\
                  groupby(["trange"]).agg({"dur":max}).sum()
app_time_files / 1e6

dur    127.709373
dtype: double[pyarrow]

In [35]:
checkpoint_events = analyzer.events.query("name == 'write'")
checkpoint_events = checkpoint_events.compute()
checkpoint_events

Unnamed: 0,name,cat,pid,tid,ts,te,dur,tinterval,trange,hostname,compute_time,io_time,app_io_time,total_time,filename,phase,size
12910,write,POSIX,0,2664209,250005384,250225091,219707,,8.0,corona171,,219707,,219707,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,30827
1943,write,POSIX,0,2664209,250852746,250865085,12339,,8.0,corona171,,12339,,12339,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,704
15516,write,POSIX,0,2664209,250865108,252595636,1730528,,8.0,corona171,,1730528,,1730528,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1009254400
4568,write,POSIX,0,2664209,253668019,253668037,18,,8.0,corona171,,18,,18,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,64
4569,write,POSIX,0,2664209,253668074,254603165,935091,,8.0,corona171,,935091,,935091,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1009254400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14879,write,POSIX,9,314836,929319731,929319758,27,,30.0,corona173,,27,,27,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,64
14880,write,POSIX,9,314836,929319806,930213492,893686,,30.0,corona173,,893686,,893686,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,865075200
14881,write,POSIX,9,314836,930797822,930799120,1298,,31.0,corona173,,1298,,1298,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1587392
14882,write,POSIX,9,314836,930799172,932698114,1898942,,31.0,corona173,,1898942,,1898942,/p/lustre2/haridev/dlio/scr/checkpoints/scr_me...,2,1875123200


In [36]:
checkpoint_files = checkpoint_events.groupby(["filename","trange","pid","tid"]).agg({"dur":sum}).\
                  groupby(["filename","trange"]).agg({"dur":max}).\
                  groupby(["filename"]).agg({"dur":sum})

In [39]:
checkpoint_files.min(), checkpoint_files.max() / 1e6, checkpoint_files.sum() / 1e6, checkpoint_files.mean() / 1e6, checkpoint_files.count()

(dur    347
 dtype: uint64[pyarrow],
 dur    8.46394
 dtype: double[pyarrow],
 dur    694.993596
 dtype: double[pyarrow],
 dur    0.965269
 dtype: double[pyarrow],
 dur    720
 dtype: int64)

In [None]:
checkpoint_files.min(), checkpoint_files.max(), checkpoint_files.mean()