In [1]:
%load_ext autoreload
%load_ext autotime

time: 268 µs (started: 2024-03-22 13:30:26 -07:00)


In [5]:
%%writefile char_comp.py

import dask.dataframe as dd
import numpy as np
import pandas as pd

XFER_SIZE_BINS = [
    -np.inf,
    4 * 1024.0,
    # 16 * 1024.0,
    64 * 1024.0,
    # 256 * 1024.0,
    1 * 1024.0 * 1024.0,
    # 4 * 1024.0 * 1024.0,
    16 * 1024.0 * 1024.0,
    # 64 * 1024.0 * 1024.0,
    np.inf
]


def calc_job_time(ddf):
    return ddf['tend'].max() - ddf['tstart'].min()


def calc_read_size(ddf):
    return ddf[ddf['io_cat'] == 1]['size'].sum()


def calc_write_size(ddf):
    return ddf[ddf['io_cat'] == 2]['size'].sum()


def calc_num_files(ddf):
    return ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2, 3]))]['file_name'].nunique()


def calc_num_procs(ddf):
    return ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2, 3]))]['proc_name'].nunique()


def calc_fpp(ddf):
    return ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2, 3]))].groupby(['file_name'])['proc_name'].nunique().to_frame().query('proc_name == 1')


def calc_acc_pat(ddf):
    return ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))]['acc_pat'].value_counts()


def calc_size(ddf):
    return ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))]['size'].sum()


def calc_ops_dist(ddf):
    return ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2, 3]))]['io_cat'].value_counts()


def calc_xfer_dist(ddf):
    return ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))]['size'].map_partitions(pd.cut, XFER_SIZE_BINS).value_counts()


def calc_agg_bw(ddf):
    return ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))]['size'].sum() / ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))]['duration'].sum()


def char_summary_tasks(ddf):
    return [
        calc_job_time(ddf),
        calc_read_size(ddf),
        calc_write_size(ddf),
        calc_num_files(ddf),
        calc_num_procs(ddf),
        calc_fpp(ddf),
        calc_acc_pat(ddf),
        calc_size(ddf),
        calc_ops_dist(ddf),
        calc_xfer_dist(ddf),
        calc_agg_bw(ddf),
    ]


def cm1_issue1_file_size_per_rank(ddf):
    ddf0 = ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))] \
        .groupby(['file_name', 'io_cat']) \
        .agg({'size': ['mean', sum], 'rank': [min, max, 'count']})

    ddf0.columns = ['_'.join(tup).rstrip('_') for tup in ddf0.columns.values]

    ddf0 = ddf0.assign(rank_rank=lambda x: x['rank_min'].astype(str) + '-' + x['rank_max'].astype(str)) \
        .reset_index() \
        .groupby(['rank_rank', 'io_cat']) \
        .agg({'size_mean': 'mean', 'size_sum': sum})

    ddf0['size_mean'] = ddf0['size_mean'] / 1024 ** 2
    ddf0['size_sum'] = ddf0['size_sum'] / 1024 ** 3

    return ddf0


def cm1_issue3_rank_0_write_low_bw(ddf):
    return ddf[(ddf['cat'] == 0) & (ddf['io_cat'] == 2)] \
        .groupby(['rank']) \
        .agg({'size': sum, 'duration': sum}) \
        .assign(bw=lambda x: x['size'] / x['duration'] / 1024 ** 3)


def hacc_issue1_open_close(ddf):
    return ddf[ddf['func_id'].str.contains('open|close')].groupby(['file_name', 'func_id'])['index'].count()


def montagep_issue1_io_size_per_app(ddf):
    return ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))].groupby(['app', 'io_cat']).agg({'size': sum})


def montagep_issue2_io_size_per_app_per_time(ddf):
    def assign_time_bin(df):
        df['time_bin'] = np.digitize(df['tmid'], bins=np.arange(434) * 1e7)
        return df

    return ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))] \
        .map_partitions(assign_time_bin) \
        .groupby(['app', 'time_bin']) \
        .agg({'size': sum}) \
        .sort_values('size', ascending=False)


def generic_issue_bw_by_rank(ddf):
    return ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))] \
        .groupby(['rank']) \
        .agg({'size': sum, 'duration': sum}) \
        .assign(bw=lambda x: x['size'] / x['duration'] / 1024 ** 3)


def generic_issue_low_bw(ddf):

    def assign_size_bin(df):
        df['size_bin'] = pd.cut(df['size'], XFER_SIZE_BINS)
        return df

    ddf0 = ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))]

    return ddf0 \
        .map_partitions(assign_size_bin) \
        .groupby(['size_bin', 'io_cat']) \
        .agg({'index': 'count', 'size': sum, 'duration': sum}) \
        .assign(bw=lambda x: x['size'] / x['duration'] / 1024 ** 3) \
        .dropna()


def generic_issue_metadata_access_per(ddf):
    return ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2, 3]))] \
        .groupby(['proc_name', 'io_cat']) \
        .sum() \
        .reset_index() \
        .groupby('io_cat')['duration'] \
        .max()

Overwriting char_comp.py
time: 4.81 ms (started: 2024-03-22 13:35:34 -07:00)


In [6]:
%%writefile -a char_comp.py

import dask
import time
from dask.distributed import Client
from dask_jobqueue import LSFCluster

n_workers = 8
n_threads_per_worker = 16

cluster = LSFCluster(
    cores=n_workers * n_threads_per_worker,
    # death_timeout=self.config.death_timeout,
    job_directives_skip=['-n', '-R', '-M', '-P', '-W 00:30'],
    job_extra_directives=['-nnodes 1', '-G asccasc', '-q pdebug', '-W 120'],
    # local_directory=self.config.local_dir,
    memory=f"1600GB",
    processes=n_workers,
    scheduler_options=dict(
        # dashboard_address=dashboard_address,
        # host=self.config.host,
    ),
    use_stdin=True,
)

client = Client(cluster)

cluster.scale(n_workers)

def _wait_until_workers_alive(client, sleep_seconds=2):
    current_n_workers = len(client.scheduler_info()['workers'])
    while client.status == 'running' and current_n_workers < n_workers:
        current_n_workers = len(client.scheduler_info()['workers'])
        print(f"Waiting for workers ({current_n_workers}/{n_workers})")
        # Try to force cluster to boot workers
        cluster._correct_state()
        # Wait
        time.sleep(sleep_seconds)
    print('All workers alive')

print('client dashboard', client.dashboard_link)

_wait_until_workers_alive(client)

Appending to char_comp.py
time: 2.25 ms (started: 2024-03-22 13:35:35 -07:00)


In [7]:
%%writefile -a char_comp.py

app_traces = {
    # 'cm1': '/usr/workspace/iopp/wisio_logs/recorder_cm1_32_4/_parquet',
    # 'hacc': '/usr/workspace/iopp/wisio_logs/recorder_hacc_32_0/_parquet',
    # 'lbann_cosmoflow': '/usr/workspace/iopp/wisio_logs/recorder_lbann_cosmoflow_32/_parquet',
    # 'lbann_jag': '/usr/workspace/iopp/wisio_logs/recorder_lbann_jag_32/_parquet',
    # 'montagep': '/usr/workspace/iopp/wisio_logs/recorder_montage_pegasus_32/_parquet',
    # 'cm1': '/p/gpfs1/iopp/wisio_logs/recorder_cm1_32_4/_parquet',
    # 'hacc': '/p/gpfs1/iopp/wisio_logs/recorder_hacc_32_0/_parquet',
    # 'lbann_cosmoflow': '/p/gpfs1/iopp/wisio_logs/recorder_lbann_cosmoflow_32/_parquet',
    # 'lbann_jag': '/p/gpfs1/iopp/wisio_logs/recorder_lbann_jag_32/_parquet',
    # 'montagep': '/p/gpfs1/iopp/wisio_logs/recorder_montage_pegasus_32/_parquet',
    'flash': '/p/gpfs1/iopp/wisio_logs/recorder_flash_sedov3d_hdf5/_parquet',
    'mummi_wemul': '/p/gpfs1/iopp/wisio_logs/recorder_mummi_wemul_32_32/_parquet',
    'genome': '/p/gpfs1/iopp/wisio_logs/recorder_genome_pegasus_32/_parquet',
}

for app, trace in app_traces.items():

    ddf = dd.read_parquet(trace)

    char_tasks = char_summary_tasks(ddf)
    char_t0 = time.perf_counter()
    for i, t in enumerate(char_tasks):
        t0 = time.perf_counter()
        r, = dask.compute(t)
        print(f"{app} char {i + 1}/{len(char_tasks)} completed {time.perf_counter() - t0}")
    char_elapsed = time.perf_counter() - char_t0

    app_tasks = []
    if app == 'cm1':
        app_tasks.extend([
            generic_issue_low_bw(ddf),
            generic_issue_metadata_access_per(ddf),
            cm1_issue1_file_size_per_rank(ddf),
            cm1_issue3_rank_0_write_low_bw(ddf),
        ])
    elif app == 'hacc':
        app_tasks.extend([
            generic_issue_bw_by_rank(ddf),
            generic_issue_low_bw(ddf),
            generic_issue_metadata_access_per(ddf),
            hacc_issue1_open_close(ddf),
        ])
    elif app == 'montagep':
        app_tasks.extend([
            generic_issue_low_bw(ddf),
            generic_issue_metadata_access_per(ddf),
            montagep_issue1_io_size_per_app(ddf),
            montagep_issue2_io_size_per_app_per_time(ddf),
        ])
    else:
        app_tasks.extend([
            generic_issue_bw_by_rank(ddf),
            generic_issue_low_bw(ddf),
            generic_issue_metadata_access_per(ddf),
        ])

    app_t0 = time.perf_counter()
    for i, t in enumerate(app_tasks):
        t0 = time.perf_counter()
        r, = dask.compute(t)
        print(f"{app} issue {i + 1}/{len(app_tasks)} completed {time.perf_counter() - t0}")
    app_elapsed = time.perf_counter() - app_t0

    print(f"{app} total {char_elapsed + app_elapsed}")

    client.restart()

    _wait_until_workers_alive(client)

Appending to char_comp.py
time: 3.69 ms (started: 2024-03-22 13:35:35 -07:00)


In [216]:
import pandas as pd
import dask.dataframe as dd

ddf = dd.read_parquet('/usr/workspace/iopp/wisio_logs/recorder_montage_pegasus_32/_parquet')
ddf

Unnamed: 0_level_0,index,proc,rank,thread_id,cat,io_cat,tstart,tend,func_id,level,hostname,app,proc_name,file_name,size,acc_pat,bandwidth,duration,tmid,file_id,proc_id
npartitions=36,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,int64,int64,int32,int32,int32,int32,float32,float32,string,int32,string,string,string,string,int64,int32,float32,float32,int64,int64,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


time: 121 ms (started: 2024-01-23 14:46:16 -08:00)


In [217]:
df = ddf.compute()
df

Unnamed: 0,index,proc,rank,thread_id,cat,io_cat,tstart,tend,func_id,level,...,app,proc_name,file_name,size,acc_pat,bandwidth,duration,tmid,file_id,proc_id
0,1,132672,0,321552,0,3,2.137661,2.137760,fopen64,0,...,mProject,mProject#lassen192#132672#35184372410384,/p/gpfs1/haridev/iopp/montage-workflow/scratch...,0,0,0.000000,0.000099,21377103,4007557435994691998,2303625805466626068
1,2,132672,0,321552,0,1,2.137835,2.138117,fread,0,...,mProject,mProject#lassen192#132672#35184372410384,/p/gpfs1/haridev/iopp/montage-workflow/scratch...,2,0,0.006764,0.000282,21379759,4007557435994691998,2303625805466626068
2,3,132672,0,321552,0,3,2.138125,2.138141,fclose,0,...,mProject,mProject#lassen192#132672#35184372410384,/p/gpfs1/haridev/iopp/montage-workflow/scratch...,0,0,0.000000,0.000016,21381327,4007557435994691998,2303625805466626068
3,4,132672,0,321552,0,3,2.138159,2.138171,fopen64,0,...,mProject,mProject#lassen192#132672#35184372410384,/p/gpfs1/haridev/iopp/montage-workflow/scratch...,0,0,0.000000,0.000012,21381647,4007557435994691998,2303625805466626068
4,5,132672,0,321552,0,1,2.138231,2.138246,fread,0,...,mProject,mProject#lassen192#132672#35184372410384,/p/gpfs1/haridev/iopp/montage-workflow/scratch...,2,0,0.127157,0.000015,21382382,4007557435994691998,2303625805466626068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
286057,2383210,63830,0,321552,0,2,18.125608,18.125610,fwrite,0,...,mAdd,mAdd#lassen734#63830#35184372410384,/p/gpfs1/haridev/iopp/montage-workflow/scratch...,2880,0,2746.582031,0.000001,181256095,4007557436607364090,369428171584890900
286058,2383211,63830,0,321552,0,3,18.125628,18.126268,fclose,0,...,mAdd,mAdd#lassen734#63830#35184372410384,/p/gpfs1/haridev/iopp/montage-workflow/scratch...,0,0,0.000000,0.000640,181259478,4007557436607364090,369428171584890900
286059,2383212,63830,0,321552,0,1,18.126276,18.131586,fread,0,...,mAdd,mAdd#lassen734#63830#35184372410384,/p/gpfs1/haridev/iopp/montage-workflow/scratch...,2880,0,0.517237,0.005310,181289317,4007557438554618451,369428171584890900
286060,2383213,63830,0,321552,0,2,18.131601,18.131601,fwrite,0,...,mAdd,mAdd#lassen734#63830#35184372410384,/p/gpfs1/haridev/iopp/montage-workflow/scratch...,2880,0,0.000000,0.000000,181316017,4007557438554618451,369428171584890900


time: 4.07 s (started: 2024-01-23 14:46:19 -08:00)


In [222]:
df['tend'].max() - df['tstart'].min()

433.14532

time: 26.5 ms (started: 2024-01-23 14:51:13 -08:00)


In [237]:
def assign_time_bin(df):
    df['time_bin'] = np.digitize(df['tmid'], bins=np.arange(434) * 1e7)
    return df

tddf = ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))].map_partitions(assign_time_bin).groupby(['app', 'time_bin']).agg({'size': sum}).sort_values('size', ascending=False).compute()
tddf

Unnamed: 0_level_0,Unnamed: 1_level_0,size
app,time_bin,Unnamed: 2_level_1
mBackground,2,17334437622
mDiff,1,17270523880
mBackground,1,13486359370
mDiff,2,11724523812
mProject,432,8871062400
mProject,...,...
mProject,362,2880
mConcatFit,1,627
mBgModel,13,56
mBgModel,14,56


time: 1.66 s (started: 2024-01-23 14:55:56 -08:00)


In [243]:
tsize = tddf.query('time_bin <= 39').groupby('app').sum().sort_values('size', ascending=False)
tsize['size'] = tsize['size'] / 1024 ** 3
tsize

Unnamed: 0_level_0,size
app,Unnamed: 1_level_1
mDiff,90.46462
mBackground,28.87584
mAdd,11.23693
mFitplane,3.346388
mViewer,2.317439
mProject,2.08702
mImgtbl,0.0005167807
mConcatFit,5.839393e-07
mBgModel,1.564622e-07


time: 8.48 ms (started: 2024-01-23 14:58:31 -08:00)


In [218]:
ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))].groupby(['app', 'io_cat']).agg({'size': sum}).compute()

Unnamed: 0_level_0,Unnamed: 1_level_0,size
app,io_cat,Unnamed: 2_level_1
mProject,1,2241185620
mProject,2,15968079360
mDiff,1,89762095364
mDiff,2,7373554560
mFitplane,1,3593157014
mConcatFit,2,627
mBgModel,2,168
mBackground,1,15502961152
mBackground,2,15502233600
mImgtbl,1,553344


time: 1.22 s (started: 2024-01-23 14:47:20 -08:00)


In [59]:
(ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))]['size'].sum() / ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))]['duration'].sum()).compute() / 1024 ** 3

34.212866328769124

time: 41.3 ms (started: 2024-01-23 09:21:12 -08:00)


In [208]:
ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))].groupby(['rank']).agg({'size': sum, 'duration': sum}).compute()

Unnamed: 0_level_0,size,duration
rank,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1325400064,2.630960
1,1325400064,2.507308
2,1325400064,2.687630
3,1325400064,2.526605
4,1325400064,2.598199
...,...,...
1275,1325400064,2.041826
1276,1325400064,2.170107
1277,1325400064,2.152200
1278,1325400064,1.905573


time: 64.7 ms (started: 2024-01-23 13:53:02 -08:00)


In [205]:
ddf[ddf['func_id'].str.contains('open|close')].groupby(['file_name', 'func_id'])['index'].count().compute()

file_name                                                           func_id
/dev/shm/job2154496201-35619-OMPI_COLL_IBM-0-collshm-comm4-master0  close      1280
                                                                    open       2528
/dev/shm/job2154496201-35619-OMPI_COLL_IBM-0-collshm-comm5-master0  close         1
                                                                    open          1
/dev/shm/job2154496201-35619-OMPI_COLL_IBM-0-collshm-comm5-master1  close         1
                                                                               ... 
/p/gpfs1/iopp/temp/hacc_dir/test-Part00001277-of-00001280.data      open64        4
/p/gpfs1/iopp/temp/hacc_dir/test-Part00001278-of-00001280.data      close         4
                                                                    open64        4
/p/gpfs1/iopp/temp/hacc_dir/test-Part00001279-of-00001280.data      close         4
                                                                    open64        4


time: 78.9 ms (started: 2024-01-23 13:50:47 -08:00)


In [109]:
ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))].groupby(['file_name', 'io_cat'])['size'].mean().compute() / 1024 ** 2

file_name                                                                                io_cat
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/LANDUSE.TBL          1         16.000000
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/cm1_config.txt       2          0.001960
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/cm1out_000001_s.dat  2         14.166667
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/cm1out_000001_u.dat  2          8.062500
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/cm1out_000001_v.dat  2          8.062500
                                                                                                     ...    
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/cm1rst_000002_v.dat  2          8.062500
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/cm1rst_000002_w.dat  2          8.062500
/p/gpfs1/iopp/temp/cm1r20.3.2846

time: 43 ms (started: 2024-01-23 09:56:43 -08:00)


In [136]:
ddf0 = ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))] \
    .groupby(['file_name', 'io_cat']) \
    .agg({'size': ['mean', sum], 'rank': [min, max, 'count']})
    
ddf0.columns = ['_'.join(tup).rstrip('_') for tup in ddf0.columns.values]

ddf0 = ddf0.assign(rank_rank=lambda x: x['rank_min'].astype(str) + '-' + x['rank_max'].astype(str)) \
    .reset_index() \
    .groupby(['rank_rank', 'io_cat']) \
    .agg({'size_mean': 'mean', 'size_sum': sum}) 

ddf0['size_mean'] = ddf0['size_mean'] / 1024 ** 2
ddf0['size_sum'] = ddf0['size_sum'] / 1024 ** 3

ddf0.compute()

Unnamed: 0_level_0,Unnamed: 1_level_0,size_mean,size_sum
rank_rank,io_cat,Unnamed: 2_level_1,Unnamed: 3_level_1
0-1279,1,16.0,20.015625
0-0,2,0.541073,1.146321
0-0,1,16.0,0.015625


time: 99.9 ms (started: 2024-01-23 10:11:21 -08:00)


In [112]:
import functools as ft

fpr = ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2]))].groupby(['file_name', 'io_cat']).agg({'size': ['mean', sum], 'rank': [min, max, 'count']}).compute()
# fpr['rank'] = fpr['rank'].apply(lambda x: x)
fpr['size', 'mean'] = fpr['size', 'mean'] / 1024 ** 2
fpr['size', 'sum'] = fpr['size', 'sum'] / 1024 ** 3
fpr['rank', 'rank'] = fpr['rank', 'min'].astype(str) + '-' + fpr['rank', 'max'].astype(str)
fpr

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size,rank,rank,rank,rank
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,sum,min,max,count,rank
file_name,io_cat,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/namelist.input,1,16.000000,20.015625,0,1279,1281,0-1279
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/proc.info,2,0.117163,0.000114,0,0,1,0-0
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/cm1_config.txt,2,0.001960,0.000002,0,0,1,0-0
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/LANDUSE.TBL,1,16.000000,0.015625,0,0,1,0-0
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/cm1out_stats.ctl,2,0.005274,0.001859,0,0,361,0-0
...,...,...,...,...,...,...,...
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/cm1rst_000002_s.dat,2,14.916667,0.043701,0,0,3,0-0
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/cm1rst_000002_w.dat,2,8.062500,0.015747,0,0,2,0-0
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/cm1rst_000002_x.dat,2,0.005829,0.000006,0,0,1,0-0
/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_files/les_ConvBoundLayer/cm1rst_000002_u.dat,2,8.062500,0.007874,0,0,1,0-0


time: 53.7 ms (started: 2024-01-23 09:58:27 -08:00)


In [115]:
fpr.reset_index()

Unnamed: 0_level_0,file_name,io_cat,size,size,rank,rank,rank,rank
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,mean,sum,min,max,count,rank
0,/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_fil...,1,16.000000,20.015625,0,1279,1281,0-1279
1,/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_fil...,2,0.117163,0.000114,0,0,1,0-0
2,/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_fil...,2,0.001960,0.000002,0,0,1,0-0
3,/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_fil...,1,16.000000,0.015625,0,0,1,0-0
4,/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_fil...,2,0.005274,0.001859,0,0,361,0-0
...,...,...,...,...,...,...,...,...
769,/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_fil...,2,14.916667,0.043701,0,0,3,0-0
770,/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_fil...,2,8.062500,0.015747,0,0,2,0-0
771,/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_fil...,2,0.005829,0.000006,0,0,1,0-0
772,/p/gpfs1/iopp/temp/cm1r20.3.2846827/config_fil...,2,8.062500,0.007874,0,0,1,0-0


time: 12.4 ms (started: 2024-01-23 09:59:11 -08:00)


In [118]:
fpr.groupby([('rank', 'rank'), 'io_cat']).agg({('size', 'mean'): 'mean', ('size', 'sum'): sum})

Unnamed: 0_level_0,Unnamed: 1_level_0,size,size
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,sum
"(rank, rank)",io_cat,Unnamed: 2_level_2,Unnamed: 3_level_2
0-0,1,16.0,0.015625
0-0,2,0.541073,1.146321
0-1279,1,16.0,20.015625


time: 11.2 ms (started: 2024-01-23 10:00:31 -08:00)


In [180]:
io_time = ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2, 3]))].groupby('proc_name').sum()['duration'].max()
print('io_time', io_time.compute())

ddf[(ddf['cat'] == 0) & (ddf['io_cat'] == 2)].groupby(['rank']).agg({'size': sum, 'duration': sum}).assign(bw=lambda x: x['size'] / x['duration'] / 1024 ** 3).compute()

io_time 4.1151


Unnamed: 0_level_0,size,duration,bw
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1230852944,0.579398,1.97847


time: 409 ms (started: 2024-01-23 10:37:46 -08:00)


In [189]:
ddf[(ddf['cat'] == 0) & (ddf['io_cat'].isin([1, 2, 3]))].groupby(['proc_name', 'io_cat']).sum().reset_index().groupby('io_cat')['duration'].max().compute()

io_cat
1    0.001981
2    0.579398
3    3.534376
Name: duration, dtype: float32

time: 315 ms (started: 2024-01-23 10:43:23 -08:00)
