# VANI

**Initial notebook configuration**

In [1]:
# Filter unnecessary warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

**Import libraries**

In [2]:
# Import VANI library
import dask
import numpy as np
import pandas as pd
from vani import Analyzer, ClusterOptions, ClusterType

**Initialize VANI Analyzer**

In [3]:
# Initialize analyzer
co = ClusterOptions(cluster_type=ClusterType.LSF)
vn = Analyzer(n_workers=4, cluster_options=co, debug=True)

  0%|          | 0/3 [00:00<?, ?it/s]

{'cls': <class 'distributed.scheduler.Scheduler'>, 'options': {'protocol': 'tcp://', 'interface': None, 'host': 'lassen708', 'dashboard_address': 'lassen708:8264', 'security': None}}
#!/usr/bin/env bash

#BSUB -J dask-worker
#BSUB -G asccasc
#BSUB -q pdebug
#BSUB -W 02:00
#BSUB -o vani.log
#BSUB -e vani.log
JOB_ID=${LSB_JOBID%.*}

/usr/workspace/iopp/.conda/envs/jupyter/bin/python -m distributed.cli.dask_worker tcp://192.168.66.200:39417 --nthreads 1 --nprocs 4 --memory-limit 29.80GiB --name name --nanny --death-timeout 300

All 4 workers alive

**Analyze logs**

In [4]:
# Analysis configuration
log_dir = "/p/gpfs1/iopp/parquet_app_logs/hacc/nodes-32/workflow-0"
# log_dir = "/p/gpfs1/iopp/parquet_app_logs/cm1/nodes-32/workflow-4"
# log_dir = "/p/gpfs1/iopp/parquet_app_logs/lbann-cosmoflow/nodes-32"
# log_dir = "/p/gpfs1/iopp/recorder_app_logs/montage_pegasus/nodes-32/_parquet"
# log_dir = "/p/gpfs1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet"

# Do the analysis
io_df_read_write, job_time = vn.analyze_parquet_logs(log_dir)

  0%|          | 0/7 [00:00<?, ?it/s]

Reading logs took 5.930038163438439 seconds
Job time computation took 3.2905118530616164 seconds


In [23]:
io_df_read_write.head()

Unnamed: 0,index,rank,thread_id,cat,tstart,tend,func_id,level,arg_count,args_1,...,args_7,args_8,args_9,args_10,duration,filename,size,count,bandwidth,tbin
35,36,0,312784,0,18.552891,18.677382,write,0,3,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,...,,,,,0.124491,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,67108864.0,1.0,514.094471,18.723384
38,39,0,312784,0,18.677414,18.720806,write,0,3,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,...,,,,,0.043392,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,67108864.0,1.0,1474.920088,18.723384
41,42,0,312784,0,18.720913,18.840509,write,0,3,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,...,,,,,0.119596,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,67108864.0,1.0,535.132801,18.723384
44,45,0,312784,0,18.84062,18.879042,write,0,3,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,...,,,,,0.038422,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,67108864.0,1.0,1665.728356,18.723384
47,48,0,312784,0,18.879133,18.925989,write,0,3,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,...,,,,,0.046856,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,67108864.0,1.0,1365.889115,18.723384


In [26]:
io_df_read_write["index"].count().compute()

25600

In [5]:
n_bins = 10

tbins, tbins_step = np.linspace(0, job_time, num=n_bins, retstep=True)
tbins

array([ 0.        ,  3.7446768 ,  7.4893536 , 11.23403041, 14.97870721,
       18.72338401, 22.46806081, 26.21273761, 29.95741442, 33.70209122])

In [6]:
%%time

io_df_read_write["tbin"] = 0

for tbin in tbins:
    print(tbin, tbin + tbins_step)
    tstart_cond = io_df_read_write["tstart"].ge(tbin - 1)
    tend_cond = io_df_read_write["tend"].lt(tbin + tbins_step)
    io_df_read_write["tbin"] = io_df_read_write["tbin"].mask(tstart_cond & tend_cond, tbin)

0.0 3.744676801893446
3.744676801893446 7.489353603786892
7.489353603786892 11.234030405680338
11.234030405680338 14.978707207573784
14.978707207573784 18.72338400946723
18.72338400946723 22.46806081136068
22.468060811360676 26.21273761325412
26.21273761325412 29.957414415147568
29.957414415147568 33.702091217041016
33.702091217041016 37.44676801893446
CPU times: user 261 ms, sys: 0 ns, total: 261 ms
Wall time: 252 ms


In [45]:
%%time

len(io_df_read_write[io_df_read_write["tbin"] == 0].compute())

CPU times: user 48.1 s, sys: 652 ms, total: 48.7 s
Wall time: 1min 11s


0

In [39]:
%%time

for tbin in tbins:
    tbin_binned = io_df_read_write[io_df_read_write["tbin"] == tbin]
    tbin_ct = tbin_binned["index"].count().compute()
    print(tbin, tbin_ct)


0.0 0
3.744676801893446 361
7.489353603786892 943
11.234030405680338 4735
14.978707207573784 4089
18.72338400946723 7178
22.468060811360676 7327
26.21273761325412 567
29.957414415147568 400
33.702091217041016 0
CPU times: user 6min 59s, sys: 6.67 s, total: 7min 6s
Wall time: 11min 23s


In [7]:
%%time

tbin_tasks = []

for tbin in tbins:
    tbin_binned = io_df_read_write[io_df_read_write["tbin"] == tbin]
    tbin_ct = tbin_binned["index"].count()
    tbin_tasks.append(tbin_ct)
    
tbin_task_results, = dask.compute(tbin_tasks)

tbin_task_results

CPU times: user 18.8 s, sys: 366 ms, total: 19.2 s
Wall time: 36.8 s


[0, 361, 943, 4735, 4089, 7178, 7327, 567, 400, 0]

In [17]:
worst_tbin_task_results_ixs = np.argpartition(tbin_task_results, 5)[5:]
print(worst_tbin_task_results_ixs)
print([tbin_task_results[i] for i in worst_tbin_task_results_ixs])
worst_tbins = [tbins[i] for i in worst_tbin_task_results_ixs]
worst_tbins

[2 4 6 3 5]
[943, 4089, 7327, 4735, 7178]


[7.489353603786892,
 14.978707207573784,
 22.468060811360676,
 11.234030405680338,
 18.72338400946723]

In [31]:
io_df_read_write_worst = io_df_read_write[io_df_read_write["tbin"].isin(worst_tbins)]

In [25]:
%%time
io_df_read_write_worst["index"].count().compute()

24272

In [29]:
%%time
tstart_task = io_df_read_write_worst["tstart"].min()
tend_task = io_df_read_write_worst["tend"].max()
res = dask.compute(tstart_task, tend_task)
res

CPU times: user 18.4 s, sys: 319 ms, total: 18.7 s
Wall time: 37.9 s


(6.499417304992676, 25.40230369567871)

In [30]:
tstart, tend = res
tbins, tbins_step = np.linspace(tstart, tend, num=n_bins, retstep=True)
tbins

array([ 6.4994173 ,  8.59973802, 10.70005873, 12.80037944, 14.90070015,
       17.00102086, 19.10134157, 21.20166228, 23.30198299, 25.4023037 ])

In [32]:
%%time

io_df_read_write_worst["tbin"] = 0

for tbin in tbins:
    print(tbin, tbin + tbins_step)
    tstart_cond = io_df_read_write_worst["tstart"].ge(tbin - 1)
    tend_cond = io_df_read_write_worst["tend"].lt(tbin + tbins_step)
    io_df_read_write_worst["tbin"] = io_df_read_write_worst["tbin"].mask(tstart_cond & tend_cond, tbin)

6.499417304992676 8.599738015068901
8.599738015068901 10.700058725145126
10.700058725145128 12.800379435221355
12.800379435221355 14.90070014529758
14.90070014529758 17.001020855373806
17.001020855373806 19.10134156545003
19.101341565450035 21.20166227552626
21.20166227552626 23.301982985602486
23.301982985602486 25.40230369567871
25.40230369567871 27.502624405754936
CPU times: user 235 ms, sys: 20.1 ms, total: 255 ms
Wall time: 247 ms


In [33]:
io_df_read_write_worst.head()

Unnamed: 0,index,rank,thread_id,cat,tstart,tend,func_id,level,arg_count,args_1,...,args_7,args_8,args_9,args_10,duration,filename,size,count,bandwidth,tbin
35,36,0,312784,0,18.552891,18.677382,write,0,3,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,...,,,,,0.124491,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,67108864.0,1.0,514.094471,19.101342
38,39,0,312784,0,18.677414,18.720806,write,0,3,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,...,,,,,0.043392,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,67108864.0,1.0,1474.920088,19.101342
41,42,0,312784,0,18.720913,18.840509,write,0,3,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,...,,,,,0.119596,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,67108864.0,1.0,535.132801,19.101342
44,45,0,312784,0,18.84062,18.879042,write,0,3,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,...,,,,,0.038422,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,67108864.0,1.0,1665.728356,19.101342
47,48,0,312784,0,18.879133,18.925989,write,0,3,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,...,,,,,0.046856,/p/gpfs1/iopp/temp/hacc_dir/test-Part00000000-...,67108864.0,1.0,1365.889115,19.101342


In [34]:
%%time

tbin_tasks = []

for tbin in tbins:
    tbin_binned = io_df_read_write_worst[io_df_read_write_worst["tbin"] == tbin]
    tbin_ct = tbin_binned["index"].count()
    tbin_tasks.append(tbin_ct)
    
tbin_task_results, = dask.compute(tbin_tasks)

tbin_task_results

CPU times: user 21.3 s, sys: 368 ms, total: 21.7 s
Wall time: 40.9 s


[39, 566, 2752, 2269, 2415, 2077, 3207, 4997, 5558, 390]

In [35]:
worst_tbin_task_results_ixs = np.argpartition(tbin_task_results, 5)[5:]
print(worst_tbin_task_results_ixs)
print([tbin_task_results[i] for i in worst_tbin_task_results_ixs])
worst_tbins = [tbins[i] for i in worst_tbin_task_results_ixs]
worst_tbins

[4 6 7 8 2]
[2415, 3207, 4997, 5558, 2752]


[14.90070014529758,
 19.101341565450035,
 21.20166227552626,
 23.301982985602486,
 10.700058725145128]

In [36]:
io_df_read_write_worst_2 = io_df_read_write_worst[io_df_read_write_worst["tbin"].isin(worst_tbins)]

In [37]:
%%time
io_df_read_write_worst_2["index"].count().compute()

18929

In [38]:
%%time
tstart_task = io_df_read_write_worst_2["tstart"].min()
tend_task = io_df_read_write_worst_2["tend"].max()
res = dask.compute(tstart_task, tend_task)
res

CPU times: user 19.5 s, sys: 324 ms, total: 19.8 s
Wall time: 39.1 s


(9.700082778930664, 25.068326950073242)

In [39]:
tstart, tend = res
tbins, tbins_step = np.linspace(tstart, tend, num=n_bins, retstep=True)
tbins

array([ 9.70008278, 11.40766546, 13.11524815, 14.82283084, 16.53041352,
       18.23799621, 19.94557889, 21.65316158, 23.36074426, 25.06832695])

In [40]:
%%time

io_df_read_write_worst_2["tbin"] = 0

for tbin in tbins:
    print(tbin, tbin + tbins_step)
    tstart_cond = io_df_read_write_worst_2["tstart"].ge(tbin - 1)
    tend_cond = io_df_read_write_worst_2["tend"].lt(tbin + tbins_step)
    io_df_read_write_worst_2["tbin"] = io_df_read_write_worst_2["tbin"].mask(tstart_cond & tend_cond, tbin)

9.700082778930664 11.407665464613173
11.407665464613173 13.115248150295681
13.115248150295681 14.82283083597819
14.822830835978191 16.5304135216607
16.530413521660698 18.23799620734321
18.23799620734321 19.94557889302572
19.94557889302572 21.65316157870823
21.653161578708225 23.360744264390735
23.360744264390732 25.068326950073242
25.068326950073242 26.775909635755752
CPU times: user 230 ms, sys: 19.8 ms, total: 250 ms
Wall time: 241 ms


In [41]:
%%time

tbin_tasks = []

for tbin in tbins:
    tbin_binned = io_df_read_write_worst_2[io_df_read_write_worst_2["tbin"] == tbin]
    tbin_ct = tbin_binned["index"].count()
    tbin_tasks.append(tbin_ct)
    
tbin_task_results, = dask.compute(tbin_tasks)

tbin_task_results

CPU times: user 21.7 s, sys: 256 ms, total: 22 s
Wall time: 41 s


[372, 2380, 0, 1814, 599, 1396, 3235, 3627, 5068, 436]

In [42]:
worst_tbin_task_results_ixs = np.argpartition(tbin_task_results, 5)[5:]
print(worst_tbin_task_results_ixs)
print([tbin_task_results[i] for i in worst_tbin_task_results_ixs])
worst_tbins = [tbins[i] for i in worst_tbin_task_results_ixs]
worst_tbins

[3 1 7 8 6]
[1814, 2380, 3627, 5068, 3235]


[14.822830835978191,
 11.407665464613173,
 21.653161578708225,
 23.360744264390732,
 19.94557889302572]

In [43]:
io_df_read_write_worst_3 = io_df_read_write_worst_2[io_df_read_write_worst_2["tbin"].isin(worst_tbins)]

In [44]:
%%time
io_df_read_write_worst_3["index"].count().compute()

CPU times: user 19.7 s, sys: 424 ms, total: 20.1 s
Wall time: 40.1 s


16124

In [50]:
min(worst_tbins)-1,max(worst_tbins)+tbins_step

(10.407665464613173, 25.068326950073242)

In [51]:
%%time
tstart_task = io_df_read_write_worst_3["tstart"].min()
tend_task = io_df_read_write_worst_3["tend"].max()
res = dask.compute(tstart_task, tend_task)
res

CPU times: user 19.9 s, sys: 409 ms, total: 20.3 s
Wall time: 40.6 s


(10.412095069885254, 24.68063735961914)

In [52]:
tstart, tend = res
tbins, tbins_step = np.linspace(tstart, tend, num=n_bins, retstep=True)
tbins

array([10.41209507, 11.99748866, 13.58288225, 15.16827583, 16.75366942,
       18.33906301, 19.9244566 , 21.50985018, 23.09524377, 24.68063736])

In [53]:
%%time

io_df_read_write_worst_3["tbin"] = 0

for tbin in tbins:
    print(tbin, tbin + tbins_step)
    tstart_cond = io_df_read_write_worst_3["tstart"].ge(tbin - 1)
    tend_cond = io_df_read_write_worst_3["tend"].lt(tbin + tbins_step)
    io_df_read_write_worst_3["tbin"] = io_df_read_write_worst_3["tbin"].mask(tstart_cond & tend_cond, tbin)

10.412095069885254 11.997488657633463
11.997488657633463 13.582882245381672
13.582882245381674 15.168275833129883
15.168275833129883 16.753669420878094
16.753669420878094 18.339063008626304
18.339063008626304 19.924456596374515
19.92445659637451 21.509850184122723
21.509850184122723 23.095243771870933
23.095243771870933 24.680637359619144
24.68063735961914 26.26603094736735
CPU times: user 258 ms, sys: 0 ns, total: 258 ms
Wall time: 249 ms


In [54]:
%%time

tbin_tasks = []

for tbin in tbins:
    tbin_binned = io_df_read_write_worst_3[io_df_read_write_worst_3["tbin"] == tbin]
    tbin_ct = tbin_binned["index"].count()
    tbin_tasks.append(tbin_ct)
    
tbin_task_results, = dask.compute(tbin_tasks)

tbin_task_results

CPU times: user 22.1 s, sys: 515 ms, total: 22.6 s
Wall time: 43.1 s


[487, 1888, 203, 1610, 0, 0, 2702, 3831, 4226, 1169]

In [55]:
worst_tbin_task_results_ixs = np.argpartition(tbin_task_results, 5)[5:]
print(worst_tbin_task_results_ixs)
print([tbin_task_results[i] for i in worst_tbin_task_results_ixs])
worst_tbins = [tbins[i] for i in worst_tbin_task_results_ixs]
worst_tbins

[3 1 7 8 6]
[1610, 1888, 3831, 4226, 2702]


[15.168275833129883,
 11.997488657633463,
 21.509850184122723,
 23.095243771870933,
 19.92445659637451]

In [56]:
io_df_read_write_worst_4 = io_df_read_write_worst_3[io_df_read_write_worst_3["tbin"].isin(worst_tbins)]

In [57]:
%%time
io_df_read_write_worst_3["index"].count().compute()

CPU times: user 20.2 s, sys: 384 ms, total: 20.6 s
Wall time: 42 s


16124