In [1]:
%run init.ipynb

In [2]:
import sys
sys.path.append("../")

import dask
import dask.array as da
import dask.bag as db
import dask.dataframe as dd
import json
import math
import numpy as np
import os
import pandas as pd
from dask import compute, delayed
from dask.dataframe import DataFrame
from dask.distributed import Client, LocalCluster, fire_and_forget, wait, worker_client
from vani.core.analysis import Analysis
from vani.core.metrics import filter_asymptote_delayed, filter_delayed, flatten_delayed, merge_delayed, sort_delayed
from vani.utils.file_utils import ensure_dir
from vani.utils.json_encoders import NpEncoder
from vani.utils.logger import create_logger, format_log

In [3]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.LSFCluster
Dashboard: http://192.168.66.200:8788/status,

0,1
Dashboard: http://192.168.66.200:8788/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://192.168.66.200:46777,Workers: 0
Dashboard: http://192.168.66.200:8788/status,Total threads: 0
Started: Just now,Total memory: 0 B


In [4]:
CACHE_DIR = "cached"
METRICS_DIR = "metrics"
INDEX_DIR = "indexed"


def compute_metrics_file_id(ddf: DataFrame, fg_index: str, log_dir: str):
    unique_filenames_d = unique_filenames_delayed(ddf=ddf, log_dir=log_dir,
                                                  dask_key_name=f"unique-filenames-{fg_index}")
    save_filenames_d = save_filenames_delayed(filenames=unique_filenames_d, log_dir=log_dir,
                                              dask_key_name=f"save-filenames-{fg_index}")
    metrics_d = metrics_filenames_delayed(ddf=ddf, filenames=unique_filenames_d,
                                          dask_key_name=f"metrics-{fg_index}")
    return [unique_filenames_d, save_filenames_d, [metrics_d]]


def load_global_min_max(log_dir: str):
    with open(f"{log_dir}/global.json") as file:
        global_min_max = json.load(file)
    return global_min_max


@delayed
def metrics_filenames_delayed(ddf: DataFrame, filenames: list, fg_index='file_id'):
    print('calculating metrics for filenames', len(filenames))
    tasks_d = []
    for filename in filenames:
        target_ddf_d = Analysis.target_ddf_delayed(ddf=ddf, start=filename, stop=filename,
                                                   dask_key_name=f"target-ddf-{fg_index}-{filename}")
        filter_d = filter_delayed(ddf=target_ddf_d, fg_index=fg_index, start=filename, stop=filename,
                                  dask_key_name=f"filter-{fg_index}-{filename}")
        flatten_d = flatten_delayed(filter_d, dask_key_name=f"flatten-{fg_index}-{filename}")
        tasks_d.append(flatten_d)
    print('num of tasks created', len(tasks_d))
    # with worker_client() as client:
    #     print('submitting on', client)
    #     metrics_f = client.compute(tasks_d)
    #     print('gathering tasks', len(metrics_f))
    #     metrics = client.gather(metrics_f)
    metrics = dask.compute(tasks_d)
    print('computed metrics', type(metrics))
    print('computed metrics', len(metrics))
    # print('computed metrics', metrics)
    return metrics


def read_and_index_logs(prefix: str, fg_index: str, log_dir: str, use_cache=True):
    dask_suffix = f"{prefix}-{fg_index}"
    fg_index_dir = f"{log_dir}/{INDEX_DIR}/{fg_index}"
    if use_cache and os.path.exists(f"{fg_index_dir}/_metadata"):
        ddf_d = read_parquet_delayed(log_dir=fg_index_dir, index=[fg_index],
                                     dask_key_name=f"read-parquet-{dask_suffix}")
        persisted_ddf_d = persist_ddf_delayed(ddf=ddf_d, dask_key_name=f"persist-ddf-{dask_suffix}")
        return [ddf_d, persisted_ddf_d]
    ddf_d = read_parquet_delayed(log_dir=log_dir, dask_key_name=f"read-parquet-{dask_suffix}")
    indexed_ddf_d = set_ddf_index_delayed(ddf=ddf_d, fg_index=fg_index,
                                          dask_key_name=f"set-index-{dask_suffix}")
    persisted_ddf_d = persist_ddf_delayed(ddf=indexed_ddf_d, dask_key_name=f"persist-ddf-{dask_suffix}")
    partitioned_ddf_d = repartition_delayed(ddf=persisted_ddf_d,
                                            dask_key_name=f"repartition-ddf-{dask_suffix}")
    save_ddf_d = save_ddf_delayed(ddf=partitioned_ddf_d, log_dir=log_dir, fg_index=fg_index,
                                  dask_key_name=f"save-ddf-{dask_suffix}")
    return [ddf_d, indexed_ddf_d, persisted_ddf_d, save_ddf_d, partitioned_ddf_d]


@delayed
def read_parquet_delayed(log_dir: str, index: list = None):
    if index:
        print("Index specified", index)
        return dd.read_parquet(f"{log_dir}/*.parquet", calculate_divisions=True, index=index)
    print("Index not specified")
    return dd.read_parquet(f"{log_dir}/*.parquet", index=False)


@delayed
def repartition_delayed(ddf: DataFrame, partition_size='128MB'):
    return ddf.repartition(partition_size=partition_size)


@delayed
def persist_ddf_delayed(ddf: DataFrame):
    ddf = ddf.persist()
    wait(ddf)
    return ddf


@delayed
def save_ddf_delayed(ddf: DataFrame, log_dir: str, fg_index: str):
    ddf.to_parquet(f"{log_dir}/{INDEX_DIR}/{fg_index}")
    return f"{log_dir}/{INDEX_DIR}/{fg_index}"


@delayed
def save_filenames_delayed(filenames: list, log_dir: str):
    filenames = list(filenames)
    filenames.sort()
    with open(f"{log_dir}/filenames.json", "w") as file:
        json.dump(filenames, file, cls=NpEncoder)


@delayed
def set_ddf_index_delayed(ddf: DataFrame, fg_index: str):
    return ddf.set_index([fg_index])


@delayed
def unique_filenames_delayed(ddf: DataFrame, log_dir: str):
    if os.path.exists(f"{log_dir}/filenames.json"):
        with open(f"{log_dir}/filenames.json", "r") as file:
            unique_filenames = json.load(file)
    else:
        unique_filenames = ddf.index.unique().compute()
    return unique_filenames


In [5]:
# log_dir = "/p/gpfs1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet"
log_dir = "/p/vast1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet"

In [12]:
gen_ddf = dd.read_parquet(f"{log_dir}/*.parquet")

In [13]:
%%time

simple_agg_ddf = gen_ddf \
    .groupby(['filename', 'proc_id', 'io_cat']) \
    .agg({'duration': sum, 'size': sum}) \
    .compute() \
    .groupby(['filename', 'io_cat']) \
    .agg({'duration': max, 'size': sum}) 

simple_agg_ddf

CPU times: user 2min 7s, sys: 8.13 s, total: 2min 15s
Wall time: 3min 23s


Unnamed: 0_level_0,Unnamed: 1_level_0,duration,size
filename,io_cat,Unnamed: 2_level_1,Unnamed: 3_level_1
,0,0.049799,0
,3,42.286640,0
%p,1,156.813416,0
%p,3,1.012680,0
./chr1-AFR,3,0.065015,0
...,...,...,...
tmpzzrle2co/chr6.NA21137,3,0.000096,0
tmpzzrle2co/chr6.NA21141,3,0.000115,0
tmpzzrle2co/chr6.NA21142,3,0.000094,0
tmpzzrle2co/chr6.NA21143,3,0.000095,0


In [8]:
%%time
tmid_ddf_d = read_and_index_logs('genome', 'tmid', log_dir=log_dir)
tmid_ddf = compute(*tmid_ddf_d)[-1]
tmid_ddf

CPU times: user 6min 48s, sys: 17.4 s, total: 7min 5s
Wall time: 13min 57s


Unnamed: 0_level_0,index,proc,rank,thread_id,cat,io_cat,tstart,tend,func_id,level,hostname,app,filename,size,acc_pat,bandwidth,duration,file_id,proc_id
npartitions=2647,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
,int64,int64,int32,int32,int32,int32,float32,float32,object,int32,object,object,object,int64,int32,float32,float32,int64,int64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [6]:
%%time
proc_ddf_d = read_and_index_logs('genome', 'proc_id', log_dir=log_dir)
proc_ddf = compute(*proc_ddf_d)[-1]
proc_ddf

CPU times: user 12.7 s, sys: 703 ms, total: 13.4 s
Wall time: 1min 4s


Unnamed: 0_level_0,index,proc,rank,thread_id,cat,io_cat,tstart,tend,func_id,level,hostname,app,filename,size,acc_pat,bandwidth,duration,tmid,file_id
npartitions=2646,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
109922137388615755,int64,int64,int32,int32,int32,int32,float32,float32,object,int32,object,object,object,int64,int32,float32,float32,int64,int64
109922137388615755,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8997616529868589131,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8997616529868589131,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [6]:
%%time
file_ddf_d = read_and_index_logs('genome', 'file_id', log_dir=log_dir)
file_ddf = compute(*file_ddf_d)[-1]
file_ddf

CPU times: user 12.3 s, sys: 464 ms, total: 12.8 s
Wall time: 59.2 s


Unnamed: 0_level_0,index,proc,rank,thread_id,cat,io_cat,tstart,tend,func_id,level,hostname,app,filename,size,acc_pat,bandwidth,duration,tmid,proc_id
npartitions=2497,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
-9223371986035283781,int64,int64,int32,int32,int32,int32,float32,float32,object,int32,object,object,object,int64,int32,float32,float32,int64,int64
-8701742508436229974,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8658693104815296183,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9223369538921024184,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [7]:
%%time

file_agg_dur_ddf = file_ddf \
    .groupby([file_ddf.index.name]) \
    .agg({'duration': sum}) \
    .compute()

file_agg_dur_ddf

CPU times: user 9.96 s, sys: 823 ms, total: 10.8 s
Wall time: 18.9 s


Unnamed: 0_level_0,duration
file_id,Unnamed: 1_level_1
-9223371986035283781,0.000150
-9223363969247508558,0.000089
-9223362727731233329,0.000122
-9223362290314498489,0.000158
-9223362230599577161,0.000153
...,...
9223361494309915981,0.000240
9223362658809780944,0.000161
9223364712007579008,0.000158
9223368457304021926,0.000089


In [11]:
file_agg_dur_ddf.sort_values('duration', ascending=False)

Unnamed: 0_level_0,duration
file_id,Unnamed: 1_level_1
-5561148475055268842,41481.980469
6142509188972423790,7695.101074
1917887208199751214,1486.319336
1917887209186244162,1025.085693
1917887210832504031,1021.207825
...,...
5804224772713050805,0.000002
-3731914350092375371,0.000002
7472729269373847073,0.000002
3349545840571170173,0.000002


In [10]:
file_agg_dur_ddf.loc[-9223336117017391838]

duration    0.00211
Name: -9223336117017391838, dtype: float32

In [7]:
%%time

file_agg_ddf = file_ddf \
    .groupby([file_ddf.index.name, 'proc_id', 'io_cat']) \
    .agg({'duration': sum, 'size': sum}) \
    .compute() \
    .groupby([file_ddf.index.name, 'io_cat']) \
    .agg({'duration': max, 'size': sum}) 

file_agg_ddf

CPU times: user 41.5 s, sys: 5.46 s, total: 46.9 s
Wall time: 2min 2s


Unnamed: 0_level_0,Unnamed: 1_level_0,duration,size
file_id,io_cat,Unnamed: 2_level_1,Unnamed: 3_level_1
-9223371986035283781,3,0.000150,0
-9223363969247508558,3,0.000089,0
-9223362727731233329,3,0.000122,0
-9223362290314498489,3,0.000158,0
-9223362230599577161,3,0.000153,0
...,...,...,...
9223361494309915981,3,0.000240,0
9223362658809780944,3,0.000161,0
9223364712007579008,3,0.000158,0
9223368457304021926,3,0.000089,0


In [23]:
%%time
file_agg_ddf.sort_values('duration', ascending=False)

CPU times: user 11.4 s, sys: 357 ms, total: 11.7 s
Wall time: 10.1 s


Unnamed: 0_level_0,Unnamed: 1_level_0,duration,size
file_id,io_cat,Unnamed: 2_level_1,Unnamed: 3_level_1
-5561148475055268842,1,156.813416,0
-3622180793669760841,3,57.132751,0
6142509188972423790,3,42.286640,0
8664006721246125357,3,40.971352,0
-339614686162845625,3,40.939384,0
...,...,...,...
3909313310083485717,3,0.000002,0
8570473682234320595,3,0.000002,0
3349376653219446141,3,0.000002,0
5804224772713050805,3,0.000002,0


In [9]:
file_agg_ddf.loc[(-5561148475055268842,)]

Unnamed: 0_level_0,duration,size
io_cat,Unnamed: 1_level_1,Unnamed: 2_level_1
1,156.813416,0
3,1.01268,0


In [21]:
%%time
file_agg_ddf.groupby(level=0).sum().sort_values('duration', ascending=False)

CPU times: user 13 s, sys: 890 ms, total: 13.9 s
Wall time: 11.7 s


Unnamed: 0_level_0,duration,size
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-5561148475055268842,157.826096,0
-3622180793669760841,57.132751,0
6142509188972423790,42.336437,0
8664006721246125357,40.971352,0
-339614686162845625,40.939384,0
...,...,...
3909313310083485717,0.000002,0
3349455173811551613,0.000002,0
5804224772713050805,0.000002,0
3349545840571170173,0.000002,0


In [14]:
%%time
file_agg_ddf.groupby(level=0).sum().sort_values('size', ascending=False)

CPU times: user 10.1 s, sys: 851 ms, total: 10.9 s
Wall time: 9.19 s


Unnamed: 0_level_0,duration,size
file_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1917887208708658513,4.386764,581538824192
1917887209186244162,9.539897,490165305344
1917887209094521718,5.175895,459639775232
1917887210832504031,9.776803,446941102080
1917887207623702483,4.357019,434235506688
...,...,...
-1234504638971010941,0.000089,0
-1234504595045973535,0.000102,0
-1234500915525040556,0.000088,0
-1234498041349688167,0.000102,0


In [19]:
%%time
file_ddf0 = file_ddf.loc[1917887208708658513] \
    .groupby([file_ddf.index.name, 'proc_id', 'io_cat']) \
    .agg({'duration': sum, 'size': sum}) \
    .compute() \
    .groupby([file_ddf.index.name, 'io_cat']) \
    .agg({'duration': max, 'size': sum})

file_ddf0

CPU times: user 134 ms, sys: 21.8 ms, total: 156 ms
Wall time: 316 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,duration,size
file_id,io_cat,Unnamed: 2_level_1,Unnamed: 3_level_1
1917887208708658513,1,2.064055,2193899520


In [16]:
581538824192 / 2193899520

265.07085620402523

In [17]:
file_ddf.loc[1917887208708658513]['filename'].unique().compute()

0    /p/gpfs1/iopp/temp/1000-genome-haridev/scratch...
Name: filename, dtype: object

In [13]:
file_ddf.loc[-339614686162845625]['duration'].sum().compute()

40.939384

In [22]:
file_ddf[file_ddf['filename'] == '/p/gpfs1/iopp/temp/1000-genome-haridev/scratch/run_dir/individuals']['duration'].sum().compute()

1486.3193

In [None]:
file_ddf0.groupby(level=0).sum().sort_values('duration', ascending=False)

In [None]:
%%time

proc_agg_ddf = proc_ddf \
    .groupby([proc_ddf.index.name, 'io_cat']) \
    .agg({'duration': sum, 'size': sum}) \
    .compute()

proc_agg_ddf

In [131]:
proc_ddf.loc[109922137388615755] \
    .groupby([proc_ddf.index.name]) \
    .agg({'duration': sum, 'size': sum, 'index': 'count'}) \
    .compute()

Unnamed: 0_level_0,duration,size,index
proc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
109922137388615755,87.82843,43136656892,179192


In [90]:
%%time
proc_agg_ddf.sort_values('duration', ascending=False)

CPU times: user 4.35 ms, sys: 258 µs, total: 4.61 ms
Wall time: 3.18 ms


Unnamed: 0_level_0,Unnamed: 1_level_0,duration,size,index
proc_id,io_cat,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8712563415903635531,3,1303.128052,-22,6291385
5225694538768582731,3,1292.321411,-22,6291385
6865660408414341195,3,1272.713745,-22,6291385
8719823684455368779,3,1272.629639,-22,6291383
6865659738399443019,3,1269.471436,-22,6291381
...,...,...,...,...
7523492661922287236,0,0.000033,0,10
7523492661921631876,0,0.000033,0,10
7523492661921894020,0,0.000032,0,10
7523492661922614916,0,0.000030,0,10


In [96]:
proc_agg_ddf.loc[8712563415903635531]

Unnamed: 0_level_0,duration,size,index
io_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,15.197469,213515417632,1913262
2,41.582199,1761046427,623739
3,1303.128052,-22,6291385


In [91]:
proc_ddf.loc[8712563415903635531].groupby(['io_cat']).agg({'duration':sum,'size':sum,'index':'count'}).compute()

Unnamed: 0_level_0,duration,size,index
io_cat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2.597726,46023704576,45320
2,16.321741,992450437,13382
3,17.531164,0,142905


In [92]:
proc_ddf.loc[8712563415903635531].compute()

Unnamed: 0_level_0,index,proc,rank,thread_id,cat,io_cat,tstart,tend,func_id,level,hostname,app,filename,size,acc_pat,bandwidth,duration,tmid,file_id
proc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
8712563415903635531,10083320,15940,0,31131248,0,3,1930.808594,1930.808594,__xstat,0,lassen330,individuals_merge,/p/gpfs1/iopp/temp/1000-genome-haridev/scratch...,0,0,0.000000,6.000000e-06,19308086128,1917887209460786633
8712563415903635531,10083321,15940,0,31131248,0,3,1930.808716,1930.808716,open,0,lassen330,individuals_merge,tmp0xd2b1y8/chr8.HG04238,0,0,0.000000,8.700000e-05,19308087052,-6161323545895660353
8712563415903635531,10083322,15940,0,31131248,0,3,1930.808716,1930.808716,__fxstat,0,lassen330,individuals_merge,/p/gpfs1/iopp/temp/1000-genome-haridev/scratch...,0,0,0.000000,9.999999e-07,19308087761,2663128172333801359
8712563415903635531,10083323,15940,0,31131248,0,3,1930.808838,1930.808838,lseek,0,lassen330,individuals_merge,/p/gpfs1/iopp/temp/1000-genome-haridev/scratch...,0,0,0.000000,1.900000e-06,19308087875,2663128172333801359
8712563415903635531,10083324,15940,0,31131248,0,2,1930.808838,1930.808838,write,0,lassen330,individuals_merge,/p/gpfs1/iopp/temp/1000-genome-haridev/scratch...,690,0,20.563602,3.200000e-05,19308088184,2663128172333801359
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8712563415903635531,10284922,15940,0,31131248,0,1,2376.487549,2376.487549,readdir,0,lassen330,individuals_merge,%p,0,0,0.000000,0.000000e+00,23764875429,-5561148475055268842
8712563415903635531,10284923,15940,0,31131248,0,1,2376.487549,2376.487549,readdir,0,lassen330,individuals_merge,%p,0,0,0.000000,4.000000e-06,23764875470,-5561148475055268842
8712563415903635531,10284924,15940,0,31131248,0,3,2376.487549,2376.487549,closedir,0,lassen330,individuals_merge,%p,0,0,0.000000,1.900000e-06,23764875569,-5561148475055268842
8712563415903635531,10284925,15940,0,31131248,0,3,2376.870361,2376.875000,rmdir,0,lassen330,individuals_merge,,0,0,0.000000,4.465800e-03,23768726518,6142509188972423790


In [95]:
8828386 / 201607

43.79007673344675

In [7]:
%%time

filenames = indexed_ddf.index.unique().compute()
filenames = list(filenames)
filenames.sort()
with open(f"{log_dir}/filenames.json", "w") as file:
    json.dump(filenames, file, cls=NpEncoder)

CPU times: user 42.1 s, sys: 1.18 s, total: 43.3 s
Wall time: 1min 4s


In [6]:
import dask.array as da
import dask.bag as db
import dask.dataframe as dd
import math
import numpy as np
import time
from dask.distributed import as_completed
from dask.diagnostics import ProgressBar
from time import sleep

def clear_logs(dask_scheduler): # As suggested in #3898 
    dask_scheduler.log.clear()
    dask_scheduler.transition_log.clear()
    dask_scheduler.events.clear()

def read_global_json(log_dir):
    with open(f"{log_dir}/filenames.json") as file:
        filenames = json.load(file)
    return np.array(filenames)

def proc_metrics(index, filenames, ddf):

    # print("filenames", len(filenames))
    dask.distributed.get_worker().log_event("filenames", len(filenames))
    dask.distributed.get_worker().log_event("filenames_shape", np.array(filenames).shape)

    filename = f"{log_dir}/metrics/file_id/{index}.parquet"
    splice_ddf = ddf.loc[filenames].reset_index()
    target_ddf = splice_ddf.compute()
    aggregate = target_ddf.groupby(['index','io_cat']).agg({
        'duration':sum, 
        'size':sum, 
        'index':'count',
        'filename':min
    })
    aggregate.reset_index(inplace=True)
    aggregate.columns  = ['_'.join(col) for col in aggregate.columns.values]
    aggregate.to_parquet(filename)

    return filename

In [9]:
%%time

n_el = 21260259
n_buckets = math.ceil(math.sqrt(n_el))

cm1_indexed_d = dask.delayed(lambda indexed_dir: dd.read_parquet(f"{indexed_dir}/*.parquet", calculate_divisions=True, index=['file_id']))(indexed_dir)
cm1_persisted_d = dask.delayed(lambda ddf: ddf.persist())(cm1_indexed_d)
cm1_files_d = dask.delayed(lambda log_dir: read_global_json(log_dir))(log_dir)
cm1_files_a = da.from_delayed(cm1_files_d, shape=(n_el,), dtype=int)

delayed_result = [
    dask.delayed(lambda i, filenames: len(filenames))(i, cm1_files_a[i*n_buckets:(i+1)*n_buckets])
    for i in range(0, n_buckets)
]

result = dask.delayed(list)(delayed_result)
# result.visualize("mapped_direct.png")

CPU times: user 3.24 s, sys: 182 ms, total: 3.42 s
Wall time: 3.24 s


In [10]:
%%time

client.run_on_scheduler(clear_logs)
metrics = result.compute()

print(len(metrics))
print(metrics[0])
print(metrics[-1])

4611
4611
3549
CPU times: user 26.8 s, sys: 724 ms, total: 27.6 s
Wall time: 1min 8s


In [11]:
%%time

n_el = 21260259
n_buckets = math.ceil(math.sqrt(n_el))

cm1_indexed_d = dask.delayed(lambda indexed_dir: dd.read_parquet(f"{indexed_dir}/*.parquet", calculate_divisions=True, index=['file_id']))(indexed_dir)
cm1_persisted_d = dask.delayed(lambda ddf: ddf.persist())(cm1_indexed_d)
cm1_files_d = dask.delayed(lambda log_dir: read_global_json(log_dir))(log_dir)
cm1_files_a = da.from_delayed(cm1_files_d, shape=(n_el,), dtype=int)
cm1_files_a = cm1_files_a.rechunk(n_buckets)

# delayed_result = [
#     dask.delayed(lambda i, filenames, ddf: ddf.loc[filenames].index.count().compute())(i, filenames, cm1_persisted_d)
#     for i, filenames in enumerate(cm1_files_a.blocks.ravel()[:10])
# ]
delayed_result = [
    dask.delayed(lambda i, filenames: len(filenames))(i, filenames)
    for i, filenames in enumerate(cm1_files_a.blocks.ravel())
]

result2 = dask.delayed(list)(delayed_result)
# result2.visualize("mapped_rechunk.png")

CPU times: user 4.87 s, sys: 69 ms, total: 4.94 s
Wall time: 4.9 s


In [12]:
%%time

client.run_on_scheduler(clear_logs)
metrics2 = result2.compute()

print(len(metrics2))
print(metrics2[0])
print(metrics2[-1])

4611
4611
3549
CPU times: user 34 s, sys: 663 ms, total: 34.7 s
Wall time: 1min 15s


In [13]:
%%time

n_el = 21260259
n_buckets = math.ceil(math.sqrt(n_el))

cm1_indexed_d = dask.delayed(lambda indexed_dir: dd.read_parquet(f"{indexed_dir}/*.parquet", calculate_divisions=True, index=['file_id']))(indexed_dir)
cm1_persisted_d = dask.delayed(lambda ddf: ddf.persist())(cm1_indexed_d)
cm1_files_d = dask.delayed(lambda log_dir: read_global_json(log_dir))(log_dir)
cm1_files_a = da.from_delayed(cm1_files_d, shape=(n_el,), dtype=int)

delayed_result = [
    dask.delayed(lambda i, filenames, ddf: ddf.loc[filenames].index.count().compute())(i, cm1_files_a[i*n_buckets:(i+1)*n_buckets], cm1_persisted_d)
    for i in range(0, n_buckets)
]

result3 = dask.delayed(list)(delayed_result)

CPU times: user 3.75 s, sys: 223 ms, total: 3.98 s
Wall time: 3.8 s


In [14]:
%%time

client.run_on_scheduler(clear_logs)
metrics3 = result3.compute()

print(len(metrics3))
print(metrics3[0])
print(metrics3[-1])

4611
4612
3549
CPU times: user 2min 2s, sys: 2.73 s, total: 2min 5s
Wall time: 2min 59s


In [15]:
%%time

n_el = 21260259
n_buckets = math.ceil(math.sqrt(n_el))

cm1_indexed_d = dask.delayed(lambda indexed_dir: dd.read_parquet(f"{indexed_dir}/*.parquet", calculate_divisions=True, index=['file_id']))(indexed_dir)
cm1_persisted_d = dask.delayed(lambda ddf: ddf.persist())(cm1_indexed_d)
cm1_files_d = dask.delayed(lambda log_dir: read_global_json(log_dir))(log_dir)
cm1_files_a = da.from_delayed(cm1_files_d, shape=(n_el,), dtype=int)
cm1_files_a = cm1_files_a.rechunk(n_buckets)

# delayed_result = [
#     dask.delayed(lambda i, filenames, ddf: ddf.loc[filenames].index.count().compute())(i, filenames, cm1_persisted_d)
#     for i, filenames in enumerate(cm1_files_a.blocks.ravel()[:10])
# ]
delayed_result = [
    dask.delayed(lambda i, filenames, ddf: ddf.loc[filenames].index.count().compute())(i, filenames, cm1_persisted_d)
    for i, filenames in enumerate(cm1_files_a.blocks.ravel())
]

result4 = dask.delayed(list)(delayed_result)

CPU times: user 5.35 s, sys: 79.2 ms, total: 5.43 s
Wall time: 5.38 s


In [16]:
%%time

client.run_on_scheduler(clear_logs)
metrics4 = result4.compute()

print(len(metrics4))
print(metrics4[0])
print(metrics4[-1])

4611
4612
3549
CPU times: user 2min 40s, sys: 3.5 s, total: 2min 44s
Wall time: 3min 28s


In [17]:
%%time

n_el = 21260259
n_buckets = math.ceil(math.sqrt(n_el))

cm1_indexed_d = dask.delayed(lambda indexed_dir: dd.read_parquet(f"{indexed_dir}/*.parquet", calculate_divisions=True, index=['file_id']))(indexed_dir)
cm1_persisted_d = dask.delayed(lambda ddf: ddf.persist())(cm1_indexed_d)
cm1_files_d = dask.delayed(lambda log_dir: read_global_json(log_dir))(log_dir)
cm1_files_a = da.from_delayed(cm1_files_d, shape=(n_el,), dtype=int)
cm1_files_a = cm1_files_a.rechunk(n_buckets)

# delayed_result = [
#     dask.delayed(lambda i, filenames, ddf: ddf.loc[filenames].index.count().compute())(i, filenames, cm1_persisted_d)
#     for i, filenames in enumerate(cm1_files_a.blocks.ravel()[:10])
# ]
# delayed_result = [
#     dask.delayed(lambda i, filenames, ddf: ddf.loc[filenames].index.count().compute())(i, filenames, cm1_persisted_d)
#     for i, filenames in enumerate(cm1_files_a.blocks.ravel())
# ]

result5 = cm1_files_a.map_blocks(lambda filenames, ddf: ddf.loc[filenames].index.count().compute(), cm1_persisted_d)

CPU times: user 3.25 s, sys: 186 ms, total: 3.44 s
Wall time: 7.61 s


In [18]:
%%time

client.run_on_scheduler(clear_logs)
# metrics5 = result5.compute()

# print(len(metrics5))
# print(metrics5[0])
# print(metrics5[-1])

CPU times: user 63 ms, sys: 48 µs, total: 63.1 ms
Wall time: 61.7 ms


In [8]:
%%time

def proc_metrics(ddf, filenames):
    return ddf.loc[filenames].reset_index().groupby(['index', 'io_cat']).agg({
        'duration':sum, 
        'size':sum, 
        'index':'count',
        'filename':min
    }).compute()

n_el = 21260259
n_buckets = math.ceil(math.sqrt(n_el))

cm1_indexed_d = dask.delayed(lambda indexed_dir: dd.read_parquet(f"{indexed_dir}/*.parquet", calculate_divisions=True, index=['file_id']))(indexed_dir)
cm1_persisted_d = dask.delayed(lambda ddf: ddf.persist())(cm1_indexed_d)
cm1_files_d = dask.delayed(lambda log_dir: read_global_json(log_dir))(log_dir)
cm1_files_a = da.from_delayed(cm1_files_d, shape=(n_el,), dtype=int)
cm1_files_a = cm1_files_a.rechunk(n_buckets)

# delayed_result = [
#     dask.delayed(lambda i, filenames, ddf: ddf.loc[filenames].index.count().compute())(i, filenames, cm1_persisted_d)
#     for i, filenames in enumerate(cm1_files_a.blocks.ravel()[:10])
# ]
delayed_result = [
    dask.delayed(proc_metrics)(cm1_persisted_d, filenames)
    for i, filenames in enumerate(cm1_files_a.blocks.ravel())
]

result6 = dask.delayed(list)(delayed_result)

CPU times: user 3.97 s, sys: 155 ms, total: 4.12 s
Wall time: 4.09 s


In [None]:
%%time

client.run_on_scheduler(clear_logs)
metrics6 = result6.compute()

print(len(metrics6))
print(metrics6[0])
print(metrics6[-1])

In [None]:
%%time

def proc_metrics(ddf):
#     target_ddf = ddf.loc[filenames].compute()
    return ddf.reset_index().groupby(['index', 'io_cat']).agg({
        'duration':sum, 
        'size':sum, 
        'index':'count',
        'filename':min
    })

n_el = 21260259
n_buckets = math.ceil(math.sqrt(n_el))
# n_buckets = 32

cm1_indexed_d = dask.delayed(lambda indexed_dir: dd.read_parquet(f"{indexed_dir}/*.parquet", calculate_divisions=True, index=['file_id']))(indexed_dir)
cm1_persisted_d = dask.delayed(lambda ddf: ddf.persist())(cm1_indexed_d)
cm1_files_d = dask.delayed(lambda log_dir: read_global_json(log_dir))(log_dir)
cm1_files_a = da.from_delayed(cm1_files_d, shape=(n_el,), dtype=int)
cm1_files_a = cm1_files_a.rechunk(n_buckets)

# delayed_result = [
#     dask.delayed(lambda i, filenames, ddf: ddf.loc[filenames].index.count().compute())(i, filenames, cm1_persisted_d)
#     for i, filenames in enumerate(cm1_files_a.blocks.ravel()[:10])
# ]

t0 = time.time()
task_ddfs_d = [
    dask.delayed(lambda ddf, filenames: ddf.loc[filenames])(cm1_persisted_d, filenames)
    for i, filenames in enumerate(cm1_files_a.blocks.ravel())
]
task_ddfs = client.compute(task_ddfs_d, sync=True)
print("Task ddf", time.time()-t0)

delayed_result = [
    dask.delayed(proc_metrics)(task_ddfs[i])
    for i, filenames in enumerate(cm1_files_a.blocks.ravel())
]

result7 = dask.delayed(list)(delayed_result)

In [None]:
%%time

client.run_on_scheduler(clear_logs)
metrics7 = result7.compute()

print(len(metrics7))
print(metrics7[0])
print(metrics7[-1])

In [None]:
%%time

def proc_metrics(ddf):
#     target_ddf = ddf.loc[filenames].compute()
    return ddf.reset_index().groupby(['index', 'io_cat']).agg({
        'duration':sum, 
        'size':sum, 
        'index':'count',
        'filename':min
    })

n_el = 21260259
n_buckets = math.ceil(math.sqrt(n_el))

cm1_indexed_d = dask.delayed(lambda indexed_dir: dd.read_parquet(f"{indexed_dir}/*.parquet", calculate_divisions=True, index=['file_id']))(indexed_dir)
cm1_persisted_d = dask.delayed(lambda ddf: ddf.persist())(cm1_indexed_d)
cm1_files_d = dask.delayed(lambda log_dir: read_global_json(log_dir))(log_dir)
cm1_files_a = da.from_delayed(cm1_files_d, shape=(n_el,), dtype=int)

tasks = {}
for i in range(0, n_buckets):
    start = i*n_buckets
    stop = (i+1)*n_buckets
    
    target_ddfs = [
        dask.delayed(lambda ddf, filename: ddf.loc[[filename]])(cm1_persisted_d, cm1_files_a[i], dask_key_name=f"target-ddf-{i}")
        for i in range(start, stop)
    ]
    print('target_ddfs', len(target_ddfs))
    delayed_result = [
        dask.delayed(proc_metrics)(target_ddfs[i], dask_key_name=f"metric-{i}")
        for i in range(start, stop)
    ]
    print('delayed_result', len(delayed_result))
    t0 = time.time()
    bucket_futures = client.compute(delayed_result, sync=False)
    print('bucket_futures', len(bucket_futures))
    for future in as_completed(bucket_futures):  
        tasks[future.key] = future.result()
        now = time.time()
        print('\rCompleted', len(tasks), now-t0)


In [None]:
%%time

client.run_on_scheduler(clear_logs)
metrics8 = result8.compute()

print(len(metrics8))
print(metrics8[0])
print(metrics8[-1])

In [7]:
%%time

n_el = 21260259
n_buckets = math.ceil(n_el*1.0/ 64*4) 
# n_buckets = math.ceil(math.sqrt(n_el))

cm1_indexed_d = dask.delayed(lambda indexed_dir: dd.read_parquet(f"{indexed_dir}/*.parquet", calculate_divisions=True, index=['file_id']))(indexed_dir)
cm1_persisted_d = dask.delayed(lambda ddf: ddf.persist())(cm1_indexed_d)
cm1_files_d = dask.delayed(lambda log_dir: read_global_json(log_dir))(log_dir)
cm1_files_a = da.from_delayed(cm1_files_d, shape=(n_el,), dtype=int)

def cal_metrics_file(ddf, index, file_lists, log_dir):
    filename = f"{log_dir}/metrics/file_id/{index}.parquet"
#     splice_ddf = ddf.loc[file_lists]
#     splice_ddf.reset_index()
#     target_ddf = splice_ddf.compute()
    aggregate = ddf.reset_index().groupby(['index','io_cat']).agg({'duration':sum, 
                                                              'size':sum, 
                                                              'bandwidth':sum, 
                                                              'index':'count', 
                                                              'proc_id':[min,max], 
                                                              'filename':min})
    aggregate.reset_index(inplace=True)
    aggregate.columns  = ['_'.join(col) for col in aggregate.columns.values]
    aggregate.to_parquet(filename)
    return filename

filter_group_index = 'file_id'

futures = []
target_ddfs = []
file_range = range(0, n_el, n_buckets)

for index, file_index in enumerate(file_range):
    selected_files = cm1_files_a[file_index:file_index+n_buckets]
    target_ddf = dask.delayed(lambda ddf, files, _: ddf.loc[files])(cm1_indexed_d, selected_files, cm1_persisted_d, dask_key_name=f"target_ddf_{index}_{filter_group_index}")
    target_ddfs.append(target_ddf)

t0 = time.time()
target_ddfs_ = client.compute(target_ddfs, sync=True)
print("Target ddf", len(target_ddfs_), time.time()-t0)

for index, file_index in enumerate(file_range):
    print(f"processing {index} of {len(file_range)}", end='\r')
    selected_files = [] # cm1_files_a[file_index:file_index+n_buckets]
    #seleceted_files_future = client.scatter(seleceted_files) doesnt help as list of futures is larger as list of file ids :D
    #print(len(seleceted_files), index)
#     target_ddf = dask.delayed(lambda ddf, files, _: ddf.loc[files].compute().reset_index())(cm1_indexed_d, selected_files, cm1_persisted_d, dask_key_name=f"target_ddf_{index}_{filter_group_index}")
    cal_metrics = dask.delayed(cal_metrics_file)(target_ddfs_[index], index, selected_files, log_dir, dask_key_name=f"cal_metrics_{index}_{filter_group_index}")
    futures.append(client.compute(cal_metrics, sync=False))
    #cal_metrics.append(delayed_func(cal_metrics,  , []))

Target ddf 16 42.540334701538086
processing 0 of 16

  ("('read-parquet-7fc378b9b14dcc5f1c4fa663b16a2bc8' ... 6802749], None)
Consider scattering large objects ahead of time
with client.scatter to reduce scheduler burden and 
keep data on workers

    future = client.submit(func, big_data)    # bad

    big_future = client.scatter(big_data)     # good
    future = client.submit(func, big_future)  # good
  % (format_bytes(len(b)), s)


CPU times: user 1min 32s, sys: 1.58 s, total: 1min 34s
Wall time: 1min 54s


In [None]:
futures

In [8]:
%%time
import time
from dask.distributed import as_completed
start_time = time.time()
metrics = []
for future in as_completed(futures):
    end_time = time.time() - start_time
    #filename = future.result()
    metrics.append(future.result())
    print(f"Completed {len(metrics)} of {len(futures)} in {end_time/60}", end='\r')

Completed 7 of 16 in 3.8645090897878014

KeyboardInterrupt: 

In [None]:
for f in futures:
    f.cancel()

In [None]:
futures

In [16]:
finished = 0
for f in futures:
    if f.status == 'finished':
        finished = finished + 1
finished

4609

In [11]:
metrics

['/p/gpfs1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet/metrics/file_id/0.parquet',
 '/p/gpfs1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet/metrics/file_id/1.parquet',
 '/p/gpfs1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet/metrics/file_id/7.parquet',
 '/p/gpfs1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet/metrics/file_id/3.parquet',
 '/p/gpfs1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet/metrics/file_id/15.parquet',
 '/p/gpfs1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet/metrics/file_id/4.parquet',
 '/p/gpfs1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet/metrics/file_id/17.parquet',
 '/p/gpfs1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet/metrics/file_id/10.parquet',
 '/p/gpfs1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet/metrics/file_id/13.parquet',
 '/p/gpfs1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet/metrics/file_id/5.parquet',
 '/p/gpfs1/iopp/recorder_app_logs/genome_pegas

In [12]:
file_ddf_0 = dd.read_parquet("/p/gpfs1/iopp/recorder_app_logs/genome_pegasus/nodes-32/_parquet/metrics/file_id/0.parquet")

file_ddf_0

Unnamed: 0_level_0,index_,io_cat_,duration_sum,size_sum,bandwidth_sum,index_count,proc_id_min,proc_id_max,filename_min
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
,int64,int64,float32,int64,float32,int64,int64,int64,object
,...,...,...,...,...,...,...,...,...


In [13]:
file_ddf_0.head()

Unnamed: 0,index_,io_cat_,duration_sum,size_sum,bandwidth_sum,index_count,proc_id_min,proc_id_max,filename_min
0,12707,3,9.4e-05,0,0.0,1,3612142438820350027,3612142438820350027,tmpz6yqykx9/chr5.HG00123
1,12818,3,9.9e-05,0,0.0,1,3612142438820350027,3612142438820350027,tmpz6yqykx9/chr5.HG00142
2,13160,3,9.1e-05,0,0.0,1,3612142438820350027,3612142438820350027,tmpz6yqykx9/chr5.HG00253
3,13377,3,0.000119,0,0.0,1,3612142438820350027,3612142438820350027,tmpz6yqykx9/chr5.HG00290
4,13712,3,9.3e-05,0,0.0,1,3612142438820350027,3612142438820350027,tmpz6yqykx9/chr5.HG00366
