In [1]:
import logging
import json
import dask
import os
from pathlib import Path
from glob import glob
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster, progress, wait, get_client
from dask.distributed import Future, get_client

In [2]:
app_root = str(Path(os.getcwd()).parent)

In [3]:
def load_profile(args):
    line, path = args
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            d["pid"] = val["pid"]
            d["tid"] = val["tid"]
            d["ts"] = int(val["ts"])
            if "args" in val:
                d["dur"] = float(val["args"]["time"])
                d["freq"] = val["args"]["count"]
            d["func_id"] = val["name"]
            d["cat"] = val["cat"]
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d

In [4]:
workers=4
cluster = LocalCluster(n_workers=workers)  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default
logging.info(f"Initialized Client with {workers} workers and link {client.dashboard_link}")

In [5]:

file=f"{app_root}/tests/output/simple_test_1MB_1K.pfw"
file

'/home/cc/dfprofiler/tests/output/simple_test_1MB_1K.pfw'

In [6]:
file_pattern = glob(file)
file_pattern

['/home/cc/dfprofiler/tests/output/simple_test_1MB_1K.pfw']

In [7]:
pfw_bag = dask.bag.read_text(file_pattern, include_path=True).map(load_profile).filter(lambda x: "ts" in x)
pfw_bag

dask.bag<filter-lambda, npartitions=1>

In [8]:
columns = {'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
           'ts': "uint64[pyarrow]", 'dur': "float32[pyarrow]", 
           'freq': "uint64[pyarrow]", 'func_id': "string[pyarrow]", 
           'cat': "string[pyarrow]"}

In [9]:
events = pfw_bag.to_dataframe(meta=columns)

In [10]:
events = events.repartition(npartitions=1).persist()
_ = wait(events)

In [12]:
events.query("func_id.str.contains('write')").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat
83,50915,50915,1000000,0.05222,37722,ext4_da_write_end,ext4
88,50915,50915,1000000,0.027251,37632,ext4_da_write_begin,ext4
118,50915,50915,1000000,0.000905,147,write,c
122,50915,50915,1000000,0.000618,147,write,sys
125,50915,50915,1000000,0.000312,147,ext4_file_write_iter,ext4
130,50916,50916,0,0.000254,2,write,c
131,50916,50916,0,0.00025,2,write,sys
133,50916,50916,0,0.000247,2,ext4_file_write_iter,ext4
144,50915,50915,0,0.000121,23,write,c
148,50915,50915,0,6.9e-05,23,write,sys


In [11]:
events["func_id"].unique().compute()



0                          malloc
1                            free
2                         realloc
3                            read
4                        vfs_read
5                          calloc
6                           close
7                          openat
8                            mmap
9                        vfs_open
10              vfs_getattr_nosec
11                      vfs_statx
12                         mremap
13                 ext4_file_open
14                      vfs_mkdir
15                         fileno
16                fileno_unlocked
17                           open
18                         open64
19                          lseek
20                        lseek64
21             mark_page_accessed
22                     readlinkat
23                   vfs_readlink
24                         mmap64
25                          fcntl
26                         fdopen
27                       shm_open
28                     shm_unlink
29            

In [17]:
functions = events.groupby(["func_id", "cat", "pid","tid", "ts"])[["freq","dur"]].sum().groupby(["func_id", "cat", "ts"]).agg({"freq":sum,"dur":max}).groupby([ "cat","func_id"]).sum()
functions = functions.reset_index()
functions.compute()

Unnamed: 0,cat,func_id,freq,dur
0,c,calloc,10322,0.021258
1,c,close,4784,0.007825
2,c,fcntl,35,4.7e-05
3,c,fdopen,3,1.7e-05
4,c,fileno,27,6.5e-05
5,c,fileno_unlocked,27,5.8e-05
6,c,free,62639,0.1343
7,c,lseek,1075,0.002169
8,c,lseek64,1075,0.001859
9,c,malloc,68307,0.12872


In [19]:
num_writes = functions.query("func_id == 'write' and cat == 'c'")
num_writes.compute()

Unnamed: 0,cat,func_id,freq,dur
21,c,write,1077,0.006886


In [23]:
num_writes_ext4 = functions.query("func_id.str.contains('ext4_file_write_iter') and cat == 'ext4'")
num_writes_ext4.compute()

Unnamed: 0,cat,func_id,freq,dur
28,ext4,ext4_file_write_iter,1026,0.002348


In [24]:
num_writes_os_cache = functions.query("func_id.str.contains('mark_buffer_dirty') and cat == 'os_cache'")
num_writes_os_cache.compute()

Unnamed: 0,cat,func_id,freq,dur
32,os_cache,mark_buffer_dirty,262194,0.124409


In [25]:
num_reads = functions.query("func_id == 'read' and cat == 'c'")
num_reads.compute()

Unnamed: 0,cat,func_id,freq,dur
17,c,read,6810,0.618551


In [26]:
num_reads_ext4 = functions.query("func_id.str.contains('read') and cat == 'ext4'")
num_reads_ext4.compute()

Unnamed: 0,cat,func_id,freq,dur


In [27]:
num_writes_os_cache = functions.query("func_id.str.contains('mark_page_accessed') and cat == 'os_cache'")
num_writes_os_cache.compute()

Unnamed: 0,cat,func_id,freq,dur
33,os_cache,mark_page_accessed,6250,0.002136
