# Analysis for DFProfiler

This is a simple analysis notebook for dfprofiler.

## Imports

In [2]:
import logging
import json
import dask
import os
from pathlib import Path
from glob import glob

In [3]:

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster, progress, wait, get_client
from dask.distributed import Future, get_client

## Project Variables

In [4]:
app_root = str(Path(os.getcwd()).parent)

## Setup Dask Local Cluster

In [5]:
workers=4
cluster = LocalCluster(n_workers=workers)  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default
logging.info(f"Initialized Client with {workers} workers and link {client.dashboard_link}")

## Start Analysis

In [6]:

file=f"{app_root}/tests/output/simple_test_1MB_1K.pfw"
file_pattern = glob(file)
file_pattern

['/home/cc/dfprofiler/tests/output/simple_test_1MB_1K.pfw']

## Function to load trace data

In [7]:
def load_profile(args):
    line, path = args
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            d["pid"] = val["pid"]
            d["tid"] = val["tid"]
            d["ts"] = int(val["ts"])
            if "args" in val:
                d["dur"] = float(val["args"]["time"])
                d["freq"] = val["args"]["count"]
            d["func_id"] = val["name"]
            d["cat"] = val["cat"]
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d

## Create Dask Dataframe

In [8]:
pfw_bag = dask.bag.read_text(file_pattern, include_path=True).map(load_profile).filter(lambda x: "ts" in x)
pfw_bag

dask.bag<filter-lambda, npartitions=1>

In [9]:
columns = {'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
           'ts': "uint64[pyarrow]", 'dur': "float32[pyarrow]", 
           'freq': "uint64[pyarrow]", 'func_id': "string[pyarrow]", 
           'cat': "string[pyarrow]"}

In [10]:
events = pfw_bag.to_dataframe(meta=columns)

In [11]:
events = events.repartition(npartitions=1).persist()
_ = wait(events)

## Analysis

In [12]:
events.query("func_id.str.contains('write')").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat
2086,347784,347784,346000,0.000027,7,write,c
2089,347784,347784,347000,0.000025,6,write,c
2094,347784,347784,345000,0.000021,5,write,c
2095,347784,347784,346000,0.000019,7,write,sys
2097,347784,347784,347000,0.000017,6,write,sys
...,...,...,...,...,...,...,...
30142,347784,347784,1813000,0.000002,1,ext4_file_write_iter,ext4
30148,347784,347784,1824000,0.000002,1,ext4_file_write_iter,ext4
30161,347784,347784,1817000,0.000002,1,ext4_file_write_iter,ext4
30167,347784,347784,1822000,0.000002,1,ext4_file_write_iter,ext4


In [13]:
events.query("ts == 1140000").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat
12565,347784,347784,1140000,0.000207,118,ext4_da_write_end,ext4
13606,347784,347784,1140000,0.000105,1,read,c
13829,347784,347784,1140000,0.000103,1,read,sys
13968,347784,347784,1140000,0.000102,1,rw_verify_area,vfs
14879,347784,347784,1140000,9.4e-05,119,ext4_da_write_begin,ext4
15231,347784,347784,1140000,9.1e-05,118,mark_buffer_dirty,block
16311,347784,347784,1140000,5.8e-05,118,mark_buffer_dirty,os_cache
16688,347784,347784,1140000,4.8e-05,119,ext4_da_reserve_space,ext4
17197,347784,347784,1140000,2.8e-05,8,"std::chrono::duration<double, std::ratio<1l, 1...",app
17709,347784,347784,1140000,2.5e-05,6,"std::chrono::duration<double, std::ratio<1l, 1...",app


In [14]:
events.query("func_id.str.contains('read')").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat
11,347784,347784,13000,0.00071,53,read,c
12,347784,347784,13000,0.000639,53,read,sys
14,347784,347784,14000,0.000395,32,read,c
15,347784,347784,14000,0.000351,32,read,sys
124,347784,347784,2000,0.000023,6,read,c
...,...,...,...,...,...,...,...
25190,347784,347784,1952000,0.000096,1,read,sys
25193,347784,347784,1997000,0.000096,1,read,sys
25200,347784,347784,2213000,0.000096,1,read,sys
25216,347784,347784,2286000,0.000096,1,read,sys


In [15]:
events.query("ts == 2875000").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat


In [16]:
events["func_id"].unique().compute()



0                                                malloc
1                                                  free
2                                                calloc
3                                               realloc
4                                             [unknown]
                            ...                        
56    unsigned int std::__detail::__to_chars_len<uns...
57    std::__cxx11::basic_string<char, std::char_tra...
58                                 ext4_file_write_iter
59                                           MPI_Reduce
60                              Timer::getElapsedTime()
Name: func_id, Length: 61, dtype: string

In [17]:
events.query("cat == 'app'")["func_id"].unique().compute()

0                                             [unknown]
1                                         __libc_malloc
2     std::iterator_traits<char const*>::difference_...
3           std::char_traits<char>::length(char const*)
4     std::iterator_traits<char const*>::iterator_ca...
5     bool __gnu_cxx::__is_null_pointer<char const>(...
6     std::chrono::duration<long, std::ratio<1l, 100...
7     std::remove_reference<std::__cxx11::basic_stri...
8     void std::__detail::__to_chars_10_impl<unsigne...
9     std::chrono::duration<double, std::ratio<1l, 1...
10    std::chrono::duration<long, std::ratio<1l, 100...
11    std::chrono::duration<double, std::ratio<1l, 1...
12    std::chrono::time_point<std::chrono::_V2::syst...
13           std::chrono::duration_values<long>::zero()
14                                  Timer::resumeTime()
15    unsigned int std::__detail::__to_chars_len<uns...
16    std::__cxx11::basic_string<char, std::char_tra...
17                                              

In [18]:
functions = events.groupby(["func_id", "cat", "pid","tid", "ts"])[["freq","dur"]].sum().groupby(["func_id", "cat", "ts"]).agg({"freq":sum,"dur":max}).groupby([ "cat","func_id"]).sum()
functions = functions.reset_index()
functions.compute()

Unnamed: 0,cat,func_id,freq,dur
0,app,Timer::getElapsedTime(),4,0.000012
1,app,Timer::resumeTime(),2050,0.006192
2,app,[unknown],6,0.000027
3,app,__libc_malloc,3,0.020369
4,app,bool __gnu_cxx::__is_null_pointer<char const>(...,1,0.000003
...,...,...,...,...
66,vfs,vfs_open,1552,0.001808
67,vfs,vfs_readlink,14,0.000015
68,vfs,vfs_statfs.part.0.isra.0,3,0.000002
69,vfs,vfs_statx,32,0.000144


In [19]:
num_writes = functions.query("func_id == 'write' and cat == 'c'")
num_writes.compute()

Unnamed: 0,cat,func_id,freq,dur
40,c,write,1058,0.008778


In [20]:
num_writes_ext4 = functions.query("func_id.str.contains('ext4_file_write_iter') and cat == 'ext4'")
num_writes_ext4.compute()

Unnamed: 0,cat,func_id,freq,dur
45,ext4,ext4_file_write_iter,1024,0.002678


In [21]:
num_writes_os_cache = functions.query("func_id.str.contains('mark_buffer_dirty') and cat == 'os_cache'")
num_writes_os_cache.compute()

Unnamed: 0,cat,func_id,freq,dur
50,os_cache,mark_buffer_dirty,262144,0.129568


In [22]:
num_reads = functions.query("func_id == 'read' and cat == 'c'")
num_reads.compute()

Unnamed: 0,cat,func_id,freq,dur
36,c,read,4010,0.124408


In [23]:
num_reads_ext4 = functions.query("func_id.str.contains('read') and cat == 'ext4'")
num_reads_ext4.compute()

Unnamed: 0,cat,func_id,freq,dur


In [24]:
num_writes_os_cache = functions.query("func_id.str.contains('mark_page_accessed') and cat == 'os_cache'")
num_writes_os_cache.compute()

Unnamed: 0,cat,func_id,freq,dur
51,os_cache,mark_page_accessed,1,1e-06


In [25]:
min_ts, max_ts = dask.compute(events["ts"].min(), events["ts"].max())

In [27]:
(max_ts - min_ts) / 1e6

np.float64(2.398)

In [28]:
events.freq.sum().compute()

np.int64(1438183)