# Analysis for DFProfiler

This is a simple analysis notebook for dfprofiler.

## Imports

In [14]:
import logging
import json
import dask
import os
from pathlib import Path
from glob import glob

In [15]:

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster, progress, wait, get_client
from dask.distributed import Future, get_client

## Project Variables

In [16]:
app_root = str(Path(os.getcwd()).parent)

## Setup Dask Local Cluster

In [17]:
workers=4
cluster = LocalCluster(n_workers=workers)  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default
logging.info(f"Initialized Client with {workers} workers and link {client.dashboard_link}")

Perhaps you already have a cluster running?
Hosting the HTTP server on port 33705 instead


## Start Analysis

In [19]:

file=f"{app_root}/tests/output/simple_test_1MB_1K.pfw"
file_pattern = glob(file)
file_pattern

['/home/cc/dfprofiler/tests/output/simple_test_1MB_1K.pfw']

## Function to load trace data

In [20]:
def load_profile(args):
    line, path = args
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            d["pid"] = val["pid"]
            d["tid"] = val["tid"]
            d["ts"] = int(val["ts"])
            if "args" in val:
                d["dur"] = float(val["args"]["time"])
                d["freq"] = val["args"]["count"]
            d["func_id"] = val["name"]
            d["cat"] = val["cat"]
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d

## Create Dask Dataframe

In [21]:
pfw_bag = dask.bag.read_text(file_pattern, include_path=True).map(load_profile).filter(lambda x: "ts" in x)
pfw_bag

dask.bag<filter-lambda, npartitions=1>

In [22]:
columns = {'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
           'ts': "uint64[pyarrow]", 'dur': "float32[pyarrow]", 
           'freq': "uint64[pyarrow]", 'func_id': "string[pyarrow]", 
           'cat': "string[pyarrow]"}

In [23]:
events = pfw_bag.to_dataframe(meta=columns)

In [24]:
events = events.repartition(npartitions=1).persist()
_ = wait(events)

## Analysis

In [25]:
events.query("func_id.str.contains('write')").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat
4190,4176,4176,556000000,0.00001,2,write,c
4224,4176,4176,556000000,0.000007,2,write,sys
4497,4176,4176,568000000,0.000006,1,write,c
4508,4176,4176,569000000,0.000005,1,write,c
4516,4176,4176,568000000,0.000004,1,write,sys
...,...,...,...,...,...,...,...
26559,4176,4176,3115000000,0.00001,1,write,sys
26574,4176,4176,3265000000,0.000007,1,write,c
26580,4175,4175,3241000000,0.000007,1,write,c
26587,4176,4176,3265000000,0.000005,1,write,sys


In [26]:
events["func_id"].unique().compute()



0                          malloc
1                            free
2                          calloc
3                         realloc
4                            read
5                            open
6                          open64
7                          fileno
8                  rw_verify_area
9                 fileno_unlocked
10              vfs_getattr_nosec
11                         openat
12                 ext4_file_open
13                          close
14                       vfs_open
15                           mmap
16                       shm_open
17                     shm_unlink
18             mark_page_accessed
19                          lseek
20                        lseek64
21                     vfs_unlink
22                          fcntl
23                         fdopen
24                      vfs_statx
25                     readlinkat
26                   vfs_readlink
27                         mremap
28                         mmap64
29            

In [27]:
functions = events.groupby(["func_id", "cat", "pid","tid", "ts"])[["freq","dur"]].sum().groupby(["func_id", "cat", "ts"]).agg({"freq":sum,"dur":max}).groupby([ "cat","func_id"]).sum()
functions = functions.reset_index()
functions.compute()

Unnamed: 0,cat,func_id,freq,dur
0,c,calloc,10322,0.03607
1,c,close,4784,0.012817
2,c,fcntl,35,7.2e-05
3,c,fdopen,3,3e-05
4,c,fileno,27,0.000151
5,c,fileno_unlocked,27,0.00013
6,c,free,62623,0.186737
7,c,lseek,1075,0.003423
8,c,lseek64,1075,0.002745
9,c,malloc,68290,0.239962


In [28]:
num_writes = functions.query("func_id == 'write' and cat == 'c'")
num_writes.compute()

Unnamed: 0,cat,func_id,freq,dur
21,c,write,1076,0.009275


In [29]:
num_writes_ext4 = functions.query("func_id.str.contains('ext4_file_write_iter') and cat == 'ext4'")
num_writes_ext4.compute()

Unnamed: 0,cat,func_id,freq,dur
28,ext4,ext4_file_write_iter,1026,0.002762


In [30]:
num_writes_os_cache = functions.query("func_id.str.contains('mark_buffer_dirty') and cat == 'os_cache'")
num_writes_os_cache.compute()

Unnamed: 0,cat,func_id,freq,dur
32,os_cache,mark_buffer_dirty,262195,0.132587


In [31]:
num_reads = functions.query("func_id == 'read' and cat == 'c'")
num_reads.compute()

Unnamed: 0,cat,func_id,freq,dur
17,c,read,6810,0.794254


In [32]:
num_reads_ext4 = functions.query("func_id.str.contains('read') and cat == 'ext4'")
num_reads_ext4.compute()

Unnamed: 0,cat,func_id,freq,dur


In [33]:
num_writes_os_cache = functions.query("func_id.str.contains('mark_page_accessed') and cat == 'os_cache'")
num_writes_os_cache.compute()

Unnamed: 0,cat,func_id,freq,dur
33,os_cache,mark_page_accessed,6184,0.003235


In [34]:
min_ts, max_ts = dask.compute(events["ts"].min(), events["ts"].max())

In [37]:
(max_ts - min_ts) / 1e9

np.float64(3.31)

In [38]:
events.freq.sum().compute()

np.int64(1284246)