# Analysis for DFProfiler

This is a simple analysis notebook for dfprofiler.

## Imports

In [1]:
import logging
import json
import dask
import os
from pathlib import Path
from glob import glob

In [2]:

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster, progress, wait, get_client
from dask.distributed import Future, get_client

## Project Variables

In [3]:
app_root = str(Path(os.getcwd()).parent)

## Setup Dask Local Cluster

In [4]:
workers=4
cluster = LocalCluster(n_workers=workers)  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default
logging.info(f"Initialized Client with {workers} workers and link {client.dashboard_link}")

## Start Analysis

In [5]:

file=f"{app_root}/tests/output/simple_test_1MB_1K.pfw"
file_pattern = glob(file)
file_pattern

['/home/cc/dfprofiler/tests/output/simple_test_1MB_1K.pfw']

## Function to load trace data

In [6]:
def load_profile(args):
    line, path = args
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            d["pid"] = val["pid"]
            d["tid"] = val["tid"]
            d["ts"] = int(val["ts"])
            if "args" in val:
                d["dur"] = float(val["args"]["time"])
                d["freq"] = val["args"]["count"]
            d["func_id"] = val["name"]
            d["cat"] = val["cat"]
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d

## Create Dask Dataframe

In [7]:
pfw_bag = dask.bag.read_text(file_pattern, include_path=True).map(load_profile).filter(lambda x: "ts" in x)
pfw_bag

dask.bag<filter-lambda, npartitions=1>

In [8]:
columns = {'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
           'ts': "uint64[pyarrow]", 'dur': "float32[pyarrow]", 
           'freq': "uint64[pyarrow]", 'func_id': "string[pyarrow]", 
           'cat': "string[pyarrow]"}

In [9]:
events = pfw_bag.to_dataframe(meta=columns)

In [10]:
events = events.repartition(npartitions=1).persist()
_ = wait(events)

## Analysis

In [11]:
events.query("func_id.str.contains('write')").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat
9395,17085,17085,1140000,0.000023,2,write,c
9496,17085,17085,1141000,0.000021,2,write,c
9810,17085,17085,1138000,0.000015,1,write,c
9852,17085,17085,1165000,0.000014,3,write,c
9859,17085,17085,1128000,0.000014,1,write,c
...,...,...,...,...,...,...,...
51318,17085,17085,3138000,0.000001,1,ext4_file_write_iter,ext4
51327,17085,17085,2875000,0.000001,1,ext4_file_write_iter,ext4
51330,17085,17085,3096000,0.000001,1,ext4_file_write_iter,ext4
51363,17085,17085,3890000,0.000001,1,ext4_file_write_iter,ext4


In [15]:
events.query("ts == 1140000").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat
7266,17085,17085,1140000,0.000282,25,malloc,c
7626,17085,17085,1140000,0.000136,12,free,c
8377,17085,17085,1140000,5.4e-05,40,kfree,kmem
8396,17085,17085,1140000,5.3e-05,39,kmalloc_trace,kmem
9395,17085,17085,1140000,2.3e-05,2,write,c
9911,17085,17085,1140000,1.4e-05,2,write,sys
10146,17085,17085,1140000,1.1e-05,2,rw_verify_area,vfs
11276,17085,17085,1140000,4e-06,2,rw_verify_area,vfs


In [16]:
events.query("func_id.str.contains('read')").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat
145,17085,17085,6000,0.00003,4,read,c
154,17085,17085,6000,0.000019,4,read,sys
156,17085,17085,5000,0.000018,2,read,c
171,17085,17085,5000,0.000013,2,read,sys
253,17085,17085,52000,0.000623,48,read,c
...,...,...,...,...,...,...,...
47648,17085,17085,2875000,0.000002,1,read,sys
47679,17085,17085,2870000,0.000002,1,read,sys
47789,17085,17085,3062000,0.000002,1,read,sys
47793,17085,17085,2893000,0.000002,1,read,sys


In [17]:
events.query("ts == 2875000").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat
34848,17085,17085,2875000,8.5e-05,234,kmem_cache_alloc,kmem
34851,17085,17085,2875000,8.5e-05,118,mark_buffer_dirty,block
34896,17085,17085,2875000,8.4e-05,117,ext4_da_write_begin,ext4
37245,17085,17085,2875000,7.1e-05,118,ext4_da_write_end,ext4
38699,17085,17085,2875000,5.4e-05,118,mark_buffer_dirty,os_cache
40382,17085,17085,2875000,4.3e-05,117,ext4_da_reserve_space,ext4
41686,17085,17085,2875000,3.7e-05,120,kmem_cache_free,kmem
43604,17085,17085,2875000,5e-06,1,write,c
44179,17085,17085,2875000,4e-06,1,read,c
44687,17085,17085,2875000,4e-06,7,kmalloc_slab,kmem


In [12]:
events["func_id"].unique().compute()



0                          malloc
1                            free
2                           kfree
3                   kmalloc_trace
4                kmem_cache_alloc
5                         realloc
6                          calloc
7                 kmem_cache_free
8                    kmalloc_slab
9               vfs_getattr_nosec
10                           read
11                         fileno
12                fileno_unlocked
13                 rw_verify_area
14                           open
15                         open64
16    kmem_cache_free_bulk.part.0
17                 ext4_file_open
18                         openat
19          kmem_cache_alloc_bulk
20                          close
21                       vfs_open
22                           mmap
23           kmem_cache_alloc_lru
24             mark_page_accessed
25                          lseek
26                       shm_open
27                     shm_unlink
28                        lseek64
29            

In [13]:
functions = events.groupby(["func_id", "cat", "pid","tid", "ts"])[["freq","dur"]].sum().groupby(["func_id", "cat", "ts"]).agg({"freq":sum,"dur":max}).groupby([ "cat","func_id"]).sum()
functions = functions.reset_index()
functions.compute()

Unnamed: 0,cat,func_id,freq,dur
0,block,block_devnode,3,0.000005
1,block,block_dirty_folio,2152,0.001395
2,block,block_uevent,3,0.000004
3,block,kfree,6,0.000015
4,block,kmem_cache_free,2071,0.002106
...,...,...,...,...
68,vfs,vfs_getattr_nosec,1830,0.001532
69,vfs,vfs_open,118,0.000106
70,vfs,vfs_statfs.part.0.isra.0,1,0.000002
71,vfs,vfs_statx,27,0.000055


In [14]:
num_writes = functions.query("func_id == 'write' and cat == 'c'")
num_writes.compute()

Unnamed: 0,cat,func_id,freq,dur
27,c,write,1075,0.00699


In [29]:
num_writes_ext4 = functions.query("func_id.str.contains('ext4_file_write_iter') and cat == 'ext4'")
num_writes_ext4.compute()

Unnamed: 0,cat,func_id,freq,dur
28,ext4,ext4_file_write_iter,1026,0.002762


In [30]:
num_writes_os_cache = functions.query("func_id.str.contains('mark_buffer_dirty') and cat == 'os_cache'")
num_writes_os_cache.compute()

Unnamed: 0,cat,func_id,freq,dur
32,os_cache,mark_buffer_dirty,262195,0.132587


In [31]:
num_reads = functions.query("func_id == 'read' and cat == 'c'")
num_reads.compute()

Unnamed: 0,cat,func_id,freq,dur
17,c,read,6810,0.794254


In [32]:
num_reads_ext4 = functions.query("func_id.str.contains('read') and cat == 'ext4'")
num_reads_ext4.compute()

Unnamed: 0,cat,func_id,freq,dur


In [33]:
num_writes_os_cache = functions.query("func_id.str.contains('mark_page_accessed') and cat == 'os_cache'")
num_writes_os_cache.compute()

Unnamed: 0,cat,func_id,freq,dur
33,os_cache,mark_page_accessed,6184,0.003235


In [34]:
min_ts, max_ts = dask.compute(events["ts"].min(), events["ts"].max())

In [37]:
(max_ts - min_ts) / 1e9

np.float64(3.31)

In [38]:
events.freq.sum().compute()

np.int64(1284246)