# Analysis for DFProfiler

This is a simple analysis notebook for dfprofiler.

## Imports

In [1]:
import logging
import json
import dask
import os
from pathlib import Path
from glob import glob
import zindex_py as zindex

In [2]:

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster, progress, wait, get_client
from dask.distributed import Future, get_client

## Project Variables

In [3]:
app_root = str(Path(os.getcwd()).parent)

In [4]:
logging.basicConfig(
    level=logging.DEBUG,
    handlers=[
        logging.StreamHandler(),
    ],
    format="%(asctime)s [%(levelname)s]: %(message)s in %(pathname)s:%(lineno)d",
)

## Setup Dask Local Cluster

In [5]:
workers=4
cluster = LocalCluster(n_workers=workers)  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default
logging.info(f"Initialized Client with {workers} workers and link {client.dashboard_link}")

2024-07-21 22:12:22,279 [DEBUG]: Using selector: EpollSelector in /usr/lib/python3.10/asyncio/selector_events.py:54
2024-07-21 22:12:23,189 [INFO]: Initialized Client with 4 workers and link http://127.0.0.1:8787/status in /tmp/ipykernel_38903/1637310826.py:4


## Start Analysis

In [6]:

file=f"{app_root}/tests/output/simple_test_1MB_128K.pfw.gz"
file_pattern = glob(file)
file_pattern

['/home/cc/dfprofiler/tests/output/simple_test_1MB_128K.pfw.gz']

## Function to load trace data

In [7]:
def create_index(filename):
    index_file = f"{filename}.zindex"
    if not os.path.exists(index_file):
        status = zindex.create_index(filename, index_file=f"file:{index_file}",
                                     regex="id:\b([0-9]+)", numeric=True, unique=True, debug=False, verbose=False)
        logging.debug(f"Creating Index for {filename} returned {status}")
    return filename

def get_linenumber(filename):
    index_file = f"{filename}.zindex"
    line_number = zindex.get_max_line(filename, index_file=index_file, debug=False, verbose=False)
    logging.debug(f" The {filename} has {line_number} lines")
    return (filename, line_number)

def get_size(filename):
    conf = get_dft_configuration()
    if filename.endswith('.pfw'):
        size = os.stat(filename).st_size
    elif filename.endswith('.pfw.gz'):
        index_file = f"{filename}.zindex"
        line_number = zindex.get_max_line(filename, index_file=index_file,debug=False, verbose=False)
        size = line_number * 256
    logging.debug(f" The {filename} has {size/1024**3} GB size")
    return int(size)


def generate_line_batches(filename, max_line):
    batch_size = 16*1024
    for start in range(0, max_line, batch_size):
        end =  min((start + batch_size - 1) , (max_line - 1))
        logging.debug(f"Created a batch for {filename} from [{start}, {end}] lines")
        yield filename, start, end

def load_indexed_gzip_files(filename, start, end):
    index_file = f"{filename}.zindex"
    json_lines = zindex.zquery(filename, index_file=index_file,
                          raw=f"select a.line from LineOffsets a where a.line >= {start} AND a.line <= {end};", debug=False, verbose=False)
    logging.debug(f"Read {len(json_lines)} json lines for [{start}, {end}]")
    return json_lines

In [8]:
def load_profile(line):
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            d["pid"] = val["pid"]
            d["tid"] = val["tid"]
            d["ts_us"] = int(val["ts"])
            if "args" in val:
                d["dur_sec"] = float(val["args"]["time"])
                d["freq"] = val["args"]["freq"]
                d["size_bytes"] = val["args"]["size_sum"]
                d["filename"] = val["args"]["fname"]
            d["func_id"] = val["name"]
            d["cat"] = val["cat"]
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d

## Create Dask Dataframe

In [9]:
if len(file_pattern) > 0:
    dask.bag.from_sequence(file_pattern).map(create_index).compute()
    logging.info(f"Created index for {len(file_pattern)} files")
    total_size = dask.bag.from_sequence(file_pattern).map(get_size).sum()
    logging.info(f"Total size of all files are {total_size} bytes")
    max_line_numbers = dask.bag.from_sequence(file_pattern).map(get_linenumber).compute()
    logging.info(f"Max lines per file are {max_line_numbers}")
    json_line_delayed = []
    total_lines = 0
    for filename, max_line in max_line_numbers:
        total_lines += max_line
        for _, start, end in generate_line_batches(filename, max_line):
            json_line_delayed.append((filename, start, end))

    logging.info(f"Loading {len(json_line_delayed)} batches out of {len(file_pattern)} files and has {total_lines} lines overall")
    json_line_bags = []
    for filename, start, end in json_line_delayed:
        num_lines = end - start + 1
        json_line_bags.append(dask.delayed(load_indexed_gzip_files, nout=num_lines)(filename, start, end))
    json_lines = dask.bag.concat(json_line_bags)
    pfw_bag = json_lines.map(load_profile).filter(lambda x: "ts_us" in x)
    pfw_bag.take(1)

2024-07-21 22:12:27,369 [INFO]: Created index for 1 files in /tmp/ipykernel_38903/405840175.py:3


2024-07-21 22:12:27,376 [INFO]: Total size of all files are <dask.bag.core.Item object at 0x79abac26d9f0> bytes in /tmp/ipykernel_38903/405840175.py:5
2024-07-21 22:12:27,448 [INFO]: Max lines per file are [('/home/cc/dfprofiler/tests/output/simple_test_1MB_128K.pfw.gz', 5613)] in /tmp/ipykernel_38903/405840175.py:7
2024-07-21 22:12:27,452 [DEBUG]: Created a batch for /home/cc/dfprofiler/tests/output/simple_test_1MB_128K.pfw.gz from [0, 5612] lines in /tmp/ipykernel_38903/2639451476.py:31
2024-07-21 22:12:27,454 [INFO]: Loading 1 batches out of 1 files and has 5613 lines overall in /tmp/ipykernel_38903/405840175.py:15


In [22]:
columns = {'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
           'ts_us': "uint64[pyarrow]", 'dur_sec': "float32[pyarrow]", 
           'freq': "uint64[pyarrow]", 'size_bytes': "uint64[pyarrow]", 'func_id': "string[pyarrow]", 
           'filename': "string[pyarrow]", 
           'cat': "string[pyarrow]"}

In [23]:
events = pfw_bag.to_dataframe(meta=columns)

In [24]:
events = events.repartition(npartitions=1).persist()
_ = wait(events)

In [25]:
events.head()

Unnamed: 0,pid,tid,ts_us,dur_sec,freq,size_bytes,func_id,filename,cat
0,37818,37818,0,1e-06,1,,close,/sys/devices/system/cpu/cpu15/cpufreq/cpuinfo_...,sys
1,37818,37818,0,1.3e-05,1,64.0,read,/sys/bus/pci/devices/0000:85:0f.7/config,sys
2,37818,37818,0,1e-06,1,,close,/sys/devices/system/cpu/cpu12/cache/index0/size,sys
3,37818,37818,0,2e-06,1,8.0,read,/sys/devices/system/cpu/cpu15/cpufreq/cpuinfo_...,sys
4,37818,37818,0,1e-06,1,,close,/sys/devices/system/cpu/cpu17/topology/die_cpus,sys


## Analysis

In [26]:
events.query("func_id.str.contains('write')").compute()

Unnamed: 0,pid,tid,ts_us,dur_sec,freq,size_bytes,func_id,filename,cat
859,37818,37818,0,0.082555,112015,,ext4_da_write_begin,,ext4
936,37818,37818,0,0.000099,30,,write,,c
1425,37818,37818,0,0.000065,30,240,write,,sys
2491,37818,37818,0,0.002582,437,,write,,c
2607,37818,37818,0,0.001801,437,458227712,write,/home/cc/dfprofiler/build/data/file_0_0.dat,sys
...,...,...,...,...,...,...,...,...,...
5585,37818,37818,27000000,0.14544,80467,,ext4_da_write_end,,ext4
5588,37818,37818,27000000,0.000016,3,71,write,,sys
5590,37818,37818,27000000,0.00197,315,,write,,c
5602,37818,37818,27000000,0.06789,80467,,ext4_da_write_begin,,ext4


In [27]:
total_io = events["size_bytes"].sum().compute() / (1024**3)
total_io

np.float64(31.676917175762355)

In [29]:
filenames = events["filename"].unique().compute()
filenames



0       /sys/devices/system/cpu/cpu15/cpufreq/cpuinfo_...
1                /sys/bus/pci/devices/0000:85:0f.7/config
2         /sys/devices/system/cpu/cpu12/cache/index0/size
3         /sys/devices/system/cpu/cpu17/topology/die_cpus
4                /sys/bus/pci/devices/0000:85:11.2/config
                              ...                        
1922    /sys/bus/pci/devices/0000:00:11.5/current_link...
1923    /sys/devices/system/cpu/cpu18/cache/index5/sha...
1924    /home/cc/spack/opt/spack/linux-ubuntu22.04-sky...
1925    /sys/devices/system/cpu/cpu32/cache/index4/sha...
1926                  /tmp/ompi.ebpf.1000/jf.0/2967207936
Name: filename, Length: 1927, dtype: string

In [30]:
read_write_time = events.query("size_bytes > 0").groupby(["pid","tid", "ts_us"])[["freq","dur_sec"]].sum().groupby(["ts_us"]).agg({"freq":sum,"dur_sec":max}).sum().compute()
read_write_time["dur_sec"], read_write_time["freq"]

(1.675211951136589, 35457.0)

In [31]:
print("Bandwidth is {} GB/s".format(total_io / read_write_time['dur_sec']))

Bandwidth is 18.909199611589667 GB/s


In [32]:
events.query("ts_us == 26000000").compute()

Unnamed: 0,pid,tid,ts_us,dur_sec,freq,size_bytes,func_id,filename,cat
5547,37818,37818,26000000,0.001643,379,397410304.0,write,/home/cc/dfprofiler/build/data/file_0_0.dat,sys
5548,37818,37818,26000000,0.000698,379,,lseek64,,c
5549,37818,37818,26000000,0.081353,96986,,ext4_da_write_begin,,ext4
5550,37818,37818,26000000,0.040467,97000,,ext4_da_reserve_space,,ext4
5551,37818,37818,26000000,0.037071,379,397410304.0,read,/home/cc/dfprofiler/build/data/file_0_0.dat,sys
5552,37818,37818,26000000,0.053109,97006,,mark_buffer_dirty,,os_cache
5553,37818,37818,26000000,0.07879,97014,,mark_buffer_dirty,,block
5554,37818,37818,26000000,0.000483,758,,rw_verify_area,,vfs
5555,37818,37818,26000000,0.173263,97020,,ext4_da_write_end,,ext4
5556,37818,37818,26000000,0.001427,379,,mark_buffer_dirty,,vfs


In [33]:
events.query("func_id.str.contains('read')").compute()

Unnamed: 0,pid,tid,ts_us,dur_sec,freq,size_bytes,func_id,filename,cat
1,37818,37818,0,0.000013,1,64,read,/sys/bus/pci/devices/0000:85:0f.7/config,sys
3,37818,37818,0,0.000002,1,8,read,/sys/devices/system/cpu/cpu15/cpufreq/cpuinfo_...,sys
5,37818,37818,0,0.000013,1,64,read,/sys/bus/pci/devices/0000:85:11.2/config,sys
13,37818,37818,0,0.000002,1,9,read,/sys/bus/pci/devices/0000:01:00.1/class,sys
16,37818,37818,0,0.000009,1,64,read,/sys/bus/pci/devices/0000:00:05.2/config,sys
...,...,...,...,...,...,...,...,...,...
5557,37818,37818,26000000,0.037668,379,,read,,c
5567,37818,37818,26000000,0.018482,188,197132288,read,/home/cc/dfprofiler/build/data/file_0_0.dat,sys
5573,37818,37818,26000000,0.018684,187,,read,,c
5579,37818,37818,27000000,0.031017,315,330301440,read,/home/cc/dfprofiler/build/data/file_0_0.dat,sys


In [34]:
events.query("ts_us == 24000000").compute()

Unnamed: 0,pid,tid,ts_us,dur_sec,freq,size_bytes,func_id,filename,cat
5515,37818,37818,24000000,0.081968,147784,,mark_buffer_dirty,,os_cache
5516,37818,37818,24000000,0.058051,577,,read,,c
5517,37818,37818,24000000,0.001194,577,,lseek,,c
5518,37818,37818,24000000,0.003649,578,,write,,c
5519,37818,37818,24000000,0.000367,577,,lseek,,sys
5520,37818,37818,24000000,0.001403,578,,ext4_file_write_iter,,ext4
5521,37818,37818,24000000,0.125021,147783,,ext4_da_write_begin,,ext4
5522,37818,37818,24000000,0.062439,147783,,ext4_da_reserve_space,,ext4
5523,37818,37818,24000000,0.001049,577,,lseek64,,c
5524,37818,37818,24000000,0.002207,578,,mark_buffer_dirty,,vfs


In [35]:
events["func_id"].unique().compute()

0                        close
1                         read
2                       openat
3                     vfs_open
4     vfs_statfs.part.0.isra.0
5                       open64
6                       statfs
7            mark_buffer_dirty
8               ext4_file_open
9                         open
10                  vfs_unlink
11         ext4_da_write_begin
12                       write
13                      malloc
14       ext4_da_reserve_space
15                    shm_open
16                       fcntl
17                     realloc
18                   vfs_statx
19                      fileno
20                      mmap64
21           vfs_getattr_nosec
22                       lseek
23                      calloc
24              PMPI_Comm_size
25              rw_verify_area
26                   PMPI_Init
27      _Z10gen_randomB5cxx11i
28                      fdopen
29              posix_memalign
30                vfs_readlink
31        ext4_file_write_iter
32      

In [36]:
events.query("cat == 'app'")["func_id"].unique().compute()

0    _Z10gen_randomB5cxx11i
1                     _init
2                      main
3                     _fini
Name: func_id, dtype: string

In [37]:
events.query("cat == 'mpi'")["func_id"].unique().compute()

0    PMPI_Comm_size
1         PMPI_Init
2    PMPI_Comm_rank
3     PMPI_Finalize
4       PMPI_Reduce
Name: func_id, dtype: string

In [38]:
functions = events.groupby(["func_id", "cat", "pid","tid", "ts_us"])[["freq","dur_sec"]].sum().groupby(["func_id", "cat", "ts_us"]).agg({"freq":sum,"dur_sec":max}).groupby([ "cat","func_id"]).sum()
functions = functions.reset_index()
functions.compute()

Unnamed: 0,cat,func_id,freq,dur_sec
0,app,_Z10gen_randomB5cxx11i,1,0.018685
1,app,_fini,1,3e-06
2,app,_init,1,6e-06
3,app,main,1,6e-06
4,block,mark_buffer_dirty,4152677,3.251503
5,c,calloc,3448,0.009626
6,c,close,1541,0.004579
7,c,fcntl,4,9e-06
8,c,fdopen,2,9e-06
9,c,fileno,7,2.8e-05


In [39]:
num_writes = functions.query("func_id == 'write' and cat == 'c'")
num_writes.compute()

Unnamed: 0,cat,func_id,freq,dur_sec
25,c,write,16253,0.100557


In [40]:
num_writes_ext4 = functions.query("func_id.str.contains('ext4_file_write_iter') and cat == 'ext4'")
num_writes_ext4.compute()

Unnamed: 0,cat,func_id,freq,dur_sec
30,ext4,ext4_file_write_iter,16231,0.037478


In [41]:
num_writes_os_cache = functions.query("func_id.str.contains('mark_buffer_dirty') and cat == 'os_cache'")
num_writes_os_cache.compute()

Unnamed: 0,cat,func_id,freq,dur_sec
36,os_cache,mark_buffer_dirty,4154656,2.143389


In [42]:
num_reads = functions.query("func_id == 'read' and cat == 'c'")
num_reads.compute()

Unnamed: 0,cat,func_id,freq,dur_sec
21,c,read,19198,1.634286


In [43]:
num_reads_ext4 = functions.query("func_id.str.contains('read') and cat == 'ext4'")
num_reads_ext4.compute()

Unnamed: 0,cat,func_id,freq,dur_sec


In [44]:
num_writes_os_cache = functions.query("func_id.str.contains('mark_page_accessed') and cat == 'os_cache'")
num_writes_os_cache.compute()

Unnamed: 0,cat,func_id,freq,dur_sec
37,os_cache,mark_page_accessed,1,1e-06


In [45]:
min_ts, max_ts = dask.compute(events["ts_us"].min(), events["ts_us"].max())

In [46]:
print(f"Application Time {(max_ts - min_ts) / 1e6}")

Application Time 27.0


In [47]:
total_calls = events.freq.sum().compute()
total_calls

np.int64(21039362)

In [48]:
system_calls = events.query("cat == 'sys'")["freq"].sum().compute()
system_calls

np.int64(55236)

In [49]:
print(f"Events captured per system call {total_calls/system_calls}")

Events captured per system call 380.89944963429645
