# Analysis for DFProfiler

This is a simple analysis notebook for dfprofiler.

## Imports

In [1]:
import logging
import json
import dask
import os
from pathlib import Path
from glob import glob
import zindex_py as zindex

In [2]:

import dask.dataframe as dd
from dask.distributed import Client, LocalCluster, progress, wait, get_client
from dask.distributed import Future, get_client

## Project Variables

In [3]:
app_root = str(Path(os.getcwd()).parent)

In [4]:
logging.basicConfig(
    level=logging.DEBUG,
    handlers=[
        logging.StreamHandler(),
    ],
    format="%(asctime)s [%(levelname)s]: %(message)s in %(pathname)s:%(lineno)d",
)

## Setup Dask Local Cluster

In [5]:
workers=4
cluster = LocalCluster(n_workers=workers)  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default
logging.info(f"Initialized Client with {workers} workers and link {client.dashboard_link}")

2024-07-21 04:50:41,504 [DEBUG]: Using selector: EpollSelector in /usr/lib/python3.10/asyncio/selector_events.py:54
2024-07-21 04:50:42,411 [INFO]: Initialized Client with 4 workers and link http://127.0.0.1:8787/status in /tmp/ipykernel_375374/1637310826.py:4


## Start Analysis

In [6]:

file=f"{app_root}/tests/output/simple_test_1MB_128K.pfw.gz"
file_pattern = glob(file)
file_pattern

['/home/cc/dfprofiler/tests/output/simple_test_1MB_128K.pfw.gz']

## Function to load trace data

In [7]:
def create_index(filename):
    index_file = f"{filename}.zindex"
    if not os.path.exists(index_file):
        status = zindex.create_index(filename, index_file=f"file:{index_file}",
                                     regex="id:\b([0-9]+)", numeric=True, unique=True, debug=False, verbose=False)
        logging.debug(f"Creating Index for {filename} returned {status}")
    return filename

def get_linenumber(filename):
    index_file = f"{filename}.zindex"
    line_number = zindex.get_max_line(filename, index_file=index_file, debug=False, verbose=False)
    logging.debug(f" The {filename} has {line_number} lines")
    return (filename, line_number)

def get_size(filename):
    conf = get_dft_configuration()
    if filename.endswith('.pfw'):
        size = os.stat(filename).st_size
    elif filename.endswith('.pfw.gz'):
        index_file = f"{filename}.zindex"
        line_number = zindex.get_max_line(filename, index_file=index_file,debug=False, verbose=False)
        size = line_number * 256
    logging.debug(f" The {filename} has {size/1024**3} GB size")
    return int(size)


def generate_line_batches(filename, max_line):
    batch_size = 16*1024
    for start in range(0, max_line, batch_size):
        end =  min((start + batch_size - 1) , (max_line - 1))
        logging.debug(f"Created a batch for {filename} from [{start}, {end}] lines")
        yield filename, start, end

def load_indexed_gzip_files(filename, start, end):
    index_file = f"{filename}.zindex"
    json_lines = zindex.zquery(filename, index_file=index_file,
                          raw=f"select a.line from LineOffsets a where a.line >= {start} AND a.line <= {end};", debug=False, verbose=False)
    logging.debug(f"Read {len(json_lines)} json lines for [{start}, {end}]")
    return json_lines

In [8]:
def load_profile(line):
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            d["pid"] = val["pid"]
            d["tid"] = val["tid"]
            d["ts"] = int(val["ts"])
            if "args" in val:
                d["dur"] = float(val["args"]["time"])
                d["freq"] = val["args"]["count"]
            d["func_id"] = val["name"]
            d["cat"] = val["cat"]
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d

## Create Dask Dataframe

In [9]:
if len(file_pattern) > 0:
    dask.bag.from_sequence(file_pattern).map(create_index).compute()
    logging.info(f"Created index for {len(file_pattern)} files")
    total_size = dask.bag.from_sequence(file_pattern).map(get_size).sum()
    logging.info(f"Total size of all files are {total_size} bytes")
    max_line_numbers = dask.bag.from_sequence(file_pattern).map(get_linenumber).compute()
    logging.info(f"Max lines per file are {max_line_numbers}")
    json_line_delayed = []
    total_lines = 0
    for filename, max_line in max_line_numbers:
        total_lines += max_line
        for _, start, end in generate_line_batches(filename, max_line):
            json_line_delayed.append((filename, start, end))

    logging.info(f"Loading {len(json_line_delayed)} batches out of {len(file_pattern)} files and has {total_lines} lines overall")
    json_line_bags = []
    for filename, start, end in json_line_delayed:
        num_lines = end - start + 1
        json_line_bags.append(dask.delayed(load_indexed_gzip_files, nout=num_lines)(filename, start, end))
    json_lines = dask.bag.concat(json_line_bags)
    pfw_bag = json_lines.map(load_profile).filter(lambda x: "ts" in x)
    pfw_bag.take(1)

2024-07-21 04:50:58,419 [INFO]: Created index for 1 files in /tmp/ipykernel_375374/4222768969.py:3
2024-07-21 04:50:58,424 [INFO]: Total size of all files are <dask.bag.core.Item object at 0x7a4d740b79d0> bytes in /tmp/ipykernel_375374/4222768969.py:5
2024-07-21 04:50:58,496 [INFO]: Max lines per file are [('/home/cc/dfprofiler/tests/output/simple_test_1MB_128K.pfw.gz', 676)] in /tmp/ipykernel_375374/4222768969.py:7
2024-07-21 04:50:58,498 [DEBUG]: Created a batch for /home/cc/dfprofiler/tests/output/simple_test_1MB_128K.pfw.gz from [0, 675] lines in /tmp/ipykernel_375374/2639451476.py:31
2024-07-21 04:50:58,502 [INFO]: Loading 1 batches out of 1 files and has 676 lines overall in /tmp/ipykernel_375374/4222768969.py:15


In [10]:
columns = {'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
           'ts': "uint64[pyarrow]", 'dur': "float32[pyarrow]", 
           'freq': "uint64[pyarrow]", 'func_id': "string[pyarrow]", 
           'cat': "string[pyarrow]"}

In [11]:
events = pfw_bag.to_dataframe(meta=columns)

In [12]:
events = events.repartition(npartitions=1).persist()
_ = wait(events)

In [13]:
events.head()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat
0,374281,374281,0,1e-06,1,mark_page_accessed,os_cache
1,374281,374281,0,0.081344,112117,ext4_da_write_begin,ext4
2,374281,374281,0,7e-06,2,statfs,sys
3,374281,374281,0,3.2e-05,8,open,c
4,374281,374281,0,0.000903,439,lseek,c


## Analysis

In [14]:
events.query("func_id.str.contains('write')").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat
1,374281,374281,0,0.081344,112117,ext4_da_write_begin,ext4
12,374281,374281,0,0.000063,31,write,sys
13,374281,374281,0,0.000955,437,ext4_file_write_iter,ext4
23,374281,374281,0,0.002245,437,write,sys
29,374281,374281,0,0.000098,31,write,c
...,...,...,...,...,...,...,...
653,374596,374596,26000000,0.003141,633,write,sys
656,374596,374596,26000000,0.122913,162037,ext4_da_write_begin,ext4
659,374596,374596,26000000,0.000009,1,write,sys
661,374596,374596,27000000,0.000006,2,write,sys


In [15]:
events.query("ts == 1140000").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat


In [16]:
events.query("func_id.str.contains('read')").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat
6,374281,374281,0,0.00002,14,readlinkat,sys
30,374281,374281,0,0.064739,3423,read,c
47,374281,374281,0,0.000015,14,vfs_readlink,vfs
63,374281,374281,0,0.060318,3429,read,sys
67,374281,374281,1000000,0.05893,587,read,sys
...,...,...,...,...,...,...,...
619,374596,374596,24000000,0.01065,108,read,c
622,374596,374596,25000000,0.059488,622,read,sys
628,374596,374596,25000000,0.060474,622,read,c
641,374596,374596,26000000,0.060221,633,read,sys


In [17]:
events.query("ts == 2875000").compute()

Unnamed: 0,pid,tid,ts,dur,freq,func_id,cat


In [18]:
events["func_id"].unique().compute()



0           mark_page_accessed
1          ext4_da_write_begin
2                       statfs
3                         open
4                        lseek
5                      lseek64
6                   readlinkat
7               posix_memalign
8                         free
9                       openat
10                       close
11                   PMPI_Init
12                       write
13        ext4_file_write_iter
14              PMPI_Comm_size
15                   ftruncate
16                      munmap
17                      fileno
18    vfs_statfs.part.0.isra.0
19                      calloc
20                     realloc
21                  vfs_unlink
22                      mmap64
23      _Z10gen_randomB5cxx11i
24              ext4_file_open
25                        read
26                        mmap
27                      malloc
28           mark_buffer_dirty
29                      open64
30             fileno_unlocked
31                       _init
32      

In [19]:
events.query("cat == 'app'")["func_id"].unique().compute()

0    _Z10gen_randomB5cxx11i
1                     _init
2                     _fini
3                      main
Name: func_id, dtype: string

In [30]:
events.query("cat == 'mpi'")["func_id"].unique().compute()

0         PMPI_Init
1    PMPI_Comm_size
2    PMPI_Comm_rank
3       PMPI_Reduce
4     PMPI_Finalize
Name: func_id, dtype: string

In [20]:
functions = events.groupby(["func_id", "cat", "pid","tid", "ts"])[["freq","dur"]].sum().groupby(["func_id", "cat", "ts"]).agg({"freq":sum,"dur":max}).groupby([ "cat","func_id"]).sum()
functions = functions.reset_index()
functions.compute()

Unnamed: 0,cat,func_id,freq,dur
0,app,_Z10gen_randomB5cxx11i,2,0.018887
1,app,_fini,2,7e-06
2,app,_init,2,6e-06
3,app,main,2,1.1e-05
4,block,mark_buffer_dirty,4425030,3.137173
5,c,calloc,6896,0.009679
6,c,close,3082,0.00419
7,c,fcntl,8,9e-06
8,c,fdopen,4,9e-06
9,c,fileno,14,2.4e-05


In [21]:
num_writes = functions.query("func_id == 'write' and cat == 'c'")
num_writes.compute()

Unnamed: 0,cat,func_id,freq,dur
25,c,write,17374,0.117144


In [22]:
num_writes_ext4 = functions.query("func_id.str.contains('ext4_file_write_iter') and cat == 'ext4'")
num_writes_ext4.compute()

Unnamed: 0,cat,func_id,freq,dur
30,ext4,ext4_file_write_iter,17286,0.036637


In [23]:
num_writes_os_cache = functions.query("func_id.str.contains('mark_buffer_dirty') and cat == 'os_cache'")
num_writes_os_cache.compute()

Unnamed: 0,cat,func_id,freq,dur
36,os_cache,mark_buffer_dirty,4425080,2.050056


In [24]:
num_reads = functions.query("func_id == 'read' and cat == 'c'")
num_reads.compute()

Unnamed: 0,cat,func_id,freq,dur
21,c,read,23257,1.623251


In [25]:
num_reads_ext4 = functions.query("func_id.str.contains('read') and cat == 'ext4'")
num_reads_ext4.compute()

Unnamed: 0,cat,func_id,freq,dur


In [26]:
num_writes_os_cache = functions.query("func_id.str.contains('mark_page_accessed') and cat == 'os_cache'")
num_writes_os_cache.compute()

Unnamed: 0,cat,func_id,freq,dur
37,os_cache,mark_page_accessed,2,1e-06


In [27]:
min_ts, max_ts = dask.compute(events["ts"].min(), events["ts"].max())

In [28]:
(max_ts - min_ts) / 1e6

np.float64(27.0)

In [29]:
events.freq.sum().compute()

np.int64(22497203)