# Analysis for DFProfiler

This is a simple analysis notebook for dfprofiler.

## Imports

In [1]:
import logging
import json
import dask
import os
from pathlib import Path
from glob import glob
import math
import zindex_py as zindex
import numpy as np
import intervals as I

In [2]:
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster, progress, wait, get_client
from dask.distributed import Future, get_client

## Project Variables

In [3]:
app_root = str(Path(os.getcwd()).parent)

In [4]:
logging.basicConfig(
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(),
    ],
    format="%(asctime)s [%(levelname)s]: %(message)s in %(pathname)s:%(lineno)d",
)

## Setup Dask Local Cluster

In [5]:
workers=16
cluster = LocalCluster(n_workers=workers)  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default
logging.info(f"Initialized Client with {workers} workers and link {client.dashboard_link}")

Perhaps you already have a cluster running?
Hosting the HTTP server on port 35303 instead
2024-10-21 17:19:46,130 [INFO]: Initialized Client with 16 workers and link http://127.0.0.1:35303/status in /var/tmp/haridev/ipykernel_3017468/3142773904.py:4


## Start Analysis

In [6]:
is_trace = True
file=f"{app_root}/tests/output/ops-8192_ts-4m/write.pfw.gz"
file_pattern = glob(file)
file_pattern

['/usr/WS2/haridev/datacrumbs/tests/output/ops-8192_ts-4m/write.pfw.gz']

## Function to load trace data

In [7]:
def create_index(filename):
    index_file = f"{filename}.zindex"
    if not os.path.exists(index_file):
        status = zindex.create_index(filename, index_file=f"file:{index_file}",
                                     regex="id:\b([0-9]+)", numeric=True, unique=True, debug=False, verbose=False)
        logging.debug(f"Creating Index for {filename} returned {status}")
    return filename

def get_linenumber(filename):
    index_file = f"{filename}.zindex"
    line_number = zindex.get_max_line(filename, index_file=index_file, debug=False, verbose=False)
    logging.debug(f" The {filename} has {line_number} lines")
    return (filename, line_number)

def get_size(filename):
    if filename.endswith('.pfw'):
        size = os.stat(filename).st_size
    elif filename.endswith('.pfw.gz'):
        index_file = f"{filename}.zindex"
        line_number = zindex.get_max_line(filename, index_file=index_file,debug=False, verbose=False)
        size = line_number * 256
    logging.debug(f" The {filename} has {size/1024**3} GB size")
    return int(size)


def generate_line_batches(filename, max_line):
    batch_size = 16*1024
    for start in range(0, max_line, batch_size):
        end =  min((start + batch_size - 1) , (max_line - 1))
        logging.debug(f"Created a batch for {filename} from [{start}, {end}] lines")
        yield filename, start, end

def load_indexed_gzip_files(filename, start, end):
    index_file = f"{filename}.zindex"
    json_lines = zindex.zquery(filename, index_file=index_file,
                          raw=f"select a.line from LineOffsets a where a.line >= {start} AND a.line <= {end};", debug=False, verbose=False)
    logging.debug(f"Read {len(json_lines)} json lines for [{start}, {end}]")
    return json_lines

In [8]:
def load_profile(line):
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            if "pid" in d:
                d["pid"] = val["pid"]
            if "tid" in d:
                d["tid"] = val["tid"]
            if "ts" in d:
                d["ts_us"] = int(val["ts"])
            d["filename"] = "NA"
            if "args" in val:
                if "time" in val["args"]:
                    d["dur_sec"] = float(val["args"]["time"])
                if "freq" in val["args"]:
                    d["freq"] = val["args"]["freq"]
                if "size_sum" in val["args"]:
                    d["size_bytes"] = val["args"]["size_sum"]
                if "fname" in val["args"] and val["args"]["fname"]:
                    d["filename"] = val["args"]["fname"]
            d["func_id"] = val["name"]
            d["cat"] = val["cat"]
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d

def get_np_type(num, dtype):
    return hex(num)[2:]

def load_trace(line):
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            d["name"] = val["name"]
            d["cat"] = val["cat"]
            if "pid" in val:
                d["pid"] = val["pid"]
            if "tid" in val:
                d["tid"] = val["tid"]
            d["ts"] = 0
            d["dur"] = 0
            if "ts" in val:
                d["ts"] = int(val["ts"])
                d["te"] = int(val["ts"])
            if "dur" in val:
                d["dur"] = int(val["dur"])
            if "args" in val and "hhash" in val["args"]:                    
                d["hhash"] = get_np_type(val["args"]["hhash"], np.int64)
            interval = I.empty()
            if d["dur"] > 0:
                d["te"] = int(val["ts"]) + d["dur"]
                interval = I.closedopen(d["ts"], d["ts"] + d["dur"])
            d["interval"] = I.to_string(interval)
            if val["ph"] != "M":
                d["type"] = 0    
                if "args" in val:                    
                    if "hhash" in val["args"]:
                        d["hhash"] = get_np_type(val["args"]["hhash"], np.int64)
                    if "size_sum" in val["args"]:
                        d["size"] = val["args"]["size_sum"]
                    if "fhash" in val["args"]:
                        d["fhash"] = get_np_type(val["args"]["fhash"], np.int64)
            else:
                if val["name"] == "FH":
                    d["type"] = 1
                    if "args" in val:
                        if "name" in val["args"]:
                            d["name"] = val["args"]["name"]
                        if "value" in val["args"]:
                            d["hash"] = get_np_type(val["args"]["value"], np.int64)
                elif val["name"] == "HH":
                    d["type"] = 2
                    if "args" in val:
                        if "name" in val["args"]:
                            d["name"] = val["args"]["name"]
                        if "value" in val["args"]:
                            d["hash"] = get_np_type(val["args"]["value"], np.int64)
            
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d

## Create Dask Dataframe

In [9]:
if len(file_pattern) > 0:
    dask.bag.from_sequence(file_pattern).map(create_index).compute()
    logging.info(f"Created index for {len(file_pattern)} files")
    total_size = dask.bag.from_sequence(file_pattern).map(get_size).sum()
    n_partition = math.ceil(total_size.compute() / (128 * 1024 ** 2))
    logging.info(f"Total size of all files are {total_size} bytes")
    max_line_numbers = dask.bag.from_sequence(file_pattern).map(get_linenumber).compute()
    logging.info(f"Max lines per file are {max_line_numbers}")
    json_line_delayed = []
    total_lines = 0
    for filename, max_line in max_line_numbers:
        total_lines += max_line
        for _, start, end in generate_line_batches(filename, max_line):
            json_line_delayed.append((filename, start, end))

    logging.info(f"Loading {len(json_line_delayed)} batches out of {len(file_pattern)} files and has {total_lines} lines overall")
    json_line_bags = []
    for filename, start, end in json_line_delayed:
        num_lines = end - start + 1
        json_line_bags.append(dask.delayed(load_indexed_gzip_files, nout=num_lines)(filename, start, end))
    json_lines = dask.bag.concat(json_line_bags)
    if is_trace:
        pfw_bag = json_lines.map(load_trace).filter(lambda x: "name" in x)
    else:
        pfw_bag = json_lines.map(load_profile).filter(lambda x: "func_id" in x)
    pfw_bag.take(1)

2024-10-21 17:19:46,264 [INFO]: Created index for 1 files in /var/tmp/haridev/ipykernel_3017468/864066620.py:3
2024-10-21 17:19:46,351 [INFO]: Total size of all files are <dask.bag.core.Item object at 0x1555213b8040> bytes in /var/tmp/haridev/ipykernel_3017468/864066620.py:6
2024-10-21 17:19:46,425 [INFO]: Max lines per file are [('/usr/WS2/haridev/datacrumbs/tests/output/ops-8192_ts-4m/write.pfw.gz', 6013218)] in /var/tmp/haridev/ipykernel_3017468/864066620.py:8
2024-10-21 17:19:46,427 [INFO]: Loading 368 batches out of 1 files and has 6013218 lines overall in /var/tmp/haridev/ipykernel_3017468/864066620.py:16


In [10]:
if is_trace:
    columns = {'hhash': "string[pyarrow]", 'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
                'cat': "string[pyarrow]", 'name': "string[pyarrow]", 'type':  "uint8[pyarrow]",
            'ts': "uint64[pyarrow]", 'te': "uint64[pyarrow]", 'dur': "uint64[pyarrow]", 'interval': "string[pyarrow]", 
             'size': "string[pyarrow]", 'fhash': "string[pyarrow]", 'hash': "string[pyarrow]", 
           }
else:
    columns = {'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
            'ts_us': "uint64[pyarrow]", 'dur_sec': "float32[pyarrow]", 
            'freq': "uint64[pyarrow]", 'size_bytes': "uint64[pyarrow]", 'name': "string[pyarrow]", 
            'filename': "string[pyarrow]", 
            'cat': "string[pyarrow]"}

In [11]:
events = pfw_bag.to_dataframe(meta=columns)

In [12]:
events = events.repartition(npartitions=n_partition).persist()
_ = wait(events)

In [13]:
fhash = events.query("type == 1")[["name","hash"]]
hhash = events.query("type == 2")[["name","hash"]]
event = events.query("type == 0")
fhashes = fhash.query("name.str.contains('file_0')").compute()["hash"]
fhashes = fhashes.to_list()


## Analysis

In [14]:

interesting_events = event.query("fhash.isin(@value)", local_dict={"value": fhashes})
interesting_events.compute()

Unnamed: 0,hhash,pid,tid,cat,name,type,ts,te,dur,interval,size,fhash,hash
7265,2ed896eec0538d57033359a0a9fa34c1,8830,8830,sys,openat,0,138677,138680,3,"[138677,138680)",,9277debc67b02019e012acf9bf32eb36,
14835,2ed896eec0538d57033359a0a9fa34c1,8830,8830,sys,write,0,180823,187702,6879,"[180823,187702)",4194304.0,9277debc67b02019e012acf9bf32eb36,
5956,2ed896eec0538d57033359a0a9fa34c1,8830,8830,sys,write,0,229330,236130,6800,"[229330,236130)",4194304.0,9277debc67b02019e012acf9bf32eb36,
13457,2ed896eec0538d57033359a0a9fa34c1,8830,8830,sys,write,0,272526,278794,6268,"[272526,278794)",4194304.0,9277debc67b02019e012acf9bf32eb36,
4578,2ed896eec0538d57033359a0a9fa34c1,8830,8830,sys,write,0,319657,329788,10131,"[319657,329788)",4194304.0,9277debc67b02019e012acf9bf32eb36,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
14821,2ed896eec0538d57033359a0a9fa34c1,8830,8830,sys,write,0,629260891,629269742,8851,"[629260891,629269742)",4194304.0,9277debc67b02019e012acf9bf32eb36,
14969,2ed896eec0538d57033359a0a9fa34c1,8830,8830,sys,write,0,629480343,629492216,11873,"[629480343,629492216)",4194304.0,9277debc67b02019e012acf9bf32eb36,
15044,2ed896eec0538d57033359a0a9fa34c1,8830,8830,sys,write,0,629608705,629619671,10966,"[629608705,629619671)",4194304.0,9277debc67b02019e012acf9bf32eb36,
15377,2ed896eec0538d57033359a0a9fa34c1,8830,8830,sys,write,0,630215483,630227433,11950,"[630215483,630227433)",4194304.0,9277debc67b02019e012acf9bf32eb36,


In [15]:

# interesting_events["interval"] = interesting_events.apply(lambda x: I.to_string(I.closed(x["ts"], x["ts"]+x["dur"])), axis=1)

In [22]:
def group_func(df):
    val = I.empty()
    for index, value in df.items():
        if str(value) != 'NA':
            pad_interval = I.from_string(str(value), int)
            val = val.union(pad_interval)
    logging.debug(f"Grouped Range into {val}")
    return I.to_string(val)
def union_portions():
    return dd.Aggregation(
        'union_portions',
        chunk=lambda s: s.apply(group_func),
        agg=lambda s: s.apply(group_func)
    )
relevant_intervals = interesting_events.reduction(chunk=lambda s: s.apply(group_func), aggregate=lambda s1: s1.apply(group_func))["interval"].compute()
relevant_intervals = I.from_string(relevant_intervals, int)
relevant_intervals_list = list(relevant_intervals)
relevant_intervals_list[:10], len(relevant_intervals_list)

([[138677,138680),
  [180823,187702),
  [229330,236130),
  [272526,278794),
  [319657,329788),
  [374972,385069),
  [421370,429159),
  [466234,473716),
  [511394,519068),
  [554724,563509)],
 1317)

In [23]:
min_ts = relevant_intervals_list[0].lower
max_te = relevant_intervals_list[-1].upper
min_ts, max_te

(138677, 632192918)

In [24]:
filtered_events = event.query(f"ts >= {min_ts} and te <= {max_te} and dur > 0")
filtered_events.compute()

Unnamed: 0,hhash,pid,tid,cat,name,type,ts,te,dur,interval,size,fhash,hash
7263,2ed896eec0538d57033359a0a9fa34c1,8830,8830,kernel,ima_file_check,0,138677,138678,1,"[138677,138678)",,,
7265,2ed896eec0538d57033359a0a9fa34c1,8830,8830,sys,openat,0,138677,138680,3,"[138677,138680)",,9277debc67b02019e012acf9bf32eb36,
7266,2ed896eec0538d57033359a0a9fa34c1,8830,8830,c,open64,0,138677,138683,6,"[138677,138683)",,,
7267,2ed896eec0538d57033359a0a9fa34c1,8830,8830,c,open,0,138677,138684,7,"[138677,138684)",,,
7273,2ed896eec0538d57033359a0a9fa34c1,8830,8830,kernel,__vfs_getxattr,0,138705,138706,1,"[138705,138706)",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
213,2ed896eec0538d57033359a0a9fa34c1,8830,8830,kernel,page_counter_try_charge,0,632182131,632182152,21,"[632182131,632182152)",,,
215,2ed896eec0538d57033359a0a9fa34c1,8830,8830,kernel,fuse_send_write_pages,0,632183383,632191203,7820,"[632183383,632191203)",,,
216,2ed896eec0538d57033359a0a9fa34c1,8830,8830,kernel,fuse_file_write_iter,0,632183383,632191204,7821,"[632183383,632191204)",,,
217,2ed896eec0538d57033359a0a9fa34c1,8830,8830,kernel,fuse_flush_writepages,0,632191238,632191239,1,"[632191238,632191239)",,,


In [25]:
len(event)

6012980

In [26]:
def contains(x):
    x["valid"] = "0"
    if x["ts"] is not np.nan and x["dur"] is not np.nan:
        val = relevant_intervals.overlaps(I.from_string(str(x["interval"]), int))           
        x["valid"] = "1" if val else "0"  
    return x
filtered_events["valid"] = "0"
valid_events = filtered_events.apply(contains, axis=1, meta=filtered_events).query("valid == '1'").persist()
_ = wait(valid_events)

In [28]:
valid_events.compute()

Unnamed: 0,hhash,pid,tid,cat,name,type,ts,te,dur,interval,size,fhash,hash,valid
7263,2ed896eec0538d57033359a0a9fa34c1,8830,8830,kernel,ima_file_check,0,138677,138678,1,"[138677,138678)",,,,1
7265,2ed896eec0538d57033359a0a9fa34c1,8830,8830,sys,openat,0,138677,138680,3,"[138677,138680)",,9277debc67b02019e012acf9bf32eb36,,1
7266,2ed896eec0538d57033359a0a9fa34c1,8830,8830,c,open64,0,138677,138683,6,"[138677,138683)",,,,1
7267,2ed896eec0538d57033359a0a9fa34c1,8830,8830,c,open,0,138677,138684,7,"[138677,138684)",,,,1
14832,2ed896eec0538d57033359a0a9fa34c1,8830,8830,kernel,fuse_send_write_pages,0,180823,187696,6873,"[180823,187696)",,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15374,2ed896eec0538d57033359a0a9fa34c1,8830,8830,kernel,fuse_send_write_pages,0,630215483,630227428,11945,"[630215483,630227428)",,,,1
15375,2ed896eec0538d57033359a0a9fa34c1,8830,8830,kernel,fuse_file_write_iter,0,630215483,630227430,11947,"[630215483,630227430)",,,,1
15376,2ed896eec0538d57033359a0a9fa34c1,8830,8830,vfs,vfs_write,0,630215483,630227431,11948,"[630215483,630227431)",,,,1
15377,2ed896eec0538d57033359a0a9fa34c1,8830,8830,sys,write,0,630215483,630227433,11950,"[630215483,630227433)",4194304.0,9277debc67b02019e012acf9bf32eb36,,1


In [41]:
valid_events.groupby(["name", "cat"])["dur"].sum().compute() / 1e6

name                     cat   
fuse_file_write_iter     kernel    15.088548
fuse_send_write_pages    kernel     15.11327
open64                   c          0.000006
write                    sys       15.121911
file_free_rcu            kernel     0.000001
__free_pages             kernel     0.000012
free_unref_page          kernel      0.00001
ima_file_check           kernel     0.000001
open                     c          0.000007
openat                   sys        0.000003
vfs_write                vfs       15.089591
__update_blocked_fair    kernel     0.000013
update_blocked_averages  kernel      0.00002
close                    sys        0.001675
Name: dur, dtype: double[pyarrow]