# Analysis for DataCrumbs

This is a simple analysis notebook for Datacrumbs.

## Imports

In [1]:
import logging
import json
import dask
import os
from pathlib import Path
from glob import glob
import math
import zindex_py as zindex
import numpy as np
import intervals as I
import pandas as pd
from tqdm.notebook import trange, tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster, progress, wait, get_client
from dask.distributed import Future, get_client

## Project Variables

In [3]:
app_root = str(Path(os.getcwd()).parent)

In [4]:
logging.basicConfig(
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(),
    ],
    format="%(asctime)s [%(levelname)s]: %(message)s in %(pathname)s:%(lineno)d",
)

## Setup Dask Local Cluster

In [7]:
workers=16
cluster = LocalCluster(n_workers=workers)  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default
logging.info(f"Initialized Client with {workers} workers and link {client.dashboard_link}")

Perhaps you already have a cluster running?
Hosting the HTTP server on port 39457 instead
2024-11-08 10:50:12,354 [INFO]: Initialized Client with 16 workers and link http://127.0.0.1:39457/status in /var/tmp/haridev/ipykernel_3664668/3142773904.py:4


## Start Analysis

In [50]:

import os 
is_trace = True
ops="openat"
folder="/usr/workspace/haridev/xio/"
output=f"{folder}/output/jslines"
file=f"{folder}/ops-32_files-8/RAW-BUF.pfw.gz"
# file=f"{app_root}/tests/output/ops-64_ts-64m/RAW-BUFFERED.pfw.gz"
output_file=f"{output}/all_"+os.path.basename(os.path.dirname(file))+"-"+ os.path.basename(file) + ".jsonl"
file_pattern = glob(file)
file_pattern, output_file

(['/usr/workspace/haridev/xio//ops-32_files-8/RAW-BUF.pfw.gz'],
 '/usr/workspace/haridev/xio//output/jslines/all_ops-32_files-8-RAW-BUF.pfw.gz.jsonl')

## Function to load trace data

In [9]:
def create_index(filename):
    index_file = f"{filename}.zindex"
    if not os.path.exists(index_file):
        status = zindex.create_index(filename, index_file=f"file:{index_file}",
                                     regex="id:\b([0-9]+)", numeric=True, unique=True, debug=False, verbose=False)
        logging.debug(f"Creating Index for {filename} returned {status}")
    return filename

def get_linenumber(filename):
    index_file = f"{filename}.zindex"
    line_number = zindex.get_max_line(filename, index_file=index_file, debug=False, verbose=False)
    logging.debug(f" The {filename} has {line_number} lines")
    return (filename, line_number)

def get_size(filename):
    if filename.endswith('.pfw'):
        size = os.stat(filename).st_size
    elif filename.endswith('.pfw.gz'):
        index_file = f"{filename}.zindex"
        line_number = zindex.get_max_line(filename, index_file=index_file,debug=False, verbose=False)
        size = line_number * 256
    logging.debug(f" The {filename} has {size/1024**3} GB size")
    return int(size)


def generate_line_batches(filename, max_line):
    batch_size = 16*1024
    for start in range(0, max_line, batch_size):
        end =  min((start + batch_size - 1) , (max_line - 1))
        logging.debug(f"Created a batch for {filename} from [{start}, {end}] lines")
        yield filename, start, end

def load_indexed_gzip_files(filename, start, end):
    index_file = f"{filename}.zindex"
    json_lines = zindex.zquery(filename, index_file=index_file,
                          raw=f"select a.line from LineOffsets a where a.line >= {start} AND a.line <= {end};", debug=False, verbose=False)
    logging.debug(f"Read {len(json_lines)} json lines for [{start}, {end}]")
    return json_lines

In [10]:
def load_profile(line):
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            if "pid" in d:
                d["pid"] = val["pid"]
            if "tid" in d:
                d["tid"] = val["tid"]
            if "ts" in d:
                d["ts_us"] = int(val["ts"])
            d["filename"] = "NA"
            if "args" in val:
                if "time" in val["args"]:
                    d["dur_sec"] = float(val["args"]["time"])
                if "freq" in val["args"]:
                    d["freq"] = val["args"]["freq"]
                if "size_sum" in val["args"]:
                    d["size_bytes"] = val["args"]["size_sum"]
                if "fname" in val["args"] and val["args"]["fname"]:
                    d["filename"] = val["args"]["fname"]
            d["func_id"] = val["name"]
            d["cat"] = val["cat"]
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d


def load_trace(line):
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            d["name"] = val["name"]
            d["cat"] = val["cat"]
            if "pid" in val:
                d["pid"] = val["pid"]
            if "tid" in val:
                d["tid"] = val["tid"]
            d["ts"] = 0
            d["dur"] = 0
            if "ts" in val:
                d["ts"] = int(val["ts"])
                d["te"] = int(val["ts"])
            d["dur"] = 1
            if "dur" in val:
                d["dur"] = int(val["dur"])
            if "args" in val and "hhash" in val["args"]:                    
                d["hhash"] = val["args"]["hhash"]
            if "ts" in val:
                interval = I.closedopen(d["ts"], d["ts"] + 1)
                if d["dur"] > 0:
                    d["te"] = int(val["ts"]) + d["dur"]
                    interval = I.closedopen(d["ts"], d["ts"] + d["dur"])
                d["interval"] = I.to_string(interval)
            if val["ph"] != "M":
                d["type"] = 0    
                if "args" in val:                    
                    if "hhash" in val["args"]:
                        d["hhash"] = val["args"]["hhash"]
                    if "size_sum" in val["args"]:
                        d["size"] = val["args"]["size_sum"]
                    if "fhash" in val["args"]:
                        d["fhash"] = val["args"]["fhash"]
            else:
                if val["name"] == "FH":
                    d["type"] = 1
                    if "args" in val:
                        if "name" in val["args"]:
                            d["name"] = val["args"]["name"]
                        if "value" in val["args"]:
                            d["hash"] = val["args"]["value"]
                elif val["name"] == "HH":
                    d["type"] = 2
                    if "args" in val:
                        if "name" in val["args"]:
                            d["name"] = val["args"]["name"]
                        if "value" in val["args"]:
                            d["hash"] = val["args"]["value"]
            
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d

## Create Dask Dataframe

In [11]:
if len(file_pattern) > 0:
    dask.bag.from_sequence(file_pattern).map(create_index).compute()
    logging.info(f"Created index for {len(file_pattern)} files")
    total_size = dask.bag.from_sequence(file_pattern).map(get_size).sum()
    n_partition = math.ceil(total_size.compute() / (128 * 1024 ** 2))
    logging.info(f"Total size of all files are {total_size} bytes")
    max_line_numbers = dask.bag.from_sequence(file_pattern).map(get_linenumber).compute()
    logging.info(f"Max lines per file are {max_line_numbers}")
    json_line_delayed = []
    total_lines = 0
    for filename, max_line in max_line_numbers:
        total_lines += max_line
        for _, start, end in generate_line_batches(filename, max_line):
            json_line_delayed.append((filename, start, end))

    logging.info(f"Loading {len(json_line_delayed)} batches out of {len(file_pattern)} files and has {total_lines} lines overall")
    json_line_bags = []
    for filename, start, end in json_line_delayed:
        num_lines = end - start + 1
        json_line_bags.append(dask.delayed(load_indexed_gzip_files, nout=num_lines)(filename, start, end))
    json_lines = dask.bag.concat(json_line_bags)
    if is_trace:
        pfw_bag = json_lines.map(load_trace).filter(lambda x: "name" in x)
    else:
        pfw_bag = json_lines.map(load_profile).filter(lambda x: "func_id" in x)
    pfw_bag.take(1)

2024-11-08 10:50:17,308 [INFO]: Created index for 1 files in /var/tmp/haridev/ipykernel_3664668/864066620.py:3
2024-11-08 10:50:17,487 [INFO]: Total size of all files are <dask.bag.core.Item object at 0x1554c0d99d60> bytes in /var/tmp/haridev/ipykernel_3664668/864066620.py:6
2024-11-08 10:50:17,569 [INFO]: Max lines per file are [('/usr/workspace/haridev/xio//ops-32_files-8/RAW-BUF.pfw.gz', 4906748)] in /var/tmp/haridev/ipykernel_3664668/864066620.py:8
2024-11-08 10:50:17,571 [INFO]: Loading 300 batches out of 1 files and has 4906748 lines overall in /var/tmp/haridev/ipykernel_3664668/864066620.py:16


In [12]:
if is_trace:
    columns = {'hhash': "string[pyarrow]", 'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
                'cat': "string[pyarrow]", 'name': "string[pyarrow]", 'type':  "uint8[pyarrow]",
            'ts': "uint64[pyarrow]", 'te': "uint64[pyarrow]", 'dur': "uint64[pyarrow]", 'interval': "string[pyarrow]", 
             'size': "uint64[pyarrow]", 'fhash': "string[pyarrow]", 'hash': "string[pyarrow]", 
           }
else:
    columns = {'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
            'ts_us': "uint64[pyarrow]", 'dur_sec': "float32[pyarrow]", 
            'freq': "uint64[pyarrow]", 'size_bytes': "uint64[pyarrow]", 'name': "string[pyarrow]", 
            'filename': "string[pyarrow]", 
            'cat': "string[pyarrow]"}

In [13]:
events = pfw_bag.to_dataframe(meta=columns)

In [14]:
events = events.repartition(npartitions=n_partition).persist()
_ = wait(events)

In [15]:
fhash = events.query("type == 1")[["name","hash"]]
hhash = events.query("type == 2")[["name","hash"]]
event = events.query("type == 0")
fhashes = fhash.query("name.str.contains('file_0')").compute()["hash"]
fhashes = fhashes.to_list()


In [16]:
fhash.query("name.str.contains('file_0')").compute()

Unnamed: 0,name,hash
1780,/home/cc/datacrumbs/build/data/file_0_0.dat,2337428835aa42fa0d2764000f669460
4468,/home/cc/datacrumbs/build/data/file_0_1.dat,288f886adbe25d76fe445beb9911af78
7179,/home/cc/datacrumbs/build/data/file_0_2.dat,ec6c04037b2c4f483dc3c933ec088cc8
9858,/home/cc/datacrumbs/build/data/file_0_3.dat,176264bd47969d9f55d0a6d5b8021e21
12604,/home/cc/datacrumbs/build/data/file_0_4.dat,edea10a5c821e90758563992efb303dc
15252,/home/cc/datacrumbs/build/data/file_0_5.dat,c072c6a531057c46d034ace752f465e5
1555,/home/cc/datacrumbs/build/data/file_0_6.dat,df67ba39da3433bf9d33ddd244ea8a7a
4278,/home/cc/datacrumbs/build/data/file_0_7.dat,4399451135d75f5c5be27cd4057b34c8


In [17]:
event.query("fhash.isin(@value)", local_dict={"value": fhashes}).compute()

Unnamed: 0,hhash,pid,tid,cat,name,type,ts,te,dur,interval,size,fhash,hash
1781,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,openat,0,460296305,460586748,290443,"[460296305,460586748)",,2337428835aa42fa0d2764000f669460,
1861,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,write,0,461657534,461898426,240892,"[461657534,461898426)",1024,2337428835aa42fa0d2764000f669460,
1897,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,read,0,461929060,461989691,60631,"[461929060,461989691)",1024,2337428835aa42fa0d2764000f669460,
1958,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,write,0,463052970,463166181,113211,"[463052970,463166181)",1024,2337428835aa42fa0d2764000f669460,
1976,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,read,0,463185949,463209344,23395,"[463185949,463209344)",1024,2337428835aa42fa0d2764000f669460,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9825,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157325455056,157325483484,28428,"[157325455056,157325483484)",16384,4399451135d75f5c5be27cd4057b34c8,
9921,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157327418547,157327499293,80746,"[157327418547,157327499293)",16384,4399451135d75f5c5be27cd4057b34c8,
9983,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157329298722,157329325714,26992,"[157329298722,157329325714)",16384,4399451135d75f5c5be27cd4057b34c8,
10047,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157331188671,157331286554,97883,"[157331188671,157331286554)",16384,4399451135d75f5c5be27cd4057b34c8,


## Analysis

In [18]:

interesting_events = event.query("fhash.isin(@value)", local_dict={"value": fhashes}).sort_values("ts")
interesting_events.compute()

Unnamed: 0,hhash,pid,tid,cat,name,type,ts,te,dur,interval,size,fhash,hash
1781,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,openat,0,460296305,460586748,290443,"[460296305,460586748)",,2337428835aa42fa0d2764000f669460,
1861,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,write,0,461657534,461898426,240892,"[461657534,461898426)",1024,2337428835aa42fa0d2764000f669460,
1897,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,read,0,461929060,461989691,60631,"[461929060,461989691)",1024,2337428835aa42fa0d2764000f669460,
1958,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,write,0,463052970,463166181,113211,"[463052970,463166181)",1024,2337428835aa42fa0d2764000f669460,
1976,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,read,0,463185949,463209344,23395,"[463185949,463209344)",1024,2337428835aa42fa0d2764000f669460,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9825,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157325455056,157325483484,28428,"[157325455056,157325483484)",16384,4399451135d75f5c5be27cd4057b34c8,
9921,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157327418547,157327499293,80746,"[157327418547,157327499293)",16384,4399451135d75f5c5be27cd4057b34c8,
9983,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157329298722,157329325714,26992,"[157329298722,157329325714)",16384,4399451135d75f5c5be27cd4057b34c8,
10047,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157331188671,157331286554,97883,"[157331188671,157331286554)",16384,4399451135d75f5c5be27cd4057b34c8,


In [19]:
interesting_events["combined_name"] = interesting_events["name"] + "-" + interesting_events["cat"]
ts_events = interesting_events[["size"]].compute().reset_index().drop("index", axis=1)
ts_events["size"].fillna(value=1.0)

0            1
1         1024
2         1024
3         1024
4         1024
         ...  
12411    16384
12412    16384
12413    16384
12414    16384
12415        1
Name: size, Length: 12416, dtype: uint64[pyarrow]

In [31]:
interesting_intervals  = interesting_events[["interval","name"]].compute()
interesting_intervals

Unnamed: 0,interval,name
1781,"[460296305,460586748)",openat
1861,"[461657534,461898426)",write
1897,"[461929060,461989691)",read
1958,"[463052970,463166181)",write
1976,"[463185949,463209344)",read
...,...,...
9825,"[157325455056,157325483484)",read
9921,"[157327418547,157327499293)",read
9983,"[157329298722,157329325714)",read
10047,"[157331188671,157331286554)",read


In [32]:

# interesting_events["interval"] = interesting_events.apply(lambda x: I.to_string(I.closed(x["ts"], x["ts"]+x["dur"])), axis=1)

In [25]:
def group_func(df):
    val = I.empty()
    for index, value in df.items():
        if str(value) != 'NA':
            pad_interval = I.from_string(str(value), int)
            val = val.union(pad_interval)
    logging.debug(f"Grouped Range into {val}")
    return I.to_string(val)
def union_portions():
    return dd.Aggregation(
        'union_portions',
        chunk=lambda s: s.apply(group_func),
        agg=lambda s: s.apply(group_func)
    )
relevant_intervals = interesting_events[["interval"]].reduction(chunk=lambda s: s.apply(group_func), aggregate=lambda s1: s1.apply(group_func))["interval"].compute()
relevant_intervals = I.from_string(relevant_intervals, int)
relevant_intervals_list = list(relevant_intervals)
relevant_intervals_list[:10], len(relevant_intervals_list)

([[460296305,460586748),
  [461657534,461898426),
  [461929060,461989691),
  [463052970,463166181),
  [463185949,463209344),
  [464270483,464347630),
  [464367094,464391244),
  [465452044,465529545),
  [465542671,465557657),
  [466617931,466787067)],
 12416)

In [26]:
min_ts = relevant_intervals_list[0].lower
max_te = relevant_intervals_list[-1].upper
min_ts, max_te

(460296305, 157331295261)

In [28]:
filtered_events = event.query(f"ts >= {min_ts - 1e5} and te <= {max_te + 1e5} and dur > 0")
filtered_events.compute()
# filtered_events = event

Unnamed: 0,hhash,pid,tid,cat,name,type,ts,te,dur,interval,size,fhash,hash
1587,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,kernel,__mod_node_page_state,0,460196345,460196775,430,"[460196345,460196775)",,,
1590,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,kernel,next_uptodate_page,0,460198409,460198925,516,"[460198409,460198925)",,,
1591,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,kernel,__mod_node_page_state,0,460201504,460201812,308,"[460201504,460201812)",,,
1592,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,kernel,__mod_lruvec_page_state,0,460200687,460202161,1474,"[460200687,460202161)",,,
1593,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,kernel,page_add_file_rmap,0,460199794,460202467,2673,"[460199794,460202467)",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10056,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,kernel,__mod_node_page_state,0,157331383211,157331383943,732,"[157331383211,157331383943)",,,
10057,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,kernel,__mod_zone_page_state,0,157331385911,157331386579,668,"[157331385911,157331386579)",,,
10058,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,kernel,__mod_node_page_state,0,157331388446,157331389308,862,"[157331388446,157331389308)",,,
10059,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,kernel,__mod_zone_page_state,0,157331391133,157331391930,797,"[157331391133,157331391930)",,,


In [49]:
filtered_events["combined_name"] = filtered_events["name"] + "-" + filtered_events["cat"]
event_batch_per_sys_call = []
rows = list(interesting_intervals.iterrows())
count = 0
ops_map = {}
ops_counter = 0
for index, row in tqdm(rows):
    interval = I.from_string(row["interval"], int)
    ops = row["name"]
    a_overlaps_b = f"(ts >= {interval.lower} and ts <=  {interval.upper}) or (te >= {interval.lower} and te <=  {interval.upper})"
    b_overlaps_a = f"({interval.lower} >= ts and {interval.lower} <=  te) or ({interval.upper} >= ts and {interval.upper} <=  te)"
    batch = filtered_events.query(f"{a_overlaps_b} or {b_overlaps_a}")[["combined_name", "dur"]].groupby("combined_name").sum().compute()
    if ops in ops_map:
        op_value = ops_map[ops]
    else:
        ops_counter += 1
        op_value = ops_counter
        ops_map[ops] = op_value
    batch.loc['op'] = [op_value]
    event_batch_per_sys_call.append(batch)
    count += 1


  0%|          | 0/12416 [00:00<?, ?it/s]

In [45]:
merged_df = None
count = 0
for batch in tqdm(event_batch_per_sys_call):
    if merged_df is not None:
        merged_df = merged_df.merge(batch, how='outer', on="combined_name",suffixes=('', f"_{count}"))
    else:
        merged_df = batch
    count += 1

  0%|          | 0/11 [00:00<?, ?it/s]

In [48]:
dataset = merged_df.transpose()
df = dataset.reset_index().drop("index", axis=1)
df["op_name"] = "UNKNOWN"
for key, value in ops_map.items():
    df["op_name"] = df["op_name"].mask(df["op"].eq(value), key)
df["op_name"].unique()

array(['openat', 'write', 'read'], dtype=object)

In [39]:
final_dataset = df.merge(ts_events, how='outer', left_index=True, right_index=True)
final_dataset["BW"] = final_dataset[f"size"] / (1024**2) / (final_dataset[f"{ops}-sys"] / 1e9)
final_dataset.drop([f"{ops}-sys", "size"], inplace=True, axis=1)
final_dataset.columns

Index(['__alloc_pages-kernel', '__ext4_check_dir_entry-kernel',
       '__ext4_ext_check-kernel', '__ext4_ext_dirty-kernel',
       '__ext4_find_entry-kernel', '__ext4_get_inode_loc-kernel',
       '__ext4_handle_dirty_metadata-kernel',
       '__ext4_journal_ensure_credits-kernel',
       '__ext4_journal_get_write_access-kernel',
       '__ext4_journal_start_sb-kernel',
       ...
       'release_pages-kernel', 'security_file_alloc-kernel',
       'security_file_open-kernel', 'security_file_truncate-kernel',
       'should_fail_alloc_page-kernel', 'truncate_inode_pages_range-kernel',
       'truncate_pagecache-kernel', 'update_blocked_averages-kernel',
       'vfs_open-kernel', 'BW'],
      dtype='object', length=186)

In [40]:
output_file
final_dataset["op"] = ops

In [41]:
final_dataset.to_json(path_or_buf=f"{output_file}",orient='records', lines=True)


In [11]:
output_file="/usr/workspace/haridev/xio/output/jslines/write_ops-64_ts-64m-RAW-DIRECT.pfw.gz.jsonl"
ops="write"

In [12]:
from glob import glob
files = glob(f"{output_file}")
final_dataset_l = []
for file in files:
    final_dataset_l.append(dd.read_json(file))
final_dataset = dd.concat(final_dataset_l).compute().reset_index().drop("index", axis=1)
final_dataset["BW"] = final_dataset[f"transfer_size"] / (1024**2) / (final_dataset[f"{ops}-sys"]/1e9)
final_dataset["op"] = ops
final_dataset.to_json(path_or_buf=f"{output_file}",orient='records', lines=True)


## make the number of relevance features dynamic.

1. Add up the importance score to reach 95%.
2. Add Transfer size
3. Split features into layers and do this analysis per layer.
4. Correlation
   1. correlation matrix.
   2. PCA
   3. Lasso Regression (L1)
   4. Auto regression
5. SHAPLEY value (feature importance)
   1. Tree SHAP
6. How portable are the interfaces (do not overfit)

1. Tanzima for better models


## models
- sequential training: gradient boost
- 