# Analysis for DataCrumbs

This is a simple analysis notebook for Datacrumbs.

## Imports

In [1]:
import logging
import json
import dask
import os
from pathlib import Path
from glob import glob
import math
import zindex_py as zindex
import numpy as np
import intervals as I
import pandas as pd
from tqdm.notebook import trange, tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster, progress, wait, get_client
from dask.distributed import Future, get_client

## Project Variables

In [3]:
app_root = str(Path(os.getcwd()).parent)

In [4]:
logging.basicConfig(
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(),
    ],
    format="%(asctime)s [%(levelname)s]: %(message)s in %(pathname)s:%(lineno)d",
)

## Setup Dask Local Cluster

In [5]:
workers=16
cluster = LocalCluster(n_workers=workers)  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default
logging.info(f"Initialized Client with {workers} workers and link {client.dashboard_link}")

Perhaps you already have a cluster running?
Hosting the HTTP server on port 46859 instead
2025-02-27 10:50:36,672 [INFO]: Initialized Client with 16 workers and link http://127.0.0.1:46859/status in /var/tmp/haridev/ipykernel_2967325/3142773904.py:4




## Start Analysis

In [35]:

import os 
is_trace = True
ops="openat"
folder="/usr/workspace/haridev/xio/"
output=f"{folder}/output/jslines"
file=f"/usr/workspace/haridev/datacrumbs_new/data/trace-posix.pfw.gz"
# file=f"{app_root}/tests/output/ops-64_ts-64m/RAW-BUFFERED.pfw.gz"
output_file=f"{output}/all_"+os.path.basename(os.path.dirname(file))+"-"+ os.path.basename(file) + ".jsonl"
file_pattern = glob(file)
file_pattern, output_file

(['/usr/workspace/haridev/datacrumbs_new/data/trace-posix.pfw.gz'],
 '/usr/workspace/haridev/xio//output/jslines/all_data-trace-posix.pfw.gz.jsonl')

## Function to load trace data

In [36]:
def create_index(filename):
    index_file = f"{filename}.zindex"
    if os.path.exists(index_file):
        os.remove(index_file)
    if not os.path.exists(index_file):
        status = zindex.create_index(filename, index_file=f"file:{index_file}",
                                     regex="id:\b([0-9]+)", numeric=True, unique=True, debug=False, verbose=False)
        logging.debug(f"Creating Index for {filename} returned {status}")
    return filename

def get_linenumber(filename):
    index_file = f"{filename}.zindex"
    line_number = zindex.get_max_line(filename, index_file=index_file, debug=False, verbose=False)
    logging.debug(f" The {filename} has {line_number} lines")
    return (filename, line_number)

def get_size(filename):
    if filename.endswith('.pfw'):
        size = os.stat(filename).st_size
    elif filename.endswith('.pfw.gz'):
        index_file = f"{filename}.zindex"
        line_number = zindex.get_max_line(filename, index_file=index_file,debug=False, verbose=False)
        size = line_number * 256
    logging.debug(f" The {filename} has {size/1024**3} GB size")
    return int(size)


def generate_line_batches(filename, max_line):
    batch_size = 16*1024
    for start in range(0, max_line, batch_size):
        end =  min((start + batch_size - 1) , (max_line - 1))
        logging.debug(f"Created a batch for {filename} from [{start}, {end}] lines")
        yield filename, start, end

def load_indexed_gzip_files(filename, start, end):
    index_file = f"{filename}.zindex"
    json_lines = zindex.zquery(filename, index_file=index_file,
                          raw=f"select a.line from LineOffsets a where a.line >= {start} AND a.line <= {end};", debug=False, verbose=False)
    logging.debug(f"Read {len(json_lines)} json lines for [{start}, {end}]")
    return json_lines

In [53]:
def load_profile(line):
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            if "pid" in d:
                d["pid"] = val["pid"]
            if "tid" in d:
                d["tid"] = val["tid"]
            if "ts" in d:
                d["ts_us"] = int(val["ts"])
            d["filename"] = "NA"
            if "args" in val:
                if "time" in val["args"]:
                    d["dur_sec"] = float(val["args"]["time"])
                if "freq" in val["args"]:
                    d["freq"] = val["args"]["freq"]
                if "size_sum" in val["args"]:
                    d["size_bytes"] = val["args"]["size_sum"]
                if "fname" in val["args"] and val["args"]["fname"]:
                    d["filename"] = val["args"]["fname"]
            d["func_id"] = val["name"]
            d["cat"] = val["cat"]
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d


def load_trace(line):
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            d["name"] = val["name"]
            d["cat"] = val["cat"]
            if "pid" in val:
                d["pid"] = val["pid"]
            if "tid" in val:
                d["tid"] = val["tid"]
            d["ts"] = 0
            d["dur"] = 0
            if "ts" in val:
                d["ts"] = int(val["ts"])
                d["te"] = int(val["ts"])
            d["dur"] = 1
            if "dur" in val:
                d["dur"] = int(val["dur"])
            if "args" in val and "hhash" in val["args"]:                    
                d["hhash"] = val["args"]["hhash"]
            if "ts" in val:
                interval = I.closedopen(d["ts"], d["ts"] + 1)
                if d["dur"] > 0:
                    d["te"] = int(val["ts"]) + d["dur"]
                    interval = I.closedopen(d["ts"], d["ts"] + d["dur"])
                d["interval"] = I.to_string(interval)
            if val["ph"] != "M":
                d["type"] = 0    
                if "args" in val:                    
                    if "hhash" in val["args"]:
                        d["hhash"] = val["args"]["hhash"]
                    if "size_sum" in val["args"]:
                        d["size"] = val["args"]["size_sum"]
                    if "fhash" in val["args"]:
                        d["fhash"] = val["args"]["fhash"]
            else:
                if val["name"] == "FH":
                    d["type"] = 1
                    if "args" in val:
                        if "name" in val["args"]:
                            d["name"] = val["args"]["name"]
                        if "value" in val["args"]:
                            d["hash"] = val["args"]["value"]
                elif val["name"] == "HH":
                    d["type"] = 2
                    if "args" in val:
                        if "name" in val["args"]:
                            d["name"] = val["args"]["name"]
                        if "value" in val["args"]:
                            d["hash"] = val["args"]["value"]
            
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d

## Create Dask Dataframe

In [54]:
if len(file_pattern) > 0:
    dask.bag.from_sequence(file_pattern).map(create_index).compute()
    logging.info(f"Created index for {len(file_pattern)} files")
    total_size = dask.bag.from_sequence(file_pattern).map(get_size).sum()
    n_partition = math.ceil(total_size.compute() / (128 * 1024 ** 2))
    logging.info(f"Total size of all files are {total_size} bytes")
    max_line_numbers = dask.bag.from_sequence(file_pattern).map(get_linenumber).compute()
    logging.info(f"Max lines per file are {max_line_numbers}")
    json_line_delayed = []
    total_lines = 0
    for filename, max_line in max_line_numbers:
        total_lines += max_line
        for _, start, end in generate_line_batches(filename, max_line):
            json_line_delayed.append((filename, start, end))

    logging.info(f"Loading {len(json_line_delayed)} batches out of {len(file_pattern)} files and has {total_lines} lines overall")
    json_line_bags = []
    for filename, start, end in json_line_delayed:
        num_lines = end - start + 1
        json_line_bags.append(dask.delayed(load_indexed_gzip_files, nout=num_lines)(filename, start, end))
    json_lines = dask.bag.concat(json_line_bags)
    if is_trace:
        pfw_bag = json_lines.map(load_trace).filter(lambda x: "name" in x)
    else:
        pfw_bag = json_lines.map(load_profile).filter(lambda x: "func_id" in x)
    pfw_bag.take(1)

2025-02-27 11:06:30,900 [INFO]: Created index for 1 files in /var/tmp/haridev/ipykernel_2967325/864066620.py:3
2025-02-27 11:06:30,945 [INFO]: Total size of all files are <dask.bag.core.Item object at 0x15537f475df0> bytes in /var/tmp/haridev/ipykernel_2967325/864066620.py:6
2025-02-27 11:06:30,980 [INFO]: Max lines per file are [('/usr/workspace/haridev/datacrumbs_new/data/trace-posix.pfw.gz', 434565)] in /var/tmp/haridev/ipykernel_2967325/864066620.py:8
2025-02-27 11:06:30,982 [INFO]: Loading 27 batches out of 1 files and has 434565 lines overall in /var/tmp/haridev/ipykernel_2967325/864066620.py:16


In [55]:
if is_trace:
    columns = {'hhash': "string[pyarrow]", 'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
                'cat': "string[pyarrow]", 'name': "string[pyarrow]", 'type':  "uint64[pyarrow]",
            'ts': "uint64[pyarrow]", 'te': "uint64[pyarrow]", 'dur': "uint64[pyarrow]", 'interval': "string[pyarrow]", 
             'size': "uint64[pyarrow]", 'fhash': "string[pyarrow]", 'hash': "string[pyarrow]", 
           }
else:
    columns = {'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
            'ts_us': "uint64[pyarrow]", 'dur_sec': "float32[pyarrow]", 
            'freq': "uint64[pyarrow]", 'size_bytes': "uint64[pyarrow]", 'name': "string[pyarrow]", 
            'filename': "string[pyarrow]", 
            'cat': "string[pyarrow]"}

In [56]:
events = pfw_bag.to_dataframe(meta=columns)

In [57]:
events = events.repartition(npartitions=n_partition).persist()
_ = wait(events)

In [58]:
fhash = events.query("type == 1")[["name","hash"]]
hhash = events.query("type == 2")[["name","hash"]]
event = events.query("type == 0")
fhashes = fhash.query("name.str.contains('test')").compute()["hash"]
fhashes = fhashes.to_list()
fhashes


['4.3964026171542656e+18',
 '4.928435996063191e+18',
 '1.7310318003223237e+19',
 '8.279003197236494e+18',
 '5.20262030860475e+18',
 '4.159259069983187e+18']

In [59]:
fhash.compute()

Unnamed: 0,name,hash
7658,/usr/lib/x86_64-linux-gnu/openmpi/lib/openmpi3...,2.3763919348293345e+18
7659,tls/avx512_1/x86_64/libpmix.so.2,1.9667812553246912e+18
7660,haswell/libkrb5support.so.0,6.052585383886541e+18
7661,tls/avx512_1/libnettle.so.8,1.8315315124802222e+19
7662,/usr/lib/x86_64-linux-gnu/openmpi/lib/openmpi3...,1.4045793221426905e+19
...,...,...
8576,/usr/lib/x86_64-linux-gnu/openmpi/lib/libpsm_i...,1.4892821240287767e+18
8577,tls/libpsm2.so.2,9.85292844982091e+18
8578,tls/avx512_1/libmca_common_sm.so.40,1.1108149334883035e+19
8579,tls/avx512_1/x86_64/libnettle.so.8,1.9990554279044598e+18


In [60]:
fhash.query("name.str.contains('bin')").compute()

Unnamed: 0,name,hash
7685,glibc-hwcaps/x86-64-v2/libinfinipath.so.4,1.7077593364157571e+19
7705,tls/haswell/x86_64/libinfinipath.so.4,1.3592906208522828e+19
7714,haswell/avx512_1/x86_64/libinfinipath.so.4,1.4465143953582332e+19
7850,glibc-hwcaps/x86-64-v4/libinfinipath.so.4,5.647878578155325e+18
7918,haswell/libinfinipath.so.4,8.398307815387045e+17
7921,avx512_1/x86_64/libinfinipath.so.4,4.412321801763152e+18
7925,tls/haswell/libinfinipath.so.4,1.5672826572810867e+18
7968,tls/avx512_1/x86_64/libinfinipath.so.4,6.035178886405087e+18
8042,libinfinipath.so.4,3.8779058478142807e+18
8097,avx512_1/libinfinipath.so.4,7.583678685796221e+18


In [61]:
event.compute()

Unnamed: 0,hhash,pid,tid,cat,name,type,ts,te,dur,interval,size,fhash,hash
1,ad0234829205b9033196ba818f7a872b,42899,42899,kernel,copy_to_page,0,6918,7784,866,"[6918,7784)",,,
2,ad0234829205b9033196ba818f7a872b,42899,42899,kernel,copy_to_page,0,14083,14969,886,"[14083,14969)",,,
3,ad0234829205b9033196ba818f7a872b,42899,42899,kernel,copy_to_page,0,18404,18751,347,"[18404,18751)",,,
4,ad0234829205b9033196ba818f7a872b,42899,42899,app1,_init,0,17764,20763,2999,"[17764,20763)",,,
5,ad0234829205b9033196ba818f7a872b,42899,42899,kernel,copy_to_page,0,23971,24276,305,"[23971,24276)",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7653,ad0234829205b9033196ba818f7a872b,42899,42899,kernel,copy_to_page,0,649825197,649825541,344,"[649825197,649825541)",,,
7654,ad0234829205b9033196ba818f7a872b,42899,42899,c,free,0,649824785,649827064,2279,"[649824785,649827064)",,,
7655,ad0234829205b9033196ba818f7a872b,42899,42899,app1,main,0,23404,649828233,649804829,"[23404,649828233)",,,
7656,ad0234829205b9033196ba818f7a872b,42899,42899,kernel,copy_to_page,0,649832916,649833106,190,"[649832916,649833106)",,,


In [62]:
event.query("fhash.isin(@value)", local_dict={"value": fhashes}).compute()

Unnamed: 0,hhash,pid,tid,cat,name,type,ts,te,dur,interval,size,fhash,hash
12886,ad0234829205b9033196ba818f7a872b,42899,42899,sys,openat,0,73644575,73671801,27226,"[73644575,73671801)",,4.159259069983187e+18,
12921,ad0234829205b9033196ba818f7a872b,42899,42899,sys,close,0,73710257,73710821,564,"[73710257,73710821)",,4.159259069983187e+18,
12956,ad0234829205b9033196ba818f7a872b,42899,42899,sys,openat,0,73725351,73741880,16529,"[73725351,73741880)",,4.928435996063191e+18,
12991,ad0234829205b9033196ba818f7a872b,42899,42899,sys,close,0,73764982,73765391,409,"[73764982,73765391)",,4.928435996063191e+18,
13071,ad0234829205b9033196ba818f7a872b,42899,42899,sys,openat,0,73834309,73850976,16667,"[73834309,73850976)",,5.20262030860475e+18,
13106,ad0234829205b9033196ba818f7a872b,42899,42899,sys,close,0,73873479,73873896,417,"[73873479,73873896)",,5.20262030860475e+18,
13209,ad0234829205b9033196ba818f7a872b,42899,42899,sys,openat,0,73929075,73944740,15665,"[73929075,73944740)",,4.3964026171542656e+18,
13244,ad0234829205b9033196ba818f7a872b,42899,42899,sys,close,0,73966183,73966584,401,"[73966183,73966584)",,4.3964026171542656e+18,
343,ad0234829205b9033196ba818f7a872b,42899,42899,sys,openat,0,149421404,149479875,58471,"[149421404,149479875)",,8.279003197236494e+18,
378,ad0234829205b9033196ba818f7a872b,42899,42899,sys,close,0,149578471,149580300,1829,"[149578471,149580300)",,8.279003197236494e+18,


## Analysis

In [63]:

interesting_events = event.query("fhash.isin(@value)", local_dict={"value": fhashes}).sort_values("ts")
interesting_events.compute()

Unnamed: 0,hhash,pid,tid,cat,name,type,ts,te,dur,interval,size,fhash,hash
12886,ad0234829205b9033196ba818f7a872b,42899,42899,sys,openat,0,73644575,73671801,27226,"[73644575,73671801)",,4.159259069983187e+18,
12921,ad0234829205b9033196ba818f7a872b,42899,42899,sys,close,0,73710257,73710821,564,"[73710257,73710821)",,4.159259069983187e+18,
12956,ad0234829205b9033196ba818f7a872b,42899,42899,sys,openat,0,73725351,73741880,16529,"[73725351,73741880)",,4.928435996063191e+18,
12991,ad0234829205b9033196ba818f7a872b,42899,42899,sys,close,0,73764982,73765391,409,"[73764982,73765391)",,4.928435996063191e+18,
13071,ad0234829205b9033196ba818f7a872b,42899,42899,sys,openat,0,73834309,73850976,16667,"[73834309,73850976)",,5.20262030860475e+18,
13106,ad0234829205b9033196ba818f7a872b,42899,42899,sys,close,0,73873479,73873896,417,"[73873479,73873896)",,5.20262030860475e+18,
13209,ad0234829205b9033196ba818f7a872b,42899,42899,sys,openat,0,73929075,73944740,15665,"[73929075,73944740)",,4.3964026171542656e+18,
13244,ad0234829205b9033196ba818f7a872b,42899,42899,sys,close,0,73966183,73966584,401,"[73966183,73966584)",,4.3964026171542656e+18,
343,ad0234829205b9033196ba818f7a872b,42899,42899,sys,openat,0,149421404,149479875,58471,"[149421404,149479875)",,8.279003197236494e+18,
378,ad0234829205b9033196ba818f7a872b,42899,42899,sys,close,0,149578471,149580300,1829,"[149578471,149580300)",,8.279003197236494e+18,


In [64]:
interesting_events["combined_name"] = interesting_events["name"] + "-" + interesting_events["cat"]
ts_events = interesting_events[["size"]].compute().reset_index().drop("index", axis=1)
ts_events["size"].fillna(value=1.0)

0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
Name: size, dtype: uint64[pyarrow]

In [65]:
interesting_intervals  = interesting_events[["interval","name"]].compute()
interesting_intervals

Unnamed: 0,interval,name
12886,"[73644575,73671801)",openat
12921,"[73710257,73710821)",close
12956,"[73725351,73741880)",openat
12991,"[73764982,73765391)",close
13071,"[73834309,73850976)",openat
13106,"[73873479,73873896)",close
13209,"[73929075,73944740)",openat
13244,"[73966183,73966584)",close
343,"[149421404,149479875)",openat
378,"[149578471,149580300)",close


In [66]:

# interesting_events["interval"] = interesting_events.apply(lambda x: I.to_string(I.closed(x["ts"], x["ts"]+x["dur"])), axis=1)

In [67]:
def group_func(df):
    val = I.empty()
    for index, value in df.items():
        if str(value) != 'NA':
            pad_interval = I.from_string(str(value), int)
            val = val.union(pad_interval)
    logging.debug(f"Grouped Range into {val}")
    return I.to_string(val)
def union_portions():
    return dd.Aggregation(
        'union_portions',
        chunk=lambda s: s.apply(group_func),
        agg=lambda s: s.apply(group_func)
    )
relevant_intervals = interesting_events[["interval"]].reduction(chunk=lambda s: s.apply(group_func), aggregate=lambda s1: s1.apply(group_func))["interval"].compute()
relevant_intervals = I.from_string(relevant_intervals, int)
relevant_intervals_list = list(relevant_intervals)
relevant_intervals_list[:10], len(relevant_intervals_list)

([[73644575,73671801),
  [73710257,73710821),
  [73725351,73741880),
  [73764982,73765391),
  [73834309,73850976),
  [73873479,73873896),
  [73929075,73944740),
  [73966183,73966584),
  [149421404,149479875),
  [149578471,149580300)],
 12)

In [68]:
min_ts = relevant_intervals_list[0].lower
max_te = relevant_intervals_list[-1].upper
min_ts, max_te

(73644575, 149735087)

In [69]:
filtered_events = event.query(f"ts >= {min_ts - 1e5} and te <= {max_te + 1e5} and dur > 0")
filtered_events.compute()
# filtered_events = event

Unnamed: 0,hhash,pid,tid,cat,name,type,ts,te,dur,interval,size,fhash,hash
12789,ad0234829205b9033196ba818f7a872b,42899,42899,kernel,copy_to_page,0,73546608,73546863,255,"[73546608,73546863)",,,
12790,ad0234829205b9033196ba818f7a872b,42899,42899,c,free,0,73546279,73548200,1921,"[73546279,73548200)",,,
12791,ad0234829205b9033196ba818f7a872b,42899,42899,kernel,copy_to_page,0,73549235,73549421,186,"[73549235,73549421)",,,
12792,ad0234829205b9033196ba818f7a872b,42899,42899,c,malloc,0,73548908,73550762,1854,"[73548908,73550762)",,,
12793,ad0234829205b9033196ba818f7a872b,42899,42899,kernel,copy_to_page,0,73551886,73552073,187,"[73551886,73552073)",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
468,ad0234829205b9033196ba818f7a872b,42899,42899,kernel,apparmor_file_permission,0,149809629,149812043,2414,"[149809629,149812043)",,,
469,ad0234829205b9033196ba818f7a872b,42899,42899,kernel,security_file_permission,0,149808645,149812784,4139,"[149808645,149812784)",,,
470,ad0234829205b9033196ba818f7a872b,42899,42899,vfs,rw_verify_area,0,149807865,149813599,5734,"[149807865,149813599)",,,
471,ad0234829205b9033196ba818f7a872b,42899,42899,vfs,vfs_write,0,149806851,149820639,13788,"[149806851,149820639)",,,


In [70]:
filtered_events["combined_name"] = filtered_events["name"] + "-" + filtered_events["cat"]
event_batch_per_sys_call = []
rows = list(interesting_intervals.iterrows())
count = 0
ops_map = {}
ops_counter = 0
for index, row in tqdm(rows):
    interval = I.from_string(row["interval"], int)
    ops = row["name"]
    a_overlaps_b = f"(ts >= {interval.lower} and ts <=  {interval.upper}) or (te >= {interval.lower} and te <=  {interval.upper})"
    b_overlaps_a = f"({interval.lower} >= ts and {interval.lower} <=  te) or ({interval.upper} >= ts and {interval.upper} <=  te)"
    batch = filtered_events.query(f"{a_overlaps_b} or {b_overlaps_a}")[["combined_name", "dur"]].groupby("combined_name").sum().compute()
    if ops in ops_map:
        op_value = ops_map[ops]
    else:
        ops_counter += 1
        op_value = ops_counter
        ops_map[ops] = op_value
    batch.loc['op'] = [op_value]
    event_batch_per_sys_call.append(batch)
    count += 1


  0%|          | 0/12 [00:00<?, ?it/s]

In [71]:
merged_df = None
count = 0
for batch in tqdm(event_batch_per_sys_call):
    if merged_df is not None:
        merged_df = merged_df.merge(batch, how='outer', on="combined_name",suffixes=('', f"_{count}"))
    else:
        merged_df = batch
    count += 1

  0%|          | 0/12 [00:00<?, ?it/s]

In [72]:
dataset = merged_df.transpose()
df = dataset.reset_index().drop("index", axis=1)
df["op_name"] = "UNKNOWN"
for key, value in ops_map.items():
    df["op_name"] = df["op_name"].mask(df["op"].eq(value), key)
df["op_name"].unique()

array(['openat', 'close'], dtype=object)

In [39]:
final_dataset = df.merge(ts_events, how='outer', left_index=True, right_index=True)
final_dataset["BW"] = final_dataset[f"size"] / (1024**2) / (final_dataset[f"{ops}-sys"] / 1e9)
final_dataset.drop([f"{ops}-sys", "size"], inplace=True, axis=1)
final_dataset.columns

Index(['__alloc_pages-kernel', '__ext4_check_dir_entry-kernel',
       '__ext4_ext_check-kernel', '__ext4_ext_dirty-kernel',
       '__ext4_find_entry-kernel', '__ext4_get_inode_loc-kernel',
       '__ext4_handle_dirty_metadata-kernel',
       '__ext4_journal_ensure_credits-kernel',
       '__ext4_journal_get_write_access-kernel',
       '__ext4_journal_start_sb-kernel',
       ...
       'release_pages-kernel', 'security_file_alloc-kernel',
       'security_file_open-kernel', 'security_file_truncate-kernel',
       'should_fail_alloc_page-kernel', 'truncate_inode_pages_range-kernel',
       'truncate_pagecache-kernel', 'update_blocked_averages-kernel',
       'vfs_open-kernel', 'BW'],
      dtype='object', length=186)

In [40]:
output_file
final_dataset["op"] = ops

In [41]:
final_dataset.to_json(path_or_buf=f"{output_file}",orient='records', lines=True)


In [11]:
output_file="/usr/workspace/haridev/xio/output/jslines/write_ops-64_ts-64m-RAW-DIRECT.pfw.gz.jsonl"
ops="write"

In [12]:
from glob import glob
files = glob(f"{output_file}")
final_dataset_l = []
for file in files:
    final_dataset_l.append(dd.read_json(file))
final_dataset = dd.concat(final_dataset_l).compute().reset_index().drop("index", axis=1)
final_dataset["BW"] = final_dataset[f"transfer_size"] / (1024**2) / (final_dataset[f"{ops}-sys"]/1e9)
final_dataset["op"] = ops
final_dataset.to_json(path_or_buf=f"{output_file}",orient='records', lines=True)


## make the number of relevance features dynamic.

1. Add up the importance score to reach 95%.
2. Add Transfer size
3. Split features into layers and do this analysis per layer.
4. Correlation
   1. correlation matrix.
   2. PCA
   3. Lasso Regression (L1)
   4. Auto regression
5. SHAPLEY value (feature importance)
   1. Tree SHAP
6. How portable are the interfaces (do not overfit)

1. Tanzima for better models


## models
- sequential training: gradient boost
- 