# Analysis for DataCrumbs

This is a simple analysis notebook for Datacrumbs.

## Imports

In [1]:
import logging
import json
import dask
import os
from pathlib import Path
from glob import glob
import math
import zindex_py as zindex
import numpy as np
import intervals as I
import pandas as pd
from tqdm.notebook import trange, tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster, progress, wait, get_client
from dask.distributed import Future, get_client

## Project Variables

In [3]:
app_root = str(Path(os.getcwd()).parent)

In [4]:
logging.basicConfig(
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(),
    ],
    format="%(asctime)s [%(levelname)s]: %(message)s in %(pathname)s:%(lineno)d",
)

## Setup Dask Local Cluster

In [None]:
workers=16
cluster = LocalCluster(n_workers=workers)  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default
logging.info(f"Initialized Client with {workers} workers and link {client.dashboard_link}")

## Start Analysis

In [34]:

import os 
is_trace = True
ops="read"
folder="/usr/workspace/haridev/xio/"
output=f"{folder}/output/jslines"
file=f"{folder}/ops-32_files-8/RAW-BUF.pfw.gz"
# file=f"{app_root}/tests/output/ops-64_ts-64m/RAW-BUFFERED.pfw.gz"
output_file=f"{output}/{ops}_"+os.path.basename(os.path.dirname(file))+"-"+ os.path.basename(file) + ".jsonl"
file_pattern = glob(file)
file_pattern, output_file

(['/usr/workspace/haridev/xio//ops-32_files-8/RAW-BUF.pfw.gz'],
 '/usr/workspace/haridev/xio//output/jslines/read_ops-32_files-8-RAW-BUF.pfw.gz.jsonl')

## Function to load trace data

In [35]:
def create_index(filename):
    index_file = f"{filename}.zindex"
    if not os.path.exists(index_file):
        status = zindex.create_index(filename, index_file=f"file:{index_file}",
                                     regex="id:\b([0-9]+)", numeric=True, unique=True, debug=False, verbose=False)
        logging.debug(f"Creating Index for {filename} returned {status}")
    return filename

def get_linenumber(filename):
    index_file = f"{filename}.zindex"
    line_number = zindex.get_max_line(filename, index_file=index_file, debug=False, verbose=False)
    logging.debug(f" The {filename} has {line_number} lines")
    return (filename, line_number)

def get_size(filename):
    if filename.endswith('.pfw'):
        size = os.stat(filename).st_size
    elif filename.endswith('.pfw.gz'):
        index_file = f"{filename}.zindex"
        line_number = zindex.get_max_line(filename, index_file=index_file,debug=False, verbose=False)
        size = line_number * 256
    logging.debug(f" The {filename} has {size/1024**3} GB size")
    return int(size)


def generate_line_batches(filename, max_line):
    batch_size = 16*1024
    for start in range(0, max_line, batch_size):
        end =  min((start + batch_size - 1) , (max_line - 1))
        logging.debug(f"Created a batch for {filename} from [{start}, {end}] lines")
        yield filename, start, end

def load_indexed_gzip_files(filename, start, end):
    index_file = f"{filename}.zindex"
    json_lines = zindex.zquery(filename, index_file=index_file,
                          raw=f"select a.line from LineOffsets a where a.line >= {start} AND a.line <= {end};", debug=False, verbose=False)
    logging.debug(f"Read {len(json_lines)} json lines for [{start}, {end}]")
    return json_lines

In [36]:
def load_profile(line):
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            if "pid" in d:
                d["pid"] = val["pid"]
            if "tid" in d:
                d["tid"] = val["tid"]
            if "ts" in d:
                d["ts_us"] = int(val["ts"])
            d["filename"] = "NA"
            if "args" in val:
                if "time" in val["args"]:
                    d["dur_sec"] = float(val["args"]["time"])
                if "freq" in val["args"]:
                    d["freq"] = val["args"]["freq"]
                if "size_sum" in val["args"]:
                    d["size_bytes"] = val["args"]["size_sum"]
                if "fname" in val["args"] and val["args"]["fname"]:
                    d["filename"] = val["args"]["fname"]
            d["func_id"] = val["name"]
            d["cat"] = val["cat"]
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d


def load_trace(line):
    d = {}
    if line is not None and line !="" and len(line) > 0 and "[" != line[0] and line != "\n" :
        try:
            unicode_line = ''.join([i if ord(i) < 128 else '#' for i in line])
            val = json.loads(unicode_line)
            d["name"] = val["name"]
            d["cat"] = val["cat"]
            if "pid" in val:
                d["pid"] = val["pid"]
            if "tid" in val:
                d["tid"] = val["tid"]
            d["ts"] = 0
            d["dur"] = 0
            if "ts" in val:
                d["ts"] = int(val["ts"])
                d["te"] = int(val["ts"])
            d["dur"] = 1
            if "dur" in val:
                d["dur"] = int(val["dur"])
            if "args" in val and "hhash" in val["args"]:                    
                d["hhash"] = val["args"]["hhash"]
            if "ts" in val:
                interval = I.closedopen(d["ts"], d["ts"] + 1)
                if d["dur"] > 0:
                    d["te"] = int(val["ts"]) + d["dur"]
                    interval = I.closedopen(d["ts"], d["ts"] + d["dur"])
                d["interval"] = I.to_string(interval)
            if val["ph"] != "M":
                d["type"] = 0    
                if "args" in val:                    
                    if "hhash" in val["args"]:
                        d["hhash"] = val["args"]["hhash"]
                    if "size_sum" in val["args"]:
                        d["size"] = val["args"]["size_sum"]
                    if "fhash" in val["args"]:
                        d["fhash"] = val["args"]["fhash"]
            else:
                if val["name"] == "FH":
                    d["type"] = 1
                    if "args" in val:
                        if "name" in val["args"]:
                            d["name"] = val["args"]["name"]
                        if "value" in val["args"]:
                            d["hash"] = val["args"]["value"]
                elif val["name"] == "HH":
                    d["type"] = 2
                    if "args" in val:
                        if "name" in val["args"]:
                            d["name"] = val["args"]["name"]
                        if "value" in val["args"]:
                            d["hash"] = val["args"]["value"]
            
        except Exception as error:
            logging.error(f"Processing {line} failed with {error}")
    return d

## Create Dask Dataframe

In [37]:
if len(file_pattern) > 0:
    dask.bag.from_sequence(file_pattern).map(create_index).compute()
    logging.info(f"Created index for {len(file_pattern)} files")
    total_size = dask.bag.from_sequence(file_pattern).map(get_size).sum()
    n_partition = math.ceil(total_size.compute() / (128 * 1024 ** 2))
    logging.info(f"Total size of all files are {total_size} bytes")
    max_line_numbers = dask.bag.from_sequence(file_pattern).map(get_linenumber).compute()
    logging.info(f"Max lines per file are {max_line_numbers}")
    json_line_delayed = []
    total_lines = 0
    for filename, max_line in max_line_numbers:
        total_lines += max_line
        for _, start, end in generate_line_batches(filename, max_line):
            json_line_delayed.append((filename, start, end))

    logging.info(f"Loading {len(json_line_delayed)} batches out of {len(file_pattern)} files and has {total_lines} lines overall")
    json_line_bags = []
    for filename, start, end in json_line_delayed:
        num_lines = end - start + 1
        json_line_bags.append(dask.delayed(load_indexed_gzip_files, nout=num_lines)(filename, start, end))
    json_lines = dask.bag.concat(json_line_bags)
    if is_trace:
        pfw_bag = json_lines.map(load_trace).filter(lambda x: "name" in x)
    else:
        pfw_bag = json_lines.map(load_profile).filter(lambda x: "func_id" in x)
    pfw_bag.take(1)

2024-11-06 16:11:16,603 [INFO]: Created index for 1 files in /var/tmp/haridev/ipykernel_132416/864066620.py:3
2024-11-06 16:11:16,715 [INFO]: Total size of all files are <dask.bag.core.Item object at 0x15534e3f2670> bytes in /var/tmp/haridev/ipykernel_132416/864066620.py:6
2024-11-06 16:11:16,772 [INFO]: Max lines per file are [('/usr/workspace/haridev/xio//ops-32_files-8/RAW-BUF.pfw.gz', 4906748)] in /var/tmp/haridev/ipykernel_132416/864066620.py:8
2024-11-06 16:11:16,773 [INFO]: Loading 300 batches out of 1 files and has 4906748 lines overall in /var/tmp/haridev/ipykernel_132416/864066620.py:16


In [38]:
if is_trace:
    columns = {'hhash': "string[pyarrow]", 'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
                'cat': "string[pyarrow]", 'name': "string[pyarrow]", 'type':  "uint8[pyarrow]",
            'ts': "uint64[pyarrow]", 'te': "uint64[pyarrow]", 'dur': "uint64[pyarrow]", 'interval': "string[pyarrow]", 
             'size': "uint64[pyarrow]", 'fhash': "string[pyarrow]", 'hash': "string[pyarrow]", 
           }
else:
    columns = {'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]",
            'ts_us': "uint64[pyarrow]", 'dur_sec': "float32[pyarrow]", 
            'freq': "uint64[pyarrow]", 'size_bytes': "uint64[pyarrow]", 'name': "string[pyarrow]", 
            'filename': "string[pyarrow]", 
            'cat': "string[pyarrow]"}

In [39]:
events = pfw_bag.to_dataframe(meta=columns)

In [40]:
events = events.repartition(npartitions=n_partition).persist()
_ = wait(events)

In [41]:
fhash = events.query("type == 1")[["name","hash"]]
hhash = events.query("type == 2")[["name","hash"]]
event = events.query("type == 0")
fhashes = fhash.query("name.str.contains('file_0')").compute()["hash"]
fhashes = fhashes.to_list()


In [42]:
fhash.query("name.str.contains('file_0')").compute()

Unnamed: 0,name,hash
1780,/home/cc/datacrumbs/build/data/file_0_0.dat,2337428835aa42fa0d2764000f669460
4468,/home/cc/datacrumbs/build/data/file_0_1.dat,288f886adbe25d76fe445beb9911af78
7179,/home/cc/datacrumbs/build/data/file_0_2.dat,ec6c04037b2c4f483dc3c933ec088cc8
9858,/home/cc/datacrumbs/build/data/file_0_3.dat,176264bd47969d9f55d0a6d5b8021e21
12604,/home/cc/datacrumbs/build/data/file_0_4.dat,edea10a5c821e90758563992efb303dc
15252,/home/cc/datacrumbs/build/data/file_0_5.dat,c072c6a531057c46d034ace752f465e5
1555,/home/cc/datacrumbs/build/data/file_0_6.dat,df67ba39da3433bf9d33ddd244ea8a7a
4278,/home/cc/datacrumbs/build/data/file_0_7.dat,4399451135d75f5c5be27cd4057b34c8


In [43]:
event.query("fhash.isin(@value)", local_dict={"value": fhashes}).compute()

Unnamed: 0,hhash,pid,tid,cat,name,type,ts,te,dur,interval,size,fhash,hash
1781,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,openat,0,460296305,460586748,290443,"[460296305,460586748)",,2337428835aa42fa0d2764000f669460,
1861,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,write,0,461657534,461898426,240892,"[461657534,461898426)",1024,2337428835aa42fa0d2764000f669460,
1897,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,read,0,461929060,461989691,60631,"[461929060,461989691)",1024,2337428835aa42fa0d2764000f669460,
1958,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,write,0,463052970,463166181,113211,"[463052970,463166181)",1024,2337428835aa42fa0d2764000f669460,
1976,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,read,0,463185949,463209344,23395,"[463185949,463209344)",1024,2337428835aa42fa0d2764000f669460,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9825,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157325455056,157325483484,28428,"[157325455056,157325483484)",16384,4399451135d75f5c5be27cd4057b34c8,
9921,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157327418547,157327499293,80746,"[157327418547,157327499293)",16384,4399451135d75f5c5be27cd4057b34c8,
9983,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157329298722,157329325714,26992,"[157329298722,157329325714)",16384,4399451135d75f5c5be27cd4057b34c8,
10047,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157331188671,157331286554,97883,"[157331188671,157331286554)",16384,4399451135d75f5c5be27cd4057b34c8,


## Analysis

In [44]:

interesting_events = event.query("fhash.isin(@value) and name == @ops", local_dict={"value": fhashes, "ops": ops}).sort_values("ts")
interesting_events.compute()

Unnamed: 0,hhash,pid,tid,cat,name,type,ts,te,dur,interval,size,fhash,hash
1897,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,read,0,461929060,461989691,60631,"[461929060,461989691)",1024,2337428835aa42fa0d2764000f669460,
1976,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,read,0,463185949,463209344,23395,"[463185949,463209344)",1024,2337428835aa42fa0d2764000f669460,
2035,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,read,0,464367094,464391244,24150,"[464367094,464391244)",1024,2337428835aa42fa0d2764000f669460,
2094,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,read,0,465542671,465557657,14986,"[465542671,465557657)",1024,2337428835aa42fa0d2764000f669460,
2215,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,sys,read,0,466807632,466862256,54624,"[466807632,466862256)",1024,2337428835aa42fa0d2764000f669460,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9763,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157323589364,157323670567,81203,"[157323589364,157323670567)",16384,4399451135d75f5c5be27cd4057b34c8,
9825,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157325455056,157325483484,28428,"[157325455056,157325483484)",16384,4399451135d75f5c5be27cd4057b34c8,
9921,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157327418547,157327499293,80746,"[157327418547,157327499293)",16384,4399451135d75f5c5be27cd4057b34c8,
9983,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,sys,read,0,157329298722,157329325714,26992,"[157329298722,157329325714)",16384,4399451135d75f5c5be27cd4057b34c8,


In [45]:
interesting_events["combined_name"] = interesting_events["name"] + "-" + interesting_events["cat"]
ts_events = interesting_events[["size"]].compute().reset_index().drop("index", axis=1)
ts_events

Unnamed: 0,size
0,1024
1,1024
2,1024
3,1024
4,1024
...,...
6139,16384
6140,16384
6141,16384
6142,16384


In [46]:
interesting_intervals  = interesting_events["interval"].compute()
interesting_intervals

1897           [461929060,461989691)
1976           [463185949,463209344)
2035           [464367094,464391244)
2094           [465542671,465557657)
2215           [466807632,466862256)
                    ...             
9763     [157323589364,157323670567)
9825     [157325455056,157325483484)
9921     [157327418547,157327499293)
9983     [157329298722,157329325714)
10047    [157331188671,157331286554)
Name: interval, Length: 6144, dtype: string

In [47]:

# interesting_events["interval"] = interesting_events.apply(lambda x: I.to_string(I.closed(x["ts"], x["ts"]+x["dur"])), axis=1)

In [48]:
def group_func(df):
    val = I.empty()
    for index, value in df.items():
        if str(value) != 'NA':
            pad_interval = I.from_string(str(value), int)
            val = val.union(pad_interval)
    logging.debug(f"Grouped Range into {val}")
    return I.to_string(val)
def union_portions():
    return dd.Aggregation(
        'union_portions',
        chunk=lambda s: s.apply(group_func),
        agg=lambda s: s.apply(group_func)
    )
relevant_intervals = interesting_events.reduction(chunk=lambda s: s.apply(group_func), aggregate=lambda s1: s1.apply(group_func))["interval"].compute()
relevant_intervals = I.from_string(relevant_intervals, int)
relevant_intervals_list = list(relevant_intervals)
relevant_intervals_list[:10], len(relevant_intervals_list)

([[461929060,461989691),
  [463185949,463209344),
  [464367094,464391244),
  [465542671,465557657),
  [466807632,466862256),
  [468020294,468044363),
  [469198751,469221494),
  [470410811,470464302),
  [471677069,471701819),
  [472857249,472879576)],
 6144)

In [49]:
min_ts = relevant_intervals_list[0].lower
max_te = relevant_intervals_list[-1].upper
min_ts, max_te

(461929060, 157331286554)

In [50]:
filtered_events = event.query(f"ts >= {min_ts - 1e5} and te <= {max_te + 1e5} and dur > 0")
filtered_events.compute()

Unnamed: 0,hhash,pid,tid,cat,name,type,ts,te,dur,interval,size,fhash,hash
1831,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,kernel,profile_tick,0,461831507,461832550,1043,"[461831507,461832550)",,,
1832,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,kernel,__update_blocked_fair,0,461837120,461838451,1331,"[461837120,461838451)",,,
1833,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,kernel,update_blocked_averages,0,461835227,461839381,4154,"[461835227,461839381)",,,
1836,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,kernel,unlock_page,0,461845806,461846603,797,"[461845806,461846603)",,,
1837,ecd9cccc050c9e893ab33b1a228fe76d,102273,102273,kernel,ext4_journal_check_start,0,461851191,461851967,776,"[461851191,461851967)",,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10052,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,kernel,__mod_node_page_state,0,157331371262,157331372292,1030,"[157331371262,157331372292)",,,
10053,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,kernel,__mod_zone_page_state,0,157331374708,157331375801,1093,"[157331374708,157331375801)",,,
10054,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,kernel,__mod_node_page_state,0,157331377696,157331378572,876,"[157331377696,157331378572)",,,
10055,ecd9cccc050c9e893ab33b1a228fe76d,103600,103600,kernel,__mod_zone_page_state,0,157331380404,157331381160,756,"[157331380404,157331381160)",,,


In [None]:
filtered_events["combined_name"] = filtered_events["name"] + "-" + filtered_events["cat"]
event_batch_per_sys_call = []
for interval in tqdm(relevant_intervals_list):
    a_overlaps_b = f"(ts >= {interval.lower} and ts <=  {interval.upper}) or (te >= {interval.lower} and te <=  {interval.upper})"
    b_overlaps_a = f"({interval.lower} >= ts and {interval.lower} <=  te) or ({interval.upper} >= ts and {interval.upper} <=  te)"
    batch = filtered_events.query(f"{a_overlaps_b} or {b_overlaps_a}")[["combined_name", "dur"]].groupby("combined_name").sum()
    event_batch_per_sys_call.append(batch.compute())


  0%|          | 0/6144 [00:00<?, ?it/s]

In [29]:
merged_df = None
count = 0
for batch in tqdm(event_batch_per_sys_call):
    if merged_df is not None:
        merged_df = merged_df.merge(batch, how='outer', on="combined_name",suffixes=('', f"_{count}"))
    else:
        merged_df = batch
    count += 1

  0%|          | 0/6400 [00:00<?, ?it/s]



In [30]:
dataset = merged_df.transpose()
df = dataset.reset_index().drop("index", axis=1)
df

combined_name,__alloc_pages-kernel,__bio_add_page-kernel,__bio_iov_iter_get_pages-kernel,__bio_split_to_limits-kernel,__blk_bios_map_sg-kernel,__ext4_get_inode_loc-kernel,__ext4_handle_dirty_metadata-kernel,__ext4_journal_get_write_access-kernel,__ext4_journal_start_sb-kernel,__ext4_journal_stop-kernel,...,security_file_permission-kernel,should_fail_alloc_page-kernel,should_fail_bio-kernel,submit_bio-kernel,submit_bio_noacct-kernel,submit_bio_noacct_nocheck-kernel,try_grab_page-kernel,update_blocked_averages-kernel,vfs_read-vfs,vm_normal_page-kernel
0,,221,7964,,282,2050,226,265,1060,320,...,2141,,198,10392,9557,8190,,,150886,
1,,515,13507,,217,,,,,,...,4005,,197,9151,8423,7089,,,165743,
2,,221,5583,,295,2154,220,262,943,363,...,1806,,201,9353,8645,7362,,,131196,
3,,600,13399,,697,,,,,,...,3836,,549,24332,22364,18886,,,198183,
4,,224,5641,,253,2020,193,255,943,287,...,1973,,201,10189,9417,8138,,,135656,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6395,,,,,,,,,,,...,1629,,,,,,,,451540,
6396,,,,,,2962,264,387,1380,364,...,1670,,,,,,,,479202,
6397,,,,,,,,,,,...,2067,,,,,,,,445734,
6398,,,,,,2907,264,317,1959,319,...,1245,,,,,,,,1523898,


In [31]:
final_dataset = df.merge(ts_events, how='outer', left_index=True, right_index=True)
final_dataset["BW"] = final_dataset[f"size"] / (1024**2) / (final_dataset[f"{ops}-sys"] / 1e9)
final_dataset.drop([f"{ops}-sys", "size"], inplace=True, axis=1)
final_dataset.columns

Index(['__alloc_pages-kernel', '__bio_add_page-kernel',
       '__bio_iov_iter_get_pages-kernel', '__bio_split_to_limits-kernel',
       '__blk_bios_map_sg-kernel', '__ext4_get_inode_loc-kernel',
       '__ext4_handle_dirty_metadata-kernel',
       '__ext4_journal_get_write_access-kernel',
       '__ext4_journal_start_sb-kernel', '__ext4_journal_stop-kernel',
       '__ext4_mark_inode_dirty-kernel', '__find_get_block-kernel',
       '__get_user_pages-kernel', '__mod_lruvec_page_state-kernel',
       '__mod_node_page_state-kernel', '__mod_zone_page_state-kernel',
       '__submit_bio-kernel', '__update_blocked_fair-kernel',
       'aa_file_perm-kernel', 'apparmor_file_permission-kernel',
       'bio_alloc_bioset-kernel', 'bio_associate_blkg-kernel',
       'bio_associate_blkg_from_css-kernel', 'bio_integrity_prep-kernel',
       'bio_iov_iter_get_pages-kernel', 'bio_set_pages_dirty-kernel',
       'bio_split_rw-kernel', 'bio_to_wbt_flags-kernel',
       'blk_cgroup_bio_start-kernel', 'b

In [32]:
output_file
final_dataset["op"] = ops

In [33]:
final_dataset.to_json(path_or_buf=f"{output_file}",orient='records', lines=True)


In [None]:
files = glob(f"{output_file}")
final_dataset_l = []
for file in files:
    final_dataset_l.append(dd.read_json(file))
final_dataset = dd.concat(final_dataset_l).compute().reset_index().drop("index", axis=1)
final_dataset

## make the number of relevance features dynamic.

1. Add up the importance score to reach 95%.
2. Add Transfer size
3. Split features into layers and do this analysis per layer.
4. Correlation
   1. correlation matrix.
   2. PCA
   3. Lasso Regression (L1)
   4. Auto regression
5. SHAPLEY value (feature importance)
   1. Tree SHAP
6. How portable are the interfaces (do not overfit)

1. Tanzima for better models


## models
- sequential training: gradient boost
- 