# Analysis for DataCrumbs

This is a simple analysis notebook for Datacrumbs.

## Imports

In [2]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
import logging
import json
import dask
import os
from pathlib import Path
from glob import glob
import math
import zindex_py as zindex
import numpy as np
import intervals as I
import pandas as pd
from tqdm.notebook import trange, tqdm
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt



In [3]:
import dask.dataframe as dd
from dask.distributed import Client, LocalCluster, progress, wait, get_client
from dask.distributed import Future, get_client

## Project Variables

In [4]:
app_root = str(Path(os.getcwd()).parent)

In [5]:
logging.basicConfig(
    level=logging.INFO,
    handlers=[
        logging.StreamHandler(),
    ],
    format="%(asctime)s [%(levelname)s]: %(message)s in %(pathname)s:%(lineno)d",
)

## Setup Dask Local Cluster

In [6]:
workers=16
cluster = LocalCluster(n_workers=workers)  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default
logging.info(f"Initialized Client with {workers} workers and link {client.dashboard_link}")

2024-11-06 16:12:09,316 [INFO]: Initialized Client with 16 workers and link http://127.0.0.1:44411/status in /var/tmp/haridev/ipykernel_173228/3142773904.py:4


## Start Analysis

In [14]:

import os 
is_trace = True
ops="write"
folder="/usr/workspace/haridev/xio/"
output=f"{folder}/output/jslines"
file=f"{folder}/ops-32_files-8/RAW-DIRECT.pfw.gz"
# file=f"{app_root}/tests/output/ops-64_ts-64m/RAW-BUFFERED.pfw.gz"
output_file=f"{output}/*ops-32_files-8-RAW-DIRECT.pfw.gz.jsonl"
file_pattern = glob(file)
file_pattern, output_file

(['/usr/workspace/haridev/xio//ops-32_files-8/RAW-DIRECT.pfw.gz'],
 '/usr/workspace/haridev/xio//output/jslines/*ops-32_files-8-RAW-DIRECT.pfw.gz.jsonl')

In [15]:
files = glob(f"{output_file}")
final_dataset_l = []
for file in files:
    final_dataset_l.append(dd.read_json(file))
final_dataset = dd.concat(final_dataset_l).compute().reset_index().drop("index", axis=1)
final_dataset

Unnamed: 0,__alloc_pages-kernel,__bio_add_page-kernel,__bio_iov_iter_get_pages-kernel,__bio_split_to_limits-kernel,__blk_bios_map_sg-kernel,__ext4_ext_dirty-kernel,__ext4_get_inode_loc-kernel,__ext4_handle_dirty_metadata-kernel,__ext4_journal_get_create_access-kernel,__ext4_journal_get_write_access-kernel,...,follow_page_pte-kernel,free_unref_page_list-kernel,main-app,make_vfsgid-kernel,make_vfsuid-kernel,mark_page_accessed-os_cache,release_pages-kernel,try_grab_page-kernel,vfs_read-vfs,vm_normal_page-kernel
0,,1904.0,25548.0,,1553.0,23140.0,29007.0,6775.0,,8202.0,...,,,,,,,,,,
1,,1106.0,14339.0,,842.0,,15138.0,2693.0,,3213.0,...,,,,,,,,,,
2,,1107.0,14401.0,,822.0,,17639.0,2222.0,,3083.0,...,,,,,,,,,,
3,,524.0,13608.0,,663.0,,14700.0,2632.0,,3001.0,...,,,,,,,,,,
4,,399.0,6208.0,,346.0,23543.0,19596.0,3788.0,,4629.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12795,,,,,,,,,,,...,,,,273.0,350.0,,,,451540.0,
12796,,,,,,,2962.0,264.0,,387.0,...,,,,448.0,472.0,,,,479202.0,
12797,,,,,,,,,,,...,,,,298.0,348.0,,,,445734.0,
12798,,,,,,,2907.0,264.0,,317.0,...,,,,424.0,439.0,,,,1523898.0,


In [24]:
final_dataset["vfs_read-vfs"].unique()

array([     nan,  150886.,  165743., ...,  445734., 1523898.,  465574.])

In [17]:
clean_data = final_dataset.fillna(value=0)
clean_data

Unnamed: 0,__alloc_pages-kernel,__bio_add_page-kernel,__bio_iov_iter_get_pages-kernel,__bio_split_to_limits-kernel,__blk_bios_map_sg-kernel,__ext4_ext_dirty-kernel,__ext4_get_inode_loc-kernel,__ext4_handle_dirty_metadata-kernel,__ext4_journal_get_create_access-kernel,__ext4_journal_get_write_access-kernel,...,follow_page_pte-kernel,free_unref_page_list-kernel,main-app,make_vfsgid-kernel,make_vfsuid-kernel,mark_page_accessed-os_cache,release_pages-kernel,try_grab_page-kernel,vfs_read-vfs,vm_normal_page-kernel
0,0.0,1904.0,25548.0,0.0,1553.0,23140.0,29007.0,6775.0,0.0,8202.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1106.0,14339.0,0.0,842.0,0.0,15138.0,2693.0,0.0,3213.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1107.0,14401.0,0.0,822.0,0.0,17639.0,2222.0,0.0,3083.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,524.0,13608.0,0.0,663.0,0.0,14700.0,2632.0,0.0,3001.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,399.0,6208.0,0.0,346.0,23543.0,19596.0,3788.0,0.0,4629.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,273.0,350.0,0.0,0.0,0.0,451540.0,0.0
12796,0.0,0.0,0.0,0.0,0.0,0.0,2962.0,264.0,0.0,387.0,...,0.0,0.0,0.0,448.0,472.0,0.0,0.0,0.0,479202.0,0.0
12797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,298.0,348.0,0.0,0.0,0.0,445734.0,0.0
12798,0.0,0.0,0.0,0.0,0.0,0.0,2907.0,264.0,0.0,317.0,...,0.0,0.0,0.0,424.0,439.0,0.0,0.0,0.0,1523898.0,0.0


In [45]:
function_mapping = { 
    "vfs" : {   
                "vfs_write", "__vfs_getxattr", "rw_verify_area", "__vfs_getxattr", "file_modified_flags", "file_modified", "vfs_read", 
    },
    "ext4": {   
                "__ext4_get_inode_loc", "__ext4_handle_dirty_metadata",
                "__ext4_journal_get_write_access", "__ext4_journal_start_sb", "__ext4_journal_stop", "__ext4_mark_inode_dirty", "ext4_block_write_begin",
                "ext4_buffered_write_iter", "ext4_cache_extents", "ext4_claim_free_clusters", "ext4_da_get_block_prep", "ext4_da_reserve_space",
                "ext4_da_write_begin", "ext4_da_write_end", "ext4_dirty_inode", "ext4_es_find_extent_range", "ext4_es_insert_delayed_block",
                "ext4_es_insert_extent", "ext4_es_lookup_extent", "ext4_ext_map_blocks", "ext4_fc_track_inode", "ext4_file_write_iter",
                "ext4_fill_raw_inode", "ext4_find_extent", "ext4_generic_write_checks", "ext4_get_group_desc", "ext4_get_inode_loc",
                "ext4_get_reserved_space", "ext4_has_free_clusters", "ext4_inode_csum", "ext4_inode_csum_set", "ext4_inode_table",
                "ext4_journal_check_start", "ext4_mark_iloc_dirty", "ext4_nonda_switch", "ext4_reserve_inode_write", "ext4_xattr_block_get",
                "ext4_xattr_get", "ext4_xattr_ibody_get", "ext4_xattr_security_get", "__ext4_ext_dirty", "ext4_block_bitmap",
                "ext4_block_bitmap_csum_set", "ext4_dio_alignment", "ext4_dio_write_checks", "ext4_dio_write_end_io", "ext4_dio_write_iter",
                "ext4_es_cache_extent", "ext4_es_delayed_clu", "ext4_ext_correct_indexes", "ext4_ext_find_goal", "ext4_ext_get_access",
                "ext4_ext_index_trans_blocks", "ext4_ext_insert_extent", "ext4_ext_next_allocated_block", "ext4_ext_search_left", "ext4_ext_search_right",
                "ext4_ext_try_to_merge", "ext4_ext_try_to_merge_right", "ext4_ext_try_to_merge_up", "ext4_fc_track_range", "ext4_free_group_clusters",
                "ext4_free_group_clusters_set", "ext4_get_group_info", "ext4_get_group_no_and_offset", "ext4_group_desc_csum", "ext4_group_desc_csum_set",
                "ext4_handle_inode_extension", "ext4_inode_block_valid", "ext4_inode_extension_cleanup", "ext4_inode_journal_mode", "ext4_inode_to_goal_block",
                "ext4_iomap_begin", "ext4_iomap_end", "ext4_map_blocks", "ext4_mb_collect_stats", "ext4_mb_find_by_goal",
                "ext4_mb_generate_buddy", "ext4_mb_generate_from_pa", "ext4_mb_good_group", "ext4_mb_good_group_nolock", "ext4_mb_init_cache",
                "ext4_mb_initialize_context", "ext4_mb_load_buddy_gfp", "ext4_mb_mark_diskspace_used", "ext4_mb_new_blocks", "ext4_mb_pa_put_free",
                "ext4_mb_prefetch", "ext4_mb_prefetch_fini", "ext4_mb_regular_allocator", "ext4_mb_simple_scan_group", "ext4_mb_unload_buddy",
                "ext4_mb_use_best_found", "ext4_meta_trans_blocks", "ext4_orphan_add", "ext4_orphan_del", "ext4_read_block_bitmap",
                "ext4_read_block_bitmap_nowait", "ext4_sb_block_valid", "ext4_set_iomap", "ext4_superblock_csum_set", "ext4_validate_block_bitmap",
                "ext4_wait_block_bitmap", "ext4_ext_determine_insert_hole", "__ext4_journal_get_create_access", "__read_extent_tree_block", "ext4_bg_has_super",
                "ext4_block_bitmap_csum_verify", "ext4_ext_grow_indepth", "ext4_extent_block_csum_set", "ext4_free_clusters_after_init", "ext4_get_group_number",
                "ext4_group_desc_csum_verify", "ext4_init_block_bitmap", "ext4_inode_bitmap", "ext4_mark_bitmap_end", "ext4_mb_complex_scan_group",
                "ext4_new_meta_blocks", "ext4_num_base_meta_blocks", "ext4_num_overhead_clusters", "ext4_read_bh_nowait", "ext4_valid_block_bitmap",
                "ext4_mb_check_limits", "ext4_mb_mark_pa_deleted", "ext4_mb_new_group_pa", "ext4_mb_new_inode_pa", "ext4_mb_use_inode_pa",
                "ext4_block_write_begin", "ext4_buffered_write_iter", "ext4_da_get_block_prep", "ext4_da_reserve_space", "ext4_da_write_begin",
                "ext4_da_write_end", "ext4_es_insert_delayed_block", "ext4_get_reserved_space", "ext4_nonda_switch", "ext4_mb_generate_buddy",
                "ext4_mb_generate_from_pa", "ext4_mb_init_cache", "ext4_ext_determine_insert_hole", "ext4_bg_has_super", "ext4_block_bitmap_csum_verify",
                "ext4_free_clusters_after_init", "ext4_group_desc_csum_verify", "ext4_init_block_bitmap", "ext4_inode_bitmap", "ext4_mark_bitmap_end",
                "ext4_num_base_meta_blocks", "ext4_num_overhead_clusters", "ext4_read_bh_nowait", "ext4_valid_block_bitmap","ext4_file_read_iter", 
    },
    "page_cache": {"__alloc_pages", "__filemap_add_folio", "__filemap_get_folio", "__find_get_block", "__free_pages", "__jbd2_journal_file_buffer", "__mod_lruvec_page_state", "__mod_node_page_state", "__mod_zone_page_state",
                   "alloc_pages", "balance_dirty_pages", "balance_dirty_pages_ratelimited", "block_write_end", "filemap_add_folio", "filemap_alloc_folio", "filemap_get_entry", "free_tail_page_prepare", "free_unref_page",
                   "free_unref_page_commit","free_unref_page_list", "free_unref_page_prepare", "get_page_from_freelist", "mark_buffer_dirty", "prep_compound_page", "__find_get_block_slow", "bvec_try_merge_page", "filemap_check_errors",
                   "internal_get_user_pages_fast", "invalidate_inode_pages2_range", "jbd2_journal_file_inode", "pagecache_get_page", "__block_commit_write", "alloc_pages_mpol", "get_pfnblock_flags_mask", "page_mapping", "__get_free_pages",
                   "__filemap_add_folio", "__free_pages", "balance_dirty_pages", "balance_dirty_pages_ratelimited", "block_write_end", "filemap_add_folio",
                   "filemap_alloc_folio", "free_tail_page_prepare", "free_unref_page_commit", "free_unref_page_prepare", "mark_buffer_dirty", "prep_compound_page", "alloc_pages_mpol",
                   "page_mapping", "__update_load_avg_blocked_se", "page_counter_cancel", "page_counter_uncharge","should_fail_alloc_page","follow_page_pte", "follow_page_mask",
                    "do_anonymous_page","make_vfsgid", "mark_page_accessed", "__get_user_pages", "make_vfsuid", "try_grab_page",      }, 
    "vm": {"mod_node_page_state", "node_page_state", "vm_normal_page"},
    "profile": {"profile_tick"},
    "file_cache": {"file_free_rcu"},
    "sched": {"__update_blocked_fair", "__update_load_avg_blocked_se", },
    "mount": {"__mnt_drop_write_file", "__mnt_want_write_file", "mnt_get_write_access_file", "mnt_put_write_access_file", "mnt_get_write_access_file", "mnt_put_write_access_file", },
    "dentry": {"__file_remove_privs",},
    "apparmor": {"aa_file_perm", "apparmor_file_permission", "security_file_permission", },
    "crypto": {"chacha_block_generic", "fscrypt_limit_io_blocks", "fscrypt_set_bio_crypt_ctx", "fscrypt_set_bio_crypt_ctx_bh", "fscrypt_set_bio_crypt_ctx_bh",},
    "memory": {"do_numa_page", "obj_cgroup_uncharge_pages", "page_counter_cancel", "page_counter_try_charge", "page_counter_uncharge", "release_pages", "lockless_pages_from_mm", "do_numa_page",},
    "bio": {"__bio_add_page", "__bio_advance", "__bio_clone", "__bio_iov_iter_get_pages", "__bio_release_pages", "__bio_split_to_limits", "__blk_bios_map_sg", "__rq_qos_done_bio", "__submit_bio", "bio_alloc_bioset", "bio_associate_blkg",
            "bio_associate_blkg_from_css", "bio_attempt_back_merge", "bio_chain", "bio_clone_blkg_association", "bio_associate_blkg", "bio_associate_blkg_from_css", "bio_attempt_back_merge", "bio_chain", "bio_clone_blkg_association",
            "bio_crypt_ctx_mergeable", "bio_crypt_rq_ctx_compatible", "bio_endio", "bio_free", "bio_integrity_prep", "bio_iov_iter_get_pages", "bio_put", "bio_split", "bio_split_rw", "bio_to_wbt_flags",
            "bio_uninit", "blk_cgroup_bio_start", "blk_integrity_merge_bio", "blk_mq_attempt_bio_merge", "blk_mq_sched_bio_merge", "blk_mq_submit_bio","dd_bio_merge", "elv_bio_merge_ok", "iomap_dio_bio_end_io", "iomap_dio_bio_iter", 
            "iomap_dio_submit_bio", "should_fail_bio", "submit_bio", "submit_bio_noacct", "submit_bio_noacct_nocheck", "guard_bio_eod", "blk_account_io_merge_bio", "__bio_advance", "__bio_clone", "__bio_release_pages",
            "__rq_qos_done_bio", "bio_chain", "bio_clone_blkg_association", "bio_chain", "bio_clone_blkg_association", "bio_endio", "bio_free", "bio_put", "bio_uninit", "elv_bio_merge_ok", "iomap_dio_bio_end_io", "guard_bio_eod",
            "bio_set_pages_dirty", },
    "block": {"kblockd_mod_delayed_work_on", "mb_find_order_for_block", "__block_commit_write", "get_pfnblock_flags_mask", "kblockd_mod_delayed_work_on","update_blocked_averages", },
    "direct-io": {"kiocb_invalidate_pages", "pin_user_pages_fast", "unpin_user_page"}
}

In [46]:
all_mapped_functions = set()
for key, value in function_mapping.items():
    all_mapped_functions.update(value)
len(all_mapped_functions)

254

In [47]:
missed_mapping = set()
for column in clean_data.columns:
    values = column.split("-")
    if len(values) > 1 and values[0] not in all_mapped_functions:
        # print(values[0])
        missed_mapping.add(values[0])

for index, col in enumerate(missed_mapping):
    if index % 10 == 9:
        print(f"\"{col}\",")
    else:
        print(f"\"{col}\", ", end="")


"main", 

In [59]:
def find_most_related_functions(category, columns, variance_target):
    ignore_columns = ["write-sys","main", "read-sys"]
    y_columns = ["BW"]
    x_columns = []
    for column in columns:
        if column not in ignore_columns:
            col = clean_data.columns[clean_data.columns.str.contains(column)]
            if len(col) > 0:
                x_columns.append(col[0])
    # print(x_columns)
    x = clean_data.loc[:, x_columns].values
    y = clean_data.loc[:, y_columns].values
    regressor = RandomForestRegressor(n_estimators=10, random_state=0, oob_score=True)
    _ = regressor.fit(x, y.ravel())
    f_i = list(zip(x_columns,regressor.feature_importances_))
    f_i.sort(reverse=True, key = lambda x : x[1])
    variances = [x[1] for x in f_i]
    required_values = (np.cumsum(variances)<=variance_target).argmin()
    most_f_i = f_i[:required_values]
    most_important_features = [x[0] for x in most_f_i]
    # print(category, np.sum(variances[:required_values]), required_values, most_important_features)
    return np.sum(variances[:required_values]), required_values, most_important_features

In [62]:
variance_target = 0.99
min_variance_target = 0
for category, columns in function_mapping.items():
    explained_variance, number_of_features, features = find_most_related_functions(category, columns, variance_target)
    #if number_of_features > 0 and min_variance_target <= explained_variance:
    print(f"{number_of_features} features are needed to explain {explained_variance*100.0:.2f}% variance in {category} group with {features}")


4 features are needed to explain 93.82% variance in vfs group with ['vfs_read-vfs', 'vfs_write-vfs', 'file_modified_flags-kernel', 'rw_verify_area-vfs']
34 features are needed to explain 98.96% variance in ext4 group with ['ext4_file_read_iter-kernel', 'ext4_dio_write_iter-kernel', 'ext4_dio_alignment-kernel', 'ext4_iomap_begin-kernel', 'ext4_iomap_end-kernel', 'ext4_set_iomap-kernel', 'ext4_mb_use_inode_pa-kernel', 'ext4_sb_block_valid-kernel', 'ext4_get_group_desc-kernel', 'ext4_file_write_iter-kernel', 'ext4_meta_trans_blocks-kernel', 'ext4_inode_block_valid-kernel', 'ext4_fill_raw_inode-kernel', 'ext4_generic_write_checks-kernel', '__ext4_handle_dirty_metadata-kernel', 'ext4_find_extent-kernel', 'ext4_ext_get_access-kernel', 'ext4_map_blocks-kernel', 'ext4_free_group_clusters-kernel', 'ext4_ext_map_blocks-kernel', 'ext4_free_group_clusters_set-kernel', 'ext4_block_bitmap_csum_set-kernel', 'ext4_get_group_info-kernel', 'ext4_inode_csum_set-kernel', 'ext4_es_lookup_extent-kernel', '_

## make the number of relevance features dynamic.

1. Add up the importance score to reach 95%.
2. Add Transfer size
3. Split features into layers and do this analysis per layer.
4. Correlation
   1. correlation matrix.
   2. PCA
   3. Lasso Regression (L1)
   4. Auto regression
5. SHAPLEY value (feature importance)
   1. Tree SHAP
6. How portable are the interfaces (do not overfit)

1. Tanzima for better models


## models
- sequential training: gradient boost
- 