In [202]:
import os
import pandas as pd
from pathlib import Path

# Execute this notebook headlessly using something like:
#
# EXP_FOLDER=~/test_builds/coreutils.exp/ jupyter nbconvert --to html --execute characterize_dataset.ipynb --no-input
#

MANUAL_EXP_FOLDER = Path.home()/'test_builds'/'astera3.exp'

# take the env-var-specified experiment if present
EXP_FOLDER = Path(os.environ['EXP_FOLDER']) if 'EXP_FOLDER' in os.environ else MANUAL_EXP_FOLDER

In [203]:
#!head ~/test_builds/astera3.exp/rundata/run1/locals.csv

EXP_NAME = EXP_FOLDER.stem

num_runs = len(list((EXP_FOLDER/'rundata').iterdir()))

# assume 1 run for now, verify this
if num_runs > 1:
    raise Exception(f'More than 1 run - {num_runs} found')

run_folder = EXP_FOLDER/'rundata'/'run1'
binaries_csv = run_folder/'binaries.csv'
locals_csv = run_folder/'locals.csv'
funcs_csv = run_folder/'functions.csv'
params_csv = run_folder/'function_params.csv'

In [204]:
# read in dataframes
binaries_df = pd.read_csv(binaries_csv)
locals_df = pd.read_csv(locals_csv)
funcs_df = pd.read_csv(funcs_csv)
params_df = pd.read_csv(params_csv)

params_df['IsReturnType'] = params_df.IsReturnType.convert_dtypes(convert_boolean=True)

# example of how to join with binary name based on ID
# dd = pd.DataFrame({'BinaryId': [0,0,0,3,4,0,1,0,1], 'Number': list(range(9))})
# mm = dd.merge(binaries_df, on='BinaryId', how='left')

# Dataset Composition
What is the basic makeup of this dataset in terms of its general size (binaries, functions, variables) and the number and variety of data types?

In [212]:
num_binaries = len(binaries_df.BinaryId.unique())

### exes/shared objects
binaries_df['IsSharedObject'] = binaries_df.Name.apply(lambda x: x.endswith('.so'))
bins_by_type = binaries_df.groupby('IsSharedObject').count()[['Name']].rename(columns={'Name': 'IsSO'})
sobjs = bins_by_type.IsSO[bins_by_type.IsSO.index==True]
exes = bins_by_type.IsSO[bins_by_type.IsSO.index==False]
num_exes = 0 if exes.empty else exes[0]
num_sharedobjs = 0 if sobjs.empty else sobjs[0]

### duplicate funcs
func_names = funcs_df.groupby('FunctionName_DWARF').count()[['BinaryId']].rename(columns={'BinaryId': 'Count'})
func_names.sort_values('Count', ascending=False)
len(func_names[func_names.Count>1])
func_names[func_names.index=='main']

num_funcs = len(funcs_df)
num_unique_funcs = len(funcs_df.FunctionName_DWARF.unique())
num_main_funcs = func_names.Count[func_names.Count.index=='main'][0]
# subtract 1 because the set of unique functions already counts 1 main function...
# so we just add the duplicates
dup_main_funcs = num_main_funcs - 1
unique_plus_main_funcs = num_unique_funcs + dup_main_funcs

# from rich import Console

from rich.console import Console
from rich.table import Table

table = Table(title=f"{EXP_NAME.capitalize()} Overview")

table.add_column("Metric", justify="right", style="dodger_blue1", no_wrap=True)
table.add_column("Value")# style="green")
table.add_column("%", justify="right")#, style="green3")



# to close a style use [/], e.g: '[bold] xyz [/]'
func_color = 'bright_magenta'
table.add_row('# Binaries', f'{num_binaries:,}', '-')
table.add_row('# Exes', f'{num_exes:,}', f'{num_exes/num_binaries*100:.1f}%')
table.add_row('# Shared objects', f'{num_sharedobjs:,}', f'{num_sharedobjs/num_binaries*100:.1f}%')
table.add_row(f'[{func_color}]# Functions', f'{num_funcs:,}')
table.add_row(f'[{func_color}]# Unique Functions', f'{num_unique_funcs:,}', f'[green4]{num_unique_funcs/num_funcs*100:.1f}%')
table.add_row(f'[{func_color}]# main() Functions', f'{num_main_funcs:,}', f'{num_main_funcs/num_funcs*100:.1f}%')
table.add_row(f'[{func_color}]# Unique + main() Functions (YIELD)', f'[{func_color}]{unique_plus_main_funcs:,}',
              f'[{func_color}]{unique_plus_main_funcs/num_funcs*100:.1f}%')

console = Console()
console.print(table)

## Stripped functions sanity check
Grouping all stripped function names for all (non-NaN) stripped functions that do **NOT** start with `'FUN_'`. This should be a short list with no *"real"* function names (just runtime helper functions like `_DT_FINI`)

In [None]:
strip_funcs = funcs_df[~funcs_df.FunctionName_Strip.isna()]
strip_funcs[~strip_funcs.FunctionName_Strip.apply(lambda x: x.startswith('FUN_') if x else False)].groupby('FunctionName_Strip').count()

Unnamed: 0_level_0,FunctionStart,FunctionName_Debug,FunctionName_DWARF,BinaryId
FunctionName_Strip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
_DT_FINI,117,117,0,117
_DT_INIT,117,117,0,117
_FINI_0,117,117,0,117
_INIT_1,1,1,1,1
_obstack_allocated_p,6,6,6,6
_obstack_begin,6,6,6,6
_obstack_begin_1,6,6,6,6
_obstack_free,6,6,6,6
_obstack_memory_used,6,6,6,6
_obstack_newchunk,6,6,6,6


# Binaries & Functions
How many binaries and functions do we have?
Do we need to filter out any duplicate functions?

In [None]:
num_binaries = len(locals_df.BinaryId.unique())


# NOTE: we can't determine duplicates as easily from this dataset...if we have a table
# of functions (1 row per function) then it easy to do a  groupby and count any duplicates across
# binaries

There are 117 binaries in this dataset (that have local variables)


In [None]:
true_vars = locals_df[~locals_df.Name_DWARF.isna()]
strip_vars = locals_df[~locals_df.Name_Strip.isna()]
debug_vars = locals_df[~locals_df.Name_Debug.isna()]

print(f'# true locals = {len(true_vars):,}')
print(f'# debug locals = {len(debug_vars):,}')
print(f'# strip locals = {len(strip_vars):,}')

# true locals = 30,861
# debug locals = 162,214
# strip locals = 116,101


In [None]:
len(true_vars[~true_vars.Name_Strip.isna()])

14857

## Function Prototypes

In [None]:
len(params_df)
len(params_df[~params_df.TypeCategory_DWARF.isna()])
# params_df[params_df.Type_DWARF==params_df.Type_Strip]

# if we don't ensure TypeCategory_DWARF is valid, we get several more hits because they are both
# NaN (due to Debug being the only valid column)
strip_correct = params_df[(~params_df.TypeCategory_DWARF.isna()) & (params_df.Type_DWARF==params_df.Type_Strip)]
strip_fail = params_df[(~params_df.TypeCategory_Strip.isna()) & (params_df.Type_DWARF!=params_df.Type_Strip)]

len(strip_correct)
strip_correct.groupby('TypeCategory_DWARF').count().FunctionStart

print(len(strip_fail) + len(strip_correct))
total_stripvars = len(params_df[(~params_df.TypeCategory_Strip.isna())])

acc_pcnt = len(strip_correct)/total_stripvars*100
print(f'Ghidra stripped function parameter recovery accuracy = {acc_pcnt:.2f}%')

# but how much was because the variables were wrong?
len(params_df[(~params_df.TypeCategory_Strip.isna()) & (params_df.TypeCategory_DWARF.isna())])
len(params_df[(params_df.TypeCategory_Strip.isna()) & (~params_df.TypeCategory_DWARF.isna())])
# len(params_df[(~params_df.TypeCategory_Debug.isna()) & (params_df.TypeCategory_DWARF.isna())])

from rich.console import Console
console = Console()

num_vars = len(params_df)
strip_is_na = params_df.TypeCategory_Strip.isna()
dwarf_is_na = params_df.TypeCategory_DWARF.isna()
strip_is_valid = ~strip_is_na
dwarf_is_valid = ~dwarf_is_na

num_stripvars = len(params_df[strip_is_valid])
num_dwarfvars = len(params_df[~dwarf_is_na])
num_true_stripvars = len(params_df[(strip_is_valid) & (dwarf_is_valid)])

print(f'There are {num_vars:,} parameters (and return types)')
print(f'{num_stripvars:,} of these are parameters from the stripped binary')
print(f'{num_dwarfvars:,} of these are (true) parameters from DWARF debug info')
print(f'{num_true_stripvars:,} of these stripped/DWARF variables intersect')

55556
Ghidra stripped function parameter recovery accuracy = 11.63%
There are 91,665 parameters (and return types)
55,556 of these are parameters from the stripped binary
51,774 of these are (true) parameters from DWARF debug info
16,538 of these stripped/DWARF variables intersect


In [None]:
params_df[strip_is_valid].groupby('IsReturnType').count()

Unnamed: 0_level_0,FunctionStart,Name_Debug,Type_Debug,LocType,LocRegName,LocOffset,TypeCategory_Debug,Name_Strip,Type_Strip,TypeCategory_Strip,Name_DWARF,IsReturnType_DWARF,Type_DWARF,TypeCategory_DWARF,BinaryId
IsReturnType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
False,37666,35905,35905,37666,36950,37666,35905,37666,37666,37666,145,145,145,145,37666
True,17890,0,17824,0,0,0,17824,0,17890,17890,0,16393,16393,16393,17890


In [None]:
params_df[dwarf_is_valid].groupby('IsReturnType').count()

Unnamed: 0_level_0,FunctionStart,Name_Debug,Type_Debug,LocType,LocRegName,LocOffset,TypeCategory_Debug,Name_Strip,Type_Strip,TypeCategory_Strip,Name_DWARF,IsReturnType_DWARF,Type_DWARF,TypeCategory_DWARF,BinaryId
IsReturnType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
False,169,168,168,169,0,169,168,145,145,145,169,169,169,169,169
True,16394,0,16352,0,0,0,16352,0,16393,16393,0,16394,16394,16394,16394


In [None]:
params_df[params_df.IsReturnType]
params_df[params_df.TypeCategory_DWARF.isna()]

Unnamed: 0,FunctionStart,Name_Debug,IsReturnType,Type_Debug,LocType,LocRegName,LocOffset,TypeCategory_Debug,Name_Strip,Type_Strip,TypeCategory_Strip,Name_DWARF,IsReturnType_DWARF,Type_DWARF,TypeCategory_DWARF,BinaryId
0,1057425,__bsx,False,uint32_t,reg,edi,0.0,BUILTIN,param_1,uint32_t,BUILTIN,,,,,0
1,1065600,param_1,False,EVP_PKEY_CTX*,reg,edi,0.0,POINTER,param_1,uint32_t,BUILTIN,,,,,0
2,1065600,param_2,False,uint64_t,reg,esi,0.0,BUILTIN,param_2,uint64_t,BUILTIN,,,,,0
3,1065600,param_3,False,uint64_t,reg,edx,0.0,BUILTIN,param_3,uint64_t,BUILTIN,,,,,0
4,1065720,ctx,False,EVP_PKEY_CTX*,reg,edi,0.0,POINTER,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91548,1092644,,True,int32_t,,,,BUILTIN,,void,BUILTIN,,,,,116
91549,1069472,,True,void,,,,BUILTIN,,void,BUILTIN,,,,,116
91554,1092624,,True,int32_t,,,,BUILTIN,,void,BUILTIN,,,,,116
91580,1069520,,True,void,,,,BUILTIN,,void,BUILTIN,,,,,116
