In [82]:
import os
import pandas as pd
from pathlib import Path
from rich.console import Console
from rich.table import Table

# Execute this notebook headlessly using something like:
#
# EXP_FOLDER=~/test_builds/coreutils.exp/ jupyter nbconvert --to html --execute characterize_dataset.ipynb --no-input
#

# MANUAL_EXP_FOLDER = Path.home()/'test_builds'/'coreutils.exp'
# MANUAL_EXP_FOLDER = Path.home()/'test_builds'/'astera4.exp'
# MANUAL_EXP_FOLDER = Path.home()/'test_builds'/'astera_Og.exp'
# MANUAL_EXP_FOLDER = Path.home()/'test_builds'/'astera_O1.exp'
MANUAL_EXP_FOLDER = Path.home()/'test_builds'/'astera3.exp'

# take the env-var-specified experiment if present
EXP_FOLDER = Path(os.environ['EXP_FOLDER']) if 'EXP_FOLDER' in os.environ else MANUAL_EXP_FOLDER

In [42]:
!ls ~/test_builds

astera2.exp	       c.exp		       ffmpeg.old.exp
astera3.exp	       copy.exp		       funcproto-eval.exp
astera4.exp	       coreutils.exp	       SECOND_source_ast.exp
astera.exp	       cpp_test.exp	       source_ast.exp
astera_instrument.exp  d2		       systemv.exp
astera_O1.exp	       demo.exp		       test.exp
astera_Og.exp	       docker_test.exp	       timing.exp
astera.old.exp	       _dwarflines_astera.exp
basic-dataset.exp      ffmpeg.exp


In [43]:
!ls ~/test_builds/astera3.exp/rundata/run1/0.fighter

0.fighter	 fighter.debug		    ghidra_ast.debug.json
0.fighter.debug  function_params.csv	    ghidra_ast.json
ast_dumps	 function_params.stats.csv  locals.csv
fighter		 functions.csv		    locals.stats.csv


In [44]:
#!head ~/test_builds/astera3.exp/rundata/run1/locals.csv

EXP_NAME = EXP_FOLDER.stem
print(f'Experiment: {EXP_NAME}')

num_runs = len(list((EXP_FOLDER/'rundata').iterdir()))

# assume 1 run for now, verify this
if num_runs > 1:
    raise Exception(f'More than 1 run - {num_runs} found')

run_folder = EXP_FOLDER/'rundata'/'run1'
binaries_csv = run_folder/'binaries.csv'
locals_csv = run_folder/'locals.csv'
funcs_csv = run_folder/'functions.csv'
params_csv = run_folder/'function_params.csv'

Experiment: astera3


In [45]:
# TODO:
# 1. Rename IsReturnType_Debug/Strip to just IsReturnType (consolidate or drop one)
# 2. LEAVE duplicate functions for now...
# 3. Consider labeling all other stripped binary vars as <OTHER> (like DIRTY <Component>)
#    to see if we can reliably predict which variables don't seem to be "true source vars"

In [49]:
# read in dataframes
binaries_df = pd.read_csv(binaries_csv)
locals_df = pd.read_csv(locals_csv)
funcs_df = pd.read_csv(funcs_csv)
params_df = pd.read_csv(params_csv)
(params_df.IsReturnType_Debug == params_df.IsReturnType_Strip).all()
params_df

Unnamed: 0,FunctionStart,Name_Debug,Signature,IsReturnType_Debug,Type_Debug,LocType_Debug,LocRegName_Debug,LocOffset_Debug,TypeCategory_Debug,Name_Strip,IsReturnType_Strip,Type_Strip,LocType_Strip,LocRegName_Strip,LocOffset_Strip,TypeCategory_Strip,BinaryId
0,1322409,r,90,False,float*,reg,rdi,0.0,POINTER,param_1,False,int64_t,reg,rdi,0.0,BUILTIN,0
1,1322409,a,42,False,float*,reg,rsi,0.0,POINTER,param_2,False,int64_t,reg,rsi,0.0,BUILTIN,0
2,1322409,b,66,False,float*,reg,rdx,0.0,POINTER,param_3,False,int64_t,reg,rdx,0.0,BUILTIN,0
3,1322524,r,90,False,float*,reg,rdi,0.0,POINTER,param_1,False,int64_t,reg,rdi,0.0,BUILTIN,0
4,1322524,a,42,False,float*,reg,rsi,0.0,POINTER,param_2,False,int64_t,reg,rsi,0.0,BUILTIN,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23018,1072670,,0,True,uint8_t,,,,BUILTIN,,True,int32_t,,,,BUILTIN,7
23019,1073664,,0,True,void,,,,BUILTIN,,True,void,,,,BUILTIN,7
23020,1073776,,0,True,void,,,,BUILTIN,,True,void,,,,BUILTIN,7
23021,1073784,,0,True,int32_t,,,,BUILTIN,,True,void,,,,BUILTIN,7


In [53]:

# params_df['IsReturnType'] = params_df.IsReturnType.convert_dtypes(convert_boolean=True)
# params_df.IsReturnType_Debug

# example of how to join with binary name based on ID
# dd = pd.DataFrame({'BinaryId': [0,0,0,3,4,0,1,0,1], 'Number': list(range(9))})
# mm = dd.merge(binaries_df, on='BinaryId', how='left')

# Dataset Composition
What is the basic makeup of this dataset in terms of its general size (binaries, functions, variables) and the number and variety of data types?

## Quick totals

In [89]:
num_binaries = len(binaries_df.BinaryId.unique())

### exes/shared objects
binaries_df['IsSharedObject'] = binaries_df.Name.apply(lambda x: x.endswith('.so'))
bins_by_type = binaries_df.groupby('IsSharedObject').count()[['Name']].rename(columns={'Name': 'IsSO'})
sobjs = bins_by_type.IsSO[bins_by_type.IsSO.index==True]
exes = bins_by_type.IsSO[bins_by_type.IsSO.index==False]
num_exes = 0 if exes.empty else exes[0]
num_sharedobjs = 0 if sobjs.empty else sobjs[0]

In [227]:
table = Table(title=f"{EXP_NAME.capitalize()} Quick Totals")

table.add_column("Metric", justify="right", style="dodger_blue1", no_wrap=True)
table.add_column("Value", justify='right')# style="green")
table.add_column("%", justify="right")#, style="green3")

num_funcs = len(funcs_df)
num_unique_funcs = len(funcs_df.FunctionName_DWARF.unique())

num_rtypes = len(params_df[params_df.IsReturnType_Debug])
num_params = len(params_df[~params_df.IsReturnType_Debug])

# to close a style use [/], e.g: '[bold] xyz [/]'
data_style = 'bold light_green'
data_style2 = 'pale_green3'
func_color = 'bright_magenta'
table.add_row(f'[{data_style}]# Locals', f'[bold]{len(locals_df):,}', '-')
table.add_row(f'[{data_style}]# Params + return types', f'[bold]{len(params_df):,}', '-')
table.add_row(f'[{data_style}]# Globals', f'[bold]{0:,}', '-not implemented-')
table.add_row(f'[{data_style2}]# Params', f'{num_params:,}', f'{num_params/len(params_df)*100:.1f}%')
table.add_row(f'[{data_style2}]# Return types', f'{num_rtypes:,}', f'{num_rtypes/len(params_df)*100:.1f}%')
table.add_row('# Binaries', f'{num_binaries:,}', '-')
table.add_row('# Exes', f'{num_exes:,}', f'{num_exes/num_binaries*100:.1f}%')
table.add_row('# Shared objects', f'{num_sharedobjs:,}', f'{num_sharedobjs/num_binaries*100:.1f}%')
table.add_row(f'[{func_color}]# Functions', f'{num_funcs:,}')
table.add_row(f'[{func_color}]# Unique Functions', f'{num_unique_funcs:,}', f'{num_unique_funcs/num_funcs*100:.1f}%')

console = Console()
console.print(table)

# Number of params per function
# per_func_param_stats = params_df.loc[~params_df.IsReturnType_Debug,:].groupby(['BinaryId','FunctionStart']).count().Signature.describe()
# per_func_param_stats

num_rtypes = len(params_df[params_df.IsReturnType_Debug])
num_rtypes
len(funcs_df)
rt_df = params_df.loc[params_df.IsReturnType_Debug,:]
valid_debug_funcs = funcs_df.loc[~funcs_df.FunctionName_Debug.isna(),:]
# df.groupby('FunctionStart').count().sort_values('Signature')
# df[df.FunctionStart==1897569]
df = rt_df.merge(valid_debug_funcs, on=['BinaryId','FunctionStart'], how='right')

# TODO: calculate % yield from the .stats.csv files

In [135]:
funcs_df[funcs_df.FunctionName_Debug=='c_ray_vs_circle']

Unnamed: 0,FunctionStart,FunctionName_Debug,FunctionName_Strip,FunctionName_DWARF,BinaryId
308,1477527,c_ray_vs_circle,,c_ray_vs_circle,0
6708,1322975,c_ray_vs_circle,,c_ray_vs_circle,6


In [149]:
len(funcs_df[funcs_df.FunctionName_Debug.isna()])

52

In [None]:
# OLD METRICS HERE - in case we want to bring it back
# ----------------
# num_main_funcs = func_names.Count[func_names.Count.index=='main'][0]
# # subtract 1 because the set of unique functions already counts 1 main function...
# # so we just add the duplicates
# dup_main_funcs = num_main_funcs - 1
# unique_plus_main_funcs = num_unique_funcs + dup_main_funcs

# table.add_row(f'[{func_color}]# main() Functions', f'{num_main_funcs:,}', f'{num_main_funcs/num_funcs*100:.1f}%')
# table.add_row(f'[{func_color}]# Unique + main() Functions (YIELD)', f'[{func_color}]{unique_plus_main_funcs:,}',
#               f'[{func_color}]{unique_plus_main_funcs/num_funcs*100:.1f}%')

## Stripped functions sanity check
This should be a short list with no *"real"* function names (just runtime helper functions like `_DT_FINI`)

I'm grouping all stripped function names that do **NOT** start with `'FUN_'` (for all non-NaN stripped functions).

In [None]:
strip_funcs = funcs_df[~funcs_df.FunctionName_Strip.isna()]
strip_funcs[~strip_funcs.FunctionName_Strip.apply(lambda x: x.startswith('FUN_') if x else False)].groupby('FunctionName_Strip').count()

# Binaries & Functions
How many binaries and functions do we have?
Do we need to filter out any duplicate functions?

In [None]:
num_binaries = len(locals_df.BinaryId.unique())


# NOTE: we can't determine duplicates as easily from this dataset...if we have a table
# of functions (1 row per function) then it easy to do a  groupby and count any duplicates across
# binaries

In [None]:
true_vars = locals_df[~locals_df.Name_DWARF.isna()]
strip_vars = locals_df[~locals_df.Name_Strip.isna()]
debug_vars = locals_df[~locals_df.Name_Debug.isna()]

print(f'# true locals = {len(true_vars):,}')
print(f'# debug locals = {len(debug_vars):,}')
print(f'# strip locals = {len(strip_vars):,}')

In [None]:
true_builtins = true_vars[true_vars.TypeCategory_DWARF=='BUILTIN']
# true_builtins[true_builtins.TypeCategory_Strip=='BUILTIN']
len(true_builtins[true_builtins.TypeCategory_Strip.isna()])
len(true_builtins)
num_truebuiltins_with_stripvar = len(true_builtins[~true_builtins.TypeCategory_Strip.isna()])
# true_builtins.groupby('TypeCategory_Strip').count().FunctionStart/num_truebuiltins_with_stripvar*100
truebuiltins_with_stripvar = true_builtins[~true_builtins.TypeCategory_Strip.isna()]
tb_with_svbuiltin = truebuiltins_with_stripvar[truebuiltins_with_stripvar.TypeCategory_Strip=='BUILTIN']

mismatches = tb_with_svbuiltin[tb_with_svbuiltin.Type_Strip!=tb_with_svbuiltin.Type_DWARF][['Type_Strip','Type_DWARF']]
funcs_df[funcs_df.FunctionStart==1165544]
len(tb_with_svbuiltin[tb_with_svbuiltin.Type_Strip==tb_with_svbuiltin.Type_DWARF])/len(tb_with_svbuiltin)*100
len(tb_with_svbuiltin)
tb_with_svbuiltin[tb_with_svbuiltin.Type_DWARF=='UNMAPPED_FLOAT_16']

## Function Prototypes

The initial questions I have are not even about the data types yet, just basic parameter recovery:

1. How common is it for Ghidra to **miss parameters**?
2. How common is it for Ghidra to recover **extra parameters**?

Breaking that down...

1. How many functions did Ghidra recover the correct number of parameters?
    - *Whether or not they were at the correct location?*
2. How many functions did Ghidra fail to recover parameters?
3. How many parameters did Ghidra fail to recover on average?
    -  Overall, including 0's for functions where Ghidra didn't miss any parameters?
    - Only across the set of functions where Ghidra missed 1 or more parameters?
4. How many functions did Ghidra recover extra parameters?

In [None]:
# validate that IsReturnType is never N/A
if params_df.IsReturnType.isna().any():
    raise Exception('FAILED VALIDATION: params_df has IsReturnType entries that are N/A')

In [None]:
len(params_df)
len(params_df[~params_df.TypeCategory_DWARF.isna()])
# params_df[params_df.Type_DWARF==params_df.Type_Strip]

# filter out cases where debug build recovered a fake variable (e.g. there is no true variable (DWARF) or stripped variable)
only_debug = params_df[(params_df.TypeCategory_DWARF.isna()) & (params_df.TypeCategory_Strip.isna())]
print(f'# total parameters: {len(params_df):,}')
print(f'{len(only_debug):,} parameters are only in the debug build (not DWARF or stripped)')

# each entry in this DF is either a true param, a stripped param, or both
strip_params = params_df.loc[params_df.index.difference(only_debug.index)]
print(f'{len(strip_params):,} params remaining after removing debug-only params')
print(f'Sanity check (should be 0) -> {len(params_df):,} - {len(only_debug):,} - {len(strip_params):,} = {len(params_df)-len(only_debug)-len(strip_params):,}')

# remove return types?
len(strip_params[strip_params.IsReturnType])
strip_params.loc[~strip_params.IsReturnType]
# strip_params.groupby('IsReturnType').count().FunctionStart.sum()
strip_params.IsReturnType.isna().any()
strip_params[strip_params.IsReturnType.isna()]

In [None]:
binaries_df

In [None]:
print(f'{1322409:x}')
strip_params[(strip_params.Type_Strip.isna()) & (strip_params.BinaryId==0)].groupby('FunctionStart').count()
strip_params[strip_params.FunctionStart==1322409]

In [None]:
funcs_df[funcs_df.FunctionStart==1322409]

In [None]:

# drop debug cols to make it more readable here
strip_params.drop(['Name_Debug','Type_Debug','TypeCategory_Debug'],axis=1)\
    [strip_params.FunctionStart==0x18c450]

In [None]:
funcs_df[funcs_df.FunctionStart==1577959]
#.sort_values('LocOffset')
dwarf_stack_params = params_df[(params_df.LocType=='stack') & (~params_df.TypeCategory_DWARF.isna())]
df = dwarf_stack_params[dwarf_stack_params.LocOffset < 0]
df[df.BinaryId==0].groupby('FunctionStart').count().sort_values('LocType').reset_index().FunctionStart.apply(hex)
df[df.BinaryId==0].groupby('FunctionStart').count().sort_values('LocType')
len(df[df.BinaryId==0])/len(params_df[params_df.BinaryId==0])
len(df[df.BinaryId==0])/len(dwarf_stack_params[dwarf_stack_params.BinaryId==0])


In [None]:

# if we don't ensure TypeCategory_DWARF is valid, we get several more hits because they are both
# NaN (due to Debug being the only valid column)
strip_correct = params_df[(~params_df.TypeCategory_DWARF.isna()) & (params_df.Type_DWARF==params_df.Type_Strip)]
strip_fail = params_df[(~params_df.TypeCategory_Strip.isna()) & (params_df.Type_DWARF!=params_df.Type_Strip)]

len(strip_correct)
strip_correct.groupby('TypeCategory_DWARF').count().FunctionStart

print(len(strip_fail) + len(strip_correct))
total_stripvars = len(params_df[(~params_df.TypeCategory_Strip.isna())])

acc_pcnt = len(strip_correct)/total_stripvars*100
print(f'Ghidra stripped function parameter recovery accuracy = {acc_pcnt:.2f}%')

# but how much was because the variables were wrong?
len(params_df[(~params_df.TypeCategory_Strip.isna()) & (params_df.TypeCategory_DWARF.isna())])
len(params_df[(params_df.TypeCategory_Strip.isna()) & (~params_df.TypeCategory_DWARF.isna())])
# len(params_df[(~params_df.TypeCategory_Debug.isna()) & (params_df.TypeCategory_DWARF.isna())])

from rich.console import Console
console = Console()

num_vars = len(params_df)
strip_is_na = params_df.TypeCategory_Strip.isna()
dwarf_is_na = params_df.TypeCategory_DWARF.isna()
strip_is_valid = ~strip_is_na
dwarf_is_valid = ~dwarf_is_na

num_stripvars = len(params_df[strip_is_valid])
num_dwarfvars = len(params_df[~dwarf_is_na])
num_true_stripvars = len(params_df[(strip_is_valid) & (dwarf_is_valid)])

print(f'There are {num_vars:,} parameters (and return types)')
print(f'{num_stripvars:,} of these are parameters from the stripped binary')
print(f'{num_dwarfvars:,} of these are (true) parameters from DWARF debug info')
print(f'{num_true_stripvars:,} of these stripped/DWARF variables intersect')

In [None]:
params_df[strip_is_valid].groupby('IsReturnType').count()

In [None]:
params_df[dwarf_is_valid].groupby('IsReturnType').count()

In [None]:
params_df[params_df.IsReturnType]
params_df[params_df.TypeCategory_DWARF.isna()]