In [18]:
import os
import pandas as pd
from pathlib import Path

# Execute this notebook headlessly using something like:
#
# EXP_FOLDER=~/test_builds/coreutils.exp/ jupyter nbconvert --to html --execute characterize_dataset.ipynb --no-input
#

# MANUAL_EXP_FOLDER = Path.home()/'test_builds'/'coreutils.exp'
MANUAL_EXP_FOLDER = Path.home()/'test_builds'/'astera3.exp'

# take the env-var-specified experiment if present
EXP_FOLDER = Path(os.environ['EXP_FOLDER']) if 'EXP_FOLDER' in os.environ else MANUAL_EXP_FOLDER

In [19]:
!ls ~/test_builds

astera2.exp	   c.exp	  docker_test.exp	  SECOND_source_ast.exp
astera3.exp	   copy.exp	  _dwarflines_astera.exp  source_ast.exp
astera.exp	   coreutils.exp  ffmpeg.exp		  test.exp
astera.old.exp	   d2		  ffmpeg.old.exp	  timing.exp
basic-dataset.exp  demo.exp	  funcproto-eval.exp


In [20]:
#!head ~/test_builds/astera3.exp/rundata/run1/locals.csv

EXP_NAME = EXP_FOLDER.stem
print(f'Experiment: {EXP_NAME}')

num_runs = len(list((EXP_FOLDER/'rundata').iterdir()))

# assume 1 run for now, verify this
if num_runs > 1:
    raise Exception(f'More than 1 run - {num_runs} found')

run_folder = EXP_FOLDER/'rundata'/'run1'
binaries_csv = run_folder/'binaries.csv'
locals_csv = run_folder/'locals.csv'
funcs_csv = run_folder/'functions.csv'
params_csv = run_folder/'function_params.csv'

Experiment: astera3


In [21]:
# read in dataframes
binaries_df = pd.read_csv(binaries_csv)
locals_df = pd.read_csv(locals_csv)
funcs_df = pd.read_csv(funcs_csv)
params_df = pd.read_csv(params_csv)

params_df['IsReturnType'] = params_df.IsReturnType.convert_dtypes(convert_boolean=True)

# example of how to join with binary name based on ID
# dd = pd.DataFrame({'BinaryId': [0,0,0,3,4,0,1,0,1], 'Number': list(range(9))})
# mm = dd.merge(binaries_df, on='BinaryId', how='left')

# Dataset Composition
What is the basic makeup of this dataset in terms of its general size (binaries, functions, variables) and the number and variety of data types?

In [22]:
num_binaries = len(binaries_df.BinaryId.unique())

### exes/shared objects
binaries_df['IsSharedObject'] = binaries_df.Name.apply(lambda x: x.endswith('.so'))
bins_by_type = binaries_df.groupby('IsSharedObject').count()[['Name']].rename(columns={'Name': 'IsSO'})
sobjs = bins_by_type.IsSO[bins_by_type.IsSO.index==True]
exes = bins_by_type.IsSO[bins_by_type.IsSO.index==False]
num_exes = 0 if exes.empty else exes[0]
num_sharedobjs = 0 if sobjs.empty else sobjs[0]

### duplicate funcs
func_names = funcs_df.groupby('FunctionName_DWARF').count()[['BinaryId']].rename(columns={'BinaryId': 'Count'})
func_names.sort_values('Count', ascending=False)
len(func_names[func_names.Count>1])
func_names[func_names.index=='main']

num_funcs = len(funcs_df)
num_unique_funcs = len(funcs_df.FunctionName_DWARF.unique())
num_main_funcs = func_names.Count[func_names.Count.index=='main'][0]
# subtract 1 because the set of unique functions already counts 1 main function...
# so we just add the duplicates
dup_main_funcs = num_main_funcs - 1
unique_plus_main_funcs = num_unique_funcs + dup_main_funcs

# from rich import Console

from rich.console import Console
from rich.table import Table

table = Table(title=f"{EXP_NAME.capitalize()} Overview")

table.add_column("Metric", justify="right", style="dodger_blue1", no_wrap=True)
table.add_column("Value")# style="green")
table.add_column("%", justify="right")#, style="green3")



# to close a style use [/], e.g: '[bold] xyz [/]'
func_color = 'bright_magenta'
table.add_row('# Binaries', f'{num_binaries:,}', '-')
table.add_row('# Exes', f'{num_exes:,}', f'{num_exes/num_binaries*100:.1f}%')
table.add_row('# Shared objects', f'{num_sharedobjs:,}', f'{num_sharedobjs/num_binaries*100:.1f}%')
table.add_row(f'[{func_color}]# Functions', f'{num_funcs:,}')
table.add_row(f'[{func_color}]# Unique Functions', f'{num_unique_funcs:,}', f'[green4]{num_unique_funcs/num_funcs*100:.1f}%')
table.add_row(f'[{func_color}]# main() Functions', f'{num_main_funcs:,}', f'{num_main_funcs/num_funcs*100:.1f}%')
table.add_row(f'[{func_color}]# Unique + main() Functions (YIELD)', f'[{func_color}]{unique_plus_main_funcs:,}',
              f'[{func_color}]{unique_plus_main_funcs/num_funcs*100:.1f}%')

console = Console()
console.print(table)

## Stripped functions sanity check
This should be a short list with no *"real"* function names (just runtime helper functions like `_DT_FINI`)

I'm grouping all stripped function names that do **NOT** start with `'FUN_'` (for all non-NaN stripped functions).

In [23]:
strip_funcs = funcs_df[~funcs_df.FunctionName_Strip.isna()]
strip_funcs[~strip_funcs.FunctionName_Strip.apply(lambda x: x.startswith('FUN_') if x else False)].groupby('FunctionName_Strip').count()

Unnamed: 0_level_0,FunctionStart,FunctionName_Debug,FunctionName_DWARF,BinaryId
FunctionName_Strip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
_DT_FINI,8,8,0,8
_DT_INIT,8,8,0,8
_FINI_0,8,8,0,8
entry,8,8,0,8


# Binaries & Functions
How many binaries and functions do we have?
Do we need to filter out any duplicate functions?

In [24]:
num_binaries = len(locals_df.BinaryId.unique())


# NOTE: we can't determine duplicates as easily from this dataset...if we have a table
# of functions (1 row per function) then it easy to do a  groupby and count any duplicates across
# binaries

In [25]:
true_vars = locals_df[~locals_df.Name_DWARF.isna()]
strip_vars = locals_df[~locals_df.Name_Strip.isna()]
debug_vars = locals_df[~locals_df.Name_Debug.isna()]

print(f'# true locals = {len(true_vars):,}')
print(f'# debug locals = {len(debug_vars):,}')
print(f'# strip locals = {len(strip_vars):,}')

# true locals = 21,789
# debug locals = 86,827
# strip locals = 63,564


In [26]:
true_builtins = true_vars[true_vars.TypeCategory_DWARF=='BUILTIN']
# true_builtins[true_builtins.TypeCategory_Strip=='BUILTIN']
len(true_builtins[true_builtins.TypeCategory_Strip.isna()])
len(true_builtins)
num_truebuiltins_with_stripvar = len(true_builtins[~true_builtins.TypeCategory_Strip.isna()])
# true_builtins.groupby('TypeCategory_Strip').count().FunctionStart/num_truebuiltins_with_stripvar*100
truebuiltins_with_stripvar = true_builtins[~true_builtins.TypeCategory_Strip.isna()]
tb_with_svbuiltin = truebuiltins_with_stripvar[truebuiltins_with_stripvar.TypeCategory_Strip=='BUILTIN']

mismatches = tb_with_svbuiltin[tb_with_svbuiltin.Type_Strip!=tb_with_svbuiltin.Type_DWARF][['Type_Strip','Type_DWARF']]
funcs_df[funcs_df.FunctionStart==1165544]
len(tb_with_svbuiltin[tb_with_svbuiltin.Type_Strip==tb_with_svbuiltin.Type_DWARF])/len(tb_with_svbuiltin)*100
len(tb_with_svbuiltin)
tb_with_svbuiltin[tb_with_svbuiltin.Type_DWARF=='UNMAPPED_FLOAT_16']

Unnamed: 0,FunctionStart,Name_Debug,Type_Debug,LocType,LocRegName,LocOffset,TypeCategory_Debug,Name_Strip,Type_Strip,TypeCategory_Strip,Name_DWARF,Type_DWARF,FunctionName,TypeCategory_DWARF,TrueDebugVar,TrueStripVar,Size_DWARF,Size_Debug,Size_Strip,BinaryId


## Function Prototypes

The initial questions I have are not even about the data types yet, just basic parameter recovery:

1. How common is it for Ghidra to **miss parameters**?
2. How common is it for Ghidra to recover **extra parameters**?

Breaking that down...

1. How many functions did Ghidra recover the correct number of parameters?
    - *Whether or not they were at the correct location?*
2. How many functions did Ghidra fail to recover parameters?
3. How many parameters did Ghidra fail to recover on average?
    -  Overall, including 0's for functions where Ghidra didn't miss any parameters?
    - Only across the set of functions where Ghidra missed 1 or more parameters?
4. How many functions did Ghidra recover extra parameters?

In [96]:
# validate that IsReturnType is never N/A
if params_df.IsReturnType.isna().any():
    raise Exception('FAILED VALIDATION: params_df has IsReturnType entries that are N/A')

Exception: FAILED VALIDATION: params_df has IsReturnType entries that are N/A

In [90]:
len(params_df)
len(params_df[~params_df.TypeCategory_DWARF.isna()])
# params_df[params_df.Type_DWARF==params_df.Type_Strip]

# filter out cases where debug build recovered a fake variable (e.g. there is no true variable (DWARF) or stripped variable)
only_debug = params_df[(params_df.TypeCategory_DWARF.isna()) & (params_df.TypeCategory_Strip.isna())]
print(f'# total parameters: {len(params_df):,}')
print(f'{len(only_debug):,} parameters are only in the debug build (not DWARF or stripped)')

# each entry in this DF is either a true param, a stripped param, or both
strip_params = params_df.loc[params_df.index.difference(only_debug.index)]
print(f'{len(strip_params):,} params remaining after removing debug-only params')
print(f'Sanity check (should be 0) -> {len(params_df):,} - {len(only_debug):,} - {len(strip_params):,} = {len(params_df)-len(only_debug)-len(strip_params):,}')

# remove return types?
len(strip_params[strip_params.IsReturnType])
strip_params.loc[~strip_params.IsReturnType]
# strip_params.groupby('IsReturnType').count().FunctionStart.sum()
strip_params.IsReturnType.isna().any()
strip_params[strip_params.IsReturnType.isna()]

# total parameters: 50,856
532 parameters are only in the debug build (not DWARF or stripped)
50,324 params remaining after removing debug-only params
Sanity check (should be 0) -> 50,856 - 532 - 50,324 = 0


Unnamed: 0,FunctionStart,Name_Debug,IsReturnType,Type_Debug,LocType,LocRegName,LocOffset,TypeCategory_Debug,Name_Strip,Type_Strip,TypeCategory_Strip,Name_DWARF,IsReturnType_DWARF,Type_DWARF,TypeCategory_DWARF,BinaryId
4813,1366263,,,,stack,,-16.0,,,,,delta,False,double,BUILTIN,0
4814,1365335,,,,stack,,-80.0,,,,,delta,False,double,BUILTIN,0
4815,1364234,,,,stack,,-16.0,,,,,center,False,float*,POINTER,0
4816,1364234,,,,stack,,-24.0,,,,,size,False,float*,POINTER,0
4817,1364234,,,,stack,,-32.0,,,,,color,False,float*,POINTER,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50793,1071996,,,,stack,,-12.0,,,,,timestamp,False,uint8_t,BUILTIN,7
50794,1071963,,,,stack,,-16.0,,,,,fp,False,int8_t*,POINTER,7
50795,1071903,,,,stack,,-12.0,,,,,log,False,uint8_t,BUILTIN,7
50796,1071903,,,,stack,,-24.0,,,,,fp,False,int8_t*,POINTER,7


In [79]:
strip_params[strip_params.Type_Strip.isna()]
strip_params[strip_params.FunctionStart==1071903]

Unnamed: 0,FunctionStart,Name_Debug,IsReturnType,Type_Debug,LocType,LocRegName,LocOffset,TypeCategory_Debug,Name_Strip,Type_Strip,TypeCategory_Strip,Name_DWARF,IsReturnType_DWARF,Type_DWARF,TypeCategory_DWARF,BinaryId
50651,1071903,log,False,uint8_t,reg,edi,0.0,BUILTIN,param_1,int8_t,BUILTIN,,,,,7
50652,1071903,fp,False,int8_t*,reg,esi,0.0,POINTER,param_2,int64_t,BUILTIN,,,,,7
50795,1071903,,,,stack,,-12.0,,,,,log,False,uint8_t,BUILTIN,7
50796,1071903,,,,stack,,-24.0,,,,,fp,False,int8_t*,POINTER,7
50832,1071903,,True,void,,,,BUILTIN,,void,BUILTIN,,True,void,BUILTIN,7


In [28]:

# if we don't ensure TypeCategory_DWARF is valid, we get several more hits because they are both
# NaN (due to Debug being the only valid column)
strip_correct = params_df[(~params_df.TypeCategory_DWARF.isna()) & (params_df.Type_DWARF==params_df.Type_Strip)]
strip_fail = params_df[(~params_df.TypeCategory_Strip.isna()) & (params_df.Type_DWARF!=params_df.Type_Strip)]

len(strip_correct)
strip_correct.groupby('TypeCategory_DWARF').count().FunctionStart

print(len(strip_fail) + len(strip_correct))
total_stripvars = len(params_df[(~params_df.TypeCategory_Strip.isna())])

acc_pcnt = len(strip_correct)/total_stripvars*100
print(f'Ghidra stripped function parameter recovery accuracy = {acc_pcnt:.2f}%')

# but how much was because the variables were wrong?
len(params_df[(~params_df.TypeCategory_Strip.isna()) & (params_df.TypeCategory_DWARF.isna())])
len(params_df[(params_df.TypeCategory_Strip.isna()) & (~params_df.TypeCategory_DWARF.isna())])
# len(params_df[(~params_df.TypeCategory_Debug.isna()) & (params_df.TypeCategory_DWARF.isna())])

from rich.console import Console
console = Console()

num_vars = len(params_df)
strip_is_na = params_df.TypeCategory_Strip.isna()
dwarf_is_na = params_df.TypeCategory_DWARF.isna()
strip_is_valid = ~strip_is_na
dwarf_is_valid = ~dwarf_is_na

num_stripvars = len(params_df[strip_is_valid])
num_dwarfvars = len(params_df[~dwarf_is_na])
num_true_stripvars = len(params_df[(strip_is_valid) & (dwarf_is_valid)])

print(f'There are {num_vars:,} parameters (and return types)')
print(f'{num_stripvars:,} of these are parameters from the stripped binary')
print(f'{num_dwarfvars:,} of these are (true) parameters from DWARF debug info')
print(f'{num_true_stripvars:,} of these stripped/DWARF variables intersect')

29639
Ghidra stripped function parameter recovery accuracy = 15.94%
There are 50,856 parameters (and return types)
29,639 of these are parameters from the stripped binary
29,207 of these are (true) parameters from DWARF debug info
8,522 of these stripped/DWARF variables intersect


In [29]:
params_df[strip_is_valid].groupby('IsReturnType').count()

Unnamed: 0_level_0,FunctionStart,Name_Debug,Type_Debug,LocType,LocRegName,LocOffset,TypeCategory_Debug,Name_Strip,Type_Strip,TypeCategory_Strip,Name_DWARF,IsReturnType_DWARF,Type_DWARF,TypeCategory_DWARF,BinaryId
IsReturnType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
False,21380,20478,20478,21380,20914,21380,20478,21380,21380,21380,327,327,327,327,21380
True,8259,0,8208,0,0,0,8208,0,8259,8259,0,8195,8195,8195,8259


In [30]:
params_df[dwarf_is_valid].groupby('IsReturnType').count()

Unnamed: 0_level_0,FunctionStart,Name_Debug,Type_Debug,LocType,LocRegName,LocOffset,TypeCategory_Debug,Name_Strip,Type_Strip,TypeCategory_Strip,Name_DWARF,IsReturnType_DWARF,Type_DWARF,TypeCategory_DWARF,BinaryId
IsReturnType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
False,430,425,425,430,0,430,425,327,327,327,430,430,430,430,430
True,8195,0,8144,0,0,0,8144,0,8195,8195,0,8195,8195,8195,8195


In [31]:
params_df[params_df.IsReturnType]
params_df[params_df.TypeCategory_DWARF.isna()]

Unnamed: 0,FunctionStart,Name_Debug,IsReturnType,Type_Debug,LocType,LocRegName,LocOffset,TypeCategory_Debug,Name_Strip,Type_Strip,TypeCategory_Strip,Name_DWARF,IsReturnType_DWARF,Type_DWARF,TypeCategory_DWARF,BinaryId
0,2009856,param_1,False,EVP_PKEY_CTX*,reg,edi,0.0,POINTER,param_1,uint32_t,BUILTIN,,,,,0
1,2009856,param_2,False,uint64_t,reg,esi,0.0,BUILTIN,param_2,uint64_t,BUILTIN,,,,,0
2,2009856,param_3,False,uint64_t,reg,edx,0.0,BUILTIN,param_3,uint64_t,BUILTIN,,,,,0
3,1369447,write,False,pak_write_t*,reg,edi,0.0,POINTER,param_1,int64_t,BUILTIN,,,,,0
4,1369447,name,False,int8_t*,reg,esi,0.0,POINTER,param_2,int8_t*,POINTER,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50803,1073812,,True,void,,,,BUILTIN,,void,BUILTIN,,,,,7
50804,1073784,,True,int32_t,,,,BUILTIN,,void,BUILTIN,,,,,7
50806,1060624,,True,void,,,,BUILTIN,,void,BUILTIN,,,,,7
50836,1060672,,True,void,,,,BUILTIN,,void,BUILTIN,,,,,7
