In [10]:
from pathlib import Path
import os

MANUAL_EXP_FOLDER = Path.home()/'test_builds'/'astera3.exp'

EXP_FOLDER = Path(os.environ['EXP_FOLDER']) if 'EXP_FOLDER' in os.environ else MANUAL_EXP_FOLDER
EXP_FOLDER

True

In [129]:
from pathlib import Path
import pandas as pd

#!head ~/test_builds/astera3.exp/rundata/run1/locals.csv

EXP_NAME = 'astera3.exp'
EXP_FOLDER = Path.home()/'test_builds'/EXP_NAME

num_runs = len(list((EXP_FOLDER/'rundata').iterdir()))

# assume 1 run for now, verify this
if num_runs > 1:
    raise Exception(f'More than 1 run - {num_runs} found')

run_folder = EXP_FOLDER/'rundata'/'run1'
binaries_csv = run_folder/'binaries.csv'
locals_csv = run_folder/'locals.csv'
funcs_csv = run_folder/'functions.csv'
params_csv = run_folder/'function_params.csv'

[x for x in (EXP_FOLDER/'rundata'/'run1').iterdir() if not x.is_dir()]

[PosixPath('/home/cls0027/test_builds/astera3.exp/rundata/run1/locals.csv'),
 PosixPath('/home/cls0027/test_builds/astera3.exp/rundata/run1/functions.csv'),
 PosixPath('/home/cls0027/test_builds/astera3.exp/rundata/run1/flat_layout.csv'),
 PosixPath('/home/cls0027/test_builds/astera3.exp/rundata/run1/function_params.csv'),
 PosixPath('/home/cls0027/test_builds/astera3.exp/rundata/run1/binaries.csv')]

In [130]:
locals_df = pd.read_csv(locals_csv)
binaries_df = pd.read_csv(binaries_csv)
params_df = pd.read_csv(params_csv)

params_df['IsReturnType'] = params_df.IsReturnType.convert_dtypes(convert_boolean=True)

# example of how to join with binary name based on ID
# dd = pd.DataFrame({'BinaryId': [0,0,0,3,4,0,1,0,1], 'Number': list(range(9))})
# mm = dd.merge(binaries_df, on='BinaryId', how='left')

# Binaries & Functions
How many binaries and functions do we have?
Do we need to filter out any duplicate functions?

In [131]:
num_binaries = len(locals_df.BinaryId.unique())
print(f'There are {num_binaries} binaries in this dataset (that have local variables)')

# NOTE: we can't determine duplicates as easily from this dataset...if we have a table
# of functions (1 row per function) then it easy to do a  groupby and count any duplicates across
# binaries

There are 8 binaries in this dataset (that have local variables)


In [132]:
true_vars = locals_df[~locals_df.Name_DWARF.isna()]
strip_vars = locals_df[~locals_df.Name_Strip.isna()]
debug_vars = locals_df[~locals_df.Name_Debug.isna()]

print(f'# true locals = {len(true_vars):,}')
print(f'# debug locals = {len(debug_vars):,}')
print(f'# strip locals = {len(strip_vars):,}')

# true locals = 21,789
# debug locals = 86,827
# strip locals = 63,564


In [133]:
len(true_vars[~true_vars.Name_Strip.isna()])

11383

## Function Prototypes

In [134]:
len(params_df)
len(params_df[~params_df.TypeCategory_DWARF.isna()])
# params_df[params_df.Type_DWARF==params_df.Type_Strip]

# if we don't ensure TypeCategory_DWARF is valid, we get several more hits because they are both
# NaN (due to Debug being the only valid column)
strip_correct = params_df[(~params_df.TypeCategory_DWARF.isna()) & (params_df.Type_DWARF==params_df.Type_Strip)]
strip_fail = params_df[(~params_df.TypeCategory_Strip.isna()) & (params_df.Type_DWARF!=params_df.Type_Strip)]

len(strip_correct)
strip_correct.groupby('TypeCategory_DWARF').count().FunctionStart

print(len(strip_fail) + len(strip_correct))
total_stripvars = len(params_df[(~params_df.TypeCategory_Strip.isna())])

acc_pcnt = len(strip_correct)/total_stripvars*100
print(f'Ghidra stripped function parameter recovery accuracy = {acc_pcnt:.2f}%')

# but how much was because the variables were wrong?
len(params_df[(~params_df.TypeCategory_Strip.isna()) & (params_df.TypeCategory_DWARF.isna())])
len(params_df[(params_df.TypeCategory_Strip.isna()) & (~params_df.TypeCategory_DWARF.isna())])
# len(params_df[(~params_df.TypeCategory_Debug.isna()) & (params_df.TypeCategory_DWARF.isna())])

from rich.console import Console
console = Console()

num_vars = len(params_df)
strip_is_na = params_df.TypeCategory_Strip.isna()
dwarf_is_na = params_df.TypeCategory_DWARF.isna()
strip_is_valid = ~strip_is_na
dwarf_is_valid = ~dwarf_is_na

num_stripvars = len(params_df[strip_is_valid])
num_dwarfvars = len(params_df[~dwarf_is_na])
num_true_stripvars = len(params_df[(strip_is_valid) & (dwarf_is_valid)])

print(f'There are {num_vars:,} parameters (and return types)')
print(f'{num_stripvars:,} of these are parameters from the stripped binary')
print(f'{num_dwarfvars:,} of these are (true) parameters from DWARF debug info')
print(f'{num_true_stripvars:,} of these stripped/DWARF variables intersect')

29639
Ghidra stripped function parameter recovery accuracy = 15.94%
There are 50,856 parameters (and return types)
29,639 of these are parameters from the stripped binary
29,207 of these are (true) parameters from DWARF debug info
8,522 of these stripped/DWARF variables intersect


In [135]:
params_df[strip_is_valid].groupby('IsReturnType').count()

Unnamed: 0_level_0,FunctionStart,Name_Debug,Type_Debug,LocType,LocRegName,LocOffset,TypeCategory_Debug,Name_Strip,Type_Strip,TypeCategory_Strip,Name_DWARF,IsReturnType_DWARF,Type_DWARF,TypeCategory_DWARF,BinaryId
IsReturnType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
False,21380,20478,20478,21380,20914,21380,20478,21380,21380,21380,327,327,327,327,21380
True,8259,0,8208,0,0,0,8208,0,8259,8259,0,8195,8195,8195,8259


In [136]:
params_df[dwarf_is_valid].groupby('IsReturnType').count()

Unnamed: 0_level_0,FunctionStart,Name_Debug,Type_Debug,LocType,LocRegName,LocOffset,TypeCategory_Debug,Name_Strip,Type_Strip,TypeCategory_Strip,Name_DWARF,IsReturnType_DWARF,Type_DWARF,TypeCategory_DWARF,BinaryId
IsReturnType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
False,430,425,425,430,0,430,425,327,327,327,430,430,430,430,430
True,8195,0,8144,0,0,0,8144,0,8195,8195,0,8195,8195,8195,8195


In [137]:
params_df[params_df.IsReturnType]
params_df[params_df.TypeCategory_DWARF.isna()]

Unnamed: 0,FunctionStart,Name_Debug,IsReturnType,Type_Debug,LocType,LocRegName,LocOffset,TypeCategory_Debug,Name_Strip,Type_Strip,TypeCategory_Strip,Name_DWARF,IsReturnType_DWARF,Type_DWARF,TypeCategory_DWARF,BinaryId
0,2009856,param_1,False,EVP_PKEY_CTX*,reg,edi,0.0,POINTER,param_1,uint32_t,BUILTIN,,,,,0
1,2009856,param_2,False,uint64_t,reg,esi,0.0,BUILTIN,param_2,uint64_t,BUILTIN,,,,,0
2,2009856,param_3,False,uint64_t,reg,edx,0.0,BUILTIN,param_3,uint64_t,BUILTIN,,,,,0
3,1369447,write,False,pak_write_t*,reg,edi,0.0,POINTER,param_1,int64_t,BUILTIN,,,,,0
4,1369447,name,False,int8_t*,reg,esi,0.0,POINTER,param_2,int8_t*,POINTER,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50803,1073812,,True,void,,,,BUILTIN,,void,BUILTIN,,,,,7
50804,1073784,,True,int32_t,,,,BUILTIN,,void,BUILTIN,,,,,7
50806,1060624,,True,void,,,,BUILTIN,,void,BUILTIN,,,,,7
50836,1060672,,True,void,,,,BUILTIN,,void,BUILTIN,,,,,7
