In [1]:
import setup # resolve path to 'src'
import numpy as np
import pandas as pd

from typing import Optional
from build_parse import *
from metrics import *

pd.set_option('display.max_rows', None)

In [2]:
progs = [ CoreutilsProgram(progname) for progname in COREUTILS_PROG_NAMES ]
prognames = [ prog.get_name() for prog in progs ]

prognames_analyze = [ "stat", "nohup", "pinky", "csplit", "ginstall", "fmt", "df", "join", "expr", "seq", "unexpand", "tsort", "tee", "base64", "sum", "cksum", "wc" ]
progs_analyze = []
for progname in prognames_analyze:
    for prog in progs:
        if progname == prog.get_name():
            progs_analyze.append(prog)
            break

def prog_from_progname(progname: str) -> Program:
    for prog in progs:
        if progname == prog.get_name():
            return prog

# Define the build options to test for each program
debug_opts = BuildOptions(debug=True, strip=False, optimization=0)
standard_opts = BuildOptions(debug=False, strip=False, optimization=0)
strip_opts = BuildOptions(debug=False, strip=True, optimization=0)

opts_sets = (strip_opts, standard_opts, debug_opts)
opts_sets_keys = ("strip", "standard", "debug")

# Get the parser functions
dwarf_parser = get_parser("dwarf")
ghidra_parser = get_parser("ghidra")

varnode_compare_levels = list(VarnodeCompareLevel.range())
varnode_compare_level_labels = [ "Varnodes matched @ level={}".format(VarnodeCompareLevel.to_string(level)) for level in VarnodeCompareLevel.range() ]

primitive_metatypes = [MetaType.INT, MetaType.FLOAT, MetaType.POINTER]
complex_metatypes = [MetaType.ARRAY, MetaType.STRUCT, MetaType.UNION]
metatypes = primitive_metatypes + complex_metatypes
metatype_labels = [ MetaType.repr(metatype) for metatype in metatypes ]

# ensure that each program is built according to all variations of build options
for prog in progs:
    for opts in (debug_opts, standard_opts, strip_opts):
        assert(prog.valid_build(opts))

In [3]:

# the filename format for saving parsed ProgramInfo pickle objects
def mangle_proginfo_save_name(parsername: str, prog: Program, opts: BuildOptions) -> str:
    return "{}.{}.pickle".format(prog.get_binary_name(opts), parsername)

def get_proginfo_save_path(parsername: str, prog: Program, opts: BuildOptions) -> Path:
    return PICKLE_CACHE_DIR.joinpath(mangle_proginfo_save_name(parsername, prog, opts))

def save_proginfo(proginfo: ProgramInfo, parsername: str, prog: Program, opts: BuildOptions):
    save_pickle(proginfo, get_proginfo_save_path(parsername, prog, opts))

def load_proginfo(parsername: str, prog: Program, opts: BuildOptions) -> ProgramInfo:
    return load_pickle(get_proginfo_save_path(parsername, prog, opts))

# the filename format for saving UnoptimizedProgramInfoCompare2 objects
def mangle_cmp_save_name(prog: Program, opts: BuildOptions) -> str:
    return "{}.cmp.pickle".format(prog.get_binary_name(opts))

def get_cmp_save_path(prog: Program, opts: BuildOptions) -> Path:
    return PICKLE_CACHE_DIR.joinpath(mangle_cmp_save_name(prog, opts))

def save_cmp(cmp: UnoptimizedProgramInfoCompare2, prog: Program, opts: BuildOptions):
    save_pickle(cmp, get_cmp_save_path(prog, opts))

def load_cmp(prog: Program, opts: BuildOptions) -> UnoptimizedProgramInfoCompare2:
    return load_pickle(get_cmp_save_path(prog, opts))

In [4]:
# DWARF: only parse with the debug build options
# Ghidra: parse with all variations of build options
# Cache the results in local pickle_cache directory, named based on the 'mangle' scheme

reparse = False # should we re-parse even if we already parsed and cached a program?
skip_parsing = True # should we skip the parsing? set to True if we already parsed & cached

class ParseException(Exception):
    pass

def parse(parser: Callable, prog: Program, opts: BuildOptions) -> Optional[ProgramInfo]:
    try:
        return parser(prog.get_binary_path(opts))
    except:
        return None

failed = []
if not skip_parsing:
    for prog in union_progs:
        dwarf_debug_savepath = get_proginfo_save_path("dwarf", prog, debug_opts)
        if reparse or not dwarf_debug_savepath.exists():
            dwarf_debug = parse(dwarf_parser, prog, debug_opts)
            if dwarf_debug is None:
                failed.append(("dwarf", prog.get_name(), debug_opts))
            else:
                save_pickle(dwarf_debug, dwarf_debug_savepath)

        for opts in opts_sets:
            ghidra_parse_savepath = get_proginfo_save_path("ghidra", prog, opts)
            if reparse or not dwarf_debug_savepath.exists():
                ghidra_parse = parse(ghidra_parser, prog, opts)
                if ghidra_parse is None:
                    failed.append(("ghidra", prog.get_name(), opts))
                else:
                    save_pickle(ghidra_parse, ghidra_parse_savepath)

In [5]:
print(failed)

for prog in progs:
    for opts in opts_sets:
        assert(get_proginfo_save_path("ghidra", prog, opts).exists())

[]


In [6]:
fix_prognames = ['cksum', 'csplit', 'df', 'dir', 'du', 'expr', 'ln', 'ls', 'nl', 'ptx', 'readlink', 'realpath', 'tac', 'vdir', 'wc']
fix_progs = [ CoreutilsProgram(progname) for progname in fix_prognames ]

In [7]:
# For each program & build options combination, compute & store comparison object

recompare = False
skip_comparisons = True

failed = []
if not skip_comparisons:
    for prog in progs:
        # load the DWARF ground-truth ProgramInfo
        dwarf_proginfo = load_pickle(get_proginfo_save_path("dwarf", prog, debug_opts))
        assert(dwarf_proginfo is not None)

        # for each set of compilation options, load the Ghidra decompiler ProgramInfo
        # then compute & store the comparison object
        for opts in opts_sets:
            cmp_save_path = get_cmp_save_path(prog, opts)
            if recompare or not cmp_save_path.exists():
                ghidra_proginfo = load_pickle(get_proginfo_save_path("ghidra", prog, opts))
                assert(ghidra_proginfo is not None)
                try:
                    cmp = compare2(dwarf_proginfo, ghidra_proginfo)
                    save_pickle(cmp, get_cmp_save_path(prog, opts))
                except:
                    failed.append((prog.get_name(), opts))

In [8]:
print(failed)

for prog in progs:
    for opts in opts_sets:
        assert(get_cmp_save_path(prog, opts).exists())

[]


In [9]:
# For each opts, compute the tables

def mangle_table_save_name(
    tablename: str,
    opts: BuildOptions
) -> str:
    return "{}{}.csv".format(tablename, suffix(opts))

def build_options_display_suffix(opts: BuildOptions) -> str:
    return "(optimization={}, stripped={}, debug={})".format(opts.optimization, opts.strip, opts.debug)

def mangle_table_display_name(
    tablename: str,
    opts: BuildOptions
) -> str:

    return "{} {}".format(tablename, build_options_display_suffix(opts))

def get_table_save_path(
    tablename: str,
    opts: BuildOptions
) -> Path:
    return DATA_DIR.joinpath(mangle_table_save_name(tablename, opts))

def get_table_save_path_generic(
    tablename: str
) -> Path:
    return DATA_DIR.joinpath(tablename + ".csv")

def load_table(
    tablename: str
) -> pd.DataFrame:
    return pd.read_csv(DATA_DIR.joinpath(tablename + ".csv"), index_col=0)

def load_table_opts(
    tablename: str,
    opts: BuildOptions
) -> pd.DataFrame:
    return pd.read_csv(get_table_save_path(tablename, opts), index_col=0)

def load_table_opts_filter_analyzed(tablename: str, opts: BuildOptions) -> pd.DataFrame:
    return load_table_opts(tablename, opts).filter(prognames_analyze, axis=0)

def get_table_from_group(
    grp: MetricsGroup,
    opts: BuildOptions
)-> pd.DataFrame:
    return load_table_opts(grp.get_name(), opts)

In [10]:
metrics_groups = make_metrics()

bytes_group = metrics_groups[0]
functions_group = metrics_groups[1]
varnodes_group = metrics_groups[2]
decomposed_varnodes_group = metrics_groups[9]
array_comparisons_group = metrics_groups[13]

def varnodes_group_metatype(metatype: int) -> MetricsGroup:
    _map = dict([ (meta, i) for i, meta in enumerate(primitive_metatypes + complex_metatypes, 3) ])
    return metrics_groups[_map[metatype]]

varnodes_groups_metatypes = [ varnodes_group_metatype(metatype) for metatype in (primitive_metatypes + complex_metatypes) ]

def decomposed_varnodes_group_metatype(metatype: int) -> MetricsGroup:
    _map = dict([ (meta, i) for i, meta in enumerate(primitive_metatypes, 10) ])
    return metrics_groups[_map[metatype]]

decomposed_varnodes_groups_metatypes = [ decomposed_varnodes_group_metatype(metatype) for metatype in primitive_metatypes ]

high_varnodes_groups = [varnodes_group] + varnodes_groups_metatypes
decomposed_varnodes_groups = [decomposed_varnodes_group] + decomposed_varnodes_groups_metatypes

In [11]:
recompute = False
skip_compute_metrics = True

if not skip_compute_metrics:
    for opts in opts_sets:
        cmps = [ load_cmp(prog, opts) for prog in progs ]
        for grp in metrics_groups:
            save_path = get_table_save_path(grp.get_name(), opts)
            tablename = mangle_table_display_name(grp.get_display_name(), opts)
            print(tablename)
            if recompute or not save_path.exists():
                df = compute_comparisons_metrics_dataframe(prognames, cmps, grp.get_metrics())
                df.to_csv(save_path)


In [12]:
skip_fix_varnode_metrics = False

# Add "Varnodes fraction partially recovered" & "Varnodes fraction exactly recovered" columns
# to the varnodes tables (if not already done)
if not skip_fix_varnode_metrics:
    for grp in high_varnodes_groups + decomposed_varnodes_groups:
        for opts in opts_sets:
            df = get_table_from_group(grp, opts)
            df["Varnodes fraction partially recovered"] = df.iloc[:,2:6].sum(axis=1) / df.iloc[:,0]
            df["Varnodes fraction exactly recovered"] = df.iloc[:,5] / df.iloc[:,0]
            savepath = get_table_save_path(grp.get_name(), opts)
            df.to_csv(savepath)

def get_varnode_group_average_stats(grp: MetricsGroup) -> pd.Series:
    df = get_table_from_group(grp, opts)
    return df.iloc[:,6:].mean(axis=0)

In [13]:
skip_generate_metatype_level_summaries = False

if not skip_generate_metatype_level_summaries:
    for opts in opts_sets:
        raw_seriess = []
        ratios_seriess = []
        for metatype in metatypes:
            metatype_str = MetaType.repr(metatype)
            grp = varnodes_group_metatype(metatype)
            df = get_table_from_group(grp, opts)
            metatype_varnodes = df.iloc[:,0].sum()
            varnodes_by_levels = df.iloc[:,1:6].sum(axis=0)
            varnodes_by_levels.index = varnode_compare_level_labels
            level_ratios = varnodes_by_levels / metatype_varnodes
            raw_seriess.append(varnodes_by_levels)
            ratios_seriess.append(level_ratios)

        high_raw_df = pd.DataFrame(
            raw_seriess,
            index=[ MetaType.repr(metatype) for metatype in metatypes ],
            columns=varnode_compare_level_labels
        )
        high_raw_tablename = "metatype_match_levels"
        high_raw_savepath = get_table_save_path(high_raw_tablename, opts)
        high_raw_df.to_csv(high_raw_savepath)
        
        high_ratios_df = pd.DataFrame(
            ratios_seriess,
            index=[ MetaType.repr(metatype) for metatype in metatypes ],
            columns=varnode_compare_level_labels
        )
        high_ratios_tablename = "metatype_match_levels_ratios"
        high_ratios_savepath = get_table_save_path(high_ratios_tablename, opts)
        high_ratios_df.to_csv(high_ratios_savepath)

        decomposed_raw_seriess = []
        decomposed_ratios_seriess = []
        for metatype in primitive_metatypes:
            metatype_str = MetaType.repr(metatype)
            grp = decomposed_varnodes_group_metatype(metatype)
            df = get_table_from_group(grp, opts)
            metatype_varnodes = df.iloc[:,0].sum()
            varnodes_by_levels = df.iloc[:,1:6].sum(axis=0)
            varnodes_by_levels.index = varnode_compare_level_labels
            level_ratios = varnodes_by_levels / metatype_varnodes
            decomposed_raw_seriess.append(varnodes_by_levels)
            decomposed_ratios_seriess.append(level_ratios)

        decomposed_raw_df = pd.DataFrame(
            decomposed_raw_seriess,
            index=[ MetaType.repr(metatype) for metatype in primitive_metatypes ],
            columns=varnode_compare_level_labels
        )
        decomposed_raw_tablename = "metatype_match_levels_decomposed"
        decomposed_raw_savepath = get_table_save_path(decomposed_raw_tablename, opts)
        decomposed_raw_df.to_csv(decomposed_raw_savepath)

        decomposed_ratios_df = pd.DataFrame(
            decomposed_ratios_seriess,
            index=[ MetaType.repr(metatype) for metatype in primitive_metatypes ],
            columns=varnode_compare_level_labels
        )
        decomposed_ratios_tablename = "metatype_match_levels_ratios_decomposed"
        decomposed_ratios_savepath = get_table_save_path(decomposed_ratios_tablename, opts)
        decomposed_ratios_df.to_csv(decomposed_ratios_savepath)

def get_metatype_match_levels_table(
    opts: BuildOptions,
    primitive: bool = False
) -> pd.DataFrame:
    tablename = "metatype_match_levels"
    if primitive:
        tablename += "_decomposed"
    return load_table_opts(tablename, opts)

def get_metatype_match_levels_ratios_table(
    opts: BuildOptions,
    primitive: bool = False
) -> pd.DataFrame:
    tablename = "metatype_match_levels_ratios"
    if primitive:
        tablename += "_decomposed"
    return load_table_opts(tablename, opts)

In [14]:
skip_generate_metatype_recovery_summaries = False

if not skip_generate_metatype_recovery_summaries:
    for opts in opts_sets:
        high_rows = {}
        for metatype in metatypes:
            row = {}
            df = get_table_from_group(varnodes_group_metatype(metatype), opts)
            # get the "ground truth" varnodes for metatype
            truth = df.iloc[:,0].sum()

            # get the varnode compare score for metatype
            level_sums = df.iloc[:,1:6].sum(axis=0)
            weights = np.array(list(VarnodeCompareLevel.range()))
            level_sums_weighted = np.multiply(weights, level_sums)
            level_sums_weighted_sum = level_sums_weighted.sum()
            score = level_sums_weighted_sum / (VarnodeCompareLevel.MATCH * truth)
            row["Varnode comparison score [0,1]"] = score

            missed = df.iloc[:,1].sum()
            matched = df.iloc[:,5].sum()
            row["Varnodes fraction partially recovered"] = (truth - missed) / truth
            row["Varnodes fraction exactly recovered"] = matched / truth
            high_rows[MetaType.repr(metatype)] = row

        high_df = pd.DataFrame.from_dict(high_rows, orient='index')
        high_tablename = "metatype_recovery_summary"
        high_savepath = get_table_save_path(high_tablename, opts)
        high_df.to_csv(high_savepath)

        decomposed_rows = {}
        for metatype in primitive_metatypes:
            row = {}
            df = get_table_from_group(decomposed_varnodes_group_metatype(metatype), opts)
            # get the "ground truth" varnodes for metatype
            truth = df.iloc[:,0].sum()

            # get the varnode compare score for metatype
            level_sums = df.iloc[:,1:6].sum(axis=0)
            weights = np.array(list(VarnodeCompareLevel.range()))
            level_sums_weighted = np.multiply(weights, level_sums)
            level_sums_weighted_sum = level_sums_weighted.sum()
            score = level_sums_weighted_sum / (VarnodeCompareLevel.MATCH * truth)
            row["Varnode comparison score [0,1]"] = score

            missed = df.iloc[:,1].sum()
            matched = df.iloc[:,5].sum()
            row["Varnodes fraction partially recovered"] = (truth - missed) / truth
            row["Varnodes fraction exactly recovered"] = matched / truth
            decomposed_rows[MetaType.repr(metatype)] = row

        decomposed_df = pd.DataFrame.from_dict(decomposed_rows, orient='index')
        decomposed_tablename = "metatype_recovery_summary_decomposed"
        decomposed_savepath = get_table_save_path(decomposed_tablename, opts)
        decomposed_df.to_csv(decomposed_savepath)

def get_metatype_recovery_summary_table(
    opts: BuildOptions,
    primitive: bool = False
) -> pd.DataFrame:
    tablename = "metatype_recovery_summary"
    if primitive:
        tablename += "_decomposed"
    return load_table_opts(tablename, opts)


In [15]:
skip_generate_metatype_recovery_with_levels_summaries = False

if not skip_generate_metatype_recovery_with_levels_summaries:
    for opts in opts_sets:
        for primitive in (True, False):
            dfs = [
                get_metatype_match_levels_table(opts, primitive=primitive),
                get_metatype_recovery_summary_table(opts, primitive=primitive)
            ]
            df = pd.concat(
                dfs,
                axis=1
            )
            tablename = "metatype_recovery_summary_with_levels"
            if primitive:
                tablename += "_decomposed"
            savepath = get_table_save_path(tablename, opts)
            df.to_csv(savepath)
                

def get_metatype_recovery_summary_with_levels_table(
    opts: BuildOptions,
    primitive: bool = False
) -> pd.DataFrame:
    tablename = "metatype_recovery_summary_with_levels"
    if primitive:
        tablename += "_decomposed"
    return load_table_opts(tablename, opts)

In [16]:
skip_generate_opts_varnodes_summary = False

if not skip_generate_opts_varnodes_summary:
    for primitive in (True, False):
        grp = decomposed_varnodes_group if primitive else varnodes_group
        _suffix = " (decomposed)" if primitive else ""
        seriess = []
        for opts in opts_sets:
            df = get_table_from_group(grp, opts)
            truth = df.iloc[:,0].sum()
            row = df.iloc[:,1:6].sum(axis=0)
            level_sums = row
            weights = np.array(list(VarnodeCompareLevel.range()))
            level_sums_weighted = np.multiply(weights, level_sums)
            level_sums_weighted_sum = level_sums_weighted.sum()
            score = level_sums_weighted_sum / (VarnodeCompareLevel.MATCH * truth)
            row["Varnode comparison score [0,1]" + _suffix] = score
            missed = df.iloc[:,1].sum()
            matched = df.iloc[:,5].sum()
            row["Varnodes fraction partially recovered" + _suffix] = (truth - missed) / truth
            row["Varnodes fraction exactly recovered" + _suffix] = matched / truth
            seriess.append(row)
        
        tablename = "opts_varnodes_summary"
        if primitive:
            tablename += "_decomposed"
        savepath = get_table_save_path_generic(tablename)
        
        df = pd.DataFrame(
            seriess,
            index=opts_sets_keys
        )
        for colname in df.columns[0:5]:
            df[colname] = df[colname].astype(int)
        df.to_csv(savepath)

def get_opts_varnodes_summary_table(primitive: bool = False) -> pd.DataFrame:
    tablename = "opts_varnodes_summary"
    if primitive:
        tablename += "_decomposed"
    df = load_table(tablename)
    return df

In [17]:
skip_generate_opts_varnodes_summary_metatypes = False

if not skip_generate_opts_varnodes_summary_metatypes:
    for primitive in (True, False):
        dfs = [ get_metatype_recovery_summary_with_levels_table(opts, primitive=primitive) for opts in opts_sets ]
        df = pd.concat(dfs, keys=opts_sets_keys, axis=0)
        tablename = "opts_varnodes_summary_metatypes"
        if primitive:
            tablename += "_decomposed"
        savepath = get_table_save_path_generic(tablename)
        df.to_csv(savepath)

def get_opts_varnodes_summary_metatypes_table(primitive: bool = False) -> pd.DataFrame:
    tablename = "opts_varnodes_summary_metatypes"
    if primitive:
        tablename += "_decomposed"
    savepath = get_table_save_path_generic(tablename)
    return pd.read_csv(savepath, index_col=[0,1])

In [18]:
skip_generate_opts_functions_summary = False

if not skip_generate_opts_varnodes_summary:
    rows = []
    for opts in opts_sets:
        df = get_table_from_group(functions_group, opts)
        row = df.iloc[:,0:3].sum(axis=0)
        rows.append(row)

    df = pd.DataFrame(rows, index=opts_sets_keys)
    df["Functions recovery fraction"] = df["Functions found"] / df["Ground truth functions"]

    savepath = get_table_save_path_generic("opts_functions_summary")
    df.to_csv(savepath)

def get_opts_functions_summary_table() -> pd.DataFrame:
    return load_table("opts_functions_summary")

In [19]:
skip_generate_opts_bytes_summary = False

if not skip_generate_opts_bytes_summary:
    rows = []
    for opts in opts_sets:
        df = get_table_from_group(bytes_group, opts)
        row = df.iloc[:,0:3].sum(axis=0)
        rows.append(row)

    df = pd.DataFrame(rows, index=opts_sets_keys)
    df["Bytes recovery fraction"] = df["Bytes found"] / df["Ground truth data bytes"]

    savepath = get_table_save_path_generic("opts_bytes_summary")
    df.to_csv(savepath)

def get_opts_bytes_summary_table() -> pd.DataFrame:
    return load_table("opts_bytes_summary")

In [20]:
skip_generate_opts_array_comparisons_summary = False

if not skip_generate_opts_array_comparisons_summary:
    rows = []
    for opts in opts_sets:
        df = get_table_from_group(array_comparisons_group, opts)
        comparisons_col = df["Array comparisons"]
        total_comparisons = comparisons_col.sum()
        series0 = df.iloc[:,0:3].sum(axis=0)
        series1 = df.iloc[:,3:].transform(lambda col: col * comparisons_col, axis=0).sum(axis=0) / total_comparisons
        row = pd.concat((series0, series1), axis=0)
        rows.append(row)
    
    df = pd.DataFrame(
        rows,
        index=opts_sets_keys
    )
    df["Array varnodes inferred as array fraction"] = df["Array varnodes inferred as array"] / df["Ground truth varnodes (metatype=ARRAY)"]
    for colname in df.columns[0:3]:
        df[colname] = df[colname].astype(int)
    
    tablename = "opts_array_comparisons_summary"
    savepath = get_table_save_path_generic(tablename)
    df.to_csv(savepath)

def get_opts_array_comparisons_summary_table() -> pd.DataFrame:
    return load_table("opts_array_comparisons_summary")


In [21]:
skip_generate_opts_overall_summary = False

if not skip_generate_opts_overall_summary:
    df_functions = get_opts_functions_summary_table()
    df_varnodes = get_opts_varnodes_summary_table(primitive=False)
    df_primitive_varnodes = get_opts_varnodes_summary_table(primitive=True)
    df_bytes = get_opts_bytes_summary_table()

    functions_recovery_fraction = df_functions["Functions recovery fraction"]
    varnodes = df_varnodes.iloc[:,5:]
    varnodes_decomposed = df_primitive_varnodes.iloc[:,5:]
    bytes_recovery_fraction = df_bytes["Bytes recovery fraction"]

    df = pd.concat(
        (functions_recovery_fraction, varnodes, varnodes_decomposed, bytes_recovery_fraction),
        axis=1
    )

    tablename = "opts_overall_summary"
    savepath = get_table_save_path_generic(tablename)
    df.to_csv(savepath)

def get_opts_overall_summary_table() -> pd.DataFrame:
    return load_table("opts_overall_summary")

In [22]:
cmp = load_cmp(prog_from_progname("factor"), debug_opts)
union_cmp_records = [ 
    record for record in select_comparable_varnode_compare_records(cmp)
    if record.get_varnode().get_datatype().get_metatype() == MetaType.UNION
]
union = union_cmp_records[0].get_varnode().get_datatype()
for memtype in union.membertypes:
    print(memtype)

unsigned int4
<ARRAY subtype=char dimensions=(4096,) size=4096>


In [23]:
# Functions

display(get_opts_functions_summary_table())

Unnamed: 0,Ground truth functions,Functions found,Functions missed,Functions recovery fraction
strip,18139,18139,0,1.0
standard,18139,18139,0,1.0
debug,18139,18135,4,0.999779


In [24]:
# Varnodes

display(get_opts_varnodes_summary_table())
display(get_opts_varnodes_summary_metatypes_table())

Unnamed: 0,Varnodes matched @ level=NO_MATCH,Varnodes matched @ level=OVERLAP,Varnodes matched @ level=SUBSET,Varnodes matched @ level=ALIGNED,Varnodes matched @ level=MATCH,"Varnode comparison score [0,1]",Varnodes fraction partially recovered,Varnodes fraction exactly recovered
strip,1000,1662,1001,18570,12550,0.787554,0.97125,0.360808
standard,249,1450,613,19029,13442,0.815995,0.992841,0.386453
debug,23,52,24,7,34677,0.997822,0.999339,0.996953


Unnamed: 0,Unnamed: 1,Varnodes matched @ level=NO_MATCH,Varnodes matched @ level=OVERLAP,Varnodes matched @ level=SUBSET,Varnodes matched @ level=ALIGNED,Varnodes matched @ level=MATCH,"Varnode comparison score [0,1]",Varnodes fraction partially recovered,Varnodes fraction exactly recovered
strip,INT,66,48,0,12204,8681,0.84985,0.996857,0.413401
strip,FLOAT,0,56,0,113,22,0.632199,1.0,0.115183
strip,POINTER,53,4,0,5834,3513,0.838952,0.994364,0.373564
strip,ARRAY,729,597,565,19,228,0.315248,0.659027,0.106642
strip,STRUCT,152,955,432,390,106,0.419287,0.925307,0.052088
strip,UNION,0,2,4,10,0,0.625,1.0,0.0
standard,INT,23,48,0,12248,8680,0.851374,0.998905,0.413353
standard,FLOAT,0,56,0,113,22,0.632199,1.0,0.115183
standard,POINTER,44,4,0,5836,3520,0.839855,0.995321,0.374309
standard,ARRAY,181,578,352,45,982,0.625,0.915341,0.459308


In [25]:
# Decomposed Varnodes

display(get_opts_varnodes_summary_table(primitive=True))
display(get_opts_varnodes_summary_metatypes_table(primitive=True))

Unnamed: 0,Varnodes matched @ level=NO_MATCH (decomposed),Varnodes matched @ level=OVERLAP (decomposed),Varnodes matched @ level=SUBSET (decomposed),Varnodes matched @ level=ALIGNED (decomposed),Varnodes matched @ level=MATCH (decomposed),"Varnode comparison score [0,1] (decomposed)",Varnodes fraction partially recovered (decomposed),Varnodes fraction exactly recovered (decomposed)
strip,139776,31280,0,231267,131593,0.585978,0.738206,0.246468
standard,40187,56605,0,303527,133597,0.703095,0.924732,0.250221
debug,10547,128,0,5,523236,0.980064,0.980246,0.979997


Unnamed: 0,Unnamed: 1,Varnodes matched @ level=NO_MATCH,Varnodes matched @ level=OVERLAP,Varnodes matched @ level=SUBSET,Varnodes matched @ level=ALIGNED,Varnodes matched @ level=MATCH,"Varnode comparison score [0,1]",Varnodes fraction partially recovered,Varnodes fraction exactly recovered
strip,INT,132910,28812,0,217923,125159,0.585978,0.73671,0.247936
strip,FLOAT,72,73,0,103,22,0.435185,0.733333,0.081481
strip,POINTER,6725,2057,0,13208,6332,0.591492,0.762552,0.223572
standard,INT,40017,46846,0,290436,127505,0.707291,0.920728,0.252583
standard,FLOAT,0,145,0,103,22,0.501852,1.0,0.081481
standard,POINTER,132,9245,0,12955,5990,0.636166,0.995339,0.211496
debug,INT,10533,124,0,4,494143,0.978948,0.979134,0.978881
debug,FLOAT,0,0,0,0,270,1.0,1.0,1.0
debug,POINTER,14,2,0,1,28305,0.999444,0.999506,0.9994


In [26]:
# Data Bytes

display(get_opts_bytes_summary_table())

Unnamed: 0,Ground truth data bytes,Bytes found,Bytes missed,Bytes recovery fraction
strip,1183691,725144,458547,0.612613
standard,1183691,954105,229586,0.806042
debug,1183691,1177221,6470,0.994534


In [27]:
# Summary

display(get_opts_overall_summary_table())

Unnamed: 0,Functions recovery fraction,"Varnode comparison score [0,1]",Varnodes fraction partially recovered,Varnodes fraction exactly recovered,"Varnode comparison score [0,1] (decomposed)",Varnodes fraction partially recovered (decomposed),Varnodes fraction exactly recovered (decomposed),Bytes recovery fraction
strip,1.0,0.787554,0.97125,0.360808,0.585978,0.738206,0.246468,0.612613
standard,1.0,0.815995,0.992841,0.386453,0.703095,0.924732,0.250221,0.806042
debug,0.999779,0.997822,0.999339,0.996953,0.980064,0.980246,0.979997,0.994534


In [28]:
# Array Comparisons

display(get_opts_array_comparisons_summary_table())

Unnamed: 0,Ground truth varnodes (metatype=ARRAY),Array comparisons,Array varnodes inferred as array,Array varnodes inferred as array fraction,Array length (elements) average error,Array length (elements) average error ratio,Array size (bytes) average error,Array size (bytes) average error ratio,"Array dimension match score [0,1]","Array average element type comparison score [0,1]"
strip,2138,823,774,0.362021,134.695018,2.844632,458.574727,0.911925,0.979344,0.781288
standard,2138,1579,1530,0.715622,151.155795,5.441502,239.022799,0.474756,0.974668,0.669833
debug,2138,2226,2128,0.995323,9.415993,0.109666,9.415993,0.109666,1.0,0.999551


In [29]:
# get_table_from_group(varnodes_group, debug_opts)

In [40]:
prog = CoreutilsProgram("ls")

cmp = load_cmp(prog, debug_opts)

records = []
for record in select_comparable_varnode_compare_records(cmp):
    if record.bytes_overlapped() < record.get_datatype().get_size():
    # if record.get_compare_level() != VarnodeCompareLevel.MATCH:
        records.append(record)
        print(record)

<VarnodeCompareRecord varnode=<Varnode address=<STACK:-0x30> datatype=<STRUCT winsize membertypes=4 size=8>> level=1>
<VarnodeCompareRecord varnode=<Varnode address=<STACK:-0x30> datatype=unsigned int8> level=1>
<VarnodeCompareRecord varnode=<Varnode address=<STACK:-0x1100> datatype=<ARRAY subtype=char dimensions=(652,) size=652>> level=2>
<VarnodeCompareRecord varnode=<Varnode address=<STACK:-0x1100> datatype=<ARRAY subtype=char dimensions=(652,) size=652>> level=2>


In [41]:
for record in records:
    print(record.get_varnode())
    print(record.get_varnode().get_var().get_parent_function().get_name())
    print(record.get_var())
    for cmp2 in record.get_comparisons():
        print("\t{}".format(cmp2.get_right()))

<Varnode address=<STACK:-0x30> datatype=<STRUCT winsize membertypes=4 size=8>>
decode_switches
<VAR ws :: <STRUCT winsize membertypes=4 size=8> @ [<AddressLiveRange addr=<STACK:-0x30> startpc=<ABSOLUTE:0x703a> endpc=<ABSOLUTE:0x83bb>>]>
	<Varnode address=<STACK:-0x30> datatype=int4>
<Varnode address=<STACK:-0x30> datatype=unsigned int8>
decode_switches
<VAR tmp :: unsigned int8 @ [<AddressLiveRange addr=<STACK:-0x30> startpc=<ABSOLUTE:0x703a> endpc=<ABSOLUTE:0x83bb>>]>
	<Varnode address=<STACK:-0x30> datatype=int4>
<Varnode address=<STACK:-0x1100> datatype=<ARRAY subtype=char dimensions=(652,) size=652>>
print_long_format
<VAR hbuf :: <ARRAY subtype=char dimensions=(652,) size=652> @ [<AddressLiveRange addr=<STACK:-0x1100> startpc=<ABSOLUTE:0xc647> endpc=<ABSOLUTE:0xd08c>>]>
	<Varnode address=<STACK:-0x1100> datatype=<ARRAY subtype=char dimensions=(21,) size=21>>
<Varnode address=<STACK:-0x1100> datatype=<ARRAY subtype=char dimensions=(652,) size=652>>
print_long_format
<VAR hbuf :: <A

In [42]:
flipped_cmp = cmp.flip()

def filter_cmp_record(record: VarnodeCompareRecord) -> bool:
    varnode = record.get_varnode()
    var = varnode.get_var()
    fn = var.get_parent_function()
    return var.get_name() == "hbuf"

flipped_records = list(filter(filter_cmp_record, select_comparable_varnode_compare_records(flipped_cmp)))

for record in flipped_records:
    print(record.get_varnode())
    print(record.get_varnode().get_var().get_parent_function().get_name())
    print(record.get_var())
    for cmp2 in record.get_comparisons():
        print("\t{}".format(cmp2.get_right()))

<Varnode address=<STACK:-0x1100> datatype=<ARRAY subtype=char dimensions=(21,) size=21>>
print_long_format
<VAR hbuf :: <ARRAY subtype=char dimensions=(21,) size=21> @ [<AddressLiveRange addr=<STACK:-0x1100> startpc=<ABSOLUTE:0xc647> endpc=<ABSOLUTE:0xd08c>>]>
	<Varnode address=<STACK:-0x1100> datatype=<ARRAY subtype=char dimensions=(21,) size=21>>
	<Varnode address=<STACK:-0x1100> datatype=<ARRAY subtype=char dimensions=(652,) size=652>>
	<Varnode address=<STACK:-0x1100> datatype=<ARRAY subtype=char dimensions=(21,) size=21>>
	<Varnode address=<STACK:-0x1100> datatype=<ARRAY subtype=char dimensions=(21,) size=21>>
	<Varnode address=<STACK:-0x1100> datatype=<ARRAY subtype=char dimensions=(652,) size=652>>
	<Varnode address=<STACK:-0x1100> datatype=<ARRAY subtype=char dimensions=(21,) size=21>>
