In [1]:
import setup # resolve path to 'src'
import numpy as np
import pandas as pd

from typing import Optional
from build_parse import *
from metrics import *

pd.set_option('display.max_rows', None)

In [2]:
progs = [ CoreutilsProgram(progname) for progname in COREUTILS_PROG_NAMES ]
prognames = [ prog.get_name() for prog in progs ]

# Define the build options to test for each program
debug_opts = BuildOptions(debug=True, strip=False, optimization=0)
standard_opts = BuildOptions(debug=False, strip=False, optimization=0)
strip_opts = BuildOptions(debug=False, strip=True, optimization=0)

opts_sets = (strip_opts, standard_opts, debug_opts)
opts_sets_keys = ("strip", "standard", "debug")

# Get the parser functions
dwarf_parser = get_parser("dwarf")
ghidra_parser = get_parser("ghidra")

varnode_compare_levels = list(VarnodeCompareLevel.range())
varnode_compare_level_labels = [ "Varnodes matched @ level {}".format(VarnodeCompareLevel.to_string(level)) for level in VarnodeCompareLevel.range() ]

primitive_metatypes = [MetaType.INT, MetaType.FLOAT, MetaType.POINTER]
complex_metatypes = [MetaType.ARRAY, MetaType.STRUCT, MetaType.UNION]
metatypes = primitive_metatypes + complex_metatypes
metatype_labels = [ MetaType.repr(metatype) for metatype in metatypes ]

# ensure that each program is built according to all variations of build options
for prog in progs:
    for opts in (debug_opts, standard_opts, strip_opts):
        assert(prog.valid_build(opts))

In [3]:

# the filename format for saving parsed ProgramInfo pickle objects
def mangle_proginfo_save_name(parsername: str, prog: Program, opts: BuildOptions) -> str:
    return "{}.{}.pickle".format(prog.get_binary_name(opts), parsername)

def get_proginfo_save_path(parsername: str, prog: Program, opts: BuildOptions) -> Path:
    return PICKLE_CACHE_DIR.joinpath(mangle_proginfo_save_name(parsername, prog, opts))

def save_proginfo(proginfo: ProgramInfo, parsername: str, prog: Program, opts: BuildOptions):
    save_pickle(proginfo, get_proginfo_save_path(parsername, prog, opts))

def load_proginfo(parsername: str, prog: Program, opts: BuildOptions) -> ProgramInfo:
    return load_pickle(get_proginfo_save_path(parsername, prog, opts))

# the filename format for saving UnoptimizedProgramInfoCompare2 objects
def mangle_cmp_save_name(prog: Program, opts: BuildOptions) -> str:
    return "{}.cmp.pickle".format(prog.get_binary_name(opts))

def get_cmp_save_path(prog: Program, opts: BuildOptions) -> Path:
    return PICKLE_CACHE_DIR.joinpath(mangle_cmp_save_name(prog, opts))

def save_cmp(cmp: UnoptimizedProgramInfoCompare2, prog: Program, opts: BuildOptions):
    save_pickle(cmp, get_cmp_save_path(prog, opts))

def load_cmp(prog: Program, opts: BuildOptions) -> UnoptimizedProgramInfoCompare2:
    return load_pickle(get_cmp_save_path(prog, opts))

In [4]:
# DWARF: only parse with the debug build options
# Ghidra: parse with all variations of build options
# Cache the results in local pickle_cache directory, named based on the 'mangle' scheme

reparse = False # should we re-parse even if we already parsed and cached a program?
skip_parsing = True # should we skip the parsing? set to True if we already parsed & cached

class ParseException(Exception):
    pass

def parse(parser: Callable, prog: Program, opts: BuildOptions) -> Optional[ProgramInfo]:
    try:
        return parser(prog.get_binary_path(opts))
    except:
        return None

failed = []
if not skip_parsing:
    for prog in progs:
        dwarf_debug_savepath = get_proginfo_save_path("dwarf", prog, debug_opts)
        if reparse or not dwarf_debug_savepath.exists():
            dwarf_debug = parse(dwarf_parser, prog, debug_opts)
            if dwarf_debug is None:
                failed.append(("dwarf", prog.get_name(), debug_opts))
            else:
                save_pickle(dwarf_debug, dwarf_debug_savepath)

        for opts in opts_sets:
            ghidra_parse_savepath = get_proginfo_save_path("ghidra", prog, opts)
            if reparse or not dwarf_debug_savepath.exists():
                ghidra_parse = parse(ghidra_parser, prog, opts)
                if ghidra_parse is None:
                    failed.append(("ghidra", prog.get_name(), opts))
                else:
                    save_pickle(ghidra_parse, ghidra_parse_savepath)

In [5]:
print(failed)

for prog in progs:
    for opts in opts_sets:
        assert(get_proginfo_save_path("ghidra", prog, opts).exists())

[]


In [6]:
# For each program & build options combination, compute & store comparison object

recompare = False
skip_comparisons = True

failed = []
if not skip_comparisons:
    for prog in progs:
        # load the DWARF ground-truth ProgramInfo
        dwarf_proginfo = load_pickle(get_proginfo_save_path("dwarf", prog, debug_opts))
        assert(dwarf_proginfo is not None)

        # for each set of compilation options, load the Ghidra decompiler ProgramInfo
        # then compute & store the comparison object
        for opts in opts_sets:
            cmp_save_path = get_cmp_save_path(prog, opts)
            if recompare or not cmp_save_path.exists():
                ghidra_proginfo = load_pickle(get_proginfo_save_path("ghidra", prog, opts))
                assert(ghidra_proginfo is not None)
                try:
                    cmp = compare2(dwarf_proginfo, ghidra_proginfo)
                    save_pickle(cmp, get_cmp_save_path(prog, opts))
                except:
                    failed.append((prog.get_name(), opts))

In [7]:
print(failed)

for prog in progs:
    for opts in opts_sets:
        assert(get_cmp_save_path(prog, opts).exists())

[]


In [8]:
# For each opts, compute the tables

TABLES_DIR = DATA_DIR.joinpath("tables")

def underscores_to_dashes(s: str) -> str:
    return s.replace("_", "-")

def make_latex_label(tablename: str) -> str:
    return "table:" + underscores_to_dashes(tablename)

def mangle_table_save_name(
    tablename: str,
    opts: BuildOptions
) -> str:
    return underscores_to_dashes("{}{}.csv".format(tablename, suffix(opts)))

def mangle_latex_save_name(
    tablename: str,
    opts: BuildOptions
) -> str:
    return underscores_to_dashes("{}{}.tex".format(tablename, suffix(opts)))

def build_options_display_suffix(opts: BuildOptions) -> str:
    return "(optimization={}, stripped={}, debug={})".format(opts.optimization, opts.strip, opts.debug)

def mangle_table_display_name(
    tablename: str,
    opts: BuildOptions
) -> str:

    return "{} {}".format(tablename, build_options_display_suffix(opts))

def get_table_save_path(
    tablename: str,
    opts: BuildOptions
) -> Path:
    return DATA_DIR.joinpath(mangle_table_save_name(tablename, opts))

def get_latex_path(
    tablename: str,
    opts: BuildOptions
) -> Path:
    return TABLES_DIR.joinpath(mangle_latex_save_name(tablename, opts))

def get_table_save_path_generic(
    tablename: str
) -> Path:
    return DATA_DIR.joinpath(tablename + ".csv")

def get_latex_path_generic(
    tablename: str
) -> Path:
    return TABLES_DIR.joinpath("{}.tex".format(tablename))

def load_table(
    tablename: str,
    multiindex: bool = False
) -> pd.DataFrame:
    index_col = 0 if not multiindex else (0,1)
    return pd.read_csv(DATA_DIR.joinpath(tablename + ".csv"), index_col=index_col)

def load_table_opts(
    tablename: str,
    opts: BuildOptions
) -> pd.DataFrame:
    return pd.read_csv(get_table_save_path(tablename, opts), index_col=0)

def get_table_from_group(
    grp: MetricsGroup,
    opts: BuildOptions
)-> pd.DataFrame:
    return load_table_opts(grp.get_name(), opts)

def opts_to_caption_suffix(opts: BuildOptions) -> str:
    if opts.debug:
        return "(compilation = debug)"
    elif opts.strip:
        return "(compilation = stripped)"
    else:
        return "(compilation = standard)"

def latex_column_format_str(ncols: int, table_width_cm: float = 12) -> str:
    col_width = table_width_cm / ncols
    repeat_str = "p{{{:.1f}cm}}".format(col_width)
    return "l" + repeat_str*ncols

In [9]:
metrics_groups = make_metrics()

bytes_group = metrics_groups[0]
functions_group = metrics_groups[1]
varnodes_group = metrics_groups[2]
decomposed_varnodes_group = metrics_groups[9]
array_comparisons_group = metrics_groups[13]

def varnodes_group_metatype(metatype: int) -> MetricsGroup:
    _map = dict([ (meta, i) for i, meta in enumerate(primitive_metatypes + complex_metatypes, 3) ])
    return metrics_groups[_map[metatype]]

varnodes_groups_metatypes = [ varnodes_group_metatype(metatype) for metatype in (primitive_metatypes + complex_metatypes) ]

def decomposed_varnodes_group_metatype(metatype: int) -> MetricsGroup:
    _map = dict([ (meta, i) for i, meta in enumerate(primitive_metatypes, 10) ])
    return metrics_groups[_map[metatype]]

decomposed_varnodes_groups_metatypes = [ decomposed_varnodes_group_metatype(metatype) for metatype in primitive_metatypes ]

high_varnodes_groups = [varnodes_group] + varnodes_groups_metatypes
decomposed_varnodes_groups = [decomposed_varnodes_group] + decomposed_varnodes_groups_metatypes

def get_group_column_names(grp: MetricsGroup) -> List[str]:
    return [ metric.get_display_name() for metric in grp.get_metrics() ]

In [10]:
recompute = False
skip_compute_metrics = True

if not skip_compute_metrics:
    for opts in opts_sets:
        cmps = [ load_cmp(prog, opts) for prog in progs ]
        for grp in metrics_groups:
            savepath = get_table_save_path(grp.get_name(), opts)
            tablename = mangle_table_display_name(grp.get_display_name(), opts)
            print(tablename)
            if recompute or not savepath.exists():
                df = compute_comparisons_metrics_dataframe(prognames, cmps, grp.get_metrics())
                df.to_csv(savepath)


In [11]:
skip_fix_colnames = False

if not skip_fix_colnames:
    for grp in metrics_groups:
        for opts in opts_sets:
            df = get_table_from_group(grp, opts)
            ncols = df.shape[1]
            new_colnames = get_group_column_names(grp)
            new_ncols = len(new_colnames)
            if new_ncols < ncols:
                df = df.iloc[:,0:new_ncols]
            df.set_axis(new_colnames, axis=1, inplace=True)
            savepath = get_table_save_path(grp.get_name(), opts)
            df.to_csv(savepath)

  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)
  df.set_axis(new_colnames, axis=1, inplace=True)


In [12]:
skip_fix_varnode_metrics = False

# Add "Varnodes fraction partially recovered" & "Varnodes fraction exactly recovered" columns
# to the varnodes tables (if not already done)
if not skip_fix_varnode_metrics:
    for grp in high_varnodes_groups + decomposed_varnodes_groups:
        for opts in opts_sets:
            df = get_table_from_group(grp, opts)
            df["Varnodes fraction partially recovered"] = df.iloc[:,2:6].sum(axis=1) / df.iloc[:,0]
            df["Varnodes fraction exactly recovered"] = df.iloc[:,5] / df.iloc[:,0]
            savepath = get_table_save_path(grp.get_name(), opts)
            df.to_csv(savepath)

def get_varnode_group_average_stats(grp: MetricsGroup) -> pd.Series:
    df = get_table_from_group(grp, opts)
    return df.iloc[:,6:].mean(axis=0)

In [13]:
skip_generate_metatype_level_summaries = False

if not skip_generate_metatype_level_summaries:
    for opts in opts_sets:
        raw_seriess = []
        ratios_seriess = []
        for metatype in metatypes:
            metatype_str = MetaType.repr(metatype)
            grp = varnodes_group_metatype(metatype)
            df = get_table_from_group(grp, opts)
            metatype_varnodes = df.iloc[:,0].sum()
            varnodes_by_levels = df.iloc[:,1:6].sum(axis=0)
            varnodes_by_levels.index = varnode_compare_level_labels
            level_ratios = varnodes_by_levels / metatype_varnodes
            raw_seriess.append(varnodes_by_levels)
            ratios_seriess.append(level_ratios)

        high_raw_df = pd.DataFrame(
            raw_seriess,
            index=[ MetaType.repr(metatype) for metatype in metatypes ],
            columns=varnode_compare_level_labels
        )
        high_raw_tablename = "metatype-match-levels"
        high_raw_savepath = get_table_save_path(high_raw_tablename, opts)
        high_raw_df.to_csv(high_raw_savepath)
        
        high_ratios_df = pd.DataFrame(
            ratios_seriess,
            index=[ MetaType.repr(metatype) for metatype in metatypes ],
            columns=varnode_compare_level_labels
        )
        high_ratios_tablename = "metatype-match-levels-ratios"
        high_ratios_savepath = get_table_save_path(high_ratios_tablename, opts)
        high_ratios_df.to_csv(high_ratios_savepath)

        decomposed_raw_seriess = []
        decomposed_ratios_seriess = []
        for metatype in primitive_metatypes:
            metatype_str = MetaType.repr(metatype)
            grp = decomposed_varnodes_group_metatype(metatype)
            df = get_table_from_group(grp, opts)
            metatype_varnodes = df.iloc[:,0].sum()
            varnodes_by_levels = df.iloc[:,1:6].sum(axis=0)
            varnodes_by_levels.index = varnode_compare_level_labels
            level_ratios = varnodes_by_levels / metatype_varnodes
            decomposed_raw_seriess.append(varnodes_by_levels)
            decomposed_ratios_seriess.append(level_ratios)

        decomposed_raw_df = pd.DataFrame(
            decomposed_raw_seriess,
            index=[ MetaType.repr(metatype) for metatype in primitive_metatypes ],
            columns=varnode_compare_level_labels
        )
        decomposed_raw_tablename = "metatype-match-levels-decomposed"
        decomposed_raw_savepath = get_table_save_path(decomposed_raw_tablename, opts)
        decomposed_raw_df.to_csv(decomposed_raw_savepath)

        decomposed_ratios_df = pd.DataFrame(
            decomposed_ratios_seriess,
            index=[ MetaType.repr(metatype) for metatype in primitive_metatypes ],
            columns=varnode_compare_level_labels
        )
        decomposed_ratios_tablename = "metatype-match-levels-ratios-decomposed"
        decomposed_ratios_savepath = get_table_save_path(decomposed_ratios_tablename, opts)
        decomposed_ratios_df.to_csv(decomposed_ratios_savepath)

def get_metatype_match_levels_table(
    opts: BuildOptions,
    primitive: bool = False
) -> pd.DataFrame:
    tablename = "metatype-match-levels"
    if primitive:
        tablename += "-decomposed"
    return load_table_opts(tablename, opts)

def get_metatype_match_levels_ratios_table(
    opts: BuildOptions,
    primitive: bool = False
) -> pd.DataFrame:
    tablename = "metatype-match-levels-ratios"
    if primitive:
        tablename += "-decomposed"
    return load_table_opts(tablename, opts)

In [14]:
skip_generate_metatype_recovery_summaries = False

tablename = "metatype-recovery-summary"

if not skip_generate_metatype_recovery_summaries:
    for opts in opts_sets:
        high_rows = {}
        for metatype in metatypes:
            row = {}
            df = get_table_from_group(varnodes_group_metatype(metatype), opts)
            # get the "ground truth" varnodes for metatype
            truth = df.iloc[:,0].sum()

            # get the varnode compare score for metatype
            level_sums = df.iloc[:,1:6].sum(axis=0)
            weights = np.array(list(VarnodeCompareLevel.range()))
            level_sums_weighted = np.multiply(weights, level_sums)
            level_sums_weighted_sum = level_sums_weighted.sum()
            score = level_sums_weighted_sum / (VarnodeCompareLevel.MATCH * truth)
            row["Varnode comparison score [0,1]"] = score

            missed = df.iloc[:,1].sum()
            matched = df.iloc[:,5].sum()
            row["Varnodes fraction partially recovered"] = (truth - missed) / truth
            row["Varnodes fraction exactly recovered"] = matched / truth
            high_rows[MetaType.repr(metatype)] = row

        high_df = pd.DataFrame.from_dict(high_rows, orient='index')
        high_tablename = tablename
        high_savepath = get_table_save_path(high_tablename, opts)
        high_df.to_csv(high_savepath)

        decomposed_rows = {}
        for metatype in primitive_metatypes:
            row = {}
            df = get_table_from_group(decomposed_varnodes_group_metatype(metatype), opts)
            # get the "ground truth" varnodes for metatype
            truth = df.iloc[:,0].sum()

            # get the varnode compare score for metatype
            level_sums = df.iloc[:,1:6].sum(axis=0)
            weights = np.array(list(VarnodeCompareLevel.range()))
            level_sums_weighted = np.multiply(weights, level_sums)
            level_sums_weighted_sum = level_sums_weighted.sum()
            score = level_sums_weighted_sum / (VarnodeCompareLevel.MATCH * truth)
            row["Varnode comparison score [0,1]"] = score

            missed = df.iloc[:,1].sum()
            matched = df.iloc[:,5].sum()
            row["Varnodes fraction partially recovered"] = (truth - missed) / truth
            row["Varnodes fraction exactly recovered"] = matched / truth
            decomposed_rows[MetaType.repr(metatype)] = row

        decomposed_df = pd.DataFrame.from_dict(decomposed_rows, orient='index')
        decomposed_tablename = tablename + "-decomposed"
        decomposed_savepath = get_table_save_path(decomposed_tablename, opts)
        decomposed_df.to_csv(decomposed_savepath)

def get_metatype_recovery_summary_table(
    opts: BuildOptions,
    primitive: bool = False
) -> pd.DataFrame:
    tname = "metatype-recovery-summary"
    if primitive:
        tname += "-decomposed"
    return load_table_opts(tname, opts)


In [15]:
skip_generate_metatype_recovery_with_levels_summaries = False

tablename = "metatype-recovery-summary-with-levels"

if not skip_generate_metatype_recovery_with_levels_summaries:
    for opts in opts_sets:
        for primitive in (True, False):
            dfs = [
                get_metatype_match_levels_table(opts, primitive=primitive),
                get_metatype_recovery_summary_table(opts, primitive=primitive)
            ]
            df = pd.concat(
                dfs,
                axis=1
            )
            tname = tablename
            caption = "Summary of varnode recovery for each metatype"
            if primitive:
                tname += "-decomposed"
                caption = "Summary of decomposed varnode recovery for each primitive metatype"
            savepath = get_table_save_path(tname, opts)
            df.to_csv(savepath)
                

def get_metatype_recovery_summary_with_levels_table(
    opts: BuildOptions,
    primitive: bool = False
) -> pd.DataFrame:
    tname = "metatype-recovery-summary-with-levels"
    if primitive:
        tname += "-decomposed"
    return load_table_opts(tname, opts)

In [16]:
skip_generate_opts_varnodes_summary = False

tablename = "opts-varnodes-summary"

if not skip_generate_opts_varnodes_summary:
    for primitive in (True, False):
        grp = decomposed_varnodes_group if primitive else varnodes_group
        _suffix = " (decomposed)" if primitive else ""
        seriess = []
        for opts in opts_sets:
            df = get_table_from_group(grp, opts)
            truth = df.iloc[:,0].sum()
            row = df.iloc[:,1:6].sum(axis=0)
            level_sums = row
            weights = np.array(list(VarnodeCompareLevel.range()))
            level_sums_weighted = np.multiply(weights, level_sums)
            level_sums_weighted_sum = level_sums_weighted.sum()
            score = level_sums_weighted_sum / (VarnodeCompareLevel.MATCH * truth)
            row["Varnode comparison score [0,1]" + _suffix] = score
            missed = df.iloc[:,1].sum()
            matched = df.iloc[:,5].sum()
            row["Varnodes fraction partially recovered" + _suffix] = (truth - missed) / truth
            row["Varnodes fraction exactly recovered" + _suffix] = matched / truth
            seriess.append(row)
        
        tname = tablename
        caption = "Summary of high-level varnode recovery by compilation case"
        if primitive:
            tablename += "-decomposed"
            caption = "Summary of decomposed varnode recovery by compilation case"
        
        savepath = get_table_save_path_generic(tname)
        
        df = pd.DataFrame(
            seriess,
            index=opts_sets_keys
        )
        for colname in df.columns[0:5]:
            df[colname] = df[colname].astype(int)
        df.to_csv(savepath)

def get_opts_varnodes_summary_table(primitive: bool = False) -> pd.DataFrame:
    tname = "opts-varnodes-summary"
    if primitive:
        tname += "-decomposed"
    df = load_table(tname)
    return df

In [17]:
skip_generate_opts_varnodes_summary_metatypes = False

tablename = "opts-varnodes-summary-metatypes"

if not skip_generate_opts_varnodes_summary_metatypes:
    for primitive in (True, False):
        dfs = [ get_metatype_recovery_summary_with_levels_table(opts, primitive=primitive) for opts in opts_sets ]
        df = pd.concat(dfs, keys=opts_sets_keys, axis=0)
        tname = tablename
        caption = "Summary of high-level varnode recovery by compilation case and metatype"
        if primitive:
            tname += "-decomposed"
            caption = "Summary of decomposed varnode recovery by compilation case and primitive metatype"
        savepath = get_table_save_path_generic(tname)
        df.to_csv(savepath)

def get_opts_varnodes_summary_metatypes_table(primitive: bool = False) -> pd.DataFrame:
    tname = "opts-varnodes-summary-metatypes"
    if primitive:
        tname += "-decomposed"
    savepath = get_table_save_path_generic(tname)
    return pd.read_csv(savepath, index_col=[0,1])

In [18]:
skip_generate_opts_functions_summary = False

tablename = "opts-functions-summary"

if not skip_generate_opts_varnodes_summary:
    rows = []
    for opts in opts_sets:
        df = get_table_from_group(functions_group, opts)
        row = df.iloc[:,0:3].sum(axis=0)
        rows.append(row)

    df = pd.DataFrame(rows, index=opts_sets_keys)
    df["Functions recovery fraction"] = df["Functions found"] / df["Ground truth functions"]

    savepath = get_table_save_path_generic(tablename)
    df.to_csv(savepath)

def get_opts_functions_summary_table() -> pd.DataFrame:
    return load_table("opts-functions-summary")

In [19]:
skip_generate_opts_bytes_summary = False

tablename = "opts-bytes-summary"

if not skip_generate_opts_bytes_summary:
    rows = []
    for opts in opts_sets:
        df = get_table_from_group(bytes_group, opts)
        row = df.iloc[:,0:3].sum(axis=0)
        rows.append(row)

    df = pd.DataFrame(rows, index=opts_sets_keys)
    df["Bytes recovery fraction"] = df["Bytes found"] / df["Ground truth data bytes"]

    savepath = get_table_save_path_generic(tablename)
    df.to_csv(savepath)

def get_opts_bytes_summary_table() -> pd.DataFrame:
    return load_table("opts-bytes-summary")

In [20]:
skip_generate_opts_array_comparisons_summary = False

tablename = "opts-array-comparisons-summary"

if not skip_generate_opts_array_comparisons_summary:
    rows = []
    for opts in opts_sets:
        df = get_table_from_group(array_comparisons_group, opts)
        comparisons_col = df["Array comparisons"]
        total_comparisons = comparisons_col.sum()
        series0 = df.iloc[:,0:3].sum(axis=0)
        series1 = df.iloc[:,3:].transform(lambda col: col * comparisons_col, axis=0).sum(axis=0) / total_comparisons
        row = pd.concat((series0, series1), axis=0)
        rows.append(row)
    
    df = pd.DataFrame(
        rows,
        index=opts_sets_keys
    )
    df["Array varnodes inferred as array fraction"] = df["Array varnodes inferred as array"] / df["Ground truth array varnodes"]
    for colname in df.columns[0:3]:
        df[colname] = df[colname].astype(int)
    
    savepath = get_table_save_path_generic(tablename)
    df.to_csv(savepath)

def get_opts_array_comparisons_summary_table() -> pd.DataFrame:
    return load_table("opts-array-comparisons-summary")


In [21]:
skip_generate_opts_overall_summary = False

tablename = "opts-overall-summary"

if not skip_generate_opts_overall_summary:
    df_functions = get_opts_functions_summary_table()
    df_varnodes = get_opts_varnodes_summary_table(primitive=False)
    df_primitive_varnodes = get_opts_varnodes_summary_table(primitive=True)
    df_bytes = get_opts_bytes_summary_table()

    functions_recovery_fraction = df_functions["Functions recovery fraction"]
    varnodes = df_varnodes.iloc[:,5:]
    varnodes_decomposed = df_primitive_varnodes.iloc[:,5:]
    bytes_recovery_fraction = df_bytes["Bytes recovery fraction"]

    df = pd.concat(
        (functions_recovery_fraction, varnodes, varnodes_decomposed, bytes_recovery_fraction),
        axis=1
    )

    savepath = get_table_save_path_generic(tablename)
    df.to_csv(savepath)

def get_opts_overall_summary_table() -> pd.DataFrame:
    return load_table("opts-overall-summary")

In [22]:
df = get_opts_varnodes_summary_metatypes_table()
len(df.columns)

8

In [29]:
skip_fix_metrics_groups_latex = False

if not skip_fix_metrics_groups_latex:
    for opts in opts_sets:
        for grp in metrics_groups:
            # load CSV file
            df = get_table_from_group(grp, opts)

            # save to LATEX file
            latex_path = get_latex_path(grp.get_name(), opts)
            df.to_latex(
                latex_path,
                header=['\\rotatebox{45}{' + "\_".join(c.split("_")) + '}' for c in df.columns],
                escape=False,
                longtable=True,
                label=make_latex_label(grp.get_name() + suffix(opts)),
                caption="{} {}".format(grp.get_display_name(), opts_to_caption_suffix(opts)),
                column_format=latex_column_format_str(df.shape[1] + 1),
                float_format="{:.3f}".format,
                na_rep="-"
            )

            # open the given file
            with latex_path.open("r") as f:
                contents: str = f.read()

            # find the '[' program name and replace with '{[}'
            contents = contents.replace("[ ", "{[} ")
            with latex_path.open("w") as f:
                f.write(contents)


  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(


In [24]:
SummaryLatexTable = namedtuple("SummaryTableLatex", [
    "tablename",
    "caption",
    "multirow",
    "float_format"
], defaults=("", "", False, "{:.3f}".format))

summary_latex_tables = [

    SummaryLatexTable(
        tablename="opts-varnodes-summary",
        caption="Summary of high-level varnode recovery by compilation case"
    ),

    SummaryLatexTable(
        tablename="opts-varnodes-summary-decomposed",
        caption="Summary of decomposed varnode recovery by compilation case"
    ),

    SummaryLatexTable(
        tablename="opts-varnodes-summary-metatypes",
        caption="Summary of high-level varnode recovery by compilation case and metatype",
        multirow=True
    ),

    SummaryLatexTable(
        tablename="opts-varnodes-summary-metatypes-decomposed",
        caption="Summary of decomposed varnode recovery by compilation case and primitive metatype",
        multirow=True
    ),

    SummaryLatexTable(
        tablename="opts-functions-summary",
        caption="Summary of function recovery by compilation case",
        float_format="{:.4f}".format
    ),

    SummaryLatexTable(
        tablename="opts-bytes-summary",
        caption="Summary of data bytes recovery by compilation case"
    ),

    SummaryLatexTable(
        tablename="opts-array-comparisons-summary",
        caption="Summary of array recovery by compilation case"
    ),

    SummaryLatexTable(
        tablename="opts-overall-summary",
        caption="Aggregated recovery summary of functions, varnodes, and data bytes by compilation case"
    )
]

In [30]:
for tbl in summary_latex_tables:
    df = load_table(tbl.tablename, multiindex=tbl.multirow)
    savepath = get_latex_path_generic(tbl.tablename)
    ncols = df.shape[1] + (1 if not tbl.multirow else 2)

    # save to .tex file
    df.to_latex(
        savepath,
        header=['\\rotatebox{45}{' + "\_".join(c.split("_")) + '}' for c in df.columns],
        escape=False,
        label=make_latex_label(tbl.tablename),
        caption=tbl.caption,
        column_format=latex_column_format_str(ncols),
        multirow=tbl.multirow,
        position="t",
        float_format=tbl.float_format,
        na_rep="-"
    )

    # if we want table to span full page, change "table" environment to "table*"
    # open the given file
    with savepath.open("r") as f:
        contents: str = f.read()

    # change table to table*
    contents = contents.replace("table", "table*")
    with savepath.open("w") as f:
        f.write(contents)

  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(
  df.to_latex(


In [None]:
# Functions

display(get_opts_functions_summary_table())

In [None]:
# Varnodes

display(get_opts_varnodes_summary_table())
display(get_opts_varnodes_summary_metatypes_table())

In [None]:
# Decomposed Varnodes

display(get_opts_varnodes_summary_table(primitive=True))
display(get_opts_varnodes_summary_metatypes_table(primitive=True))

In [None]:
# Data Bytes

display(get_opts_bytes_summary_table())

In [None]:
# Summary

display(get_opts_overall_summary_table())

In [None]:
# Array Comparisons

display(get_opts_array_comparisons_summary_table())

In [None]:
prog = CoreutilsProgram("ls")

cmp = load_cmp(prog, debug_opts)

records = []
for record in select_comparable_varnode_compare_records(cmp):
    if record.bytes_overlapped() < record.get_datatype().get_size():
    # if record.get_compare_level() != VarnodeCompareLevel.MATCH:
        records.append(record)
        print(record)

In [None]:
for record in records:
    print(record.get_varnode())
    print(record.get_varnode().get_var().get_parent_function().get_name())
    print(record.get_var())
    for cmp2 in record.get_comparisons():
        print("\t{}".format(cmp2.get_right()))

In [None]:
flipped_cmp = cmp.flip()

def filter_cmp_record(record: VarnodeCompareRecord) -> bool:
    varnode = record.get_varnode()
    var = varnode.get_var()
    fn = var.get_parent_function()
    return var.get_name() == "hbuf"

flipped_records = list(filter(filter_cmp_record, select_comparable_varnode_compare_records(flipped_cmp)))

for record in flipped_records:
    print(record.get_varnode())
    print(record.get_varnode().get_var().get_parent_function().get_name())
    print(record.get_var())
    for cmp2 in record.get_comparisons():
        print("\t{}".format(cmp2.get_right()))