In [1]:
import networkx as nx
import json
import numpy as np
import pandas as pd
from EoN import hierarchy_pos
import plotly.express as px
import plotly.graph_objects as go
import itertools

In [2]:
# Data: https://github.com/Cisco-Talos/binary_function_similarity
# GNN: https://github.com/deepmind/deepmind-research/tree/master/graph_matching_networks

**idb_path**
Binary that function is part of

**fva**
First virtual address(?) - appears to be same as start_ea

**func_name**
Pulled from object dump file, just the function name

**start_ea/end_ea**
Addresses where function starts/ends

**bb_num**
Count of basic blocks in the function

**bb_list**
Semicolon-separated list of basic blocks

**hashopcodes**
Hash of... something

In [42]:
simDF = pd.read_csv('sim_scores/pos_testing_Dataset-1_GGSNN_OPC-200_e10.csv')

In [43]:
def get_feature_paths(idb, t='acfg_disasm', db ='Dataset-1', testtrain='testing'):
    stripped = idb.split('/')[-1][:-4]
    if 'acfg' in t:
        return f'DBs/{db}/features/{testtrain}/{t}_{db}_{testtrain}/{stripped}_{t}.json'
    if t == 'fss':
        return f'DBs/{db}/features/{testtrain}/{t}_{db}_{testtrain}/{stripped}_Capstone_True_fss.json'
    else: return 'ERR'

In [44]:
def get_full_binary(filepath):
    f = open(filepath)
    o = json.load(f)
    l = len(list(o.keys()))
    if (l > 1):
        print(f'Warning! Code contains {l} keys')
    code = o[list(o.keys())[0]]
    #print(f'Binary contains {len(list(code.keys()))} functions')
    return code

In [None]:
def filter_edges(function):
    problem_targets = []
    for e_item in list(itertools.chain(*function['edges'])):
        if e_item not in function['nodes']:
            print()
            print("*** WARNING ****")
            print(f'{e_item} not in node list')
            problem_targets.append(e_item)
            print()
    new_edges = []
    for edge in function['edges']:
        for problem in problem_targets:
            if problem in edge:
                if debug: print(f'Removed {edge} from network')
            else:
                new_edges.append(edge)
    function['edges']=new_edges
    return function

In [51]:
def extract_network(address, binary, filter_e=False):
    function = binary[address]
    if filter_e:
        function = filter_edges(function)
    return function

In [46]:
def remove_components(G, nodeDF, edgeDF):
    components = list(nx.connected_components(G))
    if debug:
        print(f'Found {len(list(components))} components in the graph')
    if len(list(components))>1:
        if debug: print(f'Filtering graph of size {len(list(G.nodes()))}...')
        trees = []
        subgraphs = []
        for i, node_list in enumerate(components):
            if debug:
                #print(f'node_list: {list(node_list)}')
                print(f'Component #{i} has {len(node_list)} nodes')
            Gc = G.subgraph(list(node_list))
            if nx.is_tree(Gc):
                if debug: print(f'...and is tree')
                trees.append(Gc)
            else:
                if debug: print(f'...and is NOT a tree')
                subgraphs.append(Gc)
            if debug: print()
        if debug:
            print(f'Found {len(trees)} tree(s) and {len(subgraphs)} non-tree subgraphs')
        if len(trees)>0:
            G = trees[0].copy()
        else:
            G = subgraphs[0].copy()
        if debug: print(f'Filtered graph to size {len(list(G.nodes()))}')
        nodeDF = nodeDF.filter(items = list(G.nodes()), axis=0)
        for i in nodeDF.index:
            if i not in list(G.nodes()):
                if debug: print(f'Removing {i} from nodeDF')
        edgeDF = edgeDF[edgeDF[['source','target']].isin(list(G.nodes())).any(1)]
    return G, nodeDF, edgeDF

In [80]:
def parse_network(network, debug=True, remove_c=False):
    data_cols = ['node','bb_len', 'bb_mnems', 'bb_norm', 'bb_disasm', 'b64_bytes', 'bb_heads', 'source_count', 'target_count']
    edgeDF = pd.DataFrame(network["edges"], columns=['source','target'])
    nodeDF = pd.DataFrame(columns=data_cols)
    for n in network["nodes"]:
        metadata = network['basic_blocks'][str(n)].copy()
        metadata['node'] = n
        metadata['source_count'] = list(edgeDF['source']).count(n)
        # The number of times that this node is a source
        metadata['target_count'] = list(edgeDF['target']).count(n)
        # The number of times that this node is a target
        nodeDF = pd.concat([nodeDF, pd.DataFrame.from_records([metadata])])
    nodeDF['instruction_count'] = [len(m) for m in nodeDF['bb_mnems']]
    nodeDF = nodeDF.sort_values(by='target_count').set_index('node')
    G = nx.from_pandas_edgelist(edgeDF)
    if remove_c:
        G, nodeDF, edgeDF = remove_components(G, nodeDF, edgeDF)
    root = nodeDF.sort_values(by='source_count').index[0]
    if debug: print(f'Root node: {root}')
    try:
        pos = hierarchy_pos(G, root=root)
        if debug: print('Hierarchical positioning worked!')
    except:
        pos=nx.kamada_kawai_layout(G)
        if debug: print('Hierarchical positioning failed; using spring layout instead')
    
    def get_pos(node):
        try:
            return pos[node]
        except:
            return (-1,-1)
    
    nodeDF[['y','x']] = [get_pos(n) for n in nodeDF.index]
    nodeLookup = nodeDF.to_dict(orient='index')
    
    def get_xy(s, t, xy, debug=True):
        x = None
        y = None
        try:
            x = nodeLookup[s][xy]
        except:
            if debug: print(f'Failed to find source {s} in node list!')
            x = x
        try:
            y = nodeLookup[t][xy]
        except:
            if debug: print(f'Failed to find target {t} in node list!')
            y = y
        return [x, y]
    
    edgeDF['x'] = [get_xy(s,t,'x') for s,t in zip(edgeDF['source'], edgeDF['target'])]
    edgeDF['y'] = [get_xy(s,t,'y') for s,t in zip(edgeDF['source'], edgeDF['target'])]
    edges = {'x':[], 'y':[]}
    for i,e in edgeDF.iterrows():
        edges['x'].extend(e['x'])
        edges['x'].append(None)
        edges['y'].extend(e['y'])
        edges['y'].append(None)
    return nodeDF, edgeDF, edges

In [84]:
def make_network_fig(nodeDF, edges, meta=None):
    root = nodeDF.sort_values(by='source_count').index[0]
    fig = go.Figure(layout=go.Layout(
                    title='Network graph made with Python',
                    titlefont_size=16,
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )
    fig.add_trace(
        go.Scatter(x=edges['x'], y=edges['y'], mode='lines',
                   line=dict(width=2, color='#888'),
                   hoverinfo='none'
                  )
    )
    fig.add_trace(
        go.Scatter(x=nodeDF['x'], y=nodeDF['y'], text=[str(m) for m in nodeDF['bb_mnems']], mode='markers',
                   marker=dict(
                        showscale=True,
                        colorscale='YlGnBu',
                        reversescale=False,
                        color=nodeDF['instruction_count'],
                        size=20,
                        colorbar=dict(
                            thickness=15,
                            title='Instruction Count',
                            xanchor='left',
                            titleside='right'
                        ),
                        line=dict(
                            color=['red' if c==0 else 'black' for c  in nodeDF['target_count']],
                            width=4
                        ))
                ))
    fig.update_layout(showlegend=False,
        hovermode='closest',
        margin=dict(b=20,l=5,r=5,t=40),
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        template='plotly_white'
    )
    if meta is not None:
        fig.update_layout(title=f"Function: {meta['function']}<br>IDB Path: {meta['path']}<br>Arch: {meta['arch']}")
    return fig

In [85]:
def setup(i):
    row = simDF.iloc[i]
    figs = []
    for i in range(1,3):
        meta = {'function': row[f'fva_{i}'], 'path':row[f'idb_path_{i}']}
        meta['arch'] = meta['path'].split('/')[-1][:-4]
        print(meta)
        print(f"---- Looking up function {meta['function']} ----")
        binary = get_full_binary(get_feature_paths(meta['path']))
        network = extract_network(meta['function'], binary)
        nodeDF, edgeDF, edges = parse_network(network)
        figs.append(make_network_fig(nodeDF, edges, meta=meta))
        print()
    return figs

In [86]:
for i in range(0,10):
    for f in setup(i):
        f.show()

{'function': '0xa2cb80', 'path': 'IDBs/Dataset-1/z3/x64-clang-9-O1_z3.i64', 'arch': 'x64-clang-9-O1_z3'}
---- Looking up function 0xa2cb80 ----
Root node: 10669824
Hierarchical positioning failed; using spring layout instead

{'function': '0xc60c60', 'path': 'IDBs/Dataset-1/z3/x64-gcc-4.8-O1_z3.i64', 'arch': 'x64-gcc-4.8-O1_z3'}
---- Looking up function 0xc60c60 ----
Root node: 12980405
Hierarchical positioning failed; using spring layout instead



{'function': '0x9fde10', 'path': 'IDBs/Dataset-1/z3/arm64-gcc-7-O2_z3.i64', 'arch': 'arm64-gcc-7-O2_z3'}
---- Looking up function 0x9fde10 ----
Root node: 10477100
Hierarchical positioning failed; using spring layout instead

{'function': '0x99a264', 'path': 'IDBs/Dataset-1/z3/arm64-clang-9-O2_z3.i64', 'arch': 'arm64-clang-9-O2_z3'}
---- Looking up function 0x99a264 ----
Root node: 10068752
Hierarchical positioning failed; using spring layout instead



{'function': '0x80d6a0', 'path': 'IDBs/Dataset-1/z3/x64-clang-3.5-O1_z3.i64', 'arch': 'x64-clang-3.5-O1_z3'}
---- Looking up function 0x80d6a0 ----
Root node: 8443928
Hierarchical positioning failed; using spring layout instead

{'function': '0x98bf9a', 'path': 'IDBs/Dataset-1/z3/x64-gcc-4.8-O1_z3.i64', 'arch': 'x64-gcc-4.8-O1_z3'}
---- Looking up function 0x98bf9a ----
Root node: 10010855
Hierarchical positioning failed; using spring layout instead



{'function': '0xc11700', 'path': 'IDBs/Dataset-1/z3/arm32-clang-9-O0_z3.i64', 'arch': 'arm32-clang-9-O0_z3'}
---- Looking up function 0xc11700 ----
Root node: 12654636
Hierarchical positioning failed; using spring layout instead

{'function': '0xaee38c', 'path': 'IDBs/Dataset-1/z3/arm32-gcc-7-O0_z3.i64', 'arch': 'arm32-gcc-7-O0_z3'}
---- Looking up function 0xaee38c ----
Root node: 11461872
Hierarchical positioning failed; using spring layout instead



{'function': '0x28ebb0', 'path': 'IDBs/Dataset-1/nmap/arm64-clang-9-O1_nping.i64', 'arch': 'arm64-clang-9-O1_nping'}
---- Looking up function 0x28ebb0 ----
Root node: 2686400
Hierarchical positioning failed; using spring layout instead

{'function': '0x658d4', 'path': 'IDBs/Dataset-1/nmap/arm64-gcc-7-O1_nping.i64', 'arch': 'arm64-gcc-7-O1_nping'}
---- Looking up function 0x658d4 ----
Root node: 419984
Hierarchical positioning failed; using spring layout instead



{'function': '0x1202208', 'path': 'IDBs/Dataset-1/z3/mips32-clang-3.5-O2_z3.i64', 'arch': 'mips32-clang-3.5-O2_z3'}
---- Looking up function 0x1202208 ----
Root node: 18883304
Hierarchical positioning failed; using spring layout instead

{'function': '0x183119c', 'path': 'IDBs/Dataset-1/z3/mips32-gcc-7-O2_z3.i64', 'arch': 'mips32-gcc-7-O2_z3'}
---- Looking up function 0x183119c ----
Root node: 25367024
Hierarchical positioning failed; using spring layout instead



{'function': '0x13154e0', 'path': 'IDBs/Dataset-1/z3/mips64-clang-3.5-O3_z3.i64', 'arch': 'mips64-clang-3.5-O3_z3'}
---- Looking up function 0x13154e0 ----
Root node: 20010592
Hierarchical positioning failed; using spring layout instead

{'function': '0x121654a70', 'path': 'IDBs/Dataset-1/z3/mips64-gcc-9-O3_z3.i64', 'arch': 'mips64-gcc-9-O3_z3'}
---- Looking up function 0x121654a70 ----
Root node: 4855253960
Hierarchical positioning failed; using spring layout instead



{'function': '0x199aa3c', 'path': 'IDBs/Dataset-1/z3/mips32-gcc-9-O3_z3.i64', 'arch': 'mips32-gcc-9-O3_z3'}
---- Looking up function 0x199aa3c ----
Root node: 26848380
Hierarchical positioning failed; using spring layout instead

{'function': '0x1434930', 'path': 'IDBs/Dataset-1/z3/mips32-clang-9-O3_z3.i64', 'arch': 'mips32-clang-9-O3_z3'}
---- Looking up function 0x1434930 ----
Root node: 21187432
Hierarchical positioning failed; using spring layout instead



{'function': '0x1211b4', 'path': 'IDBs/Dataset-1/z3/arm32-gcc-5-O2_z3.i64', 'arch': 'arm32-gcc-5-O2_z3'}
---- Looking up function 0x1211b4 ----
Root node: 1188436
Hierarchical positioning failed; using spring layout instead

{'function': '0x24acc4', 'path': 'IDBs/Dataset-1/z3/arm32-clang-9-O2_z3.i64', 'arch': 'arm32-clang-9-O2_z3'}
---- Looking up function 0x24acc4 ----
Root node: 2404272
Hierarchical positioning failed; using spring layout instead



{'function': '0x429b34', 'path': 'IDBs/Dataset-1/z3/arm32-gcc-5-O3_z3.i64', 'arch': 'arm32-gcc-5-O3_z3'}
---- Looking up function 0x429b34 ----
Root node: 4365396
Hierarchical positioning failed; using spring layout instead

{'function': '0x4b2720', 'path': 'IDBs/Dataset-1/z3/arm32-clang-5.0-O3_z3.i64', 'arch': 'arm32-clang-5.0-O3_z3'}
---- Looking up function 0x4b2720 ----
Root node: 4925320
Hierarchical positioning failed; using spring layout instead
Failed to find target 4925388 in node list!
Failed to find target 4925388 in node list!

