In [1]:
##############################################################################
#                                                                            #
#  Code for the USENIX Security '22 paper:                                   #
#  How Machine Learning Is Solving the Binary Function Similarity Problem.   #
#                                                                            #
#  MIT License                                                               #
#                                                                            #
#  Copyright (c) 2019-2022 Cisco Talos                                       #
#                                                                            #
#  Permission is hereby granted, free of charge, to any person obtaining     #
#  a copy of this software and associated documentation files (the           #
#  "Software"), to deal in the Software without restriction, including       #
#  without limitation the rights to use, copy, modify, merge, publish,       #
#  distribute, sublicense, and/or sell copies of the Software, and to        #
#  permit persons to whom the Software is furnished to do so, subject to     #
#  the following conditions:                                                 #
#                                                                            #
#  The above copyright notice and this permission notice shall be            #
#  included in all copies or substantial portions of the Software.           #
#                                                                            #
#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,           #
#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF        #
#  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND                     #
#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE    #
#  LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION    #
#  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION     #
#  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.           #
#                                                                            #
#  Dataset-1 creation                                                        #
#                                                                            #
##############################################################################

In [2]:
import json
import numpy as np
import os
import pandas as pd
import random
import shutil

from collections import Counter
from collections import defaultdict
from itertools import chain
from itertools import compress
from tqdm import tqdm

pdcsv = lambda x: pd.read_csv(x, index_col=0)

The following table summarizes the criteria used to generate positive pairs for each task:
* The `X` indicates that the variable is required to be different in each pair
* The `*` indicates that the variable is free and may differ (but it isn't required).

```
|       | Architecture | Bitness | Compiler | Version | Optimization |
|-------|--------------|---------|----------|---------|--------------|
| arch  | X            |         |          |         |              |
| bit   |              | X       |          |         |              |
| comp  |              |         | X        | X       |              |
| ver   |              |         |          | X       |              |
| opt   |              |         |          |         | X            |
| XA    | X            | X       |          |         |              |
| XA+XO | X            | X       |          |         | X            |
| XC    |              |         | X        | X       | X            |
| XC+XB |              | X       | X        | X       | X            |
| XM    | *            | *       | *        | *       | *            |
```

In [4]:
CATEGORIES = [
    "project",
    "library",
    "arch",
    "bit",
    "compiler",
    "version",
    "optimizations",
]

TASKS_DICT = {
    # For any positive pair, the project and the library are the same.
    #   True: the variable is required to have the same value in the positive pair
    #   False: the variable is required to have different values in the negative pair.
    "arch": [
        True, True, False, True, True, True, True],
    "bit": [
        True, True, True, False, True, True, True],
    "comp": [
        True, True, True, True, False, False, True],
    "ver": [
        True, True, True, True, True, False, True],
    "opt": [
        True, True, True, True, True, True, False],
    "XA": [
        True, True, False, False, True, True, True],
    "XA+XO": [
        True, True, False, False, True, True, False],
    "XC": [
        True, True, True, True, False, False, False],
    "XC+XB": [
        True, True, True, False, False, False, False],
    # The following would be the XA+XC test
    # "XA+XC": [
    #    True, True, False, False, False, False, False]
}

# The XO test is the same as the opt one.
TASKS_DICT["XO"] = TASKS_DICT["opt"]

In [5]:
DATASET_ONE_DICT = {
    "projects": {
        "training": ["openssl", "clamav", "curl", "unrar"],
        "validation": ["zlib"],
        "test": ["z3", "nmap"],
    },
    "eval": {
        "validation": {
            "similarity": {"XA": 10000, "XC": 10000, "XC+XB": 10000, "XM": 10000}
        },
        "test": {
            "similarity": {
                "XA": 50000,
                "XC": 50000,
                "XC+XB": 50000,
                "XM": 50000,
                "arch": 50000,
                "bit": 50000,
                "comp": 50000,
                "opt": 50000,
                "ver": 50000,
            },
            "rank": {"XA": 200, "XC": 200, "XC+XB": 200, "XM": 200},
        },
    }
}

In [8]:
os.getcwd()

'/home/ivan/Workspace/binary_function_similarity/DBs/Dataset-1'

In [9]:
# where to save the new dataset
OUTPUT_DIR = "../Dataset-1-new/"

if not os.path.isdir(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)
    print(f"[D] DIR created: {OUTPUT_DIR}")
    
for dirname in ['validation', 'testing']:
    tmp_path = os.path.join(OUTPUT_DIR, "pairs", dirname)
    if not os.path.isdir(tmp_path):
        os.makedirs(tmp_path)
        print(f"[D] DIR created: {tmp_path}")

for dirname in ['training', 'validation', 'testing']:
    tmp_path = os.path.join(OUTPUT_DIR, "features", dirname)
    if not os.path.isdir(tmp_path):
        os.makedirs(tmp_path)
        print(f"[D] DIR created: {tmp_path}")

### Create a training / validation / test split

In [10]:
# The starting point
CSV_FLOWCHART_FP = "features/flowchart_Dataset-1.csv"

# Copy the flowchart file to the new folder
shutil.copy(CSV_FLOWCHART_FP, os.path.join(OUTPUT_DIR, "features", "flowchart_Dataset-1.csv")) 

'../Dataset-1-new/features/flowchart_Dataset-1.csv'

Summary:
   * Step 0 - Read the list of functions from the output of IDA flowchart
   * Step 1 -  Filter the functions with less than 5 BBs
   * Step 2 - Remove duplicated hashopcodes to remove duplicated functions
   * Step 3 - Extract compilation variables from idb_path
   * Step 4 - Create training, validation and test splits
   * Step 5 - Remove common function names across splits
   * Step 6 - Remove singleton functions.

In [11]:
# Step0 - Read the list of functions from the output of IDA flowchart
df = pd.read_csv(CSV_FLOWCHART_FP)
print(f"Shape: {df.shape}")

Shape: (8664141, 8)


In [13]:
df.columns

Index(['idb_path', 'fva', 'func_name', 'start_ea', 'end_ea', 'bb_num',
       'bb_list', 'hashopcodes'],
      dtype='object')

In [14]:
# Remove the column with the list of basic-blocks
del df['bb_list']
print(f"Shape: {df.shape}")

Shape: (8664141, 7)


In [15]:
# Step1 -  Filter the functions with less than 5 BBs
df = df[df['bb_num'] >= 5]
print(f"Shape: {df.shape}")

Shape: (8664141, 7)


In [16]:
# Step2 - Remove duplicated hashopcodes to remove duplicated functions
df.drop_duplicates('hashopcodes', keep='first', inplace=True)
print(f"Shape: {df.shape}")

Shape: (3296093, 7)


In [17]:
# Step3 - Extract compilation variables from idb_path
compilation_var = list()
for path in df['idb_path']:
    slist = path.split("/")[2:]
    project = slist[0]
    slist = slist[1].split("_")
    library = slist[1].replace(".i64", "")
    arch, comp, ver, opt = slist[0].split("-")
    bit = "32" if "32" in arch.replace("86", "32") else "64"
    arch = arch.replace("32", "").replace("64", "").replace("86", "")
    if comp == "gcc":
        ver = "gcc_" + ver
    compilation_var.append([project, library, arch, bit, comp, ver, opt])

# Convert to NumPy Array
compilation_var = np.array(compilation_var)

# Add compilation variables to the DataFrame
df['project'] = compilation_var[:,0].tolist()
df['library'] = compilation_var[:,1].tolist()
df['arch'] = compilation_var[:,2].tolist()
df['bit'] = compilation_var[:,3].tolist()
df['compiler'] = compilation_var[:,4].tolist()
df['version'] = compilation_var[:,5].tolist()
df['optimizations'] = compilation_var[:,6].tolist()

print(f"Shape: {df.shape}")

Shape: (3296093, 14)


In [18]:
# Step4 - Create training, validation and test splits
df_training = df[df['project'].isin(DATASET_ONE_DICT['projects']['training'])]
print(f"Shape df_training: \t{df_training.shape}")

df_validation = df[df['project'].isin(DATASET_ONE_DICT['projects']['validation'])]
print(f"Shape df_validation: \t{df_validation.shape}")

df_test = df[df['project'].isin(DATASET_ONE_DICT['projects']['test'])]
print(f"Shape df_test: \t\t{df_test.shape}")

# Reset indexes
df_training.reset_index(inplace=True, drop=True)
df_validation.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

Shape df_training: 	(849874, 14)
Shape df_validation: 	(22118, 14)
Shape df_test: 		(2424101, 14)


In [19]:
# Step5 - Remove common function names across splits

# Check for common function names in training and test
r1 = set(df_training['func_name'].values) & set(df_test['func_name'].values)
print(f"# function names to remove: {len(r1)} (train & test)")

# Check for common function names in training and validation
r2 = set(df_training['func_name'].values) & set(df_validation['func_name'].values)
print(f"# function names to remove: {len(r2)} (train & validation)")

df_training = df_training[~df_training['func_name'].isin(r1 | r2)]
df_training.reset_index(inplace=True, drop=True)
print(f"Shape df_training: \t{df_training.shape}")

print()

# Check for common function names in validation and test
r3 = set(df_validation['func_name'].values) & set(df_test['func_name'].values)
print(f"# function names to remove: {len(r3)} (validation & test)")

df_test = df_test[~df_test['func_name'].isin(r3)]
df_test.reset_index(inplace=True, drop=True)
print(f"Shape df_test: \t\t{df_test.shape}")

# function names to remove: 37 (train & test)
# function names to remove: 8 (train & validation)
Shape df_training: 	(847120, 14)

# function names to remove: 5 (validation & test)
Shape df_test: 		(2423315, 14)


In [20]:
# Step6 - Remove singleton functions
for df_t in [df_training, df_validation, df_test]:
    sl = [x for x, y in df_t[["library", "func_name"]].value_counts().items() if y < 2]
    gg = df_t.groupby(["library", "func_name"]).groups
    idx_list = list(chain(*[list(gg[i]) for i in sl]))
    print(f"[D] # function to remove: {len(idx_list)}")
    
    df_t.drop(idx_list, inplace=True)
    df_t.reset_index(inplace=True, drop=True)
    print(f"[D] Shape: {df_t.shape}\n")

[D] # function to remove: 2279
[D] Shape: (844841, 14)

[D] # function to remove: 28


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


[D] Shape: (22090, 14)
[D] # function to remove: 27231
[D] Shape: (2396084, 14)


In [21]:
print(f"Shape df_training: \t{df_training.shape}")
print(f"Shape df_validation: \t{df_validation.shape}")
print(f"Shape df_test: \t\t{df_test.shape}")

Shape df_training: 	(844841, 14)
Shape df_validation: 	(22090, 14)
Shape df_test: 		(2396084, 14)


### Create positive and negative pairs for validation and test

In [22]:
def create_similarity_pairs(df_input, num_pairs, test):
    """
    Generate "num_pairs" positive function pairs by sub sampling all the
    possible function combinations. Use this function when the number
    of ((libraries, function_names)) is limited to few hundreds.
    """
    # Map (libraries, function_names) to the indexes in the DB
    libfunc_dict = {
        k: list(v) for k, v in df_input.groupby(["library", "func_name"]).groups.items()
    }

    pos_pair_set = set()
    neg_pair_set = set()
    pos_pair_list = list()
    neg_pair_list = list()

    # Iterate over each library/func_name pair
    for entry in tqdm(libfunc_dict.keys(), ncols=100):
        libname, fname = entry

        # Get the list of indexes associated to the ((libname, fname)) pair
        idx_libfunc = libfunc_dict[entry]
        # DataFrame for the library/func_name pair
        df_libfunc = df_input.iloc[idx_libfunc]

        # Get the list of indexes to select negative pairs
        idx_list_neg = df_input[df_input["func_name"] != fname].index

        # (<-- left) Iterate over each function for the ((libname, fname)) pair
        for idx_left_p in idx_libfunc:

            # Extract the compilation variables
            comp_data = df_input.iloc[idx_left_p][CATEGORIES].values

            # For the XM test, any combination is valid
            idx_list_pos = idx_libfunc

            if test != "XM":
                mask = TASKS_DICT[test]
                # Build the constraints dict
                #   if m is True: the variable is required to be the same in the positive pair
                fd = {c: v for m, c, v in zip(mask, CATEGORIES, comp_data) if m}
                constraints = [(df_libfunc[k] == v) for k, v in fd.items()]
                #   if m is False: the variable is required to be different in the positive pair.
                fd = {c: v for m, c, v in zip(mask, CATEGORIES, comp_data) if not m}
                constraints += [(df_libfunc[k] != v) for k, v in fd.items()]

                # Get the list of indexes of candidate right functions to generate positive pairs
                idx_list_pos = df_libfunc[np.logical_and.reduce(constraints)].index

            # Remove the left function from the list
            idx_list_pos = [idx for idx in idx_list_pos if idx != idx_left_p]

            # Iterate over each (--> right) function
            for idx_right_p in idx_list_pos:
                pos_pair = (idx_left_p, idx_right_p)

                # Check if the pos_pair is already in the list
                if tuple(sorted(pos_pair)) not in pos_pair_set:
                    pos_pair_set.add(tuple(sorted(pos_pair)))
                    pos_pair_list.append(pos_pair)

                    # Generate the corresponding negative pair
                    is_success = False
                    while not is_success:
                        idx_right_n = random.choice(idx_list_neg)
                        neg_pair = (idx_left_p, idx_right_n)

                        # Check if the neg_pair is already in the list
                        if tuple(sorted(neg_pair)) not in neg_pair_set:
                            neg_pair_set.add(tuple(sorted(neg_pair)))
                            neg_pair_list.append(neg_pair)
                            is_success = True

    # print(
    #     f"[D] Before sampling - pos: {len(pos_pair_list)} - neg: {len(neg_pair_list)}"
    # )

    # Sub sample the positive and negative pairs to num_pairs
    if len(pos_pair_list) > num_pairs:
        sampled_list = random.sample(list(range(len(pos_pair_list))), num_pairs)
        pos_pair_list = [pos_pair_list[x] for x in sampled_list]
        neg_pair_list = [neg_pair_list[x] for x in sampled_list]
        # print(
        #     f"[D] After sampling - pos: {len(pos_pair_list)} - neg: {len(neg_pair_list)}"
        # )

    return pos_pair_list, neg_pair_list

In [23]:
def create_similarity_pairs_random_version(df_input, num_pairs, test, num_negatives=1):
    """
    Randomly generate "num_pairs" positive function pairs. Use this function
    when the number of (libraries, function_names) pairs is > 1 thousand.
    """
    # Map (libraries, function_names) to the indexes in the DB
    libfunc_dict = {
        k: list(v) for k, v in df_input.groupby(["library", "func_name"]).groups.items()
    }
    libfunc_list = list(libfunc_dict.keys())

    pos_pair_set = set()
    neg_pair_set = set()
    pos_pair_list = list()
    neg_pair_list = list()

    with tqdm(total=num_pairs, ncols=100) as pbar:
        # Iterate num_pairs time to create the pos/neg function pairs
        for _ in range(num_pairs):

            # Iterate until a positive function pair is generated
            is_success_pos = False
            while not is_success_pos:

                # Randomly select a library/func_name pair
                entry = random.choice(libfunc_list)
                libname, fname = entry
                # Get the list of indexes associated to the library/func_name pair
                idx_libfunc = libfunc_dict[entry]
                # DataFrame for the library/func_name pair
                df_libfunc = df_input.iloc[idx_libfunc]

                # Randomly select a (<-- left) function
                idx_left_p = random.choice(idx_libfunc)
                # Extract the compilation variables
                comp_data = df_input.iloc[idx_left_p][CATEGORIES].values

                # For the XM test, any combination is valid
                idx_list_pos = idx_libfunc

                if test != "XM":
                    mask = TASKS_DICT[test]
                    # Build the constraints dict
                    #   if m is True: the variable is required to be the same in the positive pair
                    fd = {c: v for m, c, v in zip(mask, CATEGORIES, comp_data) if m}
                    constraints = [(df_libfunc[k] == v) for k, v in fd.items()]
                    #   if m is False: the variable is required to be different in the positive pair.
                    fd = {c: v for m, c, v in zip(mask, CATEGORIES, comp_data) if not m}
                    constraints += [(df_libfunc[k] != v) for k, v in fd.items()]

                    # Get the list of indexes of candidate right functions to generate positive pairs
                    idx_list_pos = df_libfunc[np.logical_and.reduce(constraints)].index

                # Remove the left function from the list
                idx_list_pos = [idx for idx in idx_list_pos if idx != idx_left_p]

                # No functions are left. Retry
                if len(idx_list_pos) == 0:
                    continue

                # Randomly select a (<-- right) function
                idx_right_p = random.choice(idx_list_pos)
                pos_pair = (idx_left_p, idx_right_p)
                if tuple(sorted(pos_pair)) not in pos_pair_set:
                    pos_pair_set.add(tuple(sorted(pos_pair)))
                    pos_pair_list.append(pos_pair)
                    is_success_pos = True

                    for _ in range(num_negatives):
                        # Generate the corresponding negative pair
                        is_success_neg = False
                        while not is_success_neg:
                            idx_right_n = random.randint(0, df_input.shape[0] - 1)
                            if df_input.iloc[idx_right_n]["func_name"] == fname:
                                continue
                            neg_pair = (idx_left_p, idx_right_n)

                            # Check if the neg_pair is already in the list
                            if tuple(sorted(neg_pair)) not in neg_pair_set:
                                neg_pair_set.add(tuple(sorted(neg_pair)))
                                neg_pair_list.append(neg_pair)
                                is_success_neg = True

            # Update the progress bar
            pbar.update(1)

        # print(f"[D] pos: {len(pos_pair_list)} - neg: {len(neg_pair_list)}")

    return pos_pair_list, neg_pair_list


In [24]:
def convert_dicts_into_dataframes(df_input, dataset_dict):
    pair_columns = [
        "idb_path_1",
        "fva_1",
        "func_name_1",
        "idb_path_2",
        "fva_2",
        "func_name_2",
        "db_type",
    ]

    pos_pair_dict = defaultdict(list)
    neg_pair_dict = defaultdict(list)
    
    # Iterate over each positive and negative pair.
    #   Select the required info and save it in a new dictionary.
    for task in dataset_dict:
        for pos_pair in dataset_dict[task]["pos"]:
            for c in ["idb_path", "fva", "func_name"]:
                pos_pair_dict[c + "_1"].append(df_input.iloc[pos_pair[0]][c])
                pos_pair_dict[c + "_2"].append(df_input.iloc[pos_pair[1]][c])
            pos_pair_dict["db_type"].append(task)

        for neg_pair in dataset_dict[task]["neg"]:
            for c in ["idb_path", "fva", "func_name"]:
                neg_pair_dict[c + "_1"].append(df_input.iloc[neg_pair[0]][c])
                neg_pair_dict[c + "_2"].append(df_input.iloc[neg_pair[1]][c])
            neg_pair_dict["db_type"].append(task)
    
    # Convert the local pair_dicts into DataFrames
    df_pos = pd.DataFrame.from_dict(pos_pair_dict)
    df_neg = pd.DataFrame.from_dict(neg_pair_dict)
    
    # Check/change the order of the columns
    df_pos = df_pos[pair_columns]
    df_neg = df_neg[pair_columns]
    return df_pos, df_neg

In [25]:
def print_summary(dataset_dict):
    print("[D] Summary:") 
    for task in dataset_dict:
        print(
            "[D] \tTask: {:5} - pos: {:5} neg: {:5}".format(
                task, len(dataset_dict[task]["pos"]), len(dataset_dict[task]["neg"])
            )
        )
    print("\n")

In [26]:
def print_free_variables(df_input, task_list, dataset_dict):
    for task in task_list:
        # Skip "XM"
        if task not in TASKS_DICT:
            continue

        print("-" * 100 + "\n")
        print(f"[D] Task: {task}\n")

        # Get the name of the free variables for each task
        free_variables = list(
            compress(CATEGORIES, [not x for x in TASKS_DICT[task]])
        )

        v_list = list()
        for pos_pair in dataset_dict[task]["pos"]:
            # Get the values associated to the free variables
            vv = df_input.iloc[list(pos_pair)][free_variables].values
            # Sort them to avoid counting the permutations
            vv = tuple(sorted([tuple(x) for x in vv]))
            v_list.append(vv)

        # Print the frequency of each combination
        for k, v in Counter(v_list).most_common():
            print(f"\t{v:5}, {k}")

        print()

In [41]:
def create_pos_neg_dataset(
    df_input, task_dict, output_dir, output_fs, rand=True, num_negatives=1
):
    print("[D] Creating the pos/neg function pairs...", flush=True)
    dataset_dict = defaultdict(dict)

    for task, num_pairs in task_dict.items():
        ppl, npl = None, None
        if rand:
            # Use the random version of the pair generation function
            ppl, npl = create_similarity_pairs_random_version(
                df_input, num_pairs, task, num_negatives
            )
        else:
            ppl, npl = create_similarity_pairs(df_input, num_pairs, task)
        dataset_dict[task]["pos"] = ppl
        dataset_dict[task]["neg"] = npl

    print_summary(dataset_dict)

    print("[D] Converting the positive/negative pairs into CSV...", flush=True)
    df_pos, df_neg = convert_dicts_into_dataframes(df_input, dataset_dict)

    pos_fp = os.path.join(output_dir, output_fs.format("pos"))
    df_pos.to_csv(pos_fp)
    print(f"[D] \tPos CSV: {pos_fp}")

    neg_fp = os.path.join(output_dir, output_fs.format("neg"))
    df_neg.to_csv(neg_fp)
    print(f"[D] \tNeg CSV: {neg_fp}")

    # For debug only
    print_free_variables(df_input, task_dict.keys(), dataset_dict)
    
    selected_functions = set()
    for task in dataset_dict:
        for pair in dataset_dict[task]["pos"]:
            selected_functions.update(list(pair))
        for pair in dataset_dict[task]["neg"]:
            selected_functions.update(list(pair))
    return selected_functions, dataset_dict
    

In [42]:
# Create pairs for validation dataset
sf_set, ds_dict = create_pos_neg_dataset(
    df_validation,
    DATASET_ONE_DICT["eval"]["validation"]["similarity"],
    os.path.join(OUTPUT_DIR, "pairs", "validation"),
    "{}_validation_Dataset-1.csv",
    rand=False,
    num_negatives=1
)

df_validation = df_validation.iloc[list(sf_set)]
df_validation.reset_index(inplace=True, drop=True)

[D] Creating the pos/neg function pairs...


100%|█████████████████████████████████████████████████████████████| 395/395 [00:24<00:00, 16.42it/s]
100%|█████████████████████████████████████████████████████████████| 395/395 [00:24<00:00, 16.14it/s]
100%|█████████████████████████████████████████████████████████████| 395/395 [00:23<00:00, 16.90it/s]
100%|█████████████████████████████████████████████████████████████| 395/395 [00:12<00:00, 31.65it/s]

[D] Summary:
[D] 	Task: XA    - pos:  7920 neg:  7920
[D] 	Task: XC    - pos: 10000 neg: 10000
[D] 	Task: XC+XB - pos: 10000 neg: 10000
[D] 	Task: XM    - pos: 10000 neg: 10000


[D] Converting the positive/negative pairs into CSV...





[D] 	Pos CSV: ../Dataset-1-new/pairs/validation/pos_validation_Dataset-1.csv
[D] 	Neg CSV: ../Dataset-1-new/pairs/validation/neg_validation_Dataset-1.csv
----------------------------------------------------------------------------------------------------

[D] Task: XA
	 1578, (('mips', '32'), ('x', '64'))
	 1543, (('arm', '64'), ('mips', '32'))
	 1357, (('mips', '64'), ('x', '32'))
	 1234, (('arm', '64'), ('x', '32'))
	 1194, (('arm', '32'), ('mips', '64'))
	 1014, (('arm', '32'), ('x', '64'))

----------------------------------------------------------------------------------------------------

[D] Task: XC
	   80, (('clang', '7', 'O0'), ('gcc', 'gcc_4.8', 'O1'))
	   78, (('clang', '3.5', 'O0'), ('gcc', 'gcc_4.8', 'Os'))
	   78, (('clang', '5.0', 'O0'), ('gcc', 'gcc_4.8', 'O1'))
	   70, (('clang', '9', 'O0'), ('gcc', 'gcc_4.8', 'Os'))
	   68, (('clang', '3.5', 'O3'), ('gcc', 'gcc_4.8', 'O0'))
	   66, (('clang', '3.5', 'O0'), ('gcc', 'gcc_9', 'Os'))
	   65, (('clang', '5.0', 'O0'), ('gc

In [60]:
DATASET_ONE_DICT["eval"]["validation"]["similarity"]

for s, e in ds_dict['XA']['pos']: 
    if s == 1513 or e == 1513:
        print(s, e)

333 1513
1513 5087


In [61]:
# Create pairs for test dataset
sf_set_1 = create_pos_neg_dataset(
    df_test,
    DATASET_ONE_DICT["eval"]["test"]["similarity"],
    os.path.join(OUTPUT_DIR, "pairs", "testing"),
    "{}_testing_Dataset-1.csv",
    rand=True,
    num_negatives=1
)

[D] Creating the pos/neg function pairs...


100%|████████████████████████████████████████████████████████| 50000/50000 [02:41<00:00, 309.04it/s]
100%|████████████████████████████████████████████████████████| 50000/50000 [03:18<00:00, 251.82it/s]
100%|████████████████████████████████████████████████████████| 50000/50000 [03:38<00:00, 228.82it/s]
100%|███████████████████████████████████████████████████████| 50000/50000 [00:44<00:00, 1133.73it/s]
100%|████████████████████████████████████████████████████████| 50000/50000 [02:28<00:00, 337.68it/s]
100%|████████████████████████████████████████████████████████| 50000/50000 [02:49<00:00, 294.95it/s]
 53%|█████████████████████████████▌                          | 26381/50000 [02:05<01:52, 210.67it/s]


KeyboardInterrupt: 

In [None]:
# Create pairs for test rank dataset
sf_set_2 = create_pos_neg_dataset(
    df_test,
    DATASET_ONE_DICT["eval"]["test"]["rank"],
    os.path.join(OUTPUT_DIR, "pairs", "testing"),
    "{}_rank_testing_Dataset-1.csv",
    rand=True,
    num_negatives=100
)

In [None]:
df_test = df_test.iloc[list(sf_set_1 | sf_set_2)]
df_test.reset_index(inplace=True, drop=True)

In [None]:
print(f"Shape df_training: \t{df_training.shape}")
print(f"Shape df_validation: \t{df_validation.shape}")
print(f"Shape df_test: \t\t{df_test.shape}")

# Save the "selected functions" to a CSV.
# This will be useful to post-process the results.
df_validation.to_csv(os.path.join(OUTPUT_DIR, "validation_Dataset-1.csv"))
df_training.to_csv(os.path.join(OUTPUT_DIR, "training_Dataset-1.csv"))
df_test.to_csv(os.path.join(OUTPUT_DIR, "testing_Dataset-1.csv"))

In [None]:
# Save the "selected functions" to a JSON.
# This is useful to limit the IDA analysis to some functions only.
df_list = [df_training, df_validation, df_test]
split_list = ["training", "validation", "testing"]

for split, df_t in zip(split_list, df_list):

    fset = set([tuple(x) for x in df_t[['idb_path', 'fva']].values])
    print("{}: {} functions".format(split, len(fset)))

    selected_functions = defaultdict(list)
    for t in fset:
        selected_functions[t[0]].append(int(t[1], 16))
        
    # Test
    assert(sum([len(v) for v in selected_functions.values()]) == len(fset))

    # Save to file
    with open(os.path.join(OUTPUT_DIR, "features", split, "selected_{}_Dataset-1.json".format(split)), "w") as f_out:
        json.dump(selected_functions, f_out)