In [1]:
import json
import numpy as np
import os
import pandas as pd
import random
import re
from collections import defaultdict

from tqdm import tqdm

In [2]:
flowchart = pd.read_csv("features/flowchart_Dataset-Finetuning.csv")
print(flowchart.shape)

(62750, 8)


In [3]:
del flowchart['bb_list']
flowchart = flowchart[flowchart['bb_num'] >= 5]
flowchart.drop_duplicates("hashopcodes", keep="first", inplace=True)
print(flowchart.shape)

(56255, 7)


In [4]:
x86_idb_path_list = flowchart[flowchart["idb_path"].str.contains("/x86/")]["idb_path"].unique().tolist()
arm32_idb_path_list = flowchart[flowchart["idb_path"].str.contains("/arm32/")]["idb_path"].unique().tolist()
arm64_idb_path_list = flowchart[flowchart["idb_path"].str.contains("/arm64/")]["idb_path"].unique().tolist()
mips32_idb_path_list = flowchart[flowchart["idb_path"].str.contains("/mips32/")]["idb_path"].unique().tolist()

architecture_map = {
    "x86": x86_idb_path_list,
    "arm32": arm32_idb_path_list,
    "arm64": arm64_idb_path_list,
    "mips32": mips32_idb_path_list
}

In [5]:
crypto_keywords = [
    "aes", "des", "rc4", "sha", "md5", "hmac", "hash",
    "cbc", "ecb", "rsa", "dsa", "ecdsa", "curve", "poly1305", "chacha"
]
pattern = r'(' + '|'.join(crypto_keywords) + r')'

df = flowchart[
    (~flowchart["func_name"].str.contains("sub_")) &
    (flowchart["func_name"].str.contains(pattern, case=False, na=False, regex=True))
].copy()

df = df[df['func_name'].isin(df['func_name'].value_counts()[df['func_name'].value_counts() > 1].index)]

crypt_funcs_list = df["func_name"].unique().tolist()

len(df)

  (flowchart["func_name"].str.contains(pattern, case=False, na=False, regex=True))


6342

In [6]:
def detect_architecture(path):
    for arch, path_list in architecture_map.items():
        if path in path_list:
            return arch
    return "unknown"

df["architecture"] = df["idb_path"].apply(detect_architecture)

arch_counts = df["architecture"].value_counts()

print(arch_counts)

architecture
x86       1647
arm64     1645
mips32    1630
arm32     1420
Name: count, dtype: int64


In [7]:
def detect_optimizations(path):
    if "/O1/" in path:
        return "O1"
    elif "/O3/" in path:
        return "O3"
    else:
        return "unknown"

df["optimizations"] = df["idb_path"].apply(detect_optimizations)

optimizations_counts = df["optimizations"].value_counts()

print(optimizations_counts)

optimizations
O1    4153
O3    2189
Name: count, dtype: int64


In [8]:
def extract_keyword(func_name):
    for kw in crypto_keywords:
        if re.search(kw, func_name, re.IGNORECASE):
            return kw.lower()
    return "unknown"

df["category"] = df["func_name"].apply(extract_keyword)

cross_counts = df.groupby(["architecture", "category"]).size().unstack(fill_value=0)

print(cross_counts)

category      aes  cbc  chacha  curve  des  dsa  ecb  hash  hmac  md5  \
architecture                                                            
arm32         148   35      37     88  118  175    7   154    75    9   
arm64         167   34      42    111  127  181    8   183    90   14   
mips32        148   34      45    113  130  185    8   189    92   14   
x86           162   35      42    114  127  188    8   182    87   12   

category      poly1305  rc4  rsa  sha  
architecture                           
arm32               71   13  340  150  
arm64               86   15  374  213  
mips32              87   15  371  199  
x86                 91   16  374  209  


In [9]:
keyword_counts = {
    keyword: df["func_name"].str.contains(keyword, case=False).sum()
    for keyword in crypto_keywords
}
counts_df = pd.DataFrame(list(keyword_counts.items()), columns=["Keyword", "Count"])
print(counts_df.sort_values(by="Count", ascending=False))

     Keyword  Count
9        rsa   1522
6       hash    835
3        sha    779
10       dsa    753
0        aes    625
1        des    502
12     curve    432
5       hmac    374
13  poly1305    360
14    chacha    347
7        cbc    314
11     ecdsa    158
4        md5     65
8        ecb     60
2        rc4     59


In [10]:
# Shuffle by function name
unique_func_names = df["func_name"].unique()
shuffled_func_names = pd.Series(unique_func_names).sample(frac=1, random_state=459657).reset_index(drop=True)

# Calculate split sizes based on the number of unique function names
n_total_funcs = len(shuffled_func_names)
n_test_funcs = int(n_total_funcs * 0.2)
n_validation_funcs = int(n_total_funcs * 0.2)
n_train_funcs = n_total_funcs - n_validation_funcs - n_test_funcs

# Split function names
train_funcs = shuffled_func_names[:n_train_funcs]
test_funcs = shuffled_func_names[n_train_funcs:n_train_funcs + n_test_funcs]
validation_funcs = shuffled_func_names[n_train_funcs + n_test_funcs:]

# Create dataframes for each set
df_training = df[df["func_name"].isin(train_funcs)].reset_index(drop=True)
df_testing = df[df["func_name"].isin(test_funcs)].reset_index(drop=True)
df_validation = df[df["func_name"].isin(validation_funcs)].reset_index(drop=True)

# Check the result
print(f"Training size: {len(df_training)}")
print(f"Testing size: {len(df_testing)}")
print(f"Validation size: {len(df_validation)}")

Training size: 3772
Testing size: 1281
Validation size: 1289


In [11]:
selected_columns = ["idb_path", "fva", "func_name"]

def generate_pairs(fl):
    comparison_list = list()
    
    for arch, data in architecture_map.items():
        print(f"Processing {arch} architecture.")
        for source_func in tqdm(crypt_funcs_list):
            for source_path in data:
                left_row = fl[(fl["idb_path"] == source_path) & (fl["func_name"] == source_func)]
                if left_row.empty:
                    continue
                left = list(left_row[selected_columns].values[0])
    
                right_index = fl[(fl["idb_path"].str.contains(f"/{arch}/")) & (fl["func_name"].isin(crypt_funcs_list))].index
                for index in right_index:
                    right = list(fl.loc[index, selected_columns].values)
                    comparison_list.append(left + right)
    
    print("All done!!")
    print(f"Total: {len(comparison_list)}.")

    # Create a new DataFrame
    columns = [x + "_1" for x in selected_columns] + [x + "_2" for x in selected_columns]
    df = pd.DataFrame(comparison_list, columns=columns)
    
    # Add the db_type column
    df["db_type"] = ["XM"] * df.shape[0]
    
    # Sort the rows
    df.sort_values(by=["idb_path_1", "fva_1", "idb_path_2", "fva_2"], inplace=True)
    df.reset_index(inplace=True, drop=True)
    
    # Paranoid check
    df.drop_duplicates(inplace=True)
    df.reset_index(inplace=True, drop=True)

    return df
    
print("Generating Training pairs:")
pairs_training = generate_pairs(df_training)
print("Generating Testing pairs:")
pairs_testing = generate_pairs(df_testing)
print("Generating Validation pairs:")
pairs_validation = generate_pairs(df_validation)

Generating Training pairs:
Processing x86 architecture.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1356/1356 [02:01<00:00, 11.14it/s]


Processing arm32 architecture.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1356/1356 [01:35<00:00, 14.19it/s]


Processing arm64 architecture.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1356/1356 [02:09<00:00, 10.44it/s]


Processing mips32 architecture.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1356/1356 [02:07<00:00, 10.60it/s]


All done!!
Total: 3569430.
Generating Testing pairs:
Processing x86 architecture.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1356/1356 [00:19<00:00, 69.01it/s]


Processing arm32 architecture.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1356/1356 [00:15<00:00, 87.98it/s]


Processing arm64 architecture.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1356/1356 [00:20<00:00, 67.56it/s]


Processing mips32 architecture.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1356/1356 [00:19<00:00, 68.17it/s]


All done!!
Total: 412199.
Generating Validation pairs:
Processing x86 architecture.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1356/1356 [00:20<00:00, 67.22it/s]


Processing arm32 architecture.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1356/1356 [00:16<00:00, 84.05it/s]


Processing arm64 architecture.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1356/1356 [00:19<00:00, 70.40it/s]


Processing mips32 architecture.


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1356/1356 [00:19<00:00, 69.39it/s]


All done!!
Total: 416743.


In [12]:
pairs_training

Unnamed: 0,idb_path_1,fva_1,func_name_1,idb_path_2,fva_2,func_name_2,db_type
0,IDBs/Dataset-Finetuning/arm32/O1/arm32-O1-libc...,0x100ecc,ECDSA_size,IDBs/Dataset-Finetuning/arm32/O1/arm32-O1-libc...,0x100ecc,ECDSA_size,XM
1,IDBs/Dataset-Finetuning/arm32/O1/arm32-O1-libc...,0x100ecc,ECDSA_size,IDBs/Dataset-Finetuning/arm32/O1/arm32-O1-libc...,0x102458,EC_GROUP_check_named_curve,XM
2,IDBs/Dataset-Finetuning/arm32/O1/arm32-O1-libc...,0x100ecc,ECDSA_size,IDBs/Dataset-Finetuning/arm32/O1/arm32-O1-libc...,0x102f20,EC_get_builtin_curves,XM
3,IDBs/Dataset-Finetuning/arm32/O1/arm32-O1-libc...,0x100ecc,ECDSA_size,IDBs/Dataset-Finetuning/arm32/O1/arm32-O1-libc...,0x102f9c,ossl_ec_curve_nid_from_params,XM
4,IDBs/Dataset-Finetuning/arm32/O1/arm32-O1-libc...,0x100ecc,ECDSA_size,IDBs/Dataset-Finetuning/arm32/O1/arm32-O1-libc...,0x10325c,EC_GROUP_new_curve_GFp,XM
...,...,...,...,...,...,...,...
3569425,IDBs/Dataset-Finetuning/x86/O3/x86-O3-libwolfs...,0xc6302,wolfSSL_NoKeyShares,IDBs/Dataset-Finetuning/x86/O3/x86-O3-libwolfs...,0xc1012,wolfSSL_LH_strhash,XM
3569426,IDBs/Dataset-Finetuning/x86/O3/x86-O3-libwolfs...,0xc6302,wolfSSL_NoKeyShares,IDBs/Dataset-Finetuning/x86/O3/x86-O3-libwolfs...,0xc1d9b,wolfSSL_3des_iv,XM
3569427,IDBs/Dataset-Finetuning/x86/O3/x86-O3-libwolfs...,0xc6302,wolfSSL_NoKeyShares,IDBs/Dataset-Finetuning/x86/O3/x86-O3-libwolfs...,0xc1deb,wolfSSL_aes_ctr_iv,XM
3569428,IDBs/Dataset-Finetuning/x86/O3/x86-O3-libwolfs...,0xc6302,wolfSSL_NoKeyShares,IDBs/Dataset-Finetuning/x86/O3/x86-O3-libwolfs...,0xc62d5,wolfSSL_UseKeyShare,XM


### Create positive and negative pairs

In [13]:
def create_similarity_pairs(df_input, neg_for_pos=100):
    def extract_keywords(func_name):
        func_name = str(func_name).lower()
        return set([kw for kw in crypto_keywords if kw in func_name])

    def has_common_crypto_keyword(row):
        kw1 = extract_keywords(row['func_name_1'])
        kw2 = extract_keywords(row['func_name_2'])
        return len(kw1 & kw2) > 0

    def no_common_crypto_keyword(row):
        kw1 = extract_keywords(row['func_name_1'])
        kw2 = extract_keywords(row['func_name_2'])
        return len(kw1 & kw2) == 0

    df_pos = df_input[df_input.apply(has_common_crypto_keyword, axis=1)].copy()
    df_neg = df_input[df_input.apply(no_common_crypto_keyword, axis=1)].copy()

    if len(df_pos) * neg_for_pos > len(df_neg):
        num_neg = len(df_neg)
        num_pos = len(df_neg) // neg_for_pos
        df_pos = df_pos.sample(frac=1, random_state=837465).reset_index(drop=True)[:num_pos]
    else:
        num_pos = len(df_pos)
        num_neg = len(df_pos) * neg_for_pos
        df_neg = df_neg.sample(frac=1, random_state=837465).reset_index(drop=True)[:num_neg]

    return df_pos, df_neg

# training: 61467 399628
df_pos_training, df_neg_training = create_similarity_pairs(pairs_training)
# testing: 7139 40696
df_pos_testing, df_neg_testing = create_similarity_pairs(pairs_testing)
# validation: 7006 41444
df_pos_validation, df_neg_validation = create_similarity_pairs(pairs_validation)

print("Counts of pos and neg pairs:")
print("Training", len(df_pos_training), len(df_neg_training))
print("Testing", len(df_pos_testing), len(df_neg_testing))
print("Validation", len(df_pos_validation), len(df_neg_validation))

Counts of pos and neg pairs:
Training 30766 3076656
Testing 3591 359150
Validation 3573 357386


In [14]:
df_pos_training

Unnamed: 0,idb_path_1,fva_1,func_name_1,idb_path_2,fva_2,func_name_2,db_type
0,IDBs/Dataset-Finetuning/arm64/O1/arm64-O1-libm...,0x3feb0,mbedtls_poly1305_self_test,IDBs/Dataset-Finetuning/arm64/O1/arm64-O1-libg...,0x34368,poly1305mac_setiv,XM
1,IDBs/Dataset-Finetuning/arm32/O1/arm32-O1-libg...,0x5e324,dsa_verify,IDBs/Dataset-Finetuning/arm32/O1/arm32-O1-libc...,0xe87d4,dsa_sig_print,XM
2,IDBs/Dataset-Finetuning/mips32/O1/mips32-O1-li...,0x23d6a8,ossl_rsa_padding_check_PKCS1_type_2_TLS,IDBs/Dataset-Finetuning/mips32/O1/mips32-O1-li...,0x242304,ossl_rsa_check_pminusq_diff,XM
3,IDBs/Dataset-Finetuning/arm64/O1/arm64-O1-libc...,0x201b84,ossl_rsa_check_prime_factor,IDBs/Dataset-Finetuning/arm64/O1/arm64-O1-libw...,0x43d1c,RsaSign,XM
4,IDBs/Dataset-Finetuning/mips32/O1/mips32-O1-li...,0x2bf988,rsa_decrypt,IDBs/Dataset-Finetuning/mips32/O1/mips32-O1-li...,0x2309a0,ossl_rsa_todata,XM
...,...,...,...,...,...,...,...
30761,IDBs/Dataset-Finetuning/x86/O1/x86-O1-libmbedc...,0x34682,pem_des3_decrypt,IDBs/Dataset-Finetuning/x86/O3/x86-O3-libcrypt...,0x14bf20,DES_string_to_2keys,XM
30762,IDBs/Dataset-Finetuning/arm32/O3/arm32-O3-libw...,0x31b6c,wc_AesGcmDecrypt,IDBs/Dataset-Finetuning/arm32/O3/arm32-O3-libw...,0x4284c,wc_AesCmacVerify,XM
30763,IDBs/Dataset-Finetuning/mips32/O3/mips32-O3-li...,0xa5af0,wolfSSL_RSA_private_encrypt,IDBs/Dataset-Finetuning/mips32/O3/mips32-O3-li...,0x1cd750,RSA_X931_generate_key_ex,XM
30764,IDBs/Dataset-Finetuning/mips32/O1/mips32-O1-li...,0x4fc60,wc_ecc_shared_secret_ex,IDBs/Dataset-Finetuning/mips32/O1/mips32-O1-li...,0x4fdac,wc_ecc_shared_secret,XM


In [15]:
print("[D] Converting the positive/negative pairs into CSV...", flush=True)

df_pos_training.to_csv("pairs/training/pos-training-Finetuning.csv")
df_pos_testing.to_csv("pairs/testing/pos-testing-Finetuning.csv")
df_pos_validation.to_csv("pairs/validation/pos-validation-Finetuning.csv")
df_neg_training.to_csv("pairs/training/neg-training-Finetuning.csv")
df_neg_testing.to_csv("pairs/testing/neg-testing-Finetuning.csv")
df_neg_validation.to_csv("pairs/validation/neg-validation-Finetuning.csv")

print("Done!!")

[D] Converting the positive/negative pairs into CSV...
Done!!


In [16]:
def filter_functions_in_pairs(df_input, pairs_df):
    pair_funcs = set(
        list(zip(pairs_df["idb_path_1"], pairs_df["func_name_1"])) +
        list(zip(pairs_df["idb_path_2"], pairs_df["func_name_2"]))
    )

    df_filtered = df_input[
        df_input.apply(lambda row: (row["idb_path"], row["func_name"]) in pair_funcs, axis=1)
    ].copy()

    return df_filtered

df_training = filter_functions_in_pairs(df_training, pd.concat([df_pos_training, df_neg_training], ignore_index=True))
df_testing = filter_functions_in_pairs(df_testing, pd.concat([df_pos_testing, df_neg_testing], ignore_index=True))
df_validation = filter_functions_in_pairs(df_validation, pd.concat([df_pos_validation, df_neg_validation], ignore_index=True))

# Check the result
print(f"Training size: {len(df_training)}")
print(f"Testing size: {len(df_testing)}")
print(f"Validation size: {len(df_validation)}")

Training size: 3772
Testing size: 1281
Validation size: 1289


In [17]:
print("[D] Converting Dataset into CSV...", flush=True)

df_training.to_csv("training_Dataset-Finetuning.csv")
df_testing.to_csv("testing_Dataset-Finetuning.csv")
df_validation.to_csv("validation_Dataset-Finetuning.csv")

print("Done!!")

[D] Converting Dataset into CSV...
Done!!


In [18]:
# Save the "selected functions" to a JSON.
# This is useful to limit the IDA analysis to some functions only.
df_list = [df_training, df_validation, df_testing]
split_list = ["training", "validation", "testing"]

for split, df_t in zip(split_list, df_list):

    fset = set([tuple(x) for x in df_t[['idb_path', 'fva']].values])
    print("{}: {} functions".format(split, len(fset)))

    selected_functions = defaultdict(list)
    for t in fset:
        selected_functions[t[0]].append(int(t[1], 16))
        
    # Test
    assert(sum([len(v) for v in selected_functions.values()]) == len(fset))

    # Save to file
    with open(os.path.join("features", split, "selected_{}_Dataset-Finetuning.json".format(split)), "w") as f_out:
        json.dump(selected_functions, f_out)

print("All done!!")

training: 3772 functions
validation: 1289 functions
testing: 1281 functions
All done!!
