In [1]:
import pandas as pd
import os
import json
from tqdm import tqdm
from utils import generate_ir_and_cpg, cpg_dot2json
import shutil
from multiprocessing import Pool
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
# llvm2cpg developed by Joern requires LLVM 11
os.putenv("PATH", "/usr/bin:/usr/local/bin")

In [3]:
big_vul_preprocessed_path = "big_vul_preprocessed.zip"
big_vul_df = pd.read_pickle(big_vul_preprocessed_path)

In [4]:
big_vul_df.loc[:, 'func_name'] = big_vul_df['func_name'].apply(lambda x: np.nan if len(x) == 0 or x == 'null' else x)
big_vul_df = big_vul_df.dropna(subset="func_name")

In [5]:
generate_ir_and_cpg_args = []
for code_link in tqdm(big_vul_df["code_link"].unique().tolist(), desc='Generating compilication args per repo'):
    project_df = big_vul_df[big_vul_df.code_link == code_link]
    repo_dir_name = "_".join(code_link.split(".git")[0].split("//")[1].split("/")[1:])
    result_dir_name = repo_dir_name
    i = 1
    for version_pair in project_df[["project_before", "project_after"]].drop_duplicates().itertuples():
        project_per_version_pair_df = project_df[project_df.apply(
            func=lambda x: True if x.project_before == version_pair.project_before and x.project_after == version_pair.project_after else False,
            axis=1
        )]
        function_list = [function for function in project_per_version_pair_df["func_name"].tolist() if function.find("::") == -1]
        if len(function_list) == 0:
            continue
        generate_ir_and_cpg_args.append((code_link, f"{repo_dir_name}_{i}", result_dir_name, {
            version_pair.project_before: function_list,
            version_pair.project_after: function_list
        }))
        i += 1

Generating compilication args per repo: 100%|██████████| 413/413 [11:11<00:00,  1.63s/it]  


In [6]:
generate_ir_and_cpg_args.sort(key=lambda x: len(x[3][list(x[3].keys())[0]]))
with open(f"generate_ir_and_cpg_args.json", "w+") as f:
    f.write(json.dumps(generate_ir_and_cpg_args, indent=4))

In [7]:
with open("generate_ir_and_cpg_args.json", "r") as f:
    generate_ir_and_cpg_args = json.load(f)

generate_ir_and_cpg_args = [(project[0], project[1], project[2], project[3]) for project in generate_ir_and_cpg_args]

In [8]:
filtered_projects = ['chromium_chromium', 'drm_drm-misc', 'chrisd1100_uncurl', 'torvalds_linux']
filtered_generate_ir_and_cpg_args = [args for args in generate_ir_and_cpg_args if args[2] not in filtered_projects]

In [None]:
if not os.path.exists("error_dump"):
    os.mkdir("error_dump")
else:
    shutil.rmtree("error_dump")
    os.mkdir("error_dump")

if not os.path.exists("repo_clone"):
    os.mkdir("repo_clone")
else:
    shutil.rmtree("repo_clone")
    os.mkdir("repo_clone")

with Pool(processes=24) as pool:
    list(tqdm(pool.imap_unordered(generate_ir_and_cpg, filtered_generate_ir_and_cpg_args), total=len(filtered_generate_ir_and_cpg_args), desc='Compiling for LLVM IRs and generating CPGs'))

In [None]:
rerun = [
    
]
rerun_args = [args for args in filtered_generate_ir_and_cpg_args if args[2] in rerun]

if not os.path.exists("error_dump"):
    os.mkdir("error_dump")
else:
    shutil.rmtree("error_dump")
    os.mkdir("error_dump")

if not os.path.exists("repo_clone"):
    os.mkdir("repo_clone")
else:
    shutil.rmtree("repo_clone")
    os.mkdir("repo_clone")

with Pool(processes=4) as pool:
    list(tqdm(pool.imap_unordered(generate_ir_and_cpg, rerun_args), total=len(rerun_args)))

In [9]:
for code_link in tqdm(big_vul_df["code_link"].unique().tolist(), desc='Adding generated LLVM IRs and CPGs to the dataset'):
    project_df = big_vul_df[big_vul_df.code_link == code_link]
    repo_name = "_".join(code_link.split(".git")[0].split("//")[1].split("/")[1:])
    
    if os.path.exists(f"./llvm_ir/{repo_name}") or os.path.exists(f"./cpg/{repo_name}"):
        for version_pair in project_df[["project_before", "project_after"]].drop_duplicates().itertuples():
            project_per_version_pair_df = project_df[project_df.apply(
                func=lambda x: True if x.project_before == version_pair.project_before and x.project_after == version_pair.project_after else False,
                axis=1
            )]
            
            for id in project_per_version_pair_df.index.tolist():
                if os.path.exists(f"./llvm_ir/{repo_name}/{version_pair.project_before}/{big_vul_df.loc[id, 'func_name']}.ll"):
                    with open(f"./llvm_ir/{repo_name}/{version_pair.project_before}/{big_vul_df.loc[id, 'func_name']}.ll", "r") as llvm_ir_file:
                        big_vul_df.loc[id, "llvm_ir_before"] = llvm_ir_file.read()
                
                if os.path.exists(f"./llvm_ir/{repo_name}/{version_pair.project_after}/{big_vul_df.loc[id, 'func_name']}.ll"):
                    with open(f"./llvm_ir/{repo_name}/{version_pair.project_after}/{big_vul_df.loc[id, 'func_name']}.ll", "r") as llvm_ir_file:
                        big_vul_df.loc[id, "llvm_ir_after"] = llvm_ir_file.read()
                
                if os.path.exists(f"./cpg/{repo_name}/{version_pair.project_before}/{big_vul_df.loc[id, 'func_name']}.json"):
                    with open(f"./cpg/{repo_name}/{version_pair.project_before}/{big_vul_df.loc[id, 'func_name']}.json", "r") as cpg_file:
                        cpg_list = json.load(cpg_file)
                        if len(cpg_list) == 0:
                            continue
                        if len(cpg_list) == 1:
                            big_vul_df.loc[id, "cpg"] = cpg_list[0]
                            continue
                        big_vul_df.loc[id, "cpg"] = max(cpg_list, key=len)

Adding generated LLVM IRs and CPGs to the dataset: 100%|██████████| 413/413 [04:56<00:00,  1.39it/s]


In [10]:
cpg_df = big_vul_df[big_vul_df.cpg.notna()]
cpg_df = cpg_df[cpg_df.llvm_ir_before.notna()]
cpg_df = cpg_df[cpg_df.apply(lambda x: False if x['llvm_ir_before'] == x['llvm_ir_after'] and x['vul'] == 1 else True, axis=1)]
cpg_df['cpg'] = cpg_df['cpg'].apply(lambda x: cpg_dot2json(x, json_format=True))
cpg_df.reset_index(drop=True, inplace=True)
cpg_df.index.name = 'id'

In [11]:
cpg_df['vul'].value_counts()

vul
0    23709
1     1598
Name: count, dtype: int64

In [12]:
cpg_df = cpg_df[['old_id', 'llvm_ir_before', 'llvm_ir_after', 'cpg', 'vul']]
cpg_df.to_pickle('big_vul_ir_cpg.zip')

In [13]:
train_df, test_df, _, _ = train_test_split(cpg_df, cpg_df['vul'], train_size=int(0.8 * cpg_df.shape[0]), random_state=0)
train_df, val_df, _, _ = train_test_split(train_df, train_df['vul'], train_size=int(0.6 * cpg_df.shape[0]), random_state=0)
train_df_non_vul = train_df[train_df['vul'] == 0]
train_df_non_vul = train_df_non_vul.sort_values(by='cpg', key=lambda x: x.str.len(), ignore_index=True).iloc[:int(train_df_non_vul.shape[0] * 0.75)]
train_df_vul = train_df[train_df['vul'] == 1]
multiplier = int(train_df_non_vul.shape[0] / train_df_vul.shape[0])
train_df_vul_oversampled = pd.concat([train_df[train_df['vul'] == 1]] * multiplier, ignore_index=True)
train_df = pd.concat([train_df_non_vul, train_df_vul_oversampled], ignore_index=True).sample(frac=1, random_state=0, ignore_index=True)

In [14]:
train_df.index.name = 'id'
test_df = test_df.reset_index().drop(columns='id')
test_df.index.name = 'id'
val_df = val_df.reset_index().drop(columns='id')
val_df.index.name = 'id'

In [15]:
os.makedirs('cpg_dataset/raw', exist_ok=True)
os.makedirs('cpg_dataset/processed', exist_ok=True)

train_df.to_pickle('cpg_dataset/raw/big_vul_ir_cpg_train.zip')
test_df.to_pickle('cpg_dataset/raw/big_vul_ir_cpg_test.zip')
val_df.to_pickle('cpg_dataset/raw/big_vul_ir_cpg_val.zip')