In [1]:
import pandas as pd
import gdown
import zipfile
import os
from utils import *
import numpy as np
from tqdm import tqdm

Run the below cell to download and upzip the Big-Vul dataset

In [2]:
bigvul_url = "https://drive.google.com/uc?id=1-0VhnHBp9IGh90s2wCNjeCMuy70HPl8X"
bigvul_zip_path = "MSR_data_cleaned.zip"
gdown.download(bigvul_url, bigvul_zip_path, quiet=False)
with zipfile.ZipFile(bigvul_zip_path, "r") as bigvul_zip:
    bigvul_zip.extractall("./")
os.remove(bigvul_zip_path)

Downloading...
From (uriginal): https://drive.google.com/uc?id=1-0VhnHBp9IGh90s2wCNjeCMuy70HPl8X
From (redirected): https://drive.google.com/uc?id=1-0VhnHBp9IGh90s2wCNjeCMuy70HPl8X&confirm=t&uuid=069502d4-4c23-4901-bbea-f6d328308166
To: /userhome/30/nhpham/VulGNN-KLEE/dataset/MSR_data_cleaned.zip
100%|██████████| 1.54G/1.54G [00:34<00:00, 44.7MB/s]


In [3]:
bigvul_path = "MSR_data_cleaned.csv"
bigvul_df = pd.read_csv(bigvul_path, low_memory=False)
bigvul_df = bigvul_df.rename(columns={
    'Unnamed: 0': 'old_id',
    'codeLink': 'code_link',
    'parentID': 'parent_id'
})
bigvul_df = bigvul_df.rename(columns={column: '_'.join(column.lower().split(' ')) for column in bigvul_df.columns})
bigvul_df = bigvul_df.drop(labels=[
    "access_gained", "attack_origin",
    "authentication_required", "availability",
    "complexity", "confidentiality",
    "integrity", "known_exploits",
    "publish_date", "summary", "update_date",
    "add_lines", "commit_message", "del_lines",
    "files_changed", "lines_after", "lines_before",
    "parent_id", "patch", "vul_func_with_fix"
], axis=1)
columns_order = [
    "old_id",
    "cve_id", "cve_page", "cwe_id",
    "score", "vulnerability_classification",
    "project", "code_link", "lang",
    "project_before", "project_after",
    "file_name", "func_name",
    "func_before", "func_after",
    "vul"
]

In [4]:
before_fix_commit_parsable_repo_hosts = [
    "github.com",
    "android.googlesource.com",
    "cgit.freedesktop.org",
    "git.savannah.gnu.org",
    "git.launchpad.net",
    "anongit.mindrot.org",
    "cgit.kde.org",
    "git.busybox.net",
    "git.pengutronix.de",
    "git.enlightenment.org",
    "git.netfilter.org",
    "git.savannah.nongnu.org",
    "git.musl-libc.org",
    "git.libssh.org"
]
bigvul_pp_df = bigvul_df.copy()
bigvul_pp_df = bigvul_pp_df[bigvul_pp_df.code_link != "https://github.com/u-boot/u-boot/commits/master"].reset_index(drop=True)

In [5]:
for i in tqdm(range(bigvul_pp_df.shape[0]), desc='Refining code_link, commit_id, project_before, and project_after'):
    if bigvul_pp_df.loc[i, "code_link"].split("//")[1].split("/")[0] not in before_fix_commit_parsable_repo_hosts:
        bigvul_pp_df.loc[i, "project_after"] = bigvul_pp_df.loc[i, "commit_id"]
        bigvul_pp_df.loc[i, "project_before"] = bigvul_pp_df.loc[i, "commit_id"] + "^"
    else:
        bigvul_pp_df.loc[i, "project_after"] = parse_commit_hash(bigvul_pp_df.loc[i, "project_after"])
        bigvul_pp_df.loc[i, "project_before"] = parse_commit_hash(bigvul_pp_df.loc[i, "project_before"])
    
    if bigvul_pp_df.loc[i, "code_link"] == "https://github.com/curl/curl/commit/curl-7_50_2~32":
        bigvul_pp_df.loc[i, "project_after"] = "7700fcba64bf5806de28f6c1c7da3b4f0b38567d"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://github.com/curl/curl/commit/curl-7_51_0-162-g3ab3c16":
        bigvul_pp_df.loc[i, "project_after"] = "3ab3c16db6a5674f53cf23d56512a405fde0b2c9"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.exim.org/exim.git/commitdiff/7685ce68148a083d7759e78d01aa5198fc099c44":
        bigvul_pp_df.loc[i, "project_after"] = "7685ce68148a083d7759e78d01aa5198fc099c44"
        bigvul_pp_df.loc[i, "project_before"] = "a2204cac393bb160ae7f253b9bb5280fc35ca3a3"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.exim.org/exim.git/commitdiff/5b7a7c051c9ab9ee7c924a611f90ef2be03e0ad0":
        bigvul_pp_df.loc[i, "project_after"] = "5b7a7c051c9ab9ee7c924a611f90ef2be03e0ad0"
        bigvul_pp_df.loc[i, "project_before"] = "69aca2feaca1ebbc55c6f1adaee4738dc328ae90"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=gnash.git;a=commitdiff;h=fa481c116e65ccf9137c7ddc8abc3cf05dc12f55":
        bigvul_pp_df.loc[i, "project_after"] = "fa481c116e65ccf9137c7ddc8abc3cf05dc12f55"
        bigvul_pp_df.loc[i, "project_before"] = "3dea0709b06a82ad8085d04daf86c9beff93d742"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=gnulib.git;a=commit;h=94e01571507835ff59dd8ce2a0b56a4b566965a4":
        bigvul_pp_df.loc[i, "project_after"] = "94e01571507835ff59dd8ce2a0b56a4b566965a4"
        bigvul_pp_df.loc[i, "project_before"] = "cff48ff751ca1ef8136695213a1a9df4f08dba38"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=gnutls.git;a=commitdiff;h=190cef6eed37d0e73a73c1e205eb31d45ab60a3c":
        bigvul_pp_df.loc[i, "project_after"] = "190cef6eed37d0e73a73c1e205eb31d45ab60a3c"
        bigvul_pp_df.loc[i, "project_before"] = "c6b1847aff211db23cb1f12e8c55ce6055943750"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=gnutls.git;a=commit;h=b495740f2ff66550ca9395b3fda3ea32c3acb185":
        bigvul_pp_df.loc[i, "project_after"] = "b495740f2ff66550ca9395b3fda3ea32c3acb185"
        bigvul_pp_df.loc[i, "project_before"] = "60c62e045245f07798f61ef7f39e8ae84fd9c8e0"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=gnutls.git;a=commit;h=422214868061370aeeb0ac9cd0f021a5c350a57d":
        bigvul_pp_df.loc[i, "project_after"] = "422214868061370aeeb0ac9cd0f021a5c350a57d"
        bigvul_pp_df.loc[i, "project_before"] = "cfea38b5482c21fe6ddffaddc59a0040f80bd578"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=gnutls.git;a=commitdiff;h=bc8102405fda11ea00ca3b42acc4f4bce9d6e97b":
        bigvul_pp_df.loc[i, "project_after"] = "bc8102405fda11ea00ca3b42acc4f4bce9d6e97b"
        bigvul_pp_df.loc[i, "project_before"] = "c50290f4096cf4fcac9ff3bfc47bf4394e6adf04"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=gnutls.git;a=commitdiff;h=e82ef4545e9e98cbcb032f55d7c750b81e3a0450":
        bigvul_pp_df.loc[i, "project_after"] = "e82ef4545e9e98cbcb032f55d7c750b81e3a0450"
        bigvul_pp_df.loc[i, "project_before"] = "8fe80d662c3320156f5731a672016d7a1b9dba1b"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=libtasn1.git;a=commit;h=4d4f992826a4962790ecd0cce6fbba4a415ce149":
        bigvul_pp_df.loc[i, "project_after"] = "4d4f992826a4962790ecd0cce6fbba4a415ce149"
        bigvul_pp_df.loc[i, "project_before"] = "77068c35a32cc31ba6b3af257921ca90696c7945"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=libtasn1.git;a=commit;h=a6e0a0b58f5cdaf4e9beca5bce69c09808cbb625":
        bigvul_pp_df.loc[i, "project_after"] = "a6e0a0b58f5cdaf4e9beca5bce69c09808cbb625"
        bigvul_pp_df.loc[i, "project_before"] = "9ec2c56de62085a88eea152941e9b32e3810e5d0"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=libtasn1.git;a=commit;h=f435825c0f527a8e52e6ffbc3ad0bc60531d537e":
        bigvul_pp_df.loc[i, "project_after"] = "f435825c0f527a8e52e6ffbc3ad0bc60531d537e"
        bigvul_pp_df.loc[i, "project_before"] = "d3ca1b00bd920191f1e15a530a45c19bc3ebd0ef"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=quagga.git;a=commitdiff;h=8794e8d229dc9fe29ea31424883433d4880ef408":
        bigvul_pp_df.loc[i, "project_after"] = "8794e8d229dc9fe29ea31424883433d4880ef408"
        bigvul_pp_df.loc[i, "project_before"] = "7621f336e2f346edee43227f0b1ef93fe769720b"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=quagga.git;a=commitdiff;h=3f872fe60463a931c5c766dbf8c36870c0023e88":
        bigvul_pp_df.loc[i, "project_after"] = "3f872fe60463a931c5c766dbf8c36870c0023e88"
        bigvul_pp_df.loc[i, "project_before"] = "747d6e783b60d67e6f83d3681fe523b2ecb01ed3"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=weechat.git;a=commit;h=c265cad1c95b84abfd4e8d861f25926ef13b5d91":
        bigvul_pp_df.loc[i, "project_after"] = "c265cad1c95b84abfd4e8d861f25926ef13b5d91"
        bigvul_pp_df.loc[i, "project_before"] = "bf2f7d33ef7c906142a839ad97bb81d8f13b0c6f"
    elif bigvul_pp_df.loc[i, "code_link"] == "https://git.savannah.gnu.org/gitweb/?p=weechat.git;a=commitdiff_plain;h=efb795c74fe954b9544074aafcebb1be4452b03a":
        bigvul_pp_df.loc[i, "project_after"] = "efb795c74fe954b9544074aafcebb1be4452b03a"
        bigvul_pp_df.loc[i, "project_before"] = "c1389f8fe19068790d29e39c3f94b71b8c33ea03"
    
    bigvul_pp_df.loc[i, "code_link"] = parse_repo_link(bigvul_pp_df.loc[i, "code_link"])
    bigvul_pp_df.loc[i, "project"] = "/".join(bigvul_pp_df.loc[i, "code_link"].split("//")[1].split(".git")[0].split("/")[1:])

bigvul_pp_df = bigvul_pp_df.drop(labels=['commit_id'], axis=1)

Refining code_link, commit_id, project_before, and project_after: 100%|██████████| 188491/188491 [20:24<00:00, 153.90it/s]


In [6]:
bigvul_pp_df['func_name_before'] = bigvul_pp_df['func_before'].apply(
    lambda x: find_function_name(x)
)
bigvul_pp_df['func_name_after'] = bigvul_pp_df['func_after'].apply(
    lambda x: find_function_name(x)
)
if bigvul_pp_df[bigvul_pp_df.apply(lambda x: True if x['func_name_before'] != x['func_name_after'] else False, axis=1)].shape[0] == 0:
    bigvul_pp_df = bigvul_pp_df.drop(labels=["func_name_after"], axis=1)
    bigvul_pp_df = bigvul_pp_df.rename(columns={"func_name_before": "func_name"})

In [7]:
bigvul_pp_df = bigvul_pp_df.sort_values(
    by=[
        "code_link",
        "project_before",
        "project_after",
        "func_name"
    ], ignore_index=True
).drop_duplicates(
    subset=[
        "code_link",
        "project_before",
        "project_after",
        "func_name"
    ], ignore_index=True
)

In [8]:
bigvul_pp_df['vul'].value_counts()

0    171586
1     10318
Name: vul, dtype: int64

In [9]:
bigvul_pp_df["llvm_ir_before"] = np.nan
bigvul_pp_df["llvm_ir_after"] = np.nan
bigvul_pp_df["cpg"] = np.nan
bigvul_pp_df = pd.get_dummies(
    bigvul_pp_df[columns_order[:-1] + ["llvm_ir_before", "llvm_ir_after", "cpg", "vul"]],
    columns=["vul"],
    dtype=int
).rename(columns={"vul_0": "non_vul", "vul_1": "vul"})
bigvul_pp_df.index.names = ["id"]
bigvul_pp_df.to_pickle("big_vul_preprocessed.zip")

In [10]:
os.remove(bigvul_path)