##NVD extraction

In [None]:
#@title download
base_url = 'https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-%d.json.zip'
last_year = 2023
for year in range(2002, last_year+1):
  get_url = base_url % year
  !wget $get_url

In [None]:
#@title unzip
base_file = 'nvdcve-1.1-%d.json.zip'
last_year = 2023
for year in range(2002, last_year+1):
  zip_file = base_file % year
  !unzip $zip_file -d '/content/nvdcves'

In [None]:
#@title project parsers
import re
class Parser():
  def __init__(self, github, regex_with_commit_group_number_list):
    self.github = github
    self.regex_list = [(re.compile(reg_group[0]), reg_group[1]) for reg_group in regex_with_commit_group_number_list]

  def checkUrl(self, url):
    for reg_group in self.regex_list:
      result = reg_group[0].match(url)
      if result:
        return result.group(reg_group[1])
    return None

All_Parsers = {
'linux_kernel' : Parser("https://github.com/torvalds/linux",
                      [(".*?(github\\.com/torvalds/linux|git\\.kernel\\.org).*?(commit)+.*?(id\\=|h\\=|/)+([a-f0-9]{40}|[a-f0-9]{8})",
            4)]),

'openssl' : Parser("https://github.com/openssl/openssl",
                      [(".*?(git\\.openssl\\.org).*?(commit).*?(id\\=|h\\=)+([a-f0-9]{40}|[a-f0-9]{8})",
            4)]),

'wireshark' : Parser("https://github.com/wireshark/wireshark",
                      [(".*?(github\\.com|code\\.wireshark\\.org).*?(wireshark/commit/|a=commit;h\\=)([a-f0-9]{40}|[a-f0-9]{8})",
            3)])
}



In [None]:
#@title read and extract
import json
base_json = '/content/nvdcves/nvdcve-1.1-%d.json'
all_commits = {}
for project in All_Parsers:
  all_commits[project] = []
expected_len = 40 #len('01ca667133d019edc9f0a1f70a272447c84ec41f')
last_year = 2023
for year in range(2002, last_year+1):
  print(year)
  with open(base_json % year, 'r') as f:
    data = json.loads(f.read())
  for cve in data['CVE_Items']:
    commit_id = None
    for ref in cve['cve']['references']['reference_data']:
      if 'Patch' in ref['tags']:
        for project, parser in All_Parsers.items():
          commit_id = parser.checkUrl(ref['url'])
          if commit_id:
              all_commits[project].append((commit_id, cve['cve']['CVE_data_meta']['ID'])) # keep the commit id
              break
        if commit_id:
          break


for project in all_commits:
  print(project, len(all_commits[project])) # print number of commits per projects, before removing duplicates

print('Removing duplicates...')
# removes duplicates
for project in all_commits:
  all_commits[project] = list(set(all_commits[project]))

for project in all_commits:
  print(project, len(all_commits[project])) # print number of commits per projects, before removing duplicates


In [None]:
#@title save
with open('nvdcve_commits_dataset.json', 'w') as f:
  json.dump(all_commits,f,indent=4)

##Convertion for V-SZZ + only keep linux-kernel

In [None]:
#@title convert and split dataset
import numpy as np
import json

OUTPUT_FILE = 'inputs.json'

data = None
with open('/content/nvdcve_commits_dataset.json', 'r') as f:
  data = json.load(f)

linux_data = [cve[0] for cve in data['linux_kernel']]
outputs = []
for split_data in np.array_split(linux_data,4):
  outputs.append(['linux_kernel', 'torvalds/linux', split_data.tolist(), []])
with open(OUTPUT_FILE, 'w') as out:
  json.dump(outputs, out, indent=4)

print('saved to', OUTPUT_FILE)
