# Purpose

* 1) Download all CVEs as zips, extract, store in ../../data/raw/ and delete zip
* 2) Extract relevent data from each CVE and store in a dataframe
* 3) Save dataframe as JSON object for processing later

In [7]:
import urllib
import gzip
import shutil
import os
import json
from datetime import datetime
import pandas as pd

## 1) Download CVE and save in ../../data/raw

NOTE: Pause Dropbox syncing before running this, dropbox won't let os.remove succeed.

These files are stored in the GIT repo already extracted, only run this if you need fresh files or didn't get them from git

In [4]:
for year in range(2002, 2018+1):
    file_name = 'nvdcve-1.0-{0}.json'.format(year)
    print('Downloading:', file_name)
    remote_file = 'https://nvd.nist.gov/feeds/json/cve/1.0/' + file_name + '.gz'
    local_gzip_file = '../../data/raw/' + file_name + '.gz'
    local_file = '../../data/raw/' + file_name
    urllib.request.urlretrieve(remote_file, local_gzip_file)
    with gzip.open(local_gzip_file, 'rb') as f_gzip:
        with open(local_file, 'wb') as f_raw:
            shutil.copyfileobj(f_gzip, f_raw)
    os.remove(local_gzip_file)

Downloading: nvdcve-1.0-2002.json
Downloading: nvdcve-1.0-2003.json
Downloading: nvdcve-1.0-2004.json
Downloading: nvdcve-1.0-2005.json
Downloading: nvdcve-1.0-2006.json
Downloading: nvdcve-1.0-2007.json
Downloading: nvdcve-1.0-2008.json
Downloading: nvdcve-1.0-2009.json
Downloading: nvdcve-1.0-2010.json
Downloading: nvdcve-1.0-2011.json
Downloading: nvdcve-1.0-2012.json
Downloading: nvdcve-1.0-2013.json
Downloading: nvdcve-1.0-2014.json
Downloading: nvdcve-1.0-2015.json
Downloading: nvdcve-1.0-2016.json
Downloading: nvdcve-1.0-2017.json
Downloading: nvdcve-1.0-2018.json


## 2) Extract the important bits from each CVE

notes
* CVSS V3 didn't take off until 2015
* Not all entries have V3, but all have V2 or V3
* Will use V3 when it exists, and will use V2 when not, requires some mapping

In [44]:
cves = []
num_v3 = 0
num_v2 = 0
num_v3_only = 0
num_v2_only = 0
neither = 0
for year in range(2002, 2018+1):
    print("\rProcessing {0}...".format(year), end='')
    with open('../../data/raw/nvdcve-1.0-{0}.json'.format(year),  'r', encoding='utf-8') as fin:
        json_data = json.loads(fin.read())
        
        for cve_data in json_data['CVE_Items']:
            cve = {}
            cve['id'] = cve_data['cve']['CVE_data_meta']['ID']
            cve['date'] = datetime.strptime(cve_data['publishedDate'].split('T')[0], '%Y-%m-%d')
            cve['v2'] = 0
            cve['v3'] = 0
            
            # do some statistical gathering
            if 'baseMetricV3' in cve_data['impact']:
                num_v3 += 1
                cve['v3'] = 1
                
                if 'baseMetricV2' not in cve_data['impact']:
                    num_v3_only += 1
                
            if 'baseMetricV2' in cve_data['impact']:
                num_v2 += 1
                cve['v2'] = 1
                
                if 'baseMetricV3' not in cve_data['impact']:
                    num_v2_only += 1
                                
            if 'baseMetricV3' not in cve_data['impact'] and 'baseMetricV2' not in cve_data['impact']:
                neither += 1
            
            if 'baseMetricV3' in cve_data['impact']:
                # Use V3
                cve['access'] = cve_data['impact']['baseMetricV3']['cvssV3']['attackVector']
                cve['complexity'] = cve_data['impact']['baseMetricV3']['cvssV3']['attackComplexity']
                cve['authentication'] = cve_data['impact']['baseMetricV3']['cvssV3']['privilegesRequired']
                cve['confidentiality'] = cve_data['impact']['baseMetricV3']['cvssV3']['confidentialityImpact']
                cve['integrity'] = cve_data['impact']['baseMetricV3']['cvssV3']['integrityImpact']
                cve['availability'] = cve_data['impact']['baseMetricV3']['cvssV3']['availabilityImpact']
                
            elif 'baseMetricV2' in cve_data['impact']:
                # Use V2
                cve['access'] = cve_data['impact']['baseMetricV2']['cvssV2']['accessVector']
                cve['complexity'] = cve_data['impact']['baseMetricV2']['cvssV2']['accessComplexity']
                cve['authentication'] = cve_data['impact']['baseMetricV2']['cvssV2']['authentication']
                cve['confidentiality'] = cve_data['impact']['baseMetricV2']['cvssV2']['confidentialityImpact']
                cve['integrity'] = cve_data['impact']['baseMetricV2']['cvssV2']['integrityImpact']
                cve['availability'] = cve_data['impact']['baseMetricV2']['cvssV2']['availabilityImpact']
                
            else:
                continue
            
            cves.append(cve)

print()
print("num V3", num_v3)
print("num V3 only", num_v3_only)
print("num V2", num_v2)
print("num V2 only", num_v2_only)
print("neither", neither)
print("total cves", len(cves))

Processing 2018...
num V3 29392
num V3 only 0
num V2 102947
num V2 only 73555
neither 8573
total 102947


In [45]:
json_output = json.dumps(cves, default=str)
with open('../../data/processed/cves.json', 'w') as fout:
    fout.write(json_output)
print("wrote {0} cves".format(len(cves)))

wrote 102947 cves
