# EPSS / NVD Data Merge

In [None]:
%%capture
!rm -rf jsondata
!mkdir -p jsondata
%cd jsondata
!rm -rf *
!wget https://nvd.nist.gov/feeds/json/cve/1.1/nvdcve-1.1-{2002..2022}.json.zip 
!unzip -o "*.zip" 
!wget https://epss.cyentia.com/epss_scores-current.csv.gz
!gzip -d epss_scores-current.csv.gz  

### Import Python Libraries

In [None]:
import glob
import logging
import json
import numpy as np
import pandas as pd
import warnings


logging.getLogger('matplotlib.font_manager').disabled = True
warnings.filterwarnings("ignore")


## NVD Data

In [None]:
row_accumulator = []
for filename in glob.glob('nvdcve-1.1-*.json'):
    with open(filename, 'r', encoding='utf-8') as f:
        nvd_data = json.load(f)
        for entry in nvd_data['CVE_Items']:
            cve = entry['cve']['CVE_data_meta']['ID']
            try:
                published_date = entry['publishedDate']
            except KeyError:
                published_date = 'Missing_Data'
            try:
                attack_vector = entry['impact']['baseMetricV3']['cvssV3']['attackVector']
            except KeyError:
                attack_vector = 'Missing_Data'
            try:
                attack_complexity = entry['impact']['baseMetricV3']['cvssV3']['attackComplexity']
            except KeyError:
                attack_complexity = 'Missing_Data'
            try:
                privileges_required = entry['impact']['baseMetricV3']['cvssV3']['privilegesRequired']
            except KeyError:
                privileges_required = 'Missing_Data'
            try:
                user_interaction = entry['impact']['baseMetricV3']['cvssV3']['userInteraction']
            except KeyError:
                user_interaction = 'Missing_Data'
            try:
                scope = entry['impact']['baseMetricV3']['cvssV3']['scope']
            except KeyError:
                scope = 'Missing_Data'
            try:
                confidentiality_impact = entry['impact']['baseMetricV3']['cvssV3']['confidentialityImpact']
            except KeyError:
                confidentiality_impact = 'Missing_Data'
            try:
                integrity_impact = entry['impact']['baseMetricV3']['cvssV3']['integrityImpact']
            except KeyError:
                integrity_impact = 'Missing_Data'
            try:
                availability_impact = entry['impact']['baseMetricV3']['cvssV3']['availabilityImpact']
            except KeyError:
                availability_impact = 'Missing_Data'
            try:
                base_score = entry['impact']['baseMetricV3']['cvssV3']['baseScore']
            except KeyError:
                base_score = '0.0'
            try:
                base_severity = entry['impact']['baseMetricV3']['cvssV3']['baseSeverity']
            except KeyError:
                base_severity = 'Missing_Data'
            try:
                exploitability_score = entry['impact']['baseMetricV3']['exploitabilityScore']
            except KeyError:
                exploitability_score = 'Missing_Data'
            try:
                impact_score = entry['impact']['baseMetricV3']['impactScore']
            except KeyError:
                impact_score = 'Missing_Data'
            try:
                cwe = entry['cve']['problemtype']['problemtype_data'][0]['description'][0]['value']
            except IndexError:
                cwe = 'Missing_Data'
            try:
                description = entry['cve']['description']['description_data'][0]['value']
            except IndexError:
                description = ''
            new_row = { 
                'CVE': cve, 
                'Published': published_date,
                'BaseScore': base_score,
                'BaseSeverity': base_severity,
                'CWE': cwe,
                'Scope': scope,
                'AttackVector': attack_vector,
                'AttackComplexity': attack_complexity,
                'PrivilegesRequired': privileges_required,
                'UserInteraction': user_interaction,
                'Description': description
            }
            row_accumulator.append(new_row)
    nvd = pd.DataFrame(row_accumulator)
    
nvd['Published'] = pd.to_datetime(nvd['Published']).apply(lambda x: x.date())


In [None]:
## EPSS Data

In [None]:
epss = pd.read_csv('epss_scores-current.csv', skiprows=1)


In [None]:
## Combine & Clean Data

In [None]:
epss_nvd = pd.merge(nvd, epss, how='inner', left_on='CVE', right_on='cve')

In [None]:
list(epss_nvd.columns.values)

In [None]:
epss_nvd = epss_nvd[['CVE', 'epss', 'BaseScore', 'BaseSeverity','CWE','Scope','AttackVector', 'AttackComplexity','PrivilegesRequired', 'UserInteraction', 'Description', 'Published']]
epss_nvd  = epss_nvd.rename(columns={'epss': 'EPSS', 'BaseScore': 'CVSS_V3'})

In [None]:
## Save Data 

In [None]:
result =  epss_nvd.to_json(orient="records")
parsed = json.loads(result)
print(json.dumps(parsed, indent=4))

with open('epss_enriched.json', 'w', encoding='utf-8') as f:
    json.dump(parsed, f, ensure_ascii=False, indent=4)

epss_nvd.to_csv("epss_enriched.csv", index=False)