# NVD CPE Data
---

In [1]:
import json
import pandas as pd
import glob
import re
import numpy as np 
import matplotlib.pyplot as plt
from datetime import date

In [2]:
row_accumulator = []

for filename in glob.glob('../../Data/NVD/nvd.jsonl'):
    with open(filename, 'r', encoding='utf-8') as f:
        nvd_data = json.load(f)
        for entry in nvd_data:
            if 'configurations' in entry['cve']:
                for config in entry['cve']['configurations']:
                    for node in config['nodes']:
                        if 'cpeMatch' in node:
                            for cpe in node['cpeMatch']:
                                if cpe['vulnerable']:
                                    cve = entry['cve']['id']
                                    published_date = entry['cve'].get('published', 'Missing_Data')
                                    cpe_string = cpe['criteria']
                                    end = cpe.get('versionEndExcluding', 'None')
                                    end2 = cpe.get('versionEndIncluding', 'None')
                                    start = cpe.get('versionStartExcluding', 'None')
                                    start2 = cpe.get('versionStartIncluding', 'None')

                                    new_row = {
                                        'CVE': cve,
                                        'Published': published_date,
                                        'CPE': cpe_string,
                                        'StartI': start,
                                        'StartE': start2,
                                        'EndI': end,
                                        'EndE': end2
                                    }
                                    row_accumulator.append(new_row)

nvd = pd.DataFrame(row_accumulator)
nvd['Published'] = pd.to_datetime(nvd['Published'], errors='coerce')
thisyear = (nvd['Published'] > '2000-01-01') & (nvd['Published'] < '2026-01-01')
nvd = nvd.loc[thisyear]
nvd = nvd.sort_values(by=['Published'])

## CPE Data

### CVEs With Most CPEs

In [4]:
# Calculate the counts of CVEs
cve_counts = nvd['CVE'].value_counts().reset_index()
cve_counts.columns = ['CVE', 'Count']

cve_counts.head(20)

Unnamed: 0,CVE,Count
0,CVE-2016-1409,4891
1,CVE-2017-6770,3788
2,CVE-2016-6380,3334
3,CVE-2021-3942,2942
4,CVE-2024-20433,2434
5,CVE-2008-4609,2046
6,CVE-2020-3201,1917
7,CVE-2019-1761,1762
8,CVE-2021-34705,1721
9,CVE-2009-5040,1550


### Most Common CPEs

In [5]:
# Calculate the counts of CPEs
cpe_counts = nvd['CPE'].value_counts().reset_index()
cpe_counts.columns = ['CPE', 'Count']

# Display the top 20 CPEs
cpe_counts.head(20)

Unnamed: 0,CPE,Count
0,cpe:2.3:o:linux:linux_kernel:*:*:*:*:*:*:*:*,19044
1,cpe:2.3:o:debian:debian_linux:9.0:*:*:*:*:*:*:*,3999
2,cpe:2.3:o:apple:iphone_os:*:*:*:*:*:*:*:*,3729
3,cpe:2.3:a:google:chrome:*:*:*:*:*:*:*:*,3606
4,cpe:2.3:o:debian:debian_linux:8.0:*:*:*:*:*:*:*,3481
5,cpe:2.3:o:debian:debian_linux:10.0:*:*:*:*:*:*:*,3222
6,cpe:2.3:o:microsoft:windows_server_2012:r2:*:*...,3091
7,cpe:2.3:o:microsoft:windows_server_2012:-:*:*:...,2949
8,cpe:2.3:o:apple:macos:*:*:*:*:*:*:*:*,2845
9,cpe:2.3:o:microsoft:windows_server_2019:-:*:*:...,2740


### Number of CPEs

In [None]:
# Calculate the number of unique CPEs
unique_cpes = nvd['CPE'].nunique()

# Calculate the total number of CVEs
total_cves = nvd['CVE'].nunique()

# Create a sentence with the information, adding commas to the numbers
sentence = f"There are {unique_cpes:,} unique CPEs across {total_cves:,} total CVEs."

# Display the sentence
print(sentence)

There are 349,365 unique CPEs across 245,407 total CVEs.
