# CVE Data Frame

In [5]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import re

## Load and Parse CVE Data

In [None]:
all_rows = []
base_dir = "../Data/CVE/cves"

# Loop through each year directory
for year_dir in os.listdir(base_dir):
    if year_dir.isdigit():
        year_path = os.path.join(base_dir, year_dir)
        if os.path.isdir(year_path):
            for root, dirs, files in os.walk(year_path):
                for filename in files:
                    if filename.endswith(".json"):
                        filepath = os.path.join(root, filename)
                        try:
                            with open(filepath, "r") as file:
                                cve_data = json.load(file)

                                meta = cve_data.get("cveMetadata", {})
                                containers = cve_data.get("containers", {})
                                cna = containers.get("cna", {})

                                cve_id = meta.get("cveId", None)
                                state = meta.get("state", None)
                                assigner_short = meta.get("assignerShortName", None)
                                date_reserved = meta.get("dateReserved", None)
                                date_published = meta.get("datePublished", None)
                                date_updated = meta.get("dateUpdated", None)
                                
                                desc_en = next(
                                    (d.get("value") for d in cna.get("descriptions", []) if d.get("lang") == "en"),
                                    None
                                )

                                # Collect any CVSS base scores by version
                                cvss_scores_v3 = [
                                    metric["cvssV3_1"]["baseScore"]
                                    for metric in cna.get("metrics", [])
                                    if "cvssV3_1" in metric
                                ]
                                cvss_scores_v4 = [
                                    metric["cvssV4_0"]["baseScore"]
                                    for metric in cna.get("metrics", [])
                                    if "cvssV4_0" in metric
                                ]
                                cvss_scores_v2 = [
                                    metric["cvssV2_0"]["baseScore"]
                                    for metric in cna.get("metrics", [])
                                    if "cvssV2_0" in metric
                                ]

                                # Only take the first score if available
                                cvss_score_v3_1 = cvss_scores_v3[0] if cvss_scores_v3 else None
                                cvss_score_v4_0 = cvss_scores_v4[0] if cvss_scores_v4 else None
                                cvss_score_v2_0 = cvss_scores_v2[0] if cvss_scores_v2 else None

                                # Only take the first CWE if present
                                cwe_ids = []
                                for problem_type in cna.get("problemTypes", []):
                                    for desc in problem_type.get("descriptions", []):
                                        if "cweId" in desc:
                                            cwe_ids.append(desc["cweId"])
                                cwe_id = cwe_ids[0] if cwe_ids else None
                                
                                provider_meta = cna.get("providerMetadata", {})
                                cna_short_name = provider_meta.get("shortName", None)

                                # Collect affected products and versions
                                affected_products = [
                                    f"{aff.get('vendor', 'n/a')} {aff.get('product', 'n/a')} {ver.get('version', 'n/a')}"
                                    for aff in cna.get("affected", [])
                                    for ver in aff.get("versions", [])
                                ]

                                # Collect credits
                                credits = [
                                    f"{credit['value']} ({credit.get('type', 'unknown')})"
                                    for credit in cna.get("credits", [])
                                ]

                                # Collect impacts
                                impacts = [
                                    impact.get("capecId", "unknown")
                                    for impact in cna.get("impacts", [])
                                ]

                                # Collect references
                                references = [
                                    ref.get("url", "unknown")
                                    for ref in cna.get("references", [])
                                ]

                                all_rows.append({
                                    "CVE ID": cve_id,
                                    "State": state,
                                    "Assigner Org": assigner_short,
                                    "Date Reserved": date_reserved,
                                    "Date Published": date_published,
                                    "Date Updated": date_updated,
                                    "CVE Description": desc_en,
                                    "CVSS Score (v3.1)": cvss_score_v3_1,
                                    "CVSS Score (v4.0)": cvss_score_v4_0,
                                    "CVSS Score (v2.0)": cvss_score_v2_0,
                                    "CWE ID": cwe_id,
                                    "CNA Short Name": cna_short_name,
                                    "Affected Products": affected_products,
                                    "Credits": credits,
                                    "Impacts": impacts,
                                    "References": references
                                })
                        except Exception as e:
                            print(f"Error processing file {filepath}: {e}")

cve_df = pd.DataFrame(all_rows)
cve_df

Unnamed: 0,CVE ID,State,Assigner Org,Date Reserved,Date Published,Date Updated,CVE Description,CVSS Score (v3.1),CVSS Score (v4.0),CVSS Score (v2.0),CWE ID,CNA Short Name,Affected Products,Credits,Impacts,References
0,CVE-2025-2324,PUBLISHED,ProgressSoftware,2025-03-14T17:30:06.106Z,2025-03-19T15:23:03.486Z,2025-03-19T20:17:04.235Z,Improper Privilege Management vulnerability fo...,5.9,,,CWE-269,ProgressSoftware,"[Progress MOVEit Transfer 2023.1.0, Progress M...",[],[CAPEC-233],[https://community.progress.com/s/article/MOVE...
1,CVE-2025-2148,PUBLISHED,VulDB,2025-03-10T06:12:36.829Z,2025-03-10T12:00:07.912Z,2025-03-10T14:10:36.958Z,A vulnerability was found in PyTorch 2.6.0+cu1...,5.0,2.3,5.1,CWE-119,VulDB,[n/a PyTorch 2.6.0+cu124],[Default436352 (VulDB User) (reporter)],[],"[https://vuldb.com/?id.299059, https://vuldb.c..."
2,CVE-2025-2689,PUBLISHED,VulDB,2025-03-23T09:36:26.587Z,2025-03-24T07:00:07.140Z,2025-03-24T12:17:13.656Z,"A vulnerability, which was classified as criti...",6.3,5.3,6.5,CWE-502,VulDB,"[yiisoft Yii2 2.0.0, yiisoft Yii2 2.0.1, yiiso...",[gaorenyusi (VulDB User) (reporter)],[],"[https://vuldb.com/?id.300710, https://vuldb.c..."
3,CVE-2025-2373,PUBLISHED,VulDB,2025-03-16T13:18:55.057Z,2025-03-17T10:31:05.429Z,2025-03-17T12:07:44.253Z,A vulnerability classified as critical was fou...,6.3,5.3,6.5,CWE-89,VulDB,[PHPGurukul Human Metapneumovirus Testing Mana...,[WenGui (VulDB User) (reporter)],[],"[https://vuldb.com/?id.299872, https://vuldb.c..."
4,CVE-2025-2723,PUBLISHED,VulDB,2025-03-24T12:46:32.307Z,2025-03-25T01:00:06.666Z,2025-03-25T13:26:03.279Z,A vulnerability was found in GNOME libgsf up t...,5.3,4.8,4.3,CWE-122,VulDB,"[GNOME libgsf 1.14.0, GNOME libgsf 1.14.1, GNO...",[ninpwn (VulDB User) (reporter)],[],"[https://vuldb.com/?id.300743, https://vuldb.c..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7033,CVE-2025-28938,PUBLISHED,Patchstack,2025-03-11T08:10:05.094Z,2025-03-11T21:01:16.712Z,2025-03-12T13:48:12.223Z,Missing Authorization vulnerability in Bjoern ...,4.3,,,CWE-862,Patchstack,[Bjoern WP Performance Pack n/a],[Trương Hữu Phúc (truonghuuphuc) (Patchstack A...,[CAPEC-180],[https://patchstack.com/database/wordpress/plu...
7034,CVE-2025-28892,PUBLISHED,Patchstack,2025-03-11T08:09:09.176Z,2025-03-11T21:00:48.202Z,2025-03-12T13:51:13.441Z,Cross-Site Request Forgery (CSRF) vulnerabilit...,7.1,,,CWE-352,Patchstack,[a2rocklobster FTP Sync n/a],[Abdi Pranata (Patchstack Alliance) (finder)],[CAPEC-592],[https://patchstack.com/database/wordpress/plu...
7035,CVE-2025-28914,PUBLISHED,Patchstack,2025-03-11T08:09:27.025Z,2025-03-11T21:00:59.669Z,2025-03-12T13:49:57.977Z,Improper Neutralization of Input During Web Pa...,5.9,,,CWE-79,Patchstack,[Ajay Sharma wordpress login form to anywhere ...,[Nabil Irawan (Patchstack Alliance) (finder)],[CAPEC-592],[https://patchstack.com/database/wordpress/plu...
7036,CVE-2025-28943,PUBLISHED,Patchstack,2025-03-11T08:10:05.094Z,2025-03-11T21:01:18.833Z,2025-03-12T14:17:57.505Z,Improper Neutralization of Input During Web Pa...,5.9,,,CWE-79,Patchstack,[mylo2h2s DP ALTerminator - Missing ALT manage...,[Nabil Irawan (Patchstack Alliance) (finder)],[CAPEC-592],[https://patchstack.com/database/wordpress/plu...
