In [46]:
whitey = whitelist['cpeName']
whitey = pd.DataFrame(whitey)
whitey.to_csv('../data/whitelist1list.csv')

In [52]:
import os, time, json, requests, pandas as pd
from pathlib import Path

# ── config ─────────────────────────────────────────────────────────────
API_URL      = "https://services.nvd.nist.gov/rest/json/cves/2.0"
API_KEY      = os.getenv("NVD_API_KEY") or "ea5501a5-24fe-4720-80e3-2abed401d92f"
whitelist    = pd.read_csv('../data/whitelist1list.csv')
cpe_uri      = whitelist['cpeName'].tolist()
OUTFILE      = Path("../data/cves1.parquet")
RATE_SECS    = 1.0
PER_PAGE     = 2000
PROGRESS_EVERY = 25                # how often to print a status line
start_date = "2023-01-01"
end_date   = "2024-12-31"
# ───────────────────────────────────────────────────────────────────────

In [56]:
def fetch_cves_for_cpe(cpe_uri, start_date, end_date):
    cpe_query = cpe_uri

    all_items, start = [], 0
    headers = {"apiKey": API_KEY}

    # Define the publication date window (RFC3339 format)
    start_date = "2023-01-01"
    end_date   = "2024-12-31"

    while True:
        params = {
            "cpeName":        cpe_query,
            "resultsPerPage": PER_PAGE,
            "startIndex":     start,
            "pubStartDate": f"{start_date}T00:00:00.000Z",
            "pubEndDate": f"{end_date}T23:59:59.999Z",
        }
        r = requests.get(API_URL, headers=headers, params=params, timeout=30)
        if r.status_code != 200:
            print(f"⚠️  {cpe_query[:70]} → {r.status_code}")
            break

        data  = r.json()
        items = data.get("vulnerabilities", [])
        if not items:
            break

        all_items.extend(items)
        start += PER_PAGE
        if start >= data.get("totalResults", 0):
            break

        time.sleep(RATE_SECS)

    return all_items

def flatten(v: dict, cpe_uri: str) -> dict:
    cve      = v["cve"]
    metrics  = cve.get("metrics", {})
    cvss31   = metrics.get("cvssMetricV31", [{}])[0].get("cvssData", {})
    cvss30   = metrics.get("cvssMetricV30", [{}])[0].get("cvssData", {})
    cvss     = cvss31 or cvss30
    descr    = next((d["value"] for d in cve.get("descriptions", []) if d["lang"] == "en"), "")
    cwes     = [
        d["value"] for w in cve.get("weaknesses", [])
        for d in w.get("description", []) if d.get("lang") == "en"
    ]
    refs     = "|".join(r["url"] for r in cve.get("references", [])[:10])

    return {
        "cve_id":            cve["id"],
        "cpe_name":          cpe_uri,
        "published":         cve.get("published"),
        "last_modified":     cve.get("lastModified"),

        "cvss_base":         cvss.get("baseScore"),
        "severity":          cvss.get("baseSeverity"),
        "attack_vector":     cvss.get("attackVector"),
        "attack_complexity": cvss.get("attackComplexity"),
        "priv_required":     cvss.get("privilegesRequired"),
        "user_interact":     cvss.get("userInteraction"),

        "cwes":              ";".join(cwes) if cwes else None,
        "description":       descr[:1000],
        "references":        refs,
        "full_json":         v,
    }

# ── 1. load whitelist ─────────────────────────────────────────────────
cpe_list = (
    pd.read_csv(WHITELIST, dtype=str)["cpeName"]
      .dropna()
      .unique()
)
print(f"📋  {len(cpe_list):,} unique CPEs to query")

# ── 2. query API ───────────────────────────────────────────────────────


for vuln in fetch_cves_for_cpe(cpe_list, start_date, end_date):
    rows.append(flatten(vuln, cpe))

print("✔️  API queries finished")

📋  932 unique CPEs to query
⚠️  ['cpe:2.3:a:adobe:acrobat:*:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:1:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.0.1:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.0.2:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.0.3:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.0:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.10:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.11:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.12:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.13:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.14:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.15:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.16:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.1:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.2:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.3:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.4:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.5:*:*:*:*:*:*:*'
 'cpe:2.3:a:adobe:acrobat:10.1.6:*:*:*:*:*:*:*'
 'cpe:2.3:a:ado

In [None]:
(cpe_uri: str) -> list[dict]