In [130]:
# ingest_all_cves.ipynb  –  pull every CVE for every CPE in the whitelist
import os, time, requests, pandas as pd
from pathlib import Path

# ── config ─────────────────────────────────────────────────────────────
api_url      = "https://services.nvd.nist.gov/rest/json/cves/2.0"
api_key      = os.getenv("NVD_api_key") or "ea5501a5-24fe-4720-80e3-2abed401d92f"
whitelist    = Path("../data/cpe_whitelist1.csv")
rate_secs    = 1.0
per_page     = 2000
progress_every = 25                # how often to print a status line
# ───────────────────────────────────────────────────────────────────────

def fetch_cves_for_cpe(cpe_uri: str) -> list[dict]:
    parts = cpe_uri.split(":")
    if len(parts) < 6:
        return []
    cpe_query = ":".join(parts[:6]) if parts[5] == "*" else cpe_uri

    all_items, start = [], 0
    headers = {"apiKey": api_key}
    
    while True:
        params = {
            "cpeName":        cpe_query,
            "resultsPerPage": per_page,
            "startIndex":     start,
        }
        r = requests.get(api_url, headers=headers, params=params, timeout=30)
        if r.status_code != 200:
            print(f"⚠️ {cpe_query[:70]} → {r.status_code}")
            break

        data   = r.json()
        items  = data.get("vulnerabilities", [])
        all_items.extend(items)

        start += per_page
        if start >= data.get("totalResults", 0) or not items:
            break
        time.sleep(rate_secs)
    return all_items

def flatten(v: dict, cpe_uri: str) -> dict:
    cve      = v["cve"]
    metrics  = cve.get("metrics", {})
    cvss31   = metrics.get("cvssMetricV31", [{}])[0].get("cvssData", {})
    cvss30   = metrics.get("cvssMetricV30", [{}])[0].get("cvssData", {})
    cvss     = cvss31 or cvss30
    descr    = next((d["value"] for d in cve.get("descriptions", []) if d["lang"] == "en"), "")
    cwes     = [
        d["value"] for w in cve.get("weaknesses", [])
        for d in w.get("description", []) if d.get("lang") == "en"
    ]

    
    refs     = "|".join(r["url"] for r in cve.get("references", [])[:10])
    tags = ", ".join(tag for r in cve.get("references", [])[:10] for tag in r.get("tags", []))

    
    return {
        "cveID":            cve["id"],
        "cpeName":          cpe_uri,
        "published":         cve.get("published"),
        "last_modified":     cve.get("lastModified"),

        "cwes":              ";".join(cwes) if cwes else None,
        "description":       descr[:1000],
        "references": refs,
        "tags":        tags,
        "full_json":         v,
    }

# ── 1. load whitelist ─────────────────────────────────────────────────
cpe_list = (
    pd.read_csv(whitelist, dtype=str)["cpeName"]
      .dropna()
      .unique()
)
print(f"📋  {len(cpe_list):,} unique CPEs to query")

# ── 2. query API ───────────────────────────────────────────────────────
rows = []
for idx, cpe in enumerate(cpe_list, start=1):
    if idx % progress_every == 0 or idx == 1:
        print(f"  → {idx}/{len(cpe_list)}   {cpe[:70]}…")

    for vuln in fetch_cves_for_cpe(cpe):
        rows.append(flatten(vuln, cpe))

print("✔️  API queries finished")

# ── 3. build DataFrame & de-dup ────────────────────────────────────────
df = (
    pd.DataFrame(rows)
      .drop_duplicates(subset=["cveID", "cpeName"])
      .reset_index(drop=True)
)
print(f"🗂  {df.shape[0]:,} CVE–CPE rows collected")

📋  1 unique CPEs to query
  → 1/1   cpe:2.3:a:alteryx:alteryx_server:2022.1.1.42590:*:*:*:*:*:*:*…
✔️  API queries finished
🗂  1 CVE–CPE rows collected


In [131]:
df

Unnamed: 0,cveID,cpeName,published,last_modified,cwes,description,references,tags,full_json
0,CVE-2023-26961,cpe:2.3:a:alteryx:alteryx_server:2022.1.1.4259...,2023-08-08T20:15:10.080,2024-11-21T07:52:07.460,CWE-79,Alteryx Server 2022.1.1.42590 does not employ ...,http://alteryx.com|https://gist.github.com/Dyl...,"Vendor Advisory, Exploit, Third Party Advisory...","{'cve': {'id': 'CVE-2023-26961', 'sourceIdenti..."


In [83]:
from pandas import json_normalize

cvssMetricV31 =(
    
    json_normalize(
    df['full_json'],
    record_path=['cve','metrics','cvssMetricV31'],  # drill down two levels
    meta=[ ['cve','id'] ],                         # keep the CVE ID
    )
)

cvssMetricV31.drop(columns=['source','type'],inplace=True,axis=1)
cvssMetricV31

Unnamed: 0,exploitabilityScore,impactScore,cvssData.version,cvssData.vectorString,cvssData.baseScore,cvssData.baseSeverity,cvssData.attackVector,cvssData.attackComplexity,cvssData.privilegesRequired,cvssData.userInteraction,cvssData.scope,cvssData.confidentialityImpact,cvssData.integrityImpact,cvssData.availabilityImpact,cve.id
0,1.7,2.7,3.1,CVSS:3.1/AV:N/AC:L/PR:H/UI:R/S:C/C:L/I:L/A:N,4.8,MEDIUM,NETWORK,LOW,HIGH,REQUIRED,CHANGED,LOW,LOW,NONE,CVE-2023-26961


In [117]:
cpeMatch = (
    json_normalize(
    df['full_json'],
    record_path=['cve','configurations','nodes','cpeMatch'],
    meta=[ ['cve','id'] ],                         # keep the CVE ID
    )
)
#cpeMatch.drop(columns=['lang'], inplace=True)
with pd.option_context('display.max_colwidth', None):
    display(cpeMatch['criteria'])

cpeMatch

0    cpe:2.3:a:alteryx:alteryx_server:2022.1.1.42590:*:*:*:*:*:*:*
Name: criteria, dtype: object

Unnamed: 0,vulnerable,criteria,matchCriteriaId,cve.id
0,True,cpe:2.3:a:alteryx:alteryx_server:2022.1.1.4259...,7E2E8988-A4F4-4C56-9274-0A7F2DA1BD51,CVE-2023-26961
