In [2]:
import os, time, json, requests, pandas as pd
from pathlib import Path
from tqdm.auto import tqdm   # progress bar

In [4]:
# ── config ─────────────────────────────────────────────────────────────
API_URL      = "https://services.nvd.nist.gov/rest/json/cves/2.0"
API_KEY      = os.getenv("NVD_API_KEY") or "ea5501a5-24fe-4720-80e3-2abed401d92f"
CPE_URI      = "cpe:2.3:a:adobe:acrobat:-:*:*:*:*:*:*:*"
WHITELIST    = Path("../data/whitelist1.parquet")   # your cleaned file
OUTFILE      = Path("../data/smb_cves.parquet")
RATE_SECS    = 1.2                    # stay below 5-sec soft limit
PER_PAGE     = 2000                   # max allowed by v2.0
# ───────────────────────────────────────────────────────────────────────

headers   = {"apiKey": API_KEY}
start     = 0
all_items = []

while True:
    params = {
        "cpeName": CPE_URI,         # exact-match search
        "resultsPerPage": PER_PAGE,
        "startIndex":   start
    }
    r = requests.get(API_URL, headers=headers, params=params, timeout=20)
    r.raise_for_status()            # raise if HTTP error (4xx / 5xx)

    data      = r.json()
    vulns     = data.get("vulnerabilities", [])
    all_items.extend(vulns)

    # stop when we’ve read the last page
    start += PER_PAGE
    if start >= data.get("totalResults", 0) or not vulns:
        break
    time.sleep(RATE_SECS)

print(f"✅ Retrieved {len(all_items)} CVEs for Adobe Acrobat")

# ── flatten each CVE into a one-row dict ───────────────────────────────
def extract_fields(vuln: dict) -> dict:
    cve      = vuln["cve"]
    metrics  = cve.get("metrics", {})
    refs     = cve.get("references", [])

    # Prefer CVSS v3.1, fall back to v3.0
    cvss31   = metrics.get("cvssMetricV31", [{}])[0].get("cvssData", {})
    cvss30   = metrics.get("cvssMetricV30", [{}])[0].get("cvssData", {})

    cvss     = cvss31 or cvss30                          # whichever exists

    # Grab first English description
    descr    = next(
        (d["value"] for d in cve.get("descriptions", []) if d["lang"] == "en"),
        ""
    )

    # Collect CWE IDs (could be many)
    cwes = [
        w["value"]
        for w in cve.get("weaknesses", [])
        for wmeta in w.get("description", [])
        for w in [wmeta]
        if w.get("lang") == "en"
    ]

    return {
        "cve_id":        cve["id"],
        "published":     cve.get("published"),
        "last_modified": cve.get("lastModified"),

        "cvss_base":     cvss.get("baseScore"),
        "severity":      cvss.get("baseSeverity"),
        "attack_vector": cvss.get("attackVector"),
        "attack_complexity": cvss.get("attackComplexity"),
        "priv_required": cvss.get("privilegesRequired"),
        "user_interact": cvss.get("userInteraction"),

        "cwes":          ";".join(cwes) if cwes else None,
        "description":   descr[:500],           # trim super-long text

        # keep a pipe-separated list of reference URLs
        "references":    "|".join(r["url"] for r in refs[:10]),

        "full_json":     vuln                   # raw blob for later GPT work
    }

# Build DataFrame
records = [extract_fields(v) for v in all_items]
df      = pd.DataFrame(records)

# peek
df.head()

✅ Retrieved 412 CVEs for Adobe Acrobat


Unnamed: 0,cve_id,published,last_modified,cvss_base,severity,attack_vector,attack_complexity,priv_required,user_interact,cwes,description,references,full_json
0,CVE-2006-5857,2006-12-31T05:00:00.000,2025-04-09T00:30:58.490,,,,,,,CWE-399,Adobe Reader and Acrobat 7.0.8 and earlier all...,http://archives.neohapsis.com/archives/fulldis...,"{'cve': {'id': 'CVE-2006-5857', 'sourceIdentif..."
1,CVE-2008-0655,2008-02-07T21:00:00.000,2025-04-09T00:30:58.490,9.8,CRITICAL,NETWORK,LOW,NONE,NONE,NVD-CWE-noinfo;CWE-200,Multiple unspecified vulnerabilities in Adobe ...,http://blogs.adobe.com/acroread/2008/02/adobe_...,"{'cve': {'id': 'CVE-2008-0655', 'sourceIdentif..."
2,CVE-2007-5659,2008-02-12T19:00:00.000,2025-04-09T00:30:58.490,7.8,HIGH,LOCAL,LOW,NONE,REQUIRED,CWE-120;CWE-120,Multiple buffer overflows in Adobe Reader and ...,http://labs.idefense.com/intelligence/vulnerab...,"{'cve': {'id': 'CVE-2007-5659', 'sourceIdentif..."
3,CVE-2007-5663,2008-02-12T19:00:00.000,2025-04-09T00:30:58.490,,,,,,,CWE-94,Adobe Reader and Acrobat 8.1.1 and earlier all...,http://labs.idefense.com/intelligence/vulnerab...,"{'cve': {'id': 'CVE-2007-5663', 'sourceIdentif..."
4,CVE-2007-5666,2008-02-12T19:00:00.000,2025-04-09T00:30:58.490,,,,,,,CWE-94,Untrusted search path vulnerability in Adobe R...,http://labs.idefense.com/intelligence/vulnerab...,"{'cve': {'id': 'CVE-2007-5666', 'sourceIdentif..."


In [None]:
# ingest_all_cves.ipynb  –  pull every CVE for every CPE in the whitelist
import os, time, requests, pandas as pd
from pathlib import Path

# ── config ─────────────────────────────────────────────────────────────
API_URL      = "https://services.nvd.nist.gov/rest/json/cves/2.0"
API_KEY      = os.getenv("NVD_API_KEY") or "ea5501a5-24fe-4720-80e3-2abed401d92f"
WHITELIST    = Path("../data/whitelist1.csv")
OUTFILE      = Path("../data/cves1.parquet")
RATE_SECS    = 4.5
PER_PAGE     = 2000
PROGRESS_EVERY = 25                # how often to print a status line
# ───────────────────────────────────────────────────────────────────────

def fetch_cves_for_cpe(cpe_uri: str) -> list[dict]:
    parts = cpe_uri.split(":")
    if len(parts) < 6:
        return []
    cpe_query = ":".join(parts[:6]) if parts[5] == "*" else cpe_uri

    all_items, start = [], 0
    headers = {"apiKey": API_KEY}

    while True:
        params = {
            "cpeName":        cpe_query,
            "resultsPerPage": PER_PAGE,
            "startIndex":     start,
        }
        r = requests.get(API_URL, headers=headers, params=params, timeout=30)
        if r.status_code != 200:
            print(f"⚠️ {cpe_query[:70]} → {r.status_code}")
            break

        data   = r.json()
        items  = data.get("vulnerabilities", [])
        all_items.extend(items)

        start += PER_PAGE
        if start >= data.get("totalResults", 0) or not items:
            break
        time.sleep(RATE_SECS)
    return all_items

def flatten(v: dict, cpe_uri: str) -> dict:
    cve      = v["cve"]
    metrics  = cve.get("metrics", {})
    cvss31   = metrics.get("cvssMetricV31", [{}])[0].get("cvssData", {})
    cvss30   = metrics.get("cvssMetricV30", [{}])[0].get("cvssData", {})
    cvss     = cvss31 or cvss30
    descr    = next((d["value"] for d in cve.get("descriptions", []) if d["lang"] == "en"), "")
    cwes     = [
        d["value"] for w in cve.get("weaknesses", [])
        for d in w.get("description", []) if d.get("lang") == "en"
    ]
    refs     = "|".join(r["url"] for r in cve.get("references", [])[:10])

    return {
        "cve_id":            cve["id"],
        "cpe_name":          cpe_uri,
        "published":         cve.get("published"),
        "last_modified":     cve.get("lastModified"),

        "cvss_base":         cvss.get("baseScore"),
        "severity":          cvss.get("baseSeverity"),
        "attack_vector":     cvss.get("attackVector"),
        "attack_complexity": cvss.get("attackComplexity"),
        "priv_required":     cvss.get("privilegesRequired"),
        "user_interact":     cvss.get("userInteraction"),

        "cwes":              ";".join(cwes) if cwes else None,
        "description":       descr[:1000],
        "references":        refs,
        "full_json":         v,
    }

# ── 1. load whitelist ─────────────────────────────────────────────────
cpe_list = (
    pd.read_csv(WHITELIST, dtype=str)["cpeName"]
      .dropna()
      .unique()
)
print(f"📋  {len(cpe_list):,} unique CPEs to query")

# ── 2. query API ───────────────────────────────────────────────────────
rows = []
for idx, cpe in enumerate(cpe_list, start=1):
    if idx % PROGRESS_EVERY == 0 or idx == 1:
        print(f"  → {idx}/{len(cpe_list)}   {cpe[:70]}…")

    for vuln in fetch_cves_for_cpe(cpe):
        rows.append(flatten(vuln, cpe))

print("✔️  API queries finished")

# ── 3. build DataFrame & de-dup ────────────────────────────────────────
df = (
    pd.DataFrame(rows)
      .drop_duplicates(subset=["cve_id", "cpe_name"])
      .reset_index(drop=True)
)
print(f"🗂  {df.shape[0]:,} CVE–CPE rows collected")

In [None]:
# ── 4. save ────────────────────────────────────────────────────────────
df.to_parquet(OUTFILE, index=False)
print(f"✅  Saved to {OUTFILE.resolve()}")