In [1]:
import pandas as pd
from datasets import load_dataset
import os

# --- Configuration ---
# The Hugging Face dataset identifier
HF_DATASET_ID = "Bouquets/Cybersecurity-LLM-CVE"
# Name of the file to save the data to locally (recommended)
OUTPUT_CSV_FILE = "cve_data_raw.csv"

def get_cve_data(dataset_id: str, output_file: str) -> pd.DataFrame:
    """
    Acquires the CVE dataset from Hugging Face and returns it as a Pandas DataFrame.
    Saves the data to a local CSV file to avoid repeated downloads.
    """
    print(f"--- 1. Acquiring Data from Hugging Face: {dataset_id} ---")

    try:
        # Load the 'train' split of the dataset
        dataset = load_dataset(dataset_id, split='train')
        
        # Convert the Hugging Face Dataset object to a Pandas DataFrame
        cve_df = dataset.to_pandas()

        print(f"Successfully loaded {len(cve_df)} CVE records.")

        # Save the DataFrame locally as a checkpoint
        cve_df.to_csv(output_file, index=False)
        print(f"Data saved to local file: {output_file}")
        
        return cve_df

    except Exception as e:
        print(f"An error occurred during data loading: {e}")
        # In a real project, you'd add more sophisticated error handling
        return pd.DataFrame() # Return empty DataFrame on failure

# --- Execute the Acquisition ---
cve_dataframe = get_cve_data(HF_DATASET_ID, OUTPUT_CSV_FILE)

if not cve_dataframe.empty:
    print("\n--- Sample of Loaded Data ---")
    print(cve_dataframe.head(3).to_markdown(index=False, numalign="left", stralign="left"))
    print(f"\nTotal columns available: {list(cve_dataframe.columns)}")

  from .autonotebook import tqdm as notebook_tqdm


--- 1. Acquiring Data from Hugging Face: Bouquets/Cybersecurity-LLM-CVE ---


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but t

Successfully loaded 124732 CVE records.
Data saved to local file: cve_data_raw.csv

--- Sample of Loaded Data ---
| instruction                                              | inputs   | outputs                                                                                                                                                                                                                        |
|:---------------------------------------------------------|:---------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
| Please provide detailed information about CVE-2020-13909 |          | CVE:CVE-2020-13909                                                                                                                                                                                                          