In [1]:
# --- Notebook Cell: Stage 2 - Build FAISS Database (Corrected Import Order) ---

import pandas as pd
import numpy as np
import json
import os
import pathlib
import requests
from sentence_transformers import SentenceTransformer
from tqdm.notebook import tqdm
# DO NOT import faiss here yet
import gc

# --- Configuration ---
print("--- Stage 2: FAISS Database Build (Saving to /kaggle/working/) ---")
# ... (rest of configuration section: MITRE_URL, CVE_CSV_PATH, OUTPUT_DIR etc.) ...
MITRE_URL = "https://raw.githubusercontent.com/mitre/cti/master/enterprise-attack/enterprise-attack.json"
MITRE_JSON_PATH = "/kaggle/working/mitre_enterprise_attack.json"
CVE_CSV_PATH = "/kaggle/input/vulnerability-management-datasets/cve_cisa_epss_enriched_dataset.csv"
OUTPUT_DIR = pathlib.Path("/kaggle/working/")
FAISS_INDEX_PATH = OUTPUT_DIR / "faiss_mitre_cve_index.bin"
ID_MAP_PATH = OUTPUT_DIR / "faiss_mitre_cve_id_map.json"
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'

# --- 1. Install Libraries ---
print("Installing libraries...")
!pip install -q sentence-transformers faiss-cpu requests
print("Libraries installed.")

# --- IMPORT FAISS **AFTER** INSTALLATION ---
try:
    import faiss
    print("Successfully imported faiss.")
except ImportError as e:
    print(f"❗️ ERROR: Failed to import faiss even after installation: {e}")
    print("❗️ Please check the pip install output above for errors.")
    raise # Stop execution if import fails

# --- 2. Download MITRE Data ---
# ... (rest of the code: download MITRE, process MITRE, process CVE, load model, encode, build index, save) ...
# ... (The rest of the code from message_idx: 77 follows here) ...

2025-04-30 02:03:30.659471: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745978610.855702      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745978610.911163      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


--- Stage 2: FAISS Database Build (Saving to /kaggle/working/) ---
Installing libraries...
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m47.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.5/207.5 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2

In [2]:
# --- Installation Cell (Run First) ---
print("Installing libraries...")
!pip install -q sentence-transformers faiss-cpu requests
print("Libraries installed.")

Installing libraries...
Libraries installed.


In [3]:
# --- Kaggle Notebook Cell: Stage 2 - Build FAISS Database (Final Version) ---

import pandas as pd
import numpy as np
import json
import os
import pathlib
import requests # To download MITRE JSON
from sentence_transformers import SentenceTransformer # For embeddings
from tqdm.notebook import tqdm # Progress bars
import faiss # FAISS library for indexing (Import after installation cell)
import gc # Garbage collector

# --- Configuration ---
print("--- Stage 2: FAISS Database Build (Saving to /kaggle/working/) ---")
# Official MITRE ATT&CK Enterprise JSON URL
MITRE_URL = "https://raw.githubusercontent.com/mitre/cti/master/enterprise-attack/enterprise-attack.json"
# Path to download the MITRE JSON within Kaggle's working directory
MITRE_JSON_PATH = "/kaggle/working/mitre_enterprise_attack.json"
# Path to the added Kaggle dataset for CVEs
CVE_CSV_PATH = "/kaggle/input/vulnerability-management-datasets/cve_cisa_epss_enriched_dataset.csv"
# Output paths in Kaggle's working directory
OUTPUT_DIR = pathlib.Path("/kaggle/working/")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # Ensure output dir exists
FAISS_INDEX_PATH = OUTPUT_DIR / "faiss_mitre_cve_index.bin"
ID_MAP_PATH = OUTPUT_DIR / "faiss_mitre_cve_id_map.json"
# Embedding model (ensure GPU is enabled in notebook settings for speed)
EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
# --- End Configuration ---

# --- Verify faiss import ---
try:
    import faiss
    print("Successfully imported faiss.")
except ImportError as e:
    print(f"❗️ ERROR: Failed to import faiss: {e}")
    print("❗️ Please ensure the installation cell was run successfully.")
    raise # Stop execution if import fails

# --- Download MITRE Data ---
print(f"\nDownloading MITRE ATT&CK data from {MITRE_URL}...")
try:
    response = requests.get(MITRE_URL)
    response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
    with open(MITRE_JSON_PATH, 'wb') as f:
        f.write(response.content)
    print(f"Successfully downloaded to {MITRE_JSON_PATH}")
except requests.exceptions.RequestException as e:
    print(f"❗️ ERROR downloading MITRE data: {e}")
    raise
except Exception as e:
    print(f"❗️ An unexpected error occurred during MITRE download: {e}")
    raise

# --- Load & Prepare Text Data ---
texts_for_embedding = []
ids_for_embedding = [] # Will store 'MITRE_T:Txxxx', 'MITRE_Tac:TAxxxx', 'MITRE_M:Mxxxx', 'CVE:CVE-yyyy-nnnn'

# Process MITRE JSON
print("\nProcessing MITRE ATT&CK data...")
try:
    with open(MITRE_JSON_PATH, 'r', encoding='utf-8') as f:
        mitre_data = json.load(f)

    mitre_ids_processed = set() # To avoid duplicates
    for obj in tqdm(mitre_data.get("objects", []), desc="Parsing MITRE"):
        obj_type = obj.get("type")
        mitre_id = None
        if obj_type in ["attack-pattern", "x-mitre-tactic", "course-of-action"]:
            ext_refs = obj.get("external_references", [])
            mitre_ref = next((ref for ref in ext_refs if ref.get("source_name") in ["mitre-attack", "mitre-ics-attack", "mitre-mobile-attack"]), None)
            if mitre_ref: mitre_id = mitre_ref.get("external_id")
            name = obj.get("name")
            description = str(obj.get("description", "")).strip()

            if mitre_id and name and description and mitre_id not in mitre_ids_processed:
                full_text = f"{name}: {description}"
                full_text = " ".join(full_text.split())
                texts_for_embedding.append(full_text)
                if obj_type == "attack-pattern": id_prefix = "MITRE_T"
                elif obj_type == "x-mitre-tactic": id_prefix = "MITRE_Tac"
                elif obj_type == "course-of-action": id_prefix = "MITRE_M"
                else: id_prefix = "MITRE_Other"
                ids_for_embedding.append(f"{id_prefix}:{mitre_id}")
                mitre_ids_processed.add(mitre_id)

    print(f"Processed {len(mitre_ids_processed)} unique MITRE techniques/tactics/mitigations.")
    del mitre_data; gc.collect()
except FileNotFoundError: print(f"❗️ ERROR: MITRE JSON file not found at {MITRE_JSON_PATH}.")
except Exception as e: print(f"❗️ ERROR processing MITRE JSON: {e}")

# Process CVE CSV from Kaggle Dataset (with dynamic column finding)
print("\nProcessing CVE data...")
try:
    if not os.path.exists(CVE_CSV_PATH):
         print(f"❗️ ERROR: CVE file not found at {CVE_CSV_PATH}.")
         print("❗️ Please ensure you have added the 'vulnerability-management-datasets' dataset.")
    else:
        print(f"Reading header from {CVE_CSV_PATH} to find columns...")
        header_df = pd.read_csv(CVE_CSV_PATH, nrows=0, low_memory=False)
        actual_columns = [col.strip() for col in header_df.columns]
        print(f"--- DEBUG: Found columns in CVE CSV: {actual_columns}")
        cve_id_col = next((col for col in actual_columns if 'cve_id' in col.lower()), None)
        desc_col = next((col for col in actual_columns if 'description' in col.lower()), None)

        if not cve_id_col or not desc_col:
            print(f"❗️ ERROR: Could not find required CVE columns ('cve_id', 'description') in header: {actual_columns}")
        else:
            print(f"--- DEBUG: Using columns: ID='{cve_id_col}', Description='{desc_col}'")
            print(f"Reading full CVE data using columns: ['{cve_id_col}', '{desc_col}']...")
            cve_df = pd.read_csv(
                CVE_CSV_PATH, usecols=[cve_id_col, desc_col],
                low_memory=False, dtype={desc_col: str}
            )
            # Ensure consistent column names after reading with usecols
            cve_df.columns = ['cve_id', 'description'] # Use standard names internally

            cve_df.dropna(subset=['cve_id', 'description'], inplace=True)
            cve_df['description'] = cve_df['description'].astype(str)
            cve_df = cve_df.drop_duplicates(subset=['cve_id'])

            initial_len = len(texts_for_embedding)
            for _, row in tqdm(cve_df.iterrows(), total=len(cve_df), desc="Parsing CVEs"):
                cve_id = row['cve_id']
                description = row['description'].strip()
                if cve_id and description:
                    full_text = " ".join(description.split())
                    texts_for_embedding.append(full_text)
                    ids_for_embedding.append(f"CVE:{cve_id}")
            print(f"Processed {len(texts_for_embedding) - initial_len} unique CVEs.")
            del cve_df, header_df; gc.collect()

except FileNotFoundError: print(f"Skipping CVE processing - file not found at {CVE_CSV_PATH}")
except ValueError as e: print(f"Skipping CVE processing - Value Error: {e}")
except Exception as e: print(f"❗️ An unexpected error occurred processing CVE CSV: {e}")

if not texts_for_embedding:
    raise ValueError("No text data loaded from MITRE or CVE sources. Cannot build index.")

print(f"\nTotal text entries prepared for embedding: {len(texts_for_embedding)}")
if texts_for_embedding:
    print(f"Example IDs: {ids_for_embedding[:3]} ... {ids_for_embedding[-3:]}")
    print(f"Example Texts: {[t[:70]+'...' for t in texts_for_embedding[:2]]}")


# --- 4. Load Embedding Model ---
print(f"\nLoading embedding model: {EMBEDDING_MODEL} (will use GPU if available)...")
model = SentenceTransformer(EMBEDDING_MODEL)
print("Embedding model loaded.")

# --- 5. Generate Embeddings ---
print("\nGenerating embeddings (using GPU if enabled - this may take time)...")
embeddings = model.encode(texts_for_embedding, show_progress_bar=True, batch_size=256) # Increased batch size for GPU
print(f"Generated {embeddings.shape[0]} embeddings with dimension {embeddings.shape[1]}.")

# --- 6. Build FAISS Index ---
dimension = embeddings.shape[1]
print(f"\nBuilding FAISS index (Dimension: {dimension})...")
index_flat = faiss.IndexFlatL2(dimension)
index = faiss.IndexIDMap(index_flat)
faiss_ids = np.arange(len(texts_for_embedding)).astype('int64')
index.add_with_ids(embeddings.astype('float32'), faiss_ids)
print(f"FAISS index built. Total entries: {index.ntotal}")

# --- 7. Save Index & ID Map ---
print(f"\nSaving FAISS index to: {FAISS_INDEX_PATH}")
try:
    faiss.write_index(index, str(FAISS_INDEX_PATH))
except Exception as e: print(f"❗️ ERROR saving FAISS index: {e}")

id_map_dict = {int(i): original_id for i, original_id in enumerate(ids_for_embedding)}
print(f"Saving ID map (FAISS index to MITRE/CVE ID) to: {ID_MAP_PATH}")
try:
    with open(ID_MAP_PATH, 'w', encoding='utf-8') as f_map:
        json.dump(id_map_dict, f_map)
except Exception as e: print(f"❗️ ERROR saving ID map JSON: {e}")

print("\n--- FAISS Database Creation Complete ---")
print(f"Outputs generated in {OUTPUT_DIR}:")
!ls -lh {str(OUTPUT_DIR / 'faiss_*')}
print("\nRECOMMENDATION: Use 'Save Version' -> 'Save & Run All (Commit)'")
print("Then, create a new Kaggle Dataset from these output files for reuse.")

# --- Cleanup ---
del texts_for_embedding, ids_for_embedding, embeddings, index, id_map_dict, model
gc.collect()

--- Stage 2: FAISS Database Build (Saving to /kaggle/working/) ---
Successfully imported faiss.

Downloading MITRE ATT&CK data from https://raw.githubusercontent.com/mitre/cti/master/enterprise-attack/enterprise-attack.json...
Successfully downloaded to /kaggle/working/mitre_enterprise_attack.json

Processing MITRE ATT&CK data...


Parsing MITRE:   0%|          | 0/22651 [00:00<?, ?it/s]

Processed 881 unique MITRE techniques/tactics/mitigations.

Processing CVE data...
Reading header from /kaggle/input/vulnerability-management-datasets/cve_cisa_epss_enriched_dataset.csv to find columns...
--- DEBUG: Found columns in CVE CSV: ['cve_id', 'base_severity', 'base_score', 'exploitability_score', 'impact_score', 'epss_score', 'epss_perc', 'cisa_kev', 'attack_vector', 'attack_complexity', 'privileges_required', 'user_interaction', 'scope', 'confidentiality_impact', 'integrity_impact', 'availability_impact', 'published_date']
❗️ ERROR: Could not find required CVE columns ('cve_id', 'description') in header: ['cve_id', 'base_severity', 'base_score', 'exploitability_score', 'impact_score', 'epss_score', 'epss_perc', 'cisa_kev', 'attack_vector', 'attack_complexity', 'privileges_required', 'user_interaction', 'scope', 'confidentiality_impact', 'integrity_impact', 'availability_impact', 'published_date']

Total text entries prepared for embedding: 881
Example IDs: ['MITRE_M:T1174', 

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding model loaded.

Generating embeddings (using GPU if enabled - this may take time)...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Generated 881 embeddings with dimension 384.

Building FAISS index (Dimension: 384)...
FAISS index built. Total entries: 881

Saving FAISS index to: /kaggle/working/faiss_mitre_cve_index.bin
Saving ID map (FAISS index to MITRE/CVE ID) to: /kaggle/working/faiss_mitre_cve_id_map.json

--- FAISS Database Creation Complete ---
Outputs generated in /kaggle/working:
-rw-r--r-- 1 root root  23K Apr 30 02:05 /kaggle/working/faiss_mitre_cve_id_map.json
-rw-r--r-- 1 root root 1.3M Apr 30 02:05 /kaggle/working/faiss_mitre_cve_index.bin

RECOMMENDATION: Use 'Save Version' -> 'Save & Run All (Commit)'
Then, create a new Kaggle Dataset from these output files for reuse.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2353