In [3]:
import os
from pymisp import PyMISP
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

In [None]:
# Constants
MISP_URL = "https://your-misp-instance.com"  # Replace with your MISP instance URL
MISP_KEY = "your-API-key-here"  # Replace with your actual MISP API key
BATCH_SIZE = 5461
EMBEDDING_MODEL_PATH = "all-MiniLM-L6-v2"

def get_payloads_from_misp(event_id):
    misp = PyMISP(MISP_URL, MISP_KEY, False)
    event = misp.get_event(event_id)
    return list({attr["value"].strip() for attr in event.get("Event", {}).get("Attribute", []) if attr["value"].strip()})

def build_vector_db(event_id, vector_db_path, collection_name):
    # Load payloads
    payloads = get_payloads_from_misp(event_id)
    if not payloads:
        print(f"No payloads found in MISP Event {event_id}")
        return

    # Init embedding and vector store
    embedding = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_PATH)
    vector_db = Chroma(persist_directory=vector_db_path, embedding_function=embedding)

    # Convert to Documents and insert
    documents = [Document(page_content=p, metadata={"type": collection_name}) for p in payloads]
    for i in range(0, len(documents), BATCH_SIZE):
        vector_db.add_documents(documents[i:i + BATCH_SIZE])
    vector_db.persist()

    print(f"Saved {len(documents)} {collection_name.upper()} payloads to vector DB at: {vector_db_path}")

    # Test retrieval
    docs = vector_db.similarity_search("", k=5)
    print("\n=== Payloads Retrieved ===")
    for i, doc in enumerate(docs, 1):
        print(f"{i}. {doc.page_content}")

    print("\n=== Corresponding Vectors ===")
    vectors = embedding.embed_documents([doc.page_content for doc in docs])
    for i, vec in enumerate(vectors, 1):
        print(f"{i}. {vec[:10]}")

In [None]:
# XSS
build_vector_db(
    event_id=3699,
    vector_db_path="RAG-Agent/vectorDB/vectorize_xss_26k_MISP",
    collection_name="xss"
)

  embedding = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL_PATH)
  vector_db = Chroma(persist_directory=vector_db_path, embedding_function=embedding)


Saved 26000 XSS payloads to vector DB at: /content/drive/MyDrive/NCKH/FINAL/RAG-Agent/vectorDB/vectorize_xss_26k_MISP

=== Payloads Retrieved ===
1. <bdo%0dcontenteditable%0aondblclick%09=%0aalert(XSS)%0dx>xss
2. <font%0dcontenteditable%0aondblclick%0d=%0dalert(XSS)%0dx//xss
3. <i%0acontenteditable%0donmousemove+=%09alert(XSS)%0dx>xss
4. <br%0dcontenteditable%0donmouseover+=%0dalert(XSS)%0dx>xss
5. <summary%0acontenteditable%0donmousemove%09=%0dalert(XSS)%0dx>xss

=== Corresponding Vectors ===
1. [-0.029809869825839996, -0.03533768653869629, -0.033103227615356445, 0.02542710490524769, 0.06602891534566879, 0.016763193532824516, 0.11629865318536758, -0.006964342202991247, -0.021239472553133965, -0.009740371257066727]
2. [-0.04136395826935768, 0.018394744023680687, -0.0914875715970993, 0.04152258485555649, 0.04893212020397186, -0.025130780413746834, 0.051521606743335724, -0.02738058753311634, -0.04406151920557022, -0.02599467895925045]
3. [-0.0020100607071071863, 0.02318304218351841, -0.0

  vector_db.persist()


In [None]:
# SQLI
build_vector_db(
    event_id=3706,
    vector_db_path="RAG-Agent/vectorDB/vectorize_sqli_1k5_MISP",
    collection_name="sqli"
)



Saved 1500 SQLI payloads to vector DB at: /content/drive/MyDrive/NCKH/FINAL/RAG-Agent/vectorDB/vectorize_sqli_1k5_MISP

=== Payloads Retrieved ===
1. ' OR 1 -- -
2. 1'--/*--*/-
3. /*!%55NiOn*/ /*!%53eLEct*/
4. hi' or 'x'='x';
5. username: admin'/*

=== Corresponding Vectors ===
1. [-0.05133058875799179, -0.022964373230934143, 0.04451633617281914, 0.0011554447701200843, -0.03180110082030296, 0.01823512651026249, 0.10281830281019211, -0.030896879732608795, 0.010041853412985802, -0.019503042101860046]
2. [-0.07813622802495956, 0.027137162163853645, 0.026169247925281525, -0.03784995898604393, -0.0881015881896019, -0.037472065538167953, 0.12266656011343002, 0.0005107754259370267, 0.015873447060585022, -0.02869364805519581]
3. [-0.02551090344786644, 0.06993597000837326, 0.04216225817799568, 0.019270699471235275, -0.06647668778896332, 0.027577340602874756, 0.11173047870397568, -0.046064190566539764, 0.01705021969974041, -0.010362362489104271]
4. [-0.053035859018564224, 0.06330820173025131, 0.