# CWE RAG QA Demo
This notebook demonstrates a custom RAG pipeline using LangChain and Ollama over a cleaned CWE dataset.

In [3]:

import pandas as pd
import os

# Load the original CSV file
csv_path = "../data/CWE_HW_List.csv"
df = pd.read_csv(csv_path)

# Keep only relevant columns
columns_to_keep = ["Description", "Extended Description", "Detection Methods", "Potential Mitigations"]
df_cleaned = df[columns_to_keep].dropna(how="all").fillna("N/A").iloc[:50]


# Save cleaned version (optional)
cleaned_path = "../data/CWE_HW_Cleaned.csv"
df_cleaned.to_csv(cleaned_path, index=False)

df_cleaned.head()


Unnamed: 0,Description,Extended Description,Detection Methods,Potential Mitigations
203,"Discrepancies can take many forms, and variati...",::NATURE:ChildOf:CWE ID:200:VIEW ID:1000:ORDIN...,::PHASE:Architecture and Design:STRATEGY:Separ...,::REFERENCE:CVE-2020-8695:DESCRIPTION:Observab...
226,"When resources are released, they can be made ...",::NATURE:ChildOf:CWE ID:459:VIEW ID:1000:ORDIN...,::PHASE:Architecture and Design Implementation...,::REFERENCE:CVE-2019-3733:DESCRIPTION:Cryptogr...
276,,::NATURE:ChildOf:CWE ID:732:VIEW ID:1000:ORDIN...,::PHASE:Architecture and Design Operation:DESC...,::REFERENCE:CVE-2005-1941:DESCRIPTION:Executab...
319,,::NATURE:ChildOf:CWE ID:311:VIEW ID:1000:ORDIN...,::PHASE:Architecture and Design:DESCRIPTION:Be...,::REFERENCE:CVE-2022-29519:DESCRIPTION:Program...
325,,::NATURE:ChildOf:CWE ID:573:VIEW ID:1000:ORDIN...,,::REFERENCE:CVE-2001-1585:DESCRIPTION:Missing ...


In [4]:

output_dir = "../data/cwe_md_chunks"
os.makedirs(output_dir, exist_ok=True)

for idx, row in df_cleaned.iterrows():
    md_content = "\n".join([f"**{col}**: {row[col]}" for col in df_cleaned.columns])
    with open(os.path.join(output_dir, f"cwe_{idx + 1}.md"), "w", encoding="utf-8") as f:
        f.write(md_content)

print(f" {len(df_cleaned)} markdown files created in: {output_dir}")


 50 markdown files created in: ../data/cwe_md_chunks


In [None]:
import os
os.environ["CHROMA_TELEMETRY_ENABLED"] = "false"

import logging
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma

# Set logging
logging.basicConfig(level=logging.INFO)

# Load .env (optional)
from dotenv import load_dotenv
load_dotenv("../.env")

# Load markdown files
loader = DirectoryLoader(output_dir, glob="*.md", loader_cls=TextLoader)
docs = loader.load()

# Split text
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = splitter.split_documents(docs)

# Embed and save in Chroma
persist_path = "../data/chroma_md_db"
embedding = OllamaEmbeddings(model="mistral")

vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_path
)
vectorstore.persist()
print(f" {len(splits)} chunks embedded and stored.")


In [None]:

from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

# Prompt template
prompt_template = '''
You are a helpful AI assistant that answers questions based on the provided CWE documentation.
Use only the context provided to answer the question. If you don't know the answer, say so.

Context: {context}

Question: {question}

Answer:
'''

PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

# Initialize LLM and chain
llm = Ollama(model="mistral", streaming=True)

retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=True,
    chain_type_kwargs={"prompt": PROMPT}
)


In [None]:

question = "What are the common detection methods for insecure permissions?"
response = qa_chain({"query": question})

print("Answer:", response["result"])
print("\nSources:")
for i, doc in enumerate(response["source_documents"], 1):
    print(f"Source {i}:", doc.page_content[:300], "\n")
