## Data Preprocessing 

In [13]:
import pandas as pd

# Load the original CSV
csv_path = "/Users/abhi/pdf_qa_project/data/CWE_HW_List.csv"
df = pd.read_csv(csv_path)

# Select important columns
columns_to_keep = [
    "Description", 
    "Extended Description", 
    "Detection Methods", 
    "Potential Mitigations"
]

# Keep only selected columns and clean up
df_cleaned = df[columns_to_keep].dropna(how="all").fillna("N/A")

# Save cleaned CSV
cleaned_csv_path = "/Users/abhi/pdf_qa_project/data/CWE_HW_Cleaned.csv"
df_cleaned.to_csv(cleaned_csv_path, index=False)

cleaned_csv_path


'/Users/abhi/pdf_qa_project/data/CWE_HW_Cleaned.csv'

## Convert Cleaned CSV to Markdown Files

In [15]:
# Path to the cleaned CSV
csv_path = "/Users/abhi/pdf_qa_project/data/CWE_HW_Cleaned.csv"
df = pd.read_csv(csv_path)

# Create output directory for Markdown files
output_dir = "/Users/abhi/pdf_qa_project/data/cwe_md_chunks"
os.makedirs(output_dir, exist_ok=True)

# Loop through each row and write to .md file
for idx, row in df.iterrows():
    md_content = "\n".join([
        f"**{col}**: {row[col]}" for col in df.columns
    ])
    filename = os.path.join(output_dir, f"cwe_{idx + 1}.md")
    with open(filename, "w", encoding="utf-8") as f:
        f.write(md_content)

print(f" {len(df)} markdown files created in: {output_dir}")

 110 markdown files created in: /Users/abhi/pdf_qa_project/data/cwe_md_chunks


In [None]:
# install package
pip install langchain-community langchain-ollama langchain langsmith chromadb pypdf tqdm python-dotenv

In [16]:
import os
import logging
from tqdm import tqdm
from dotenv import load_dotenv

from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langsmith import traceable

# Load environment variables
load_dotenv(dotenv_path=os.path.join(os.path.dirname(__file__), "..", ".env"), override=True)

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

@traceable(run_type="llm", metadata={"ls_provider": "ollama", "model": "mistral"})
def create_qa_agent(md_dir, model_name="mistral"):
    persist_directory = os.path.join(os.path.dirname(__file__), "..", "data", "chroma_md_db")

    if os.path.exists(persist_directory):
        logging.info("Loading existing Chroma store...")
        vectorstore = Chroma(persist_directory=persist_directory, embedding_function=OllamaEmbeddings(model=model_name))
    else:
        logging.info("Creating new Chroma store from Markdown files...")
        loader = DirectoryLoader(md_dir, glob="*.md", loader_cls=TextLoader)
        docs = loader.load()
        logging.info(f"Loaded {len(docs)} markdown files.")

        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200, length_function=len)
        splits = text_splitter.split_documents(docs)
        logging.info(f"Split the documents into {len(splits)} chunks.")

        embeddings = OllamaEmbeddings(model=model_name)
        vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embeddings)

        for chunk in tqdm(splits, desc="Embedding markdown chunks"):
            vectorstore.add_documents([chunk], embedding=embeddings)

        logging.info(f"Stored {len(splits)} chunks in the vectorstore.")

    prompt_template = """
    You are a helpful AI assistant that answers questions based on the provided Markdown documentation.
    Use only the context provided to answer the question. If you don't know the answer or
    can't find it in the context, say so.

    Context: {context}

    Question: {question}

    Answer:"""

    PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"])

    llm = Ollama(model=model_name, streaming=True)

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=vectorstore.as_retriever(search_kwargs={"k": 10}),
        return_source_documents=True,
        chain_type_kwargs={"prompt": PROMPT}
    )

    return qa_chain

@traceable(run_type="chain")
def ask_question(qa_chain, question):
    try:
        response = qa_chain({"query": question})
        return {
            "answer": response["result"],
            "sources": [doc.page_content for doc in response["source_documents"]]
        }
    except Exception as e:
        logging.error(f"An error occurred: {str(e)}")
        return {
            "error": f"An error occurred: {str(e)}",
            "answer": None,
            "sources": None
        }

def main():
    md_dir = os.path.join(os.path.dirname(__file__), "..", "data", "cwe_md_chunks")

    if not os.path.exists(md_dir):
        logging.error(f"The directory {md_dir} does not exist.")
        return

    qa_agent = create_qa_agent(md_dir)

    while True:
        question = input("\nEnter your question (or type 'exit' to quit): ")
        if question.lower() == 'exit':
            break
        result = ask_question(qa_agent, question)
        if result.get("error"):
            logging.error(result['error'])
        else:
            print(f"\nAnswer: {result['answer']}")
            print("Sources used:")
            for i, source in enumerate(result['sources'], 1):
                print(f"Source {i}: {source[:200]}...\n")

if __name__ == "__main__":
    main()


NameError: name '__file__' is not defined