## RAG Pipeline Notebook

This notebook leverages the following
* Doc parsing using Langchain parser
* Embedding using the service from VCF 
* LLM using the service from VCF
* Primitive chatbot interface using gradio

In [61]:
# %pip install --upgrade --quiet  vllm -q
%pip install atlassian-python-api pytesseract Pillow

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Note: you may need to restart the kernel to use updated packages.


In [None]:
# imports

import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [59]:
# imports for langchain, plotly and Chroma

from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.llms import VLLM
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import SpacyTextSplitter
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain.document_loaders import PlaywrightURLLoader
import asyncio
from playwright.async_api import async_playwright
from langchain_community.document_loaders import ConfluenceLoader

In [None]:
# price is a factor for our company, so we're going to use a low cost model

# MODEL = "gpt-4o-mini"
MODEL = "meta-llama/Meta-Llama-3-70B-Instruct"
db_name = "vector_db"

In [56]:
# Load environment variables in a file called .env

load_dotenv()
# os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY', 'your-key-if-not-using-env')
os.environ['OPENAI_API_KEY'] = os.getenv('VMWARE_LLM_API_KEY', 'your-key-if-not-using-env')
os.environ['CONFLUENCE_API_KEY'] = os.getenv('CONFLUENCE_API_KEY', 'your-key-if-not-using-env')

## Load the PDF docs from a directory

In [None]:
# Load docs from a directory. Expect everything to be PDFs
def load_dir(dirpath):
    data = []
    for fname in glob.glob(os.path.join(dirpath, "*")):
        loader = PyPDFLoader(file_path=fname)
        data.extend(loader.load())
    return data

# Convert loaded documents into strings by concatenating their content
# and ignoring metadata
def format_docs(docs):
    return "\n===\n".join(doc.page_content for doc in docs)

# Load data
data = load_dir("security_docs")
print(len(data))

In [None]:
print(data[12])

In [None]:
# Split data using RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)

In [None]:
# Split data using SpacyTextSplitter
text_splitter = SpacyTextSplitter()

In [None]:
chunks = text_splitter.split_documents(data)

print(f"Total number of chunks: {len(chunks)}")
# print(f"Document types found: {set(doc.metadata['doc_type'] for doc in data)}")

In [None]:
print(chunks[99])
print("==")
print(chunks[100])
print("==")
print(chunks[101])

In [None]:
print("\n".join([chunk.metadata['source'] for chunk in chunks]))

## Load the confluence page

In [83]:
"""https://vmw-confluence.broadcom.net/pages/viewpage.action?pageId=1481319163"""

loader = ConfluenceLoader(
    url="https://vmw-confluence.broadcom.net", token=os.environ['CONFLUENCE_API_KEY'],
    # space_key="VELOENG", 
    page_ids=["1481319163"],
    include_attachments=True, 
    limit=50,
    keep_markdown_format=True
)
docs = loader.load()

In [None]:
# print(docs[0])
print(docs[0].page_content)

## Load the web page recursively

Try to parse the VMware External KB

In [94]:
import re
from bs4 import BeautifulSoup

def bs4_extractor(html: str) -> str:
    soup = BeautifulSoup(html, "lxml")
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()
    
loader = RecursiveUrlLoader(
    # "https://docs.python.org/3.9/",
    "https://support.broadcom.com/web/ecx/search?searchString=velocloud&activeType=all&from=0&sortby=_score&orderBy=desc&pageNo=1&aggregations=%5B%7B%22type%22%3A%22_type%22%2C%22filter%22%3A%5B%22knowledge_articles_doc%22%5D%7D%2C%7B%22type%22%3A%22productname%22%2C%22filter%22%3A%5B%22VMware+VeloCloud+SD-WAN%22%2C%22VMware+SD-WAN+by+VeloCloud%22%5D%7D%5D&uid=d042dbba-f8c4-11ea-beba-0242ac12000b&resultsPerPage=10&exactPhrase=SD-WAN&withOneOrMore=&withoutTheWords=&pageSize=10&language=en&state=15&suCaseCreate=false",
    extractor=bs4_extractor
    # max_depth=2,
    # use_async=False,
    # extractor=bs4_extractor,
    # metadata_extractor=None,
    # exclude_dirs=(),
    # timeout=10,
    # check_response_status=True,
    # continue_on_failure=True,
    # prevent_outside=True,
    # base_url=None,
    # ...
)
docs = loader.load()

In [None]:
print(len(docs))
print(docs[0])

## A sidenote on Embeddings, and "Auto-Encoding LLMs"

We will be mapping each chunk of text into a Vector that represents the meaning of the text, known as an embedding.

OpenAI offers a model to do this, which we will use by calling their API with some LangChain code.

This model is an example of an "Auto-Encoding LLM" which generates an output given a complete input.
It's different to all the other LLMs we've discussed today, which are known as "Auto-Regressive LLMs", and generate future tokens based only on past context.

Another example of an Auto-Encoding LLMs is BERT from Google. In addition to embedding, Auto-encoding LLMs are often used for classification.

### Sidenote

In week 8 we will return to RAG and vector embeddings, and we will use an open-source vector encoder so that the data never leaves our computer - that's an important consideration when building enterprise systems and the data needs to remain internal.

In [None]:
# Put the chunks of data into a Vector Store that associates a Vector Embedding with each chunk
# Chroma is a popular open source Vector Database based on SQLLite

embeddings = OpenAIEmbeddings(
    model="BAAI/bge-en-icl", 
    base_url="https://llm.ai.broadcom.net/api/v1", 
    tiktoken_enabled=False,
    model_kwargs={"encoding_format": "float"}
)

# Delete if already exists

if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vectorstore

vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

In [None]:
# Let's investigate the vectors

collection = vectorstore._collection
count = collection.count()

sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"There are {count:,} vectors with {dimensions:,} dimensions in the vector store")

## Visualizing the Vector Store

Let's take a minute to look at the documents and their embedding vectors to see what's going on.

In [None]:
# Prework (with thanks to Jon R for identifying and fixing a bug in this!)

result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
# print(set(metadatas)
doc_types = [metadata['source'] for metadata in metadatas]
# print(set(doc_types))
color_code = colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#17becf']
colors = [list(set(doc_types)).index(t) for t in doc_types]

In [None]:
# We humans find it easier to visalize things in 2D!
# Reduce the dimensionality of the vectors to 2D using t-SNE
# (t-distributed stochastic neighbor embedding)

tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x',yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [None]:
# Let's try 3D!

tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D FAISS Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

## Use LangChain to bring it all together

In [None]:
# create a new Chat with OpenAI
# llm = ChatOpenAI(temperature=0.7, model_name=MODEL)
llm = ChatOpenAI(
    temperature=0.7, 
    model=MODEL, 
    base_url="https://llm.ai.broadcom.net/api/v1"
)

# set up the conversation memory for the chat
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# the retriever is an abstraction over the VectorStore that will be used during RAG
retriever = vectorstore.as_retriever(search_kwargs={"k": 25})

### Option 1 - Use the ConversationalRetrievalChain

In [None]:
# putting it together: set up the conversation chain with the GPT 4o-mini LLM, the vector store and memory
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory, callbacks=[StdOutCallbackHandler()])

In [None]:
def chat(question, history):
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [None]:
view = gr.ChatInterface(chat).launch()

### Option 2 - Use the langchain LCEL (|) style

In [None]:
RAG_TEMPLATE = """
You are a professional technical writer. Use the following pieces of retrieved context to answer the question at the end. Be as verbose and comprehensive as possible.

<context>
{context}
</context>

Answer the following question:

{question}"""

QUESTION = "Write a industry grade technical white paper on Security in VeloCloud which will cover the following topics : 1. Security components on the edge, in the cloud, on SODA 2. What threats do we protect against and how 3. What special constructs do we have to reduce the attack surface 4. How does our security telemetry work 5. What are examples of Velocloud deployments that have been shown to be secure?"

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain.globals import set_verbose

set_verbose(True)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Setup system prompt
rag_prompt = ChatPromptTemplate.from_template(template=RAG_TEMPLATE)

# Setup callback so we can see the trace
handler = StdOutCallbackHandler()
config = {
 'callbacks' : [handler]
}

# Setup chain
qa_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

# Run and get output
response = qa_chain.invoke(QUESTION, config=config)
print(response)

### Option 3 - Use LCEL and with debug info

In [None]:
import gradio as gr
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain.vectorstores.chroma import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader
from langchain.callbacks import get_openai_callback
import logging
import os

# Set up logging
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Create prompt template
template = """You are a professional technical writer. Use the following pieces of retrieved context to answer the question at the end. Be as verbose and comprehensive as possible:

Context: {context}

Question: {question}

Answer the question based only on the context provided. If you cannot find the answer in the context, say "I cannot answer this question based on the provided context."

Answer:"""

prompt = ChatPromptTemplate.from_template(template)

# Define the RAG chain using LCEL
rag_chain = (
    RunnableParallel(
        {"context": retriever, "question": RunnablePassthrough()}
    )
    | prompt
    | llm
    | StrOutputParser()
)

class QueryDebugger:
    def __init__(self):
        self.retrieved_docs = []
        self.final_prompt = None
        self.token_usage = None
        self.cost = None

    def clear(self):
        self.retrieved_docs = []
        self.final_prompt = None
        self.token_usage = None
        self.cost = None

debugger = QueryDebugger()

# Modified RAG chain with debugging
def debug_chain(question):
    # Clear previous debug info
    debugger.clear()
    
    # Get retrieved documents and construct prompt
    retriever_output = retriever.invoke(question)
    debugger.retrieved_docs = retriever_output
    
    # Log retrieved documents
    logger.info("Retrieved Documents:")
    for i, doc in enumerate(retriever_output, 1):
        logger.info(f"\nDocument {i}:")
        logger.info(f"Content: {doc.page_content[:200]}...")
        logger.info(f"Source: {doc.metadata.get('source', 'N/A')}")
    
    # Construct and log the prompt
    context = "\n\n".join(doc.page_content for doc in retriever_output)
    prompt_args = {"context": context, "question": question}
    final_prompt = prompt.format(**prompt_args)
    debugger.final_prompt = final_prompt
    logger.info("\nFinal Prompt:")
    logger.info(final_prompt)
    
    # Execute LLM call with token tracking
    with get_openai_callback() as cb:
        response = rag_chain.invoke(question)
        debugger.token_usage = {
            "prompt_tokens": cb.prompt_tokens,
            "completion_tokens": cb.completion_tokens,
            "total_tokens": cb.total_tokens
        }
        debugger.cost = cb.total_cost
    
    # Log token usage and cost
    logger.info("\nToken Usage:")
    logger.info(f"Prompt tokens: {debugger.token_usage['prompt_tokens']}")
    logger.info(f"Completion tokens: {debugger.token_usage['completion_tokens']}")
    logger.info(f"Total tokens: {debugger.token_usage['total_tokens']}")
    logger.info(f"Total cost: ${debugger.cost:.4f}")
    
    return response, debugger

# Gradio interface function
def process_query(question):
    """Process the user query through the RAG pipeline"""
    try:
        response, debug_info = debug_chain(question)
        
        # Format debug information for display
        debug_output = f"""
=== Debug Information ===

Retrieved Documents:
{'-' * 50}
{"".join(f'Document {i+1}:\n{doc.page_content[:200]}...\n\n' for i, doc in enumerate(debug_info.retrieved_docs))}

Final Prompt:
{'-' * 50}
{debug_info.final_prompt}

Token Usage:
{'-' * 50}
Prompt tokens: {debug_info.token_usage['prompt_tokens']}
Completion tokens: {debug_info.token_usage['completion_tokens']}
Total tokens: {debug_info.token_usage['total_tokens']}
Estimated cost: ${debug_info.cost:.4f}

Answer:
{'-' * 50}
{response}
"""
        return debug_output
    
    except Exception as e:
        logger.error(f"Error processing query: {str(e)}")
        return f"An error occurred: {str(e)}"

# Create Gradio interface
iface = gr.Interface(
    fn=process_query,
    inputs=gr.Textbox(
        lines=2,
        placeholder="Enter your question here...",
        label="Question"
    ),
    outputs=gr.Textbox(
        lines=20,
        label="Answer with Debug Information"
    ),
    title="RAG Question-Answering System with Debug Information",
    description="Ask questions about your documents using RAG (Retrieval-Augmented Generation)",
)

# Launch the interface
iface.launch()