## **RAG with Memory**

Here I'll build a pipeline to extract from a given URL and allow us to "chat" with the data. I'll also add in memory.

In [20]:
import bs4
from langchain import hub
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import WebBaseLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.vectorstores import Chroma
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.schema.messages import AIMessage, HumanMessage


In [4]:
URL = "https://www.databricks.com/glossary/llmops"
INITIAL_QUERY = "What is LLMOPS?"

loader = WebBaseLoader(web_paths=(URL,),)
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

In [5]:
condense_q_system_prompt = """
Given a chat history and the latest user question \
which might reference the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is.
"""

condense_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", condense_q_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)

condense_q_chain = condense_q_prompt | llm | StrOutputParser()

qa_system_prompt = """
You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}
"""

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


def condense_question(input: dict):
    if input.get("chat_history"):
        return condense_q_chain
    else:
        return input["question"]


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    RunnablePassthrough.assign(context=condense_question | retriever | format_docs)
    | qa_prompt
    | llm
)

In [14]:
class Conversation:
    def __init__(self, chain) -> None:
        self.chain = chain
        self.chat_history = []
    
    def add(self, question):
        ai_msg = self.chain.invoke({"question": question, "chat_history": self.chat_history})
        self.chat_history.extend([HumanMessage(content=question), ai_msg])
        return ai_msg

In [15]:
c = Conversation(rag_chain)

In [16]:
c.add("what is LLMOps?")

AIMessage(content='LLMOps refers to the operationalization of Large Language Models (LLMs). It encompasses the entire lifecycle of developing, deploying, and maintaining LLMs, including data ingestion, data preparation, prompt engineering, model fine-tuning, model deployment, model monitoring, and more. LLMOps aims to ensure efficiency, scalability, and risk reduction in the development and deployment of LLMs.')

In [18]:
c.add("Okay, and what does it stand for?")

AIMessage(content='LLMOps stands for Large Language Model Operations.')

In [21]:
c.add("What was my first question?")

AIMessage(content='Your first question was "What is LLMOps?"')