# RAG chat over web source - multiple pages
- adopted from https://python.langchain.com/docs/use_cases/question_answering/ and https://blog.langchain.dev/building-chat-langchain-2/


In [1]:
import os
import openai
import bs4
from langchain import hub
from langchain.chat_models import AzureChatOpenAI
from langchain.embeddings import AzureOpenAIEmbeddings
from langchain.schema import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.prompts import (ChatPromptTemplate, MessagesPlaceholder,
                               PromptTemplate)
from langchain.schema.runnable import (Runnable, RunnableBranch,
                                       RunnableLambda, RunnableMap)
from langchain.schema.runnable  import RunnablePassthrough
from operator import itemgetter
from ssl_workaround import no_ssl_verification
from dotenv import load_dotenv, find_dotenv

# 1. Init

In [2]:
def init():
    _ = load_dotenv(find_dotenv()) # read local .env file
    openai.api_key = os.environ['OPENAI_API_KEY']
    openai.api_base = os.environ['OPENAI_API_BASE']
    openai.api_type= os.environ['OPENAI_API_TYPE']
    openai.api_version = os.environ['OPENAI_API_VERSION']
    print(f'Openai secrets loaded, models: {os.environ["OPENAI_DEPLOYMENT_ID_LLM"]}, {os.environ["OPENAI_DEPLOYMENT_ID_EMBED"]}')
init()

Openai secrets loaded, models: gpt-35-turbo, text-embedding-ada-002


## 2. Load content from Web

In [3]:
from bs4 import BeautifulSoup, SoupStrainer
import re

def simple_extractor(html: str) -> str:
    soup = BeautifulSoup(
        html, 
        "lxml",  
        parse_only = SoupStrainer( 
        'article', role = 'main'))
    return re.sub(r"\n\n+", "\n\n", soup.text).strip()

In [4]:
from langchain.document_loaders import RecursiveUrlLoader, SitemapLoader
from langchain.utils.html import (PREFIXES_TO_IGNORE_REGEX,
                                  SUFFIXES_TO_IGNORE_REGEX)

api_ref = RecursiveUrlLoader(
    "https://ibm.github.io/ibm-generative-ai/",
    max_depth=8,
    extractor=simple_extractor,
    prevent_outside=True,
    use_async=False,
    timeout=600,
    check_response_status=True,
    exclude_dirs=(
        "https://www.ibm.com/products/watsonx-ai",
        "https://ibm.github.io/ibm-generative-ai/genindex.html",
        "https://ibm.github.io/ibm-generative-ai/py-modindex.html",
        "https://ibm.github.io/ibm-generative-ai/search.html",
    ),
    # drop trailing / to avoid duplicate pages.
    link_regex=(
        f"href=[\"']{PREFIXES_TO_IGNORE_REGEX}((?:{SUFFIXES_TO_IGNORE_REGEX}.)*?)"
        r"(?:[\#'\"]|\/[\#'\"])"
    ),
).load()

  soup = BeautifulSoup(
Unable to load from https://ibm.github.io/ibm-generative-ai/rst_source/genai.credentials.html. Received error HTTPSConnectionPool(host='ibm.github.io', port=443): Max retries exceeded with url: /ibm-generative-ai/rst_source/genai.credentials.html (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1006)'))) of type SSLError


In [5]:
# test doc loading
print(f'Nuber of docs loaded: {len(api_ref)}')
print(f'First doc lenght: {len(api_ref[0].page_content)}')
print(f'Sample: {api_ref[1].page_content[:500]}')

Nuber of docs loaded: 33
First doc lenght: 1273
Sample: Token#

class genai.schemas.token_params.TokenParams(*, return_tokens: bool | None = None)#
Bases: BaseModel

model_config: ClassVar[ConfigDict] = {'extra': 'forbid', 'str_strip_whitespace': True}#
Configuration for the model, should be a dictionary conforming to [ConfigDict][pydantic.config.ConfigDict].

model_fields: ClassVar[dict[str, FieldInfo]] = {'return_tokens': FieldInfo(annotation=Union[bool, NoneType], required=False, description='Return tokens with the response. Defaults to false.')}#


# 3. Split
- LLM struggle to find info in very long context, we should split it in reasonable long documents
- its good to have overlap between spit documents in order not to loose context from ending / begginging chunk parts

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(api_ref)

In [7]:
# test splitter
print(f'Number of documents created: {len(all_splits)}')
print(f'First document length: {len(all_splits[0].page_content)}')
print(f'Metadata sample: {all_splits[0].metadata}')
print(f'Metadata sample: {all_splits[1].metadata}')
print(f'Metadata sample: {all_splits[2].metadata}')

Number of documents created: 127
First document length: 1273
Metadata sample: {'source': 'https://ibm.github.io/ibm-generative-ai/', 'title': 'IBM Generative AI Python SDK (Tech Preview)', 'language': 'en', 'start_index': 0}
Metadata sample: {'source': 'https://ibm.github.io/ibm-generative-ai/rst_source/genai.schemas.token_params.html', 'title': 'Token - IBM Generative AI Python SDK (Tech Preview)', 'language': 'en', 'start_index': 0}
Metadata sample: {'source': 'https://ibm.github.io/ibm-generative-ai/rst_source/genai.prompt.quickstart.html', 'title': 'Quick Start - IBM Generative AI Python SDK (Tech Preview)', 'language': 'en', 'start_index': 0}


In [8]:
# TODO - optimize with https://python.langchain.com/docs/use_cases/question_answering/#go-deeper-1

# 4. Vector store
- embed and store all documents in vector store 

In [9]:
embedding=AzureOpenAIEmbeddings(azure_deployment=os.environ['OPENAI_DEPLOYMENT_ID_EMBED'])

In [10]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding)

# 5. Retrieve

In [11]:
retriever = vectorstore.as_retriever(search_type="mmr", search_kwargs={"k": 6})

In [61]:
# retriever test
retrieved_docs = retriever.get_relevant_documents(
    "What are the parameters of Credentials?"
)
print(f'---Number of documents retrieved: {len(retrieved_docs)}')
print(f'---Retrieved docs source: ')
for doc in retrieved_docs:
    print(f'------Source: {doc.metadata["source"]}')
print(f'---Doc 1: {retrieved_docs[0].page_content}')

---Number of documents retrieved: 6
---Retrieved docs source: 
------Source: https://ibm.github.io/ibm-generative-ai/rst_source/modules.html
------Source: https://ibm.github.io/ibm-generative-ai/rst_source/genai.schemas.descriptions.html
------Source: https://ibm.github.io/ibm-generative-ai/rst_source/examples/examples.html
------Source: https://ibm.github.io/ibm-generative-ai/rst_source/examples/examples.user.html
------Source: https://ibm.github.io/ibm-generative-ai/rst_source/genai.schemas.generate_params.html
------Source: https://ibm.github.io/ibm-generative-ai/rst_source/genai.prompt.quickstart.annexe.html
---Doc 1: GENAI#

Credentials
Credentials

Metadata
Metadata

Model
Model


# 6. Generate

In [62]:
RESPONSE_TEMPLATE = """\
You are an expert programmer and problem-solver, tasked with answering any question \
about IBM generative AI Python SDK.

Generate a comprehensive and informative answer of 80 words or less for the \
given question based solely on the provided search results (URL and content). You must \
only use information from the provided search results. Use an unbiased and \
journalistic tone. Combine search results together into a coherent answer. Do not \
repeat text. Cite search results using [${{number}}] notation. Only cite the most \
relevant results that answer the question accurately. Place these citations at the end \
of the sentence or paragraph that reference them - do not put them all at the end. If \
different results refer to different entities within the same name, write separate \
answers for each entity.

You should use bullet points in your answer for readability. Put citations where they apply
rather than putting them all at the end.

If there is nothing in the context relevant to the question at hand, just say "Hmm, \
I'm not sure." Don't try to make up an answer.

Anything between the following `context`  html blocks is retrieved from a knowledge \
bank, not part of the conversation with the user. 

<context>
    {context} 
<context/>

REMEMBER: If there is no relevant information within the context, just say "Hmm, I'm \
not sure." Don't try to make up an answer. Anything between the preceding 'context' \
html blocks is retrieved from a knowledge bank, not part of the conversation with the \
user.\
"""

REPHRASE_TEMPLATE = """\
Given the following conversation and a follow up question, rephrase the follow up \
question to be a standalone question.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone Question:"""

In [63]:
llm = AzureChatOpenAI(azure_deployment=os.environ['OPENAI_DEPLOYMENT_ID_LLM'])

## handling new questions given the chat history

In [64]:
CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(REPHRASE_TEMPLATE)

In [65]:
condense_question_chain = (
    PromptTemplate.from_template(REPHRASE_TEMPLATE)
    | llm
    | StrOutputParser()
)

retriever_chain = condense_question_chain | retriever

In [19]:
# test chat chain
from langchain_core.messages import AIMessage, HumanMessage

condense_question_chain.invoke(
    {
        "chat_history": [
            HumanMessage(content="What parameres of Credentials?"),
            AIMessage(content="The parameters of Credentials are api_key and api_endpoint."),
        ],
        "question": "What are their types?",
    }
)

'What are the types of api_key and api_endpoint parameters in Credentials?'

In [20]:
# test retriever chain
from langchain_core.messages import AIMessage, HumanMessage

retriever_chain.invoke(
    {
        "chat_history": [
            HumanMessage(content="What parameres of Credentials?"),
            AIMessage(content="The parameters of Credentials are api_key and api_endpoint."),
        ],
        "question": "What are their types?",
    }
)

[Document(page_content='GENAI#\n\nCredentials\nCredentials\n\nMetadata\nMetadata\n\nModel\nModel', metadata={'language': 'en', 'source': 'https://ibm.github.io/ibm-generative-ai/rst_source/modules.html', 'start_index': 0, 'title': 'GENAI - IBM Generative AI Python SDK (Tech Preview)'}),
 Document(page_content='Schemas#\n\nSubmodules#\n\nDescriptions\nDescriptions\nFilesAPIDescriptions\nTunesAPIDescriptions\n\nGenerate\nChatOptions\nGenerateParams\nHAPOptions\nImplicitHateOptions\nLengthPenalty\nModerationTypeOptions\nModerationsOptions\nReturn\nReturnOptions\nStigmaOptions\n\nChat\nAIMessage\nBaseMessage\nChatRole\nHumanMessage\nSystemMessage\n\nHistory\nHistoryParams\n\nResponses\nChatResponse\nChatStreamResponse\nErrorExtensionState\nErrorExtensionStateParam\nErrorExtensions\nErrorResponse\nFileFormatResult\nFileInfoResult\nFilesListResponse\nGenAiResponseModel\nGenerateLimits\nGenerateResponse\nGenerateResult\nGenerateStreamResponse\nGenerateStreamResult\nGeneratedToken\nHAPResult\n

## final chain

In [66]:
from typing import Dict, List, Optional, Sequence
from langchain.schema import Document

def format_docs(docs: Sequence[Document]) -> str:
    formatted_docs = []
    for i, doc in enumerate(docs):
        doc_string = f"<doc id='{i}'>{doc.page_content}</doc>"
        formatted_docs.append(doc_string)
    return "\n".join(formatted_docs)

In [35]:
_context = RunnableMap(
    {
        "context": retriever_chain | format_docs,
        "question": itemgetter("question"),
        "chat_history": itemgetter("chat_history"),
    }
)
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", RESPONSE_TEMPLATE),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)

response_synthesizer = (prompt | llm | StrOutputParser())



In [68]:
c = _context.invoke(
    {
        "chat_history": [
            HumanMessage(content="What parameres of Credentials?"),
            AIMessage(content="The parameters of Credentials are api_key and api_endpoint."),
        ],
        "question": "What are their types?",
    }
)

In [69]:
c

{'context': '<doc id=\'0\'>GENAI#\n\nCredentials\nCredentials\n\nMetadata\nMetadata\n\nModel\nModel</doc>\n<doc id=\'1\'>Schemas#\n\nSubmodules#\n\nDescriptions\nDescriptions\nFilesAPIDescriptions\nTunesAPIDescriptions\n\nGenerate\nChatOptions\nGenerateParams\nHAPOptions\nImplicitHateOptions\nLengthPenalty\nModerationTypeOptions\nModerationsOptions\nReturn\nReturnOptions\nStigmaOptions\n\nChat\nAIMessage\nBaseMessage\nChatRole\nHumanMessage\nSystemMessage\n\nHistory\nHistoryParams\n\nResponses\nChatResponse\nChatStreamResponse\nErrorExtensionState\nErrorExtensionStateParam\nErrorExtensions\nErrorResponse\nFileFormatResult\nFileInfoResult\nFilesListResponse\nGenAiResponseModel\nGenerateLimits\nGenerateResponse\nGenerateResult\nGenerateStreamResponse\nGenerateStreamResult\nGeneratedToken\nHAPResult\nHistoryResponse\nHistoryResult\nHistoryResultRequest\nImplicitHateResult\nModelCard\nModelList\nModerationResult\nModerationTypeResult\nStigmaResult\nStopReasonEnum\nTermsOfUse\nTermsOfUseRes

In [71]:
response_synthesizer.invoke(c)

'The `api_key` parameter is a string that is used to authenticate with the IBM generative AI Python SDK, while the `api_endpoint` parameter is a string that specifies the URL of the API endpoint.'

## add wait step to workaround the API limit

In [72]:
from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)
import time

def wait(prompt: str) -> str: # wait 1 second
    time.sleep(8)
    return prompt

chain_wait = RunnableLambda(wait) | {
    'context': itemgetter('context'), # Original LLM output
    'question': itemgetter('question'), # Original LLM output
    'chat_history': itemgetter('chat_history'), # Original LLM output
}

chain_wait.invoke(c) # test

{'context': '<doc id=\'0\'>GENAI#\n\nCredentials\nCredentials\n\nMetadata\nMetadata\n\nModel\nModel</doc>\n<doc id=\'1\'>Schemas#\n\nSubmodules#\n\nDescriptions\nDescriptions\nFilesAPIDescriptions\nTunesAPIDescriptions\n\nGenerate\nChatOptions\nGenerateParams\nHAPOptions\nImplicitHateOptions\nLengthPenalty\nModerationTypeOptions\nModerationsOptions\nReturn\nReturnOptions\nStigmaOptions\n\nChat\nAIMessage\nBaseMessage\nChatRole\nHumanMessage\nSystemMessage\n\nHistory\nHistoryParams\n\nResponses\nChatResponse\nChatStreamResponse\nErrorExtensionState\nErrorExtensionStateParam\nErrorExtensions\nErrorResponse\nFileFormatResult\nFileInfoResult\nFilesListResponse\nGenAiResponseModel\nGenerateLimits\nGenerateResponse\nGenerateResult\nGenerateStreamResponse\nGenerateStreamResult\nGeneratedToken\nHAPResult\nHistoryResponse\nHistoryResult\nHistoryResultRequest\nImplicitHateResult\nModelCard\nModelList\nModerationResult\nModerationTypeResult\nStigmaResult\nStopReasonEnum\nTermsOfUse\nTermsOfUseRes

In [73]:
# # ERROR - this doesnot work due to the S0 limit of API call frequency
answer_chain = _context | chain_wait | response_synthesizer

In [74]:
answer_chain.invoke(
    {
        "chat_history": [
            HumanMessage(content="What parameres of Credentials?"),
            AIMessage(content="The parameters of Credentials are api_key and api_endpoint."),
        ],
        "question": "What are their types?",
    }
)

Retrying langchain_community.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 2 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..


'The type of api_key is string, and the type of api_endpoint is string.'

## complex above

## simple below

In [16]:
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder

condense_q_system_prompt = """Given a chat history and the latest user question \
which might reference the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
condense_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", condense_q_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)
condense_q_chain = condense_q_prompt | llm | StrOutputParser()

In [17]:
# test chat chain
from langchain_core.messages import AIMessage, HumanMessage

condense_q_chain.invoke(
    {
        "chat_history": [
            HumanMessage(content="What does LLM stand for?"),
            AIMessage(content="Large language model"),
        ],
        "question": "What is meant by large",
    }
)

'The term "large" in large language model refers to the size of the neural network used to process and generate language.'

In [27]:
# QA chain
import time
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

def condense_question(input: dict):
    if input.get("chat_history"):

        condensed_q = condense_q_chain.invoke(
            {
                "chat_history": [
                    HumanMessage(content="What does LLM stand for?"),
                    AIMessage(content="Large language model"),
                ],
                "question": "What is meant by large",
            }
        )
        time.sleep(5)
        return condensed_q
    else:
        return input["question"]


rag_chain = (
    RunnablePassthrough.assign(context=condense_question | retriever | format_docs)
    | qa_prompt
    | llm
)

In [28]:
chat_history = []

question = "What is Task Decomposition?"
with no_ssl_verification():
    ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])

In [29]:
chat_history

[HumanMessage(content='What is Task Decomposition?'),
 AIMessage(content='Task Decomposition is the process of breaking down a complex task into smaller, more manageable sub-tasks or components. This allows for easier delegation of tasks, planning, and coordination of efforts. By breaking down a task into smaller pieces, it becomes more straightforward to identify dependencies and possible areas for optimization.')]

In [30]:
second_question = "What are common ways of doing it?"

with no_ssl_verification():
    rag_chain.invoke({"question": second_question, "chat_history": chat_history})

Retrying langchain_community.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 8 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..


RateLimitError: Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 4 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.

# OLD below

In [56]:
#get prompt template
prompt = hub.pull("rlm/rag-prompt")

In [26]:
# test prompt
print(
    prompt.invoke(
        {"context": "filler context", "question": "filler question"}
    ).to_string()
)

Human: You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: filler question 
Context: filler context 
Answer:


## create chain
- LangChain Expression Language (LCEL)

In [57]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [58]:
with no_ssl_verification():
    for chunk in rag_chain.stream("What are the parameters of Credentials?"):
        print(chunk, end="", flush=True)

The parameters of Credentials are api_key and api_endpoint.

In [19]:
# with no_ssl_verification():
#     rag_chain.invoke("What is Task Decomposition?")

## customize the prompt

In [21]:
from langchain.prompts import PromptTemplate

template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "Cheers!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
rag_prompt_custom = PromptTemplate.from_template(template)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | rag_prompt_custom
    | llm
    | StrOutputParser()
)

with no_ssl_verification():
    rag_chain.invoke("What is Task Decomposition?")

# 7. Add sources

In [22]:
from operator import itemgetter

from langchain_core.runnables import RunnableParallel

rag_chain_from_docs = (
    {
        "context": lambda input: format_docs(input["documents"]),
        "question": itemgetter("question"),
    }
    | rag_prompt_custom
    | llm
    | StrOutputParser()
)
rag_chain_with_source = RunnableParallel(
    {"documents": retriever, "question": RunnablePassthrough()}
) | {
    "documents": lambda input: [doc.metadata for doc in input["documents"]],
    "answer": rag_chain_from_docs,
}

rag_chain_with_source.invoke("What is Task Decomposition")

Retrying langchain_community.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 5 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..


{'documents': [{'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/',
   'start_index': 1585},
  {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/',
   'start_index': 2192},
  {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/',
   'start_index': 17804},
  {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/',
   'start_index': 17414},
  {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/',
   'start_index': 29630},
  {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/',
   'start_index': 19373}],
 'answer': 'Task decomposition is the process of breaking down a complex task into smaller, more manageable steps. This can be done through techniques such as Chain of Thought or Tree of Thoughts, and can be facilitated by LLM, task-specific instructions, or human inputs. Cheers!'}

# 8. Add Memory
- for statefull apps
- this can be done by refolmulating the latest question given the history
- add a call to LLM to refolmulate

In [24]:
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder

condense_q_system_prompt = """Given a chat history and the latest user question \
which might reference the chat history, formulate a standalone question \
which can be understood without the chat history. Do NOT answer the question, \
just reformulate it if needed and otherwise return it as is."""
condense_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", condense_q_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)
condense_q_chain = condense_q_prompt | llm | StrOutputParser()

In [25]:
condense_q_chain

ChatPromptTemplate(input_variables=['chat_history', 'question'], input_types={'chat_history': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], template='Given a chat history and the latest user question which might reference the chat history, formulate a standalone question which can be understood without the chat history. Do NOT answer the question, just reformulate it if needed and otherwise return it as is.')), MessagesPlaceholder(variable_name='chat_history'), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question'], template='{question}'))])
| AzureChatOpenAI(client=<class 'openai.api_resources.chat_completion.ChatCompletion'>, openai_api_key='2

In [26]:
# test chat chain
from langchain_core.messages import AIMessage, HumanMessage

condense_q_chain.invoke(
    {
        "chat_history": [
            HumanMessage(content="What does LLM stand for?"),
            AIMessage(content="Large language model"),
        ],
        "question": "What is meant by large",
    }
)

'The term "large" in reference to language models generally means that the model has been trained on a very large corpus of text data, often on the scale of billions or trillions of words.'

In [31]:
# QA chain
qa_system_prompt = """You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Use three sentences maximum and keep the answer concise.\

{context}"""
qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", qa_system_prompt),
        MessagesPlaceholder(variable_name="chat_history"),
        ("human", "{question}"),
    ]
)


def condense_question(input: dict):
    if input.get("chat_history"):
        return condense_q_chain
    else:
        return input["question"]


rag_chain = (
    RunnablePassthrough.assign(context=condense_question | retriever | format_docs)
    | qa_prompt
    | llm
)

In [32]:
chat_history = []

question = "What is Task Decomposition?"
with no_ssl_verification():
    ai_msg = rag_chain.invoke({"question": question, "chat_history": chat_history})
chat_history.extend([HumanMessage(content=question), ai_msg])


In [34]:
second_question = "What are common ways of doing it?"

with no_ssl_verification():
    rag_chain.invoke({"question": second_question, "chat_history": chat_history})

chat_history


Retrying langchain_community.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 10 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..


RateLimitError: Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2023-09-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 5 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.

this approach has issue with Azure Open AI call rate limit

# Approach 2 - Using conversational chain

In [None]:
# !pip install tiktoken chromadb

In [None]:
# from langchain.document_loaders import WebBaseLoader

# loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
# data = loader.load()

In [None]:
# from langchain.text_splitter import RecursiveCharacterTextSplitter

# text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
# all_splits = text_splitter.split_documents(data)

# from langchain.embeddings import OpenAIEmbeddings
# from langchain.vectorstores import Chroma

# vectorstore = Chroma.from_documents(documents=all_splits, embedding=OpenAIEmbeddings())

In [None]:
from langchain.memory import ConversationSummaryMemory

memory = ConversationSummaryMemory(
    llm=llm, memory_key="chat_history", return_messages=True
)

In [None]:
from langchain.chains import ConversationalRetrievalChain
# from langchain.chat_models import ChatOpenAI

# llm = ChatOpenAI()
# retriever = vectorstore.as_retriever()
qa = ConversationalRetrievalChain.from_llm(llm, retriever=retriever, memory=memory)

In [None]:
qa("How do agents use Task decomposition?")

Retrying langchain_community.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised RateLimitError: Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2023-07-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 10 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit..


RateLimitError: Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2023-07-01-preview have exceeded call rate limit of your current OpenAI S0 pricing tier. Please retry after 6 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.

this approach has issue with Azure Open AI call rate limit

In [None]:
qa("What are the various ways to implement memory to support it?")

# Clean up

In [None]:
# cleanup
vectorstore.delete_collection()