# RAG with Agents using local model

In [23]:
import os
import bs4
from langchain.embeddings import GPT4AllEmbeddings
from langchain.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import GPT4All
from langchain.prompts import PromptTemplate
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA
# from ssl_workaround import no_ssl_verification

# 1. Init

# 2. Load from Web

In [4]:
# loads URL content into a document - one document per URL
# only extracts the content of the post-content, post-title, and post-header classes
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs={
        "parse_only": bs4.SoupStrainer(
            class_=("post-content", "post-title", "post-header")
        )
    },
)
docs = loader.load()

In [5]:
# test doc loading
print(f'Nuber of docs loaded: {len(docs)}')
print(f'First doc lenght: {len(docs[0].page_content)}')
print(f'Sample: {docs[0].page_content[:500]}')

Nuber of docs loaded: 1
First doc lenght: 42824
Sample: 

      LLM Powered Autonomous Agents
    
Date: June 23, 2023  |  Estimated Reading Time: 31 min  |  Author: Lilian Weng


Building agents with LLM (large language model) as its core controller is a cool concept. Several proof-of-concepts demos, such as AutoGPT, GPT-Engineer and BabyAGI, serve as inspiring examples. The potentiality of LLM extends beyond generating well-written copies, stories, essays and programs; it can be framed as a powerful general problem solver.
Agent System Overview#
In


# 3. Split
- LLM struggle to find info in very long context, we should split it in reasonable long documents
- its good to have overlap between spit documents in order not to loose context from ending / begginging chunk parts

In [6]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

In [7]:
# test splitter
print(f'Number of documents created: {len(all_splits)}')
print(f'First document length: {len(all_splits[0].page_content)}')
print(f'Metadata sample: {all_splits[0].metadata}')

Number of documents created: 66
First document length: 969
Metadata sample: {'source': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'start_index': 8}


# 4. Vector Store
using local LLM model for embedding

In [8]:
# local model for embeddings
gpt4all_embd = GPT4AllEmbeddings()

In [9]:
# test embedding model
text = "This is a test document."
query_result = gpt4all_embd.embed_query(text)
print(f'embed slen: {len(query_result)}')
print(query_result[:20])

embed slen: 384
[-0.04898764193058014, 0.12305509299039841, -0.04326639696955681, 0.0587812215089798, 0.009365170262753963, -0.0384075902402401, -0.07787521928548813, 0.0464746430516243, -0.024038562551140785, 0.05405070260167122, 0.09310438483953476, 0.019093262031674385, 0.006022939924150705, -0.0064134784042835236, -0.06747597455978394, -0.016789510846138, -0.021481089293956757, -0.045280035585165024, 0.0024571875110268593, 0.10313370078802109]


In [14]:
persist_directory = './data/test_vectorstore_llm_localemb/'

In [10]:
# # OPTION 1: create vectorstore from scratch - needed first time
# vectorstore = Chroma.from_documents(
#                                 documents=all_splits, 
#                                 embedding=gpt4all_embd, 
#                                 persist_directory=persist_directory)

In [15]:
# OPTION 2: load vectorstore from disk - faster
vectorstore = Chroma(
            persist_directory=persist_directory, 
            embedding_function=gpt4all_embd)

# Retriever tool

In [16]:
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [17]:
# retriever test
retrieved_docs = retriever.get_relevant_documents(
    "What are the approaches to Task Decomposition?"
)
print(f'---Number of documents retrieved: {len(retrieved_docs)}')
print(f'---Doc 0 length: {len(retrieved_docs[0].page_content)}')
print(f'---Doc 0: {retrieved_docs[0].page_content}')
print(f'---Doc 1 length: {len(retrieved_docs[1].page_content)}')
print(f'---Doc 1: {retrieved_docs[1].page_content}')

---Number of documents retrieved: 3
---Doc 0 length: 644
---Doc 0: Tree of Thoughts (Yao et al. 2023) extends CoT by exploring multiple reasoning possibilities at each step. It first decomposes the problem into multiple thought steps and generates multiple thoughts per step, creating a tree structure. The search process can be BFS (breadth-first search) or DFS (depth-first search) with each state evaluated by a classifier (via a prompt) or majority vote.
Task decomposition can be done (1) by LLM with simple prompting like "Steps for XYZ.\n1.", "What are the subgoals for achieving XYZ?", (2) by using task-specific instructions; e.g. "Write a story outline." for writing a novel, or (3) with human inputs.
---Doc 1 length: 606
---Doc 1: Fig. 1. Overview of a LLM-powered autonomous agent system.
Component One: Planning#
A complicated task usually involves many steps. An agent needs to know what they are and plan ahead.
Task Decomposition#
Chain of thought (CoT; Wei et al. 2022) has become a

# Create Local LLM instance

In [20]:
template = """Question: {question}

Answer: Let's think step by step."""

prompt = PromptTemplate(template=template, input_variables=["question"])

In [21]:
llm = GPT4All(
    model="../local_models/gpt4all/mistral-7b-openorca.Q4_0.gguf",
    max_tokens=2048,
)

In [24]:
# conversational memory
conversational_memory = ConversationBufferWindowMemory(
    memory_key='chat_history',
    k=5,
    return_messages=True
)
# retrieval qa chain
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    verbose=True
)

In [25]:
# retriever chat test
qa.run("How can the reflextion framework be used in LLM apps?")

  warn_deprecated(


" The reflection framework can be used in LLM applications by following these steps: 1) Identifying inefficient or hallucinatory trajectories, 2) Creating self-reflection through two-shot examples and adding them into the agent's working memory for querying LLMs, 3) Using tool APIs provided by other developers (like Plugins) or self-defined functions to enhance the capabilities of the LLM. For example, HuggingGPT uses ChatGPT as a task planner to select models available in the HuggingFace platform and summarize execution results based on these selections."

# Create Agent

In [33]:
from langchain.agents import Tool

tools = [
    Tool(
        name='Knowledge Base',
        func=qa.run,
        description=(
            'use this tool when answering questions about LLM, LLM apps and the LLM frameworks'
        )
    )
]

agent types: \
https://api.python.langchain.com/en/latest/agents/langchain.agents.agent_types.AgentType.html#langchain.agents.agent_types.AgentType

In [34]:
from langchain.agents import initialize_agent

agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory,
    handle_parsing_errors=True
)

In [36]:
# existing prompt
print(agent.agent.llm_chain.prompt.messages[0].prompt.template)

Assistant is a large language model trained by OpenAI.

Assistant is designed to be able to assist with a wide range of tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. As a language model, Assistant is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.

Assistant is constantly learning and improving, and its capabilities are constantly evolving. It is able to process and understand large amounts of text, and can use this knowledge to provide accurate and informative responses to a wide range of questions. Additionally, Assistant is able to generate its own text based on the input it receives, allowing it to engage in discussions and provide explanations and descriptions on a wide range of topics.

Overall, Assistant is a powerful system that can help with a wide range of task

In [37]:
sys_msg = """Assistant is a large language model.

Assistant is designed to be able to assist with a wide range of tasks related to LLMs, machine learning and programming , \
from answering simple questions to providing in-depth explanations, example code and links to documentation. \
As a language model, Assistant is able to generate human-like text based on the input it receives, \
allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand.
Assistant can use code spippets and URL links.

Assistant is constantly learning and improving, and its capabilities are constantly evolving. \
It is able to process and understand large amounts of text, and can use this knowledge to provide accurate \
and informative responses to a wide range of questions. Additionally, Assistant is able to generate its own text based on the input \
it receives, allowing it to engage in discussions and provide explanations and descriptions on a wide range of topics related to LLMs, machine learning and programming.

If the question is related to LLMs, machine learning and programming Assistant has to first use the Knowledge Base. \

If the question is not related to LLMs, machine learning and programming, Assistant has to respond: \
I'm sorry this is not my area of expertise.

"""

In [None]:
# If no relevant answer is found in the Knowlege Base, Assistant can use Web Search tool to find an answer on the internet. \

In [38]:
new_prompt = agent.agent.create_prompt(
    system_message=sys_msg,
    tools=tools
)

agent.agent.llm_chain.prompt = new_prompt

In [39]:
agent.run("How can the reflextion framework be used in LLM apps?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mCould not parse LLM output: [0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: 
AI: {
    "action": "Final Answer",
    "action_input": "The reflection framework can be used in LLM apps by incorporating it into the model's training process to improve its performance over time."
}[0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: 
AI: {
    "action": "Final Answer",
    "action_input": "The reflection framework can be used in LLM apps by incorporating it into the model's training process to improve its performance over time."
}[0m
Observation: Invalid or incomplete response
Thought:

ValueError: variable agent_scratchpad should be a list of base messages, got Could not parse LLM output: 
Observation: Invalid or incomplete response
Thought:Could not parse LLM output: 
AI: {
    "action": "Final Answer",
    "action_input": "The reflection framework can be used in LLM apps by incorporating it into the model's training process to improve its performance over time."
}
Observation: Invalid or incomplete response
Thought:Could not parse LLM output: 
AI: {
    "action": "Final Answer",
    "action_input": "The reflection framework can be used in LLM apps by incorporating it into the model's training process to improve its performance over time."
}
Observation: Invalid or incomplete response
Thought:

I now need to return a final answer based on the previous steps:

In [42]:
agent("what is 3 * 7?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mCould not parse LLM output: [0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: [0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: [0m
Observation: Invalid or incomplete response
Thought:

ValueError: variable agent_scratchpad should be a list of base messages, got Could not parse LLM output: 
Observation: Invalid or incomplete response
Thought:Could not parse LLM output: 
Observation: Invalid or incomplete response
Thought:Could not parse LLM output: 
Observation: Invalid or incomplete response
Thought:

I now need to return a final answer based on the previous steps:

## adding a search

In [27]:
# %pip install --upgrade --quiet  duckduckgo-search

Note: you may need to restart the kernel to use updated packages.


basic search

In [43]:
from langchain.tools import DuckDuckGoSearchRun

In [44]:
search = DuckDuckGoSearchRun()

In [45]:
search.run("What are LLMs?")

"In the simplest of terms, LLMs are next-word prediction engines. Along with OpenAI's GPT-3 and 4 LLM, popular LLMs include open models such as Google's LaMDA and PaLM LLM (the basis for Bard),... A large language model (LLM) is a language model notable for its ability to achieve general-purpose language generation. LLMs acquire these abilities by learning statistical relationships from text documents during a computationally intensive self-supervised and semi-supervised training process. LLMs are artificial neural networks typically built with a transformer-based architecture. Define language models and large language models (LLMs). Define key LLM concepts, including Transformers and self-attention. Describe the costs and benefits of LLMs, along with common use... LLMs are a subset of generative AI that primarily use language as opposed to other more diverse representations seen through generative AI. But it's worth noting that these distinctions are becoming increasingly blurred as m

with links

In [46]:
from langchain.tools import DuckDuckGoSearchResults

In [47]:
search = DuckDuckGoSearchResults()

In [48]:
search.run("What are LLMs?")

"[snippet: In the simplest of terms, LLMs are next-word prediction engines. Along with OpenAI's GPT-3 and 4 LLM, popular LLMs include open models such as Google's LaMDA and PaLM LLM (the basis for Bard),..., title: What are LLMs, and how are they used in generative AI?, link: https://www.computerworld.com/article/3697649/what-are-large-language-models-and-how-are-they-used-in-generative-ai.html], [snippet: A large language model (LLM) is a language model notable for its ability to achieve general-purpose language generation. LLMs acquire these abilities by learning statistical relationships from text documents during a computationally intensive self-supervised and semi-supervised training process. LLMs are artificial neural networks typically built with a transformer-based architecture., title: Large language model - Wikipedia, link: https://en.wikipedia.org/wiki/Large_language_model], [snippet: Define language models and large language models (LLMs). Define key LLM concepts, including

wrapper with more control

In [49]:
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper

wrapper = DuckDuckGoSearchAPIWrapper(time="y", max_results=3)

In [50]:
search = DuckDuckGoSearchResults(api_wrapper=wrapper)
search.run("What is the best programming language for ML?")

"[snippet: Unambiguously, Python is the best language for machine learning. It's an interpreted language that is easy to read and understand, as its syntax is relatively similar to spoken languages. Because of this, it is also one of the easiest languages to learn., title: The Best Programming Languages for Machine Learning, link: https://tripleten.com/blog/posts/the-best-programming-languages-for-machine-learning], [snippet: Lower-level languages (like R, C++, or Java) offer greater speed but are harder to learn. Higher-level languages (like JavaScript and Python) are easier to use but slower to execute. Python is a key language for machine learning and data analytics. For speed-to-competence and breadth of application, it's probably the best one for beginners., title: What's the Best Language for Machine Learning? - CareerFoundry, link: https://careerfoundry.com/en/blog/data-analytics/best-machine-learning-languages/], [snippet: Contents 1. Best programming languages for machine lear

# Agent with search

In [51]:
tools = [
    # Tool(
    #     name='Knowledge Base',
    #     func=qa.run,
    #     description=(
    #         'use this tool when answering questions about LLM, LLM apps and the LLM frameworks'
    #     )
    # ),
    Tool(
        name='Web Search',
        func=search.run,
        description=(
            'use this tool when answering questions about LLM, LLM apps and the LLM frameworks'
        )
    )
]

In [52]:
agent = initialize_agent(
    agent='chat-conversational-react-description',
    tools=tools,
    llm=llm,
    verbose=True,
    max_iterations=3,
    early_stopping_method='generate',
    memory=conversational_memory,
    handle_parsing_errors=True
)

In [53]:
agent.run("What are LLMs?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mCould not parse LLM output: [0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: 
AI: {
    "action": "Final Answer",
    "action_input": "LLMs are large language models that can generate human-like text based on the input they receive."
}[0m
Observation: Invalid or incomplete response
Thought:[32;1m[1;3mCould not parse LLM output: [0m
Observation: Invalid or incomplete response
Thought:

ValueError: variable agent_scratchpad should be a list of base messages, got Could not parse LLM output: 
Observation: Invalid or incomplete response
Thought:Could not parse LLM output: 
AI: {
    "action": "Final Answer",
    "action_input": "LLMs are large language models that can generate human-like text based on the input they receive."
}
Observation: Invalid or incomplete response
Thought:Could not parse LLM output: 
Observation: Invalid or incomplete response
Thought:

I now need to return a final answer based on the previous steps: