# WebResearchRetriever

Given a query, this retriever will: 

* Formulate a set of relate Google searches
* Search for each 
* Load all the resulting URLs
* Then embed and perform similarity search with the query on the consolidate page content

In [None]:
print("Checking dependencies...")
# %pip install --upgrade pip --quiet
# %pip install langchain --quiet
# %pip install python-dotenv --quiet
# %pip install openai --quiet
# %pip install beautifulsoup4 --quiet
# %pip install chromadb --quiet
# %pip install google-api-python-client --quiet
# %pip install html2text --quiet
# %pip install tiktoken --quiet
# %pip install rich --quiet
print("Done!")


from dotenv import load_dotenv
load_dotenv()

# notebook.output.wordWrap = True

In [None]:
from langchain.retrievers.web_research import WebResearchRetriever

### Simple usage

Specify the LLM to use for Google search query generation.

In [None]:
import os


from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models.openai import ChatOpenAI
from langchain.utilities import GoogleSearchAPIWrapper

# Vectorstore
vectorstore = Chroma(embedding_function=OpenAIEmbeddings(),persist_directory="./chroma_db_oai")

# LLM
llm = ChatOpenAI(temperature=0)
load_dotenv()
# Search 
os.environ["GOOGLE_CSE_ID"] = os.getenv("GOOGLE_CSE_ID")
os.environ["GOOGLE_API_KEY"] = os.getenv("GOOGLE_API_KEY")
search = GoogleSearchAPIWrapper()

In [None]:
# Initialize
web_research_retriever = WebResearchRetriever.from_llm(
    vectorstore=vectorstore,
    llm=llm, 
    search=search, 
)

#### Run with citations

We can use `RetrievalQAWithSourcesChain` to retrieve docs and provide citations.

In [18]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chains.question_answering import load_qa_chain

user_input = "What is FreeMoCap?"


# qa_chain = RetrievalQAWithSourcesChain.from_chain_type(llm,retriever=web_research_retriever)
docs = web_research_retriever.get_relevant_documents(user_input)

chain = load_qa_chain(llm, chain_type="stuff")
output = chain({"input_documents": docs, "question": user_input},return_only_outputs=True)
output['output_text']



INFO:langchain.retrievers.web_research:Generating questions for Google Search ...


INFO:langchain.retrievers.web_research:Questions for Google Search (raw): {'question': 'What is FreeMoCap?', 'text': LineList(lines=['1. What does FreeMoCap stand for?\n', '2. How does FreeMoCap work?\n', '3. Can you explain the concept of FreeMoCap?'])}
INFO:langchain.retrievers.web_research:Questions for Google Search: ['1. What does FreeMoCap stand for?\n', '2. How does FreeMoCap work?\n', '3. Can you explain the concept of FreeMoCap?']
INFO:langchain.retrievers.web_research:Searching for relevant urls...
INFO:langchain.retrievers.web_research:Searching for relevant urls...
INFO:langchain.retrievers.web_research:Search results: [{'title': 'Home | The FreeMoCap Project', 'link': 'https://freemocap.org/', 'snippet': 'The Free Motion Capture Project (FreeMoCap) aims to provide research-grade markerless motion capture software to everyone for free.'}]
INFO:langchain.retrievers.web_research:Searching for relevant urls...
INFO:langchain.retrievers.web_research:Search results: [{'title': '

'FreeMoCap is a free and open-source motion capture system and platform. It is designed to be hardware and software agnostic, meaning it can work with different types of motion capture devices and software. FreeMoCap aims to provide a minimal-cost solution for motion capture, making it accessible for scientific research, education, and training purposes. It offers a graphical user interface (GUI) and supports decentralized scientific research.'

In [None]:
result = qa_chain({"question": user_input})
result

#### Run with logging

Here, we use `get_relevant_documents` method to return docs.

In [None]:
# Run
import logging
logging.basicConfig()
logging.getLogger("langchain.retrievers.web_research").setLevel(logging.INFO)
user_input = "Who is Yocheved Lifshitz?"
docs = web_research_retriever.get_relevant_documents(user_input)

from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff")
output = chain({"input_documents": docs, "question": user_input},return_only_outputs=True)
output['output_text']


#### Generate answer using retrieved docs

We can use `load_qa_chain` for QA using the retrieved docs.

### More flexibility

Pass an LLM chain with custom prompt and output parsing.

In [None]:
import os
import re
from typing import List
from langchain.chains import LLMChain
from pydantic import BaseModel, Field
from langchain.prompts import PromptTemplate
from langchain.output_parsers.pydantic import PydanticOutputParser

# LLMChain
search_prompt = PromptTemplate(
    input_variables=["question"],
    template="""You are an assistant tasked with improving Google search 
    results. Generate FIVE Google search queries that are similar to
    this question. The output should be a numbered list of questions and each
    should have a question mark at the end: {question}""",
)

class LineList(BaseModel):
    """List of questions."""

    lines: List[str] = Field(description="Questions")

class QuestionListOutputParser(PydanticOutputParser):
    """Output parser for a list of numbered questions."""

    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = re.findall(r"\d+\..*?\n", text)
        return LineList(lines=lines)
    
llm_chain = LLMChain(
            llm=llm,
            prompt=search_prompt,
            output_parser=QuestionListOutputParser(),
        )

In [None]:
# Initialize
web_research_retriever_llm_chain = WebResearchRetriever(
    vectorstore=vectorstore,
    llm_chain=llm_chain, 
    search=search, 
)

# Run
docs = web_research_retriever_llm_chain.get_relevant_documents(user_input)

In [None]:

from rich import print

for doc in docs:
    print(doc.page_content)

### Run locally

Specify LLM and embeddings that will run locally (e.g., on your laptop).

In [None]:
# from langchain.llms import LlamaCpp
# from langchain.embeddings import GPT4AllEmbeddings
# from langchain.callbacks.manager import CallbackManager
# from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

# n_gpu_layers = 1  # Metal set to 1 is enough.
# n_batch = 512  # Should be between 1 and n_ctx, consider the amount of RAM of your Apple Silicon Chip.
# callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
# llama = LlamaCpp(
#     model_path="/Users/rlm/Desktop/Code/llama.cpp/llama-2-13b-chat.ggmlv3.q4_0.bin",
#     n_gpu_layers=n_gpu_layers,
#     n_batch=n_batch,
#     n_ctx=4096,  # Context window
#     max_tokens=1000,  # Max tokens to generate
#     f16_kv=True,  # MUST set to True, otherwise you will run into problem after a couple of calls
#     callback_manager=callback_manager,
#     verbose=True,
# )

# vectorstore_llama = Chroma(embedding_function=GPT4AllEmbeddings(),persist_directory="./chroma_db_llama")

We supplied `StreamingStdOutCallbackHandler()`, so model outputs (e.g., generated questions) are streamed. 

We also have logging on, so we seem them there too.

In [None]:
from langchain.chains import RetrievalQAWithSourcesChain
# Initialize
web_research_retriever = WebResearchRetriever.from_llm(
    vectorstore=vectorstore,
    llm=llm, 
    search=search, 
)

# Run
user_input = "What is Task Decomposition in LLM Powered Autonomous Agents?"
qa_chain = RetrievalQAWithSourcesChain.from_chain_type(llm,retriever=web_research_retriever)
result = qa_chain({"question": user_input})
result