# Imports and setup

In [1]:
%load_ext autoreload
%autoreload 2

%load_ext jupyter_black
%load_ext dotenv
%dotenv

In [2]:
import os

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"

In [3]:
import bs4
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import (
    RecursiveCharacterTextSplitter,
    TextSplitter,
    SentenceTransformersTokenTextSplitter,
)
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_community.chat_models.google_palm import ChatGooglePalm
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [4]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [5]:
TOKENIZERS_PARALLELISM = False

# Prepare Vector Store

In [6]:
polands_presidents_pages = [
    "https://en.wikipedia.org/wiki/President_of_Poland",
    "https://en.wikipedia.org/wiki/Gabriel_Narutowicz",
    "https://en.wikipedia.org/wiki/Stanis%C5%82aw_Wojciechowski",
    "https://en.wikipedia.org/wiki/Ignacy_Mo%C5%9Bcicki",
    "https://en.wikipedia.org/wiki/Wojciech_Jaruzelski",
    "https://en.wikipedia.org/wiki/Lech_Wa%C5%82%C4%99sa",
    "https://en.wikipedia.org/wiki/Aleksander_Kwa%C5%9Bniewski",
    "https://en.wikipedia.org/wiki/Lech_Kaczy%C5%84ski",
    "https://en.wikipedia.org/wiki/Bronis%C5%82aw_Komorowski",
    "https://en.wikipedia.org/wiki/Andrzej_Duda",
]

In [7]:
# load pages
loader = WebBaseLoader(
    web_paths=(polands_presidents_pages),
    bs_kwargs=dict(parse_only=bs4.SoupStrainer(id=("mw-content-text"))),
)
blog_docs = loader.load()

In [8]:
blog_docs

[Document(page_content='Head of state of Poland\nFor a list of holders of the office, see List of presidents of Poland.\n\n\nPresident of the Republic of PolandPrezydent Rzeczypospolitej Polskiej\xa0(Polish)Presidential pennantIncumbentAndrzej Dudasince\xa06 August 2015Executive branch of the Polish GovernmentStyleMr. President(informal)His Excellency(diplomatic)StatusHead of stateCommander-in-chiefMember ofNational Security CouncilNational Development CouncilResidencePresidential PalaceWarsawAppointerPopular voteTerm lengthFive years, renewable onceConstituting instrumentConstitution of PolandPrecursorChief of StateFormation11\xa0December 1922; 101 years ago\xa0(1922-12-11)First holderGabriel NarutowiczDeputyMarshal of the SejmSalary294,000 zł annually[1]WebsiteOfficial website\nPolitics of Poland\nGovernment\nConstitution of Poland\nLaw\nHuman rights\n\nLegislature\nParliament of Poland\nCurrent Parliament\n \n\n\n\n\nSejm\nMarshal Szymon Hołownia (PL2050)\nDeputy Marshals\n\n\n\nSen

In [9]:
# split text
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
docs = text_splitter.split_documents(blog_docs)

len(docs)

957

In [19]:
vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=HuggingFaceEmbeddings(model_name=f"sentence-transformers/gtr-t5-base"),
)



In [20]:
llm = ChatGooglePalm(google_api_key=os.getenv("GOOGLE_API_KEY"))

Lets use default prompt used in MultiQueryRetriever:

template="""You are an AI language model assistant. Your task is \
to generate 3 different versions of the given user \
question to retrieve relevant documents from a vector  database. \
By generating multiple perspectives on the user question, \
your goal is to help the user overcome some of the limitations \
of distance-based similarity search. Provide these alternative \
questions separated by newlines. Original question: {question}"""

# MultiQueryRetriever

In [21]:
multi_qa_retriever = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(search_kwargs={"k": 3}),
    llm=llm,
    # include original query in the results
    #  include_original=True,
)

In [22]:
question = "tell me about polish presidents"

docs = multi_qa_retriever.invoke(question)
len(docs)

INFO:langchain.retrievers.multi_query:Generated queries: ['1. Who have been the presidents of Poland?', '2. What are some interesting facts about Polish presidents?', '3. What are the challenges facing the current Polish president?']


4

# RAG

In [23]:
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": multi_qa_retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [25]:
final_rag_chain.invoke(question)

INFO:langchain.retrievers.multi_query:Generated queries: ['Sure, here are 3 different versions of the question "tell me about polish presidents":', '', '1. Who have been the presidents of Poland?', '2. What are some interesting facts about Polish presidents?', '3. What are the challenges facing the current Polish president?']


ChatGooglePalmError: ChatResponse must have at least one candidate.

In [None]:
vectorstore.delete_collection()