# 1. Loading, Splitting

- Load data from PDF
- Split into chunks
    - Methods: by page, RecursiveCharacterTextSplitter, CharacterTextSplitter
    - Note; clean text if needed

In [1]:
#pip install PyPDF

In [9]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter

In [2]:
# get current dir
dir = os.getcwd()
file_path = dir + '/fedbeigebook/BeigeBook_20230712.pdf'
date = file_path[-12:-4]

In [3]:
#define  text splitter

text_splitter1 = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size = 1000,
    chunk_overlap  = 200,
    length_function = len,
)

text_splitter2 = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 1000,
    chunk_overlap  = 200, #striding over the text
    length_function = len,
)

metadata_dic = {"date": date, "category": "FED Beige Book", "author": "FED"}

In [4]:
# load and split by page

loader = PyPDFLoader(file_path)
pages = loader.load()

# add metadata
for page in pages:
    
    page.metadata = metadata_dic

In [5]:
# load and split by recursive character
chunks = loader.load_and_split(text_splitter = text_splitter2) # default text_splitter – RecursiveCharacterTextSplitter

In [6]:
# split and add metadata

## join all pages into one string
number_of_pages = len(pages)
full_text = ' '.join([pages[i].page_content for i in range(number_of_pages)])

## split and add metadata
metadatas = [{"date": date, "category": "FED Beige Book", "author": "FED"}]
texts = text_splitter1.create_documents([full_text],metadatas=metadatas)

In [7]:
len(pages), len(chunks)

(32, 155)

In [8]:
pages[0]

Document(page_content='The  Beige  Book \nSummary of Commentary on Current Economic Conditions  \nBy Federal Reserve District  For use at 2:00 PM EDT  \nWednesday  \nJuly 12, 2023  \nJune 2023  ', metadata={'date': '20230712', 'category': 'FED Beige Book', 'author': 'FED'})

# 2. Storage: Vector DB

In [22]:
# pip install faiss-cpu

# upgrade langchain
# pip install --upgrade langchain
# pip install langchain==0.0.240

In [1]:
from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

In [4]:
embeddings = OpenAIEmbeddings()

In [2]:
db = FAISS.from_documents(pages, embeddings)
# save the database
db.save_local("faiss_beigebook")

In [5]:
db = FAISS.load_local("faiss_beigebook", embeddings)

In [6]:
db.docstore._dict

{'2e051a97-c277-4cc4-a44a-c43f0e219395': Document(page_content='The  Beige  Book \nSummary of Commentary on Current Economic Conditions  \nBy Federal Reserve District  For use at 2:00 PM EDT  \nWednesday  \nJuly 12, 2023  \nJune 2023  ', metadata={'date': '20230712', 'category': 'FED Beige Book', 'author': 'FED'}),
 '12cd83b2-2e6d-48d8-bdfc-f61033efba66': Document(page_content='Federal Reserve Districts  \nBoston  \nNew York  \nPhiladelphia  Cleveland  Chicago  \nRichmond  \nAtlanta  St. Louis  Kansas City  \nDallas  Minneapolis  \nSan Francisco  \nThe System serves commonwealths and territories as follows: the New York Bank serves the \nCommonwealth of Puerto Rico and the U.S. Virgin  Islands; the San Francisco Bank serves \nAmerican Samoa, Guam, and the Commonwealth of the Northern Mariana Islands.  \nAlaska and Hawaii  \nare part of the  \n San Francisco District.  \nThis report was prepared at the Federal Reserve Bank of Minneapolis based on information collected \non or before Jun

# 3. Retrival QA

In [7]:
from langchain.chains import RetrievalQA
from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate

## 3.1 Retrival

from Vector DB
- Vector store-backed retriever (lightweight)
- Time-weighted vector store retriever
- MultiQueryRetriever
- Self-querying
- contextual compression

Others: KNN, SVM retriever for text list

In [7]:
retriever1 = db.as_retriever() #By default, the vectorstore retriever uses similarity search. 
retriever2 = db.as_retriever(search_type="mmr")  #If the underlying vectorstore support maximum marginal relevance search, you can specify that as the search type.
retriever3 = db.as_retriever(search_kwargs={"k": 5})
retriever4 = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"score_threshold": .5}) 

In [12]:
query = "What is the current economic condition in New York?"

In [10]:
docs = retriever2.get_relevant_documents(query)

In [11]:
docs

[Document(page_content='B-1Federal Reserve Bank of New York  \nSummary of Economic Activity  \nEconomic activity in the Second District stabilized in recent weeks following a period of moderate weakness. Labor \nmarket conditions were strong, with ongoing modest employment gains and steady wage growth. Inflationary pressures \neased as both input and selling price increases slowed noticeably. Supply availability continued to improve, particularly \nfor manufacturers, and manufacturing activity edged slightly higher. Consumer spending grew steadily and tourism in \nNew York City remained strong. While housing markets were solid, exceptionally low inventory remained a challenge \nand there were some signs of a pullback in demand in parts of the District. Commercial real estate markets remained \nmostly unchanged, with persistently high office vacancies. Conditions in the broad finance sector continued to deterio-\nrate, though at a more subdued pace than in recent months. Regional banks 

## 3.2 Combine Docs Chain

- Stuff: 
    - Inserts a list of documents all into a prompt. 
    - Well-suited for applications where documents are small and only a few are passed in for most calls.
- Refine: 
    - Loop over the input documents and iteratively updating its answer. 
    - Well-suited for tasks that require analyzing more documents than can fit in the model's context 
    - Not good for tasks which are difficult to accomplish iteratively. 
    - Will make far more LLM calls than
- Map reduce: 
    -  first applies an LLM chain to each document individually (the Map step). This compression step is performed recursively if necessary.
    -  then passes all the new documents to a separate combine documents chain to get a single output (the Reduce step)
- Map re-rank:
    - runs an initial prompt on each document
    - gives a score for how certain it is in its answer. The highest scoring response is returned.

## 3.3 Documents QA Chain: load_qa_chain

### Basic Method： stuff

In [15]:
chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")
chain.run(input_documents=docs, question=query)

' Labor market conditions were strong, with ongoing modest employment gains and steady wage growth. Inflationary pressures eased as both input and selling price increases slowed noticeably. Supply availability continued to improve, particularly for manufacturers, and manufacturing activity edged slightly higher. Consumer spending grew steadily and tourism in New York City remained strong. While housing markets were solid, exceptionally low inventory remained a challenge and there were some signs of a pullback in demand in parts of the District.'

In [17]:
# more control and understanding over what is happening
chain({"input_documents": docs, "question": query}, return_only_outputs=False)

{'input_documents': [Document(page_content='B-1Federal Reserve Bank of New York  \nSummary of Economic Activity  \nEconomic activity in the Second District stabilized in recent weeks following a period of moderate weakness. Labor \nmarket conditions were strong, with ongoing modest employment gains and steady wage growth. Inflationary pressures \neased as both input and selling price increases slowed noticeably. Supply availability continued to improve, particularly \nfor manufacturers, and manufacturing activity edged slightly higher. Consumer spending grew steadily and tourism in \nNew York City remained strong. While housing markets were solid, exceptionally low inventory remained a challenge \nand there were some signs of a pullback in demand in parts of the District. Commercial real estate markets remained \nmostly unchanged, with persistently high office vacancies. Conditions in the broad finance sector continued to deterio-\nrate, though at a more subdued pace than in recent mon

### Method： map_reduce & refine

In [None]:
chain = load_qa_chain(OpenAI(temperature=0), chain_type="map_reduce", return_map_steps=True)
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

In [None]:
chain = load_qa_chain(OpenAI(temperature=0), chain_type="refine", return_refine_steps=True)
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

In [None]:
chain = load_qa_chain(OpenAI(temperature=0), chain_type="map_rerank", return_intermediate_steps=True)
results = chain({"input_documents": docs, "question": query}, return_only_outputs=True)

### Custom Prompts

In [None]:
## Stuff

prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer in Chinese:"""
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)
chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff", prompt=PROMPT)
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff", prompt=PROMPT)
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

In [None]:
## map reduce

question_prompt_template = """Use the following portion of a long document to see if any of the text is relevant to answer the question. 
Return any relevant text translated into italian.
{context}
Question: {question}
Relevant text, if any, in Italian:"""
QUESTION_PROMPT = PromptTemplate(
    template=question_prompt_template, input_variables=["context", "question"]
)

combine_prompt_template = """Given the following extracted parts of a long document and a question, create a final answer italian. 
If you don't know the answer, just say that you don't know. Don't try to make up an answer.

QUESTION: {question}
=========
{summaries}
=========
Answer in Italian:"""
COMBINE_PROMPT = PromptTemplate(
    template=combine_prompt_template, input_variables=["summaries", "question"]
)
chain = load_qa_chain(OpenAI(temperature=0), chain_type="map_reduce", return_map_steps=True, question_prompt=QUESTION_PROMPT, combine_prompt=COMBINE_PROMPT)
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

## 3.4 Retrieval QA

In [25]:
# Method 1
retrieval_qa_chain = RetrievalQA.from_chain_type(llm=OpenAI(temperature=0), chain_type="stuff", retriever=db.as_retriever(search_type="mmr"))
retrieval_qa_chain.run(query)

' Economic activity in the Second District stabilized in recent weeks following a period of moderate weakness. Labor market conditions were strong, with ongoing modest employment gains and steady wage growth. Inflationary pressures eased as both input and selling price increases slowed noticeably. Supply availability continued to improve, particularly for manufacturers, and manufacturing activity edged slightly higher. Consumer spending grew steadily and tourism in New York City remained strong. While housing markets were solid, exceptionally low inventory remained a challenge and there were some signs of a pullback in demand in parts of the District. Commercial real estate markets remained mostly unchanged, with persistently high office vacancies. Conditions in the broad finance sector continued to deteriorate, though at a more subdued pace than in recent months. Regional banks reported ongoing declines in loan demand, tighter credit conditions, and narrowing loan spreads. Looking ahe

In [24]:
# Method 2 : load_qa_chain for more flexibility
qa_chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")
qa = RetrievalQA(combine_documents_chain=qa_chain, retriever=db.as_retriever())
# check the prompt
qa_chain.llm_chain.prompt.template

"Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"

# Use Case 1: Summarization for a single document (can be replaced by QA chain)

In [14]:
from langchain import OpenAI, PromptTemplate, LLMChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains.mapreduce import MapReduceChain
from langchain.prompts import PromptTemplate

from langchain.chains.summarize import load_summarize_chain

In [10]:
# get path
dir = os.getcwd()
file_path = dir + '/central_bank_speech_PDF/r230717a.pdf'

# load PDF
loader = PyPDFLoader(file_path)
docs = loader.load()

In [19]:
llm = OpenAI(temperature=0)
summarize_chain = load_summarize_chain(llm, chain_type="map_reduce", return_map_steps=True)

In [22]:
# summarize_chain.run({"input_documents": docs}, return_only_outputs=True)

In [None]:
# for loop on date

# Use Case2: Tag each chunks

# Use Case 3: Chatbot

## Basic: Conversation buffer memory

In [17]:
from langchain.memory import ConversationBufferMemory
from langchain.llms import OpenAI
from langchain.chains import ConversationChain

In [18]:
memory = ConversationBufferMemory()
llm = OpenAI(temperature=0)

conversation = ConversationChain(
    llm=llm, 
    verbose=True, 
    memory=ConversationBufferMemory()
)

In [None]:
conversation.predict(input="Hi there!")

## Adding Message Memory backed by a database to an Agent

# Use Case 3:  Agent