# 1. Read PDF file

In [1]:
#pip install PyPDF

In [1]:
import os

# get current dir
dir = os.getcwd()

file_path = dir + '/fedbeigebook/BeigeBook_20230712.pdf'
date = file_path[-12:-4]

In [2]:
from langchain.document_loaders import PyPDFLoader

loader = PyPDFLoader(file_path)
pages = loader.load_and_split()

In [4]:
# concat all pages
number_of_pages = len(pages)
#all_pages = [pages[i].page_content for i in range(number_of_pages)]
all_pages = [pages[i].page_content for i in range(number_of_pages)]

#join all pages into one string
full_text = ' '.join(all_pages)

In [18]:
# clean text (for meeting minutes)
import re

#remobe \n but keep \n\n
#full_text = re.sub(r'(?<!\n)\n(?!\n)', ' ', full_text)
full_text = re.sub(r'\n', ' ', full_text)
full_text = re.sub(r'\s+', ' ', full_text)


# remove underscores 
full_text = re.sub(r'_', '', full_text)
full_text = re.sub(r'—', '', full_text)
full_text = re.sub(r'- ', '', full_text)
full_text = re.sub(r'-', '', full_text)

# remove text like "Minutes of the Meeting of June 13–14, 2023 Page 3" but the date and page number is different for each pdf
full_text = re.sub(r'Minutes of the Meeting of [A-Za-z]+ [0-9]+–[0-9]+, [0-9]+ Page [0-9]+', '', full_text)

In [None]:
# show full text in notebook with markdown
from IPython.display import Markdown
Markdown(full_text)

# Optional: OpenAI Functions Metadata Tagger

In [None]:
# pip install langchain --upgrade
# pip install 'langchain[all]'
#pip install langchain==0.0.231

In [None]:
# Github has been updated but the package not yet. Waiting for updating......

In [None]:
from langchain.schema import Document
from langchain.chat_models import ChatOpenAI
from langchain.document_transformers.openai_functions import create_metadata_tagger


schema = {
    "properties": {
        "movie_title": {"type": "string"},
        "critic": {"type": "string"},
        "tone": {"type": "string", "enum": ["positive", "negative"]},
        "rating": {
            "type": "integer",
            "description": "The number of stars the critic rated the movie",
        },
    },
    "required": ["movie_title", "critic", "tone"],
}

# Must be an OpenAI model that supports functions
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613")

document_transformer = create_metadata_tagger(metadata_schema=schema, llm=llm)

# 2. Chunking

### Split by tokens
Language models have a token limit. You should not exceed the token limit. When you split your text into chunks it is therefore a good idea to count the number of tokens. There are many tokenizers. When you count tokens in your text you should use the same tokenizer as used in the language model.

### Recursively split by character
This text splitter is the recommended one for generic text. It is parameterized by a list of characters. It tries to split on them in order until the chunks are small enough. The default list is ["\n\n", "\n", " ", ""]. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.

How the text is split: by list of characters
How the chunk size is measured: by number of characters

In [7]:
from langchain.text_splitter import TokenTextSplitter

text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=50)

# method 1: create string list
#texts = text_splitter.split_text(full_text)

#method2: create document list
metadatas = [{"date": date, "category": "FED Beige Book", "author": "FED"}]
texts = text_splitter.create_documents([full_text],metadatas=metadatas)

# 3. Vector store

In [9]:
# pip install faiss-cpu

from langchain.vectorstores import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings

In [10]:
embeddings = OpenAIEmbeddings()
db = FAISS.from_documents(texts, embeddings)

In [None]:
db.save_local("faiss_index")

In [6]:
embeddings = OpenAIEmbeddings()
db = FAISS.load_local("faiss_index", embeddings)

In [11]:
db.docstore._dict

{'0ac86d4c-dab9-42cc-afc6-800ca7071c08': Document(page_content='The Beige Book Summary of Commentary on Current Economic Conditions By Federal Reserve District For use at 2:00 PM EDT Wednesday July 12, 2023 June 2023 Federal Reserve Districts Boston New York Philadelphia Cleveland Chicago Richmond Atlanta St. Louis Kansas City Dallas Minneapolis San Francisco The System serves commonwealths and territories as follows: the New York Bank serves the Commonwealth of Puerto Rico and the U.S. Virgin Islands; the San Francisco Bank serves American Samoa, Guam, and the Commonwealth of the Northern Mariana Islands. Alaska and Hawaii are part of the San Francisco District. This report was prepared at the Federal Reserve Bank of Minneapolis based on information collected on or before June 30, 2023. This document summarizes comments received from contacts outside the Federal Reserve System and is not a commentary on the views of Federal Reserve officials. National Summary 1 Boston A1 First Distric

In [17]:
import json
print(json.dumps(db.docstore._dict['fc257740-b305-4e09-a83a-775d6d3e21a6'].metadata, indent=2))

{
  "date": "20230614",
  "category": "FOMC Minutes",
  "author": "FOMC"
}


# 4. Retrival QA

In [12]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate

In [19]:
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=db.as_retriever())
#qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=db.as_retriever(search_kwargs={"k": 4}))

query = "Compare the outlook of Prices in each district. Sumarize briefly and list by district."
qa.run(query)

' The outlook for prices across the districts was mostly stable or lower over the next several months. Boston: Prices were stable. New York: Inflationary pressures eased noticeably. Philadelphia: Prices were likely to remain stable. Cleveland: Prices were stable. Richmond: Prices were generally increasing. Atlanta: Prices were expected to remain stable or increase moderately. Chicago: Prices rose moderately and contacts expected a similar rate of increase over the next 12 months. St. Louis: Prices rose moderately. Kansas City: Prices rose moderately and were expected to continue rising. Dallas: Prices rose moderately and contacts expected a similar rate of increase over the next 12 months. Minneapolis: Prices rose moderately and contacts expected a similar rate of increase over the next 12 months. San Francisco: Prices rose moderately and contacts expected a similar rate of increase over the next 12 months.'

In [15]:
# show answer in notebook with markdown
from IPython.display import Markdown
Markdown(qa.run(query))



Chicago
• Economic activity was little changed.
• Employment increased moderately; nonbusiness contacts saw little change in activity; consumer spending was flat; business spending and construction and real estate activity declined slightly; and manufacturing decreased modestly.
• Prices and wages rose moderately, while financial conditions tightened slightly further.
• Expectations for farm incomes in 2023 decreased some.
• Contacts generally expected a small decline in demand over the next year and many expressed concerns about the potential for a recession.

In [10]:
# Custom Prompts
prompt_template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

{context}

Question: {question}
Answer:"""


PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [11]:
chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="stuff", retriever=db.as_retriever(), chain_type_kwargs=chain_type_kwargs)

query = "what's the outlook for the economy?"
qa.run(query)

' Participants generally noted that real GDP growth had been resilient in recent quarters. They assessed that the cumulative tightening of monetary policy over the past year had contributed significantly to more restrictive financial conditions and lower demand in the most interest rate sensitive sectors of the economy, especially housing and business investment. With inflation well above the Committee’s longer run 2 percent objective, participants expected that a period of below trend growth in real GDP and some softening in labor market conditions would be needed to bring aggregate supply and aggregate demand into better balance and reduce inflationary pressures sufficiently to return inflation to 2 percent over time.'