In [12]:
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path
import bs4
from langchain_core.prompts import ChatPromptTemplate
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough, RunnableParallel, RunnableLambda
import json

In [6]:
#### INDEXING ####

# Load Documents
data_path = Path("../data")
file_path = data_path / "apple-10-k.pdf"
loader = PyPDFLoader(file_path)
docs = loader.load()

In [7]:
# Split
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Embed
vectorstore = Chroma.from_documents(documents=splits, 
                                    embedding=OpenAIEmbeddings())

retriever = vectorstore.as_retriever()

#### RETRIEVAL and GENERATION ####

# Prompt
template = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)
# LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Question
#res_msg = rag_chain.invoke("""What is the value of "Total Assets" for Apple for the fiscal year 2024, as reported in the Consolidated Balance Sheets?""")
res_msg = rag_chain.invoke("""Extract the following financial figures for the fiscal year 2024 from the document:
1.  From the 'Consolidated Balance Sheets', find the value for 'Total Assets'.
2.  From the 'Consolidated Balance Sheets', find the value for 'Total Liabilities'.
3.  From the 'Consolidated Statements of Operations', find the value for 'Net Sales' (or Total Revenue).
4.  From the 'Consolidated Statements of Operations', find the value for 'Net Income'.
Please provide the exact values as reported.""")
print(res_msg)

1. Total Assets for the fiscal year 2024: Not provided.
2. Total Liabilities for the fiscal year 2024: Not provided.
3. Net Sales for the fiscal year 2024: Not provided.
4. Net Income for the fiscal year 2024: Not provided.


In [13]:

template = """
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise and extract only the value.
Question: {question} 
Context: {context} 
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)
# LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Base RAG chain
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# Define the questions for each metric
questions = {
    "total_assets": "What is the value for 'Total Assets' for the fiscal year 2024 from the 'Consolidated Balance Sheets'?",
    "total_liabilities": "What is the value for 'Total Liabilities' for the fiscal year 2024 from the 'Consolidated Balance Sheets'?",
    "net_sales": "What is the value for 'Net Sales' for the fiscal year 2024 from the 'Consolidated Statements of Operations'?",
    "net_income": "What is the value for 'Net Income' for the fiscal year 2024 from the 'Consolidated Statements of Operations'?"
}

# Define the parallel chains to run for each question
# Each lambda function selects the correct question string from the input dictionary
map_chain = RunnableParallel(
    total_assets=RunnableLambda(lambda x: x["total_assets"]) | rag_chain,
    total_liabilities=RunnableLambda(lambda x: x["total_liabilities"]) | rag_chain,
    net_sales=RunnableLambda(lambda x: x["net_sales"]) | rag_chain,
    net_income=RunnableLambda(lambda x: x["net_income"]) | rag_chain,
)

# Invoke the parallel chains with the questions dictionary
result = map_chain.invoke(questions)


# Format and print the JSON output
json_output = json.dumps(result, indent=4)
print(json_output)

{
    "total_assets": "The value for 'Total Assets' for the fiscal year 2024 from the 'Consolidated Balance Sheets' is $364,980 million.",
    "total_liabilities": "The value for 'Total Liabilities' for the fiscal year 2024 is $308,030.",
    "net_sales": "The value for 'Net Sales' for the fiscal year 2024 from the 'Consolidated Statements of Operations' is $391,035 million.",
    "net_income": "The value for 'Net Income' for the fiscal year 2024 from the 'Consolidated Statements of Operations' is $123,216 million."
}
