In [30]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "./palantir_q4.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

9


In [31]:
import getpass
import os

os.environ["GROQ_API_KEY"] = "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"

from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")

In [32]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=0)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=HuggingFaceEmbeddings())

retriever = vectorstore.as_retriever()

In [40]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are a financial analysis AI assistant specializing in extracting key financial metrics from corporate earnings reports, investor presentations, and financial statements. Your task is to extract and summarize only the most relevant numerical data that investors care about. Focus on revenue, profit, cash flow, margins, shareholder returns, customer growth, and balance sheet strength. "
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

results = rag_chain.invoke({"input": "What is the net income for 2024 and what only for the quarter? Round the numbers to the nearest million."})

results

{'input': 'What is the net income for 2024 and what only for the quarter? Round the numbers to the nearest million.',
 'context': [Document(id='58634c3e-73cf-4b25-a574-5b4e76d4d655', metadata={'creationdate': '2025-03-08T11:47:47+00:00', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36', 'moddate': '2025-03-08T11:47:47+00:00', 'page': 0, 'page_label': '1', 'producer': 'Skia/PDF m133', 'source': './palantir_q4.pdf', 'title': 'Palantir IR - News', 'total_pages': 9}, page_content='GAAP net income of $79 million, representing a 10% margin\n$165 million of net income when excluding one-time SAR-related expenses, representing a 20%\nmargin\nGAAP income from operations of $11 million, representing a 1% margin\n$142 million of income from operations when excluding one-time SAR-related expenses,\nrepresenting a 17% margin\nAdjusted income from operations of $373 million, representing a 45% margin\nRule of 40 score 

In [29]:
results["answer"]

"<think>\nOkay, so the user has provided a bunch of financial data and asked me to extract the key metrics. Let me start by reading through the data carefully. I see there are sections related to cash flow, income, and other financial statements. \n\nFirst, I notice the net cash provided by operating activities is $1,153,865 for 2024 and $712,183 for 2023. That's a significant increase, so that's definitely important for cash flow. Then, under investing activities, there are purchases and proceeds from marketable securities. The net investing activities show a use of funds, which might indicate investments in the business or marketable securities.\n\nLooking at the adjusted EBITDA, it's $1,159,649 for 2024 and $666,130 for 2023. The margin improved from 30% to 40%, which is a positive sign. The net income is also up, $462,190 from $209,825, showing growth in profitability. \n\nI also see the diluted EPS increased, which is good for shareholders. The stock-based compensation is a big pa