In [1]:
# Environment setup
from dotenv import load_dotenv
import os
import warnings
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore")
load_dotenv()

True

In [2]:
import faiss

from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS

from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough

from langchain_text_splitters import MarkdownHeaderTextSplitter

from langchain_ollama import ChatOllama, OllamaEmbeddings

from docling.document_converter import DocumentConverter

In [4]:
def load_and_convert_document(file_path):
    converter = DocumentConverter()
    result = converter.convert(file_path)
    return result.document.export_to_markdown()

source = "goog-10-q-q3-2024.pdf"
markdown_content = load_and_convert_document(source)

In [6]:

# print(markdown_content)

In [7]:
# Splitting markdown content into chunks
def get_markdown_splits(markdown_content):
    headers_to_split_on = [("#", "Header 1"), ("##", "Header 2"), ("###", "Header 3")]
    markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on, strip_headers=False)
    return markdown_splitter.split_text(markdown_content)


chunks = get_markdown_splits(markdown_content)

In [8]:
# Embedding and vector store setup
def setup_vector_store(chunks):
    embeddings = OllamaEmbeddings(model='nomic-embed-text', base_url="http://localhost:11434")
    single_vector = embeddings.embed_query("this is some text data")
    index = faiss.IndexFlatL2(len(single_vector))
    vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={}
    )
    vector_store.add_documents(documents=chunks)
    return vector_store

In [9]:
vector_store = setup_vector_store(chunks)
# Setup retriever
retriever = vector_store.as_retriever(search_type="mmr", search_kwargs={'k': 3})

In [10]:
vector_store.index.ntotal, len(chunks)

(168, 168)

In [12]:

docs = retriever.invoke('what is revenue for september 2024?')
# docs

In [13]:
# Formatting documents for RAG
def format_docs(docs):
    return "\n\n".join([doc.page_content for doc in docs])

content = format_docs(docs)

In [14]:

print(content)

## Revenue Backlog  
As of  September 30, 2024, we had $86.8 billion of remaining performance obligations ('revenue backlog'), primarily related to Google Cloud. Our revenue backlog represents commitments in customer contracts for future services  that  have  not  yet  been  recognized  as  revenue.  The  estimated  revenue  backlog  and  timing  of  revenue recognition  for  these  commitments  is  largely  driven  by  our  ability  to  deliver  in  accordance  with  relevant  contract terms and when our customers utilize services. We expect to recognize approximately half of the revenue backlog as revenues over the next 24 months with the remainder to be recognized thereafter. Revenue backlog includes related deferred revenue currently recorded as well as amounts that will be invoiced in future periods, and excludes contracts with an original expected term of one year or less and cancellable contracts.

## Taxes  
As  of  September  30,  2024,  we  had  short-term  income  taxes  pay

In [22]:
# Setting up the RAG chain
def create_rag_chain(retriever):
    prompt = """
        You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
        If you don't know the answer, just say that you don't know.
        Answer in bullet points. Make sure your answer is relevant to the question and it is answered from the context only.
        ### Question: {question} 
        
        ### Context: {context} 
        
        ### Answer:
    """
    model = ChatOllama(model="deepseek-r1:1.5b", base_url="http://localhost:11434")
    prompt_template = ChatPromptTemplate.from_template(prompt)

    chain = (
        {"context": retriever | format_docs, "question": RunnablePassthrough()}
        | prompt_template
        | model
        | StrOutputParser()
    )
    return chain

In [23]:
rag_chain = create_rag_chain(retriever)


In [24]:
# Questions for retrieval
# question = "How much revenue is there for Google in September 2024?"
# question = "What is the net income for this quarter, and what are the key drivers contributing to its increase or decrease?"
# question = "Has the company provided guidance for the next quarter or fiscal year? If so, what are the expected revenue and profit margins?"
question = "Which technology business segment contributed the most to the company's revenue, and what was the percentage growth in that segment?"
# question = "How has the stock market reacted to this earnings report, and were there any notable comments from the CEO or CFO about future performance?"
# question = "ask your own question"

print(f"Question: {question}")
for chunk in rag_chain.stream(question):
    print(chunk, end="", flush=True)
print("\n" + "-" * 50 + "\n")

Question: Which technology business segment contributed the most to the company's revenue, and what was the percentage growth in that segment?
<think>
Okay, I'm trying to figure out which technology business segment contributed the most to the company's revenue. The question is asking not just about what segment had the highest revenue but also by how much it grew over time.

Looking at the context provided, there are two main segments mentioned: Google Services and YouTube ads. There's no mention of "Nature of Operations," so I can skip that for now since the focus is on company revenue.

Starting with Google Services, their operating income in 2023 was $23.937 million, and in 2024 it increased to $30.856 million. That's a significant jump. The change from $23.937 to $30.856 is an increase of $6.919 million. To find the percentage growth, I can calculate ((30.856 - 23.937)/23.937) * 100, which would give me the percentage increase from Google Services in 2023 to 2024.

Next, looking a