In [None]:
import os 
import shutil
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
# from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.embeddings import OpenAIEmbeddings

# Ingestion Pipeline

In [2]:
DATA_PATH = "data"

def load_documents():
    """Load PDF documents from the specified directory."""
    loader = DirectoryLoader(DATA_PATH, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [3]:
text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=1000,
	chunk_overlap=500,
	length_function=len,
	add_start_index=True
)

documents = load_documents()
chunks = text_splitter.split_documents(documents)

print(f"Number of documents loaded: {len(documents)}")
print(f"Number of chunks created: {len(chunks)}")

Number of documents loaded: 85
Number of chunks created: 533


In [4]:
doc = chunks[:5]
for i in range(0, len(doc)):
    print(f"Chunk {i}:")
    print(f"Chunk text: {doc[i].page_content}...") 

Chunk 0:
Chunk text: 1 
Dear shareholders, colleagues, customers, and partners,  
We are living through a time of historic challenge and opportunity. As I write this, the world faces ongoing economic, social, 
and geopolitical volatility. At the same time, we have entered a new age of AI that will fundamentally transform productivity  
for every individual, organization, and industry on earth, and help us address some of our most pressing challenges.  
This next generation of AI will reshape every software category and every business, including our own. Forty -eight years 
after its founding, Microsoft remains a consequential company because time and time again —from PC/Server, to 
Web/Internet, to Cloud/Mobile—we have adapted to technological paradigm shifts. Today, we are doing so once again, as 
we lead this new era.  
Amid this transformation, our mission to empower every person and every organization on the planet to achieve...
Chunk 1:
Chunk text: This next generation of AI will 

In [None]:
def save_vector_store():
    """Save the vector store to the specified directory."""
    CHROMA_PATH = "chroma"

    # Clear out the database first
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)
    
    # embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
    from dotenv import load_dotenv
    load_dotenv()
    openai_api_type = os.getenv("OPENAI_API_TYPE", "open_ai")
    embeddings = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_type=openai_api_type)
    Chroma.from_documents(
        chunks, embeddings, persist_directory=CHROMA_PATH
    )
    print(f"Vector store saved to {CHROMA_PATH}")

save_vector_store()

  embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_type=openai_api_type)


Vector store saved to chroma


# Inference Pipeline

In [2]:
# embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
from dotenv import load_dotenv
load_dotenv()
openai_api_type = os.getenv("OPENAI_API_TYPE", "open_ai")
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_type=openai_api_type)
CHROMA_PATH = "chroma"

db = Chroma(
    persist_directory=CHROMA_PATH,
    embedding_function=embeddings
)

  embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_type=openai_api_type)
  db = Chroma(


In [None]:
# Search the DB
results = db.similarity_search("What is the main idea of the document?", k=3) 
if len(results) == 0:
	print(f"Unable to find matching results.")
results

[Document(metadata={'creationdate': '2025-07-06T09:33:19+08:00', 'creator': 'Microsoft® Word for Microsoft 365', 'page_label': '6', 'total_pages': 85, 'start_index': 3019, 'producer': 'Microsoft® Word for Microsoft 365', 'moddate': '2025-07-06T09:33:19+08:00', 'source': 'data\\2023_Annual_Report.pdf', 'page': 5}, page_content='meaningful choices about how their data is used.  \nHow can we protect fundamental rights?  \nIn an increasingly digital world, we have a responsibility to promote and protect people’s fundamental rights and address \nthe challenges technology creates. For us, this means upholding responsible business practices, expanding connectivity \nand accessibility, advancing fair and inclusive societies, and empowering communities.  \nIn 2023, we worked diligently to anticipate harmful uses of our technology and put guardrails on the use of technologies that \nare consequential to people’s lives or legal status, create risk of harm, or threaten human rights. We will contin

In [4]:
context = "\n\n---\n\n".join([result.page_content for result in results])
print(f"Context: {context}...")  # Print first 500 characters of the context

Context: meaningful choices about how their data is used.  
How can we protect fundamental rights?  
In an increasingly digital world, we have a responsibility to promote and protect people’s fundamental rights and address 
the challenges technology creates. For us, this means upholding responsible business practices, expanding connectivity 
and accessibility, advancing fair and inclusive societies, and empowering communities.  
In 2023, we worked diligently to anticipate harmful uses of our technology and put guardrails on the use of technologies that 
are consequential to people’s lives or legal status, create risk of harm, or threaten human rights. We will continue to assess 
the impact of our technologies, engage our stakeholders, and model and adopt responsible practices and respect for human 
rights—including across our global supply chain.  
Today, our lives are more connected than ever. Access to education, employment, healthcare, and other critical services is

---

As we purs

In [None]:
PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {query}
"""

In [6]:
from langchain.prompts import ChatPromptTemplate

In [None]:
question = "summarize the documents?"
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context, query=question)
print(prompt)

Human: 
Answer the question based only on the following context:

meaningful choices about how their data is used.  
How can we protect fundamental rights?  
In an increasingly digital world, we have a responsibility to promote and protect people’s fundamental rights and address 
the challenges technology creates. For us, this means upholding responsible business practices, expanding connectivity 
and accessibility, advancing fair and inclusive societies, and empowering communities.  
In 2023, we worked diligently to anticipate harmful uses of our technology and put guardrails on the use of technologies that 
are consequential to people’s lives or legal status, create risk of harm, or threaten human rights. We will continue to assess 
the impact of our technologies, engage our stakeholders, and model and adopt responsible practices and respect for human 
rights—including across our global supply chain.  
Today, our lives are more connected than ever. Access to education, employment, he

In [8]:
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
model = ChatOpenAI(openai_api_key=OPENAI_API_KEY)
response_text = model.predict(prompt)
print(f"Response: {response_text}")

  response_text = model.predict(prompt)


Response: The provided context does not mention anything about people not being able to be together. The focus of the text is on promoting and protecting fundamental rights in an increasingly digital world, addressing challenges created by technology, protecting customers' privacy, and expanding opportunities for economic growth and access to technology. The context does not provide any information relevant to why people cannot be together.
