# Overview of Embeddings-Based Retrieval

In [1]:
from pypdf import PdfReader

# Read the PDF file
reader = PdfReader("microsoft_annual_report_2022.pdf")
pdf_texts = [p.extract_text().strip() for p in reader.pages]

# Filter the empty strings
pdf_texts = [text for text in pdf_texts if text]

Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 8 0 (offset 0)
Ignoring wrong pointing object 20 0 (offset 0)
Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 43 0 (offset 0)
Ignoring wrong pointing object 49 0 (offset 0)
Ignoring wrong pointing object 51 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 55 0 (offset 0)
Ignoring wrong pointing object 57 0 (offset 0)
Ignoring wrong pointing object 72 0 (offset 0)
Ignoring wrong pointing object 162 0 (offset 0)
Ignoring wrong pointing object 229 0 (offset 0)
Ignoring wrong pointing object 231 0 (offset 0)
Ignoring wrong pointing object 252 0 (offset 0)
Ignoring wrong pointing object 257 0 (offset 0)
Ignoring wrong pointing object 294 0 (offset 0)
Ignoring wrong pointing object 299 0 (offset 0)
Ignoring wrong pointing object 319 0 (offset 0)
Ignoring wrong pointing object 331 0 (offset 0)
Ignoring wrong pointing object 336 0 (offset 0)
Ignor

In [2]:
# Split the text into chunks using a recursive character splitter 
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

character_splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n", "\n", ". ", " ", ""],
    chunk_size=1000,
    chunk_overlap=0
)

character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))

print(character_split_texts[10])
print(f"\nTotal chunks: {len(character_split_texts)}")

. And we’ve expanded our Technology Education and Learning Support (TEALS) program to 290 high schools in cities with large Black and African American communities—to promote more equitable access to computer science education.  Our work to help preserve, protect, and advance democracy by promoting a healthy information ecosystem and safeguarding electoral processes is as salient as ever in today’s geopolitical climate. Our AccountGuard nation-state threat notification service protects more than 4 million accounts of election officials, human rights organizations, journalists, and other organizations. Our efforts to preserve and protect journalism in the United States and Mexico have been extended globally through new partnerships with the Thomson Reuters Foundation, Report for the World, and others.  This year, we responded to six humanitarian emergencies in five countries through donations, technology, services, and employee giving

Total chunks: 445


In [3]:
# The embbeding model has a maximum context window of 256 tokens
# Accordingly, we can split the text into chunks of 256 tokens
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)

# Further split the chunks to prepare for embedding
token_split_texts = []
for text in character_split_texts:
    token_split_texts += token_splitter.split_text(text)

print(token_split_texts[10])
print(f"\nTotal chunks: {len(token_split_texts)}")

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

. and we ’ ve expanded our technology education and learning support ( teals ) program to 290 high schools in cities with large black and african american communities — to promote more equitable access to computer science education. our work to help preserve, protect, and advance democracy by promoting a healthy information ecosystem and safeguarding electoral processes is as salient as ever in today ’ s geopolitical climate. our accountguard nation - state threat notification service protects more than 4 million accounts of election officials, human rights organizations, journalists, and other organizations. our efforts to preserve and protect journalism in the united states and mexico have been extended globally through new partnerships with the thomson reuters foundation, report for the world, and others. this year, we responded to six humanitarian emergencies in five countries through donations, technology, services, and employee giving

Total chunks: 451


In [4]:
# Embed the text chunks with a SentenceTransformer model
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction()
print(embedding_function([token_split_texts[10]]))

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

[array([-3.65974940e-02,  1.26036229e-02,  9.42810532e-03,  3.96896638e-02,
        1.03119679e-01,  4.24123593e-02, -4.41927975e-03, -5.10303900e-02,
        2.76814792e-02,  5.64509109e-02, -2.94472966e-02,  3.98839936e-02,
        8.26499313e-02,  4.40076739e-02, -2.43856739e-02,  6.25482341e-03,
       -2.85459645e-02, -7.32271001e-02, -8.69664624e-02, -5.18782251e-02,
       -5.88315260e-03,  6.47969311e-03,  4.41957191e-02,  4.11980152e-02,
       -2.37075314e-02,  5.25300484e-03, -1.45312923e-03, -7.67141804e-02,
       -7.23255798e-02, -2.77911574e-02,  6.20630570e-03, -1.08420521e-01,
       -2.48442800e-03,  4.52740490e-02,  1.58682279e-02,  6.70050979e-02,
        1.26789749e-01, -4.02106382e-02, -2.79351324e-02, -5.80640621e-02,
       -3.97184454e-02, -7.47611523e-02,  2.05687201e-03, -1.98378079e-02,
        4.27689590e-02, -1.70864332e-02,  5.04938932e-03,  1.64937340e-02,
       -4.72935066e-02, -5.15328944e-02,  4.39709574e-02, -5.82427494e-02,
        5.12361899e-02, 

In [5]:
# Setting up chroma with a default chroma client
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("microsoft_annual_report_2022", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

# Add documents to the vectorbase
chroma_collection.add(ids=ids, documents=token_split_texts)
chroma_collection.count()

451

In [6]:
query = "What was the total revenue?"

# Querying the chroma collection for top 5 most relevant documents
results = chroma_collection.query(query_texts=[query], n_results=5)
retrieved_documents = results['documents'][0]

for document in retrieved_documents:
    print(document)
    print('\n')

74 note 13 — unearned revenue unearned revenue by segment was as follows : ( in millions ) june 30, 2022 2021 productivity and business processes $ 24, 558 $ 22, 120 intelligent cloud 19, 371 17, 710 more personal computing 4, 479 4, 311 total $ 48, 408 $ 44, 141 changes in unearned revenue were as follows : ( in millions ) year ended june 30, 2022


productivity and business processes $ 29, 687 $ 24, 351 $ 18, 724 intelligent cloud 32, 721 26, 126 18, 324 more personal computing 20, 975 19, 439 15, 911 total $ 83, 383 $ 69, 916 $ 52, 959 no sales to an individual customer or country other than the united states accounted for more than 10 % of revenue for fiscal years 2022, 2021, or 2020. revenue, classified by the major geographic areas in which our customers were located, was as follows : ( in millions ) year ended june 30, 2022 2021 2020 united states ( a ) $ 100, 218 $ 83, 953 $ 73, 160 other countries 98, 052 84, 135 69, 855 total $ 198, 270 $ 168, 088 $ 143, 015 ( a ) includes bi

In [None]:
# Setup the OpenAI client for the RAG Operation
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
from gen_ai_hub.proxy.native.openai import chat

llm_model = "gpt-4o-mini"


In [8]:
def rag(query, retrieved_documents):
    information = "\n\n".join(retrieved_documents)

    messages = [
        {
            "role": "system",
            "content": "You are a helpful expert financial research assistant. Your users are asking questions about information contained in an annual report."
            "You will be shown the user's question, and the relevant information from the annual report. Answer the user's question using only this information."
        },
        {"role": "user", "content": f"Question: {query}. \n Information: {information}"}
    ]
    
    kwargs = dict(model_name=llm_model, messages=messages)
    response = chat.completions.create(**kwargs)
    
    content = response.choices[0].message.content
    return content

In [9]:
# Perform the RAG operation over the retrieved documents
output = rag(query=query, retrieved_documents=retrieved_documents)

print(output)

APIStatusError: Error code: 410 - {'error': 'Gone', 'message': 'Model has been retired. Please follow the instructions at https://help.sap.com/docs/sap-ai-core/sap-ai-core-service-guide/update-deployment to update your deployment with a new model.'}