### Part 0. Loading libraries

In [1]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, ContentFormat
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentAnalysisFeature
from azure.ai.documentintelligence.models import DocumentTable

from langchain.text_splitter import MarkdownHeaderTextSplitter
import os
from dotenv import load_dotenv
import pandas as pd
import mdpd
import re

load_dotenv()

endpoint = os.environ["AZURE_DOC_INTELLIGENCE_ENDPOINT"]
key = os.environ["AZURE_DOC_INTELLIGENCE_KEY"]

document_intelligence_client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key), api_version="2024-02-29-preview")

  from pandas.core import (


In [2]:
path_to_sample_documents = os.path.abspath(
    os.path.join(
        "MICROSOFT-10Q-FY2023-Q3.pdf",
    )
)
filename = path_to_sample_documents
with open(filename, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document("prebuilt-layout", 
                                                                 analyze_request=f, content_type="application/octet-stream", 
                                                                 output_content_format=ContentFormat.MARKDOWN,
                                                                 features=[DocumentAnalysisFeature.KEY_VALUE_PAIRS])
result = poller.result()

In [3]:
sections = []
for section in result.paragraphs:
    if section.role:
        print(section.role)
        print(section.bounding_regions[0].page_number)
        print(section.content)

result.paragraphs

title
1
UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549
===
sectionHeading
1
## MICROSOFT CORPORATION
title
2
MICROSOFT CORPORATION FORM 10-Q For the Quarter Ended March 31, 2023 INDEX
===
pageNumber
2
<!-- PageNumber="2" -->
title
3
PART ! Item 1
===
title
3
PART I. FINANCIAL INFORMATION ITEM 1. FINANCIAL STATEMENTS INCOME STATEMENTS
===
pageNumber
3
<!-- PageNumber="3" -->
title
4
COMPREHENSIVE INCOME STATEMENTS
===
title
5
PART I Item 1
===
pageNumber
5
<!-- PageNumber="5" -->
title
5
BALANCE SHEETS
===
title
6
PART ! Item 1
===
title
6
CASH FLOWS STATEMENTS
===
pageNumber
6
<!-- PageNumber="6" -->
title
7
PART ! Item 1
===
title
7
STOCKHOLDERS' EQUITY STATEMENTS
===
pageNumber
7
<!-- PageNumber="7" -->
sectionHeading
8
## Accounting Principles
sectionHeading
8
## Principles of Consolidation
sectionHeading
8
### Estimates and Assumptions
sectionHeading
8
## Financial Instruments
sectionHeading
8
### Investments
pageNumber
8
<!-- PageNumber="8" -->
pageHeader


[{'spans': [{'offset': 0, 'length': 75}], 'boundingRegions': [{'pageNumber': 1, 'polygon': [1.719, 0.4106, 6.5006, 0.4106, 6.5006, 1.0748, 1.719, 1.0748]}], 'role': 'title', 'content': 'UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549\n==='},
 {'spans': [{'offset': 77, 'length': 9}], 'boundingRegions': [{'pageNumber': 1, 'polygon': [3.4937, 1.237, 4.7411, 1.237, 4.7411, 1.4702, 3.4937, 1.4702]}], 'content': 'FORM 10-Q'},
 {'spans': [{'offset': 88, 'length': 144}], 'boundingRegions': [{'pageNumber': 1, 'polygon': [0.2393, 1.6137, 7.0025, 1.5832, 7.0041, 1.9533, 0.241, 1.9837]}], 'content': ':selected:\nQUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Quarterly Period Ended March 31, 2023'},
 {'spans': [{'offset': 234, 'length': 2}], 'boundingRegions': [{'pageNumber': 1, 'polygon': [4.1833, 2.0634, 4.4064, 2.0634, 4.4014, 2.185, 4.1833, 2.1901]}], 'content': 'OR'},
 {'spans': [{'offset': 238, 'length': 12}], 'boundingR

In [6]:
doc_string = result.content
strings_to_replace = re.findall(".+\n===", doc_string)
for string in strings_to_replace:
    doc_string = doc_string.replace(string, "=== "+string.replace("===",""))

## Split the document into chunks base on markdown headers.
headers_to_split_on = [
    ("===", "Title"),
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

markdown_chunks = text_splitter.split_text(doc_string)

In [7]:
chunk_list = []
for chunk in markdown_chunks:
    try:
        title = chunk.metadata['Title']
    except:
        title = ""
    try:
        header1 = chunk.metadata['Header 1']
    except:
        header1 = ""
    try:
        header2 = chunk.metadata['Header 2']
    except:
        header2 = ""
    try:
        header3 = chunk.metadata['Header 3']
    except:
        header3 = ""
    chunk_list.append({"title": title,"header1":header1,"header2":header2,"header3":header3,"content":chunk.page_content})

df = pd.DataFrame(chunk_list)

In [8]:
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")

def num_tokens_from_string(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [9]:
df['token_length']=[num_tokens_from_string(chunk) for chunk in df.content]

In [10]:
df['full_text'] =" title: " + df['title'].astype(str) + " header1: " + +df['header1'].astype(str) + " header2: " +df['header2'].astype(str) +  " header3: " + df['header3'].astype(str) + " content: " + df['content'].astype(str)
df['full_title'] =" title: " + df['title'].astype(str) + " header1: " + +df['header1'].astype(str) + " header2: " +df['header2'].astype(str) +  " header3: " + df['header3'].astype(str)

In [11]:
df

Unnamed: 0,title,header1,header2,header3,content,token_length,full_text,full_title
0,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,,,,FORM 10-Q\n:selected:\nQUARTERLY REPORT PURSUA...,99,title: UNITED STATES SECURITIES AND EXCHANGE ...,title: UNITED STATES SECURITIES AND EXCHANGE ...
1,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,,MICROSOFT CORPORATION,,WASHINGTON (STATE OF INCORPORATION) \n91-1144...,597,title: UNITED STATES SECURITIES AND EXCHANGE ...,title: UNITED STATES SECURITIES AND EXCHANGE ...
2,MICROSOFT CORPORATION FORM 10-Q For the Quarte...,,,,| PART I. FINANCIAL INFORMATION ||| Page |\n||...,345,title: MICROSOFT CORPORATION FORM 10-Q For th...,title: MICROSOFT CORPORATION FORM 10-Q For th...
3,PART I. FINANCIAL INFORMATION ITEM 1. FINANCIA...,,,,"| (In millions, except per share amounts) (Una...",602,title: PART I. FINANCIAL INFORMATION ITEM 1. ...,title: PART I. FINANCIAL INFORMATION ITEM 1. ...
4,COMPREHENSIVE INCOME STATEMENTS,,,,| (In millions) (Unaudited) | Three Months End...,231,title: COMPREHENSIVE INCOME STATEMENTS header...,title: COMPREHENSIVE INCOME STATEMENTS header...
...,...,...,...,...,...,...,...,...
126,ITEM 2. UNREGISTERED SALES OF EQUITY SECURITIE...,,,,Following are our monthly share repurchases fo...,470,title: ITEM 2. UNREGISTERED SALES OF EQUITY S...,title: ITEM 2. UNREGISTERED SALES OF EQUITY S...
127,ITEM 6. EXHIBITS,,,,15.1 Letter regarding unaudited interim financ...,274,title: ITEM 6. EXHIBITS header1: header2: h...,title: ITEM 6. EXHIBITS header1: header2: h...
128,SIGNATURE,,,,Pursuant to the requirements of the Securities...,311,title: SIGNATURE header1: header2: header3:...,title: SIGNATURE header1: header2: header3:
129,CERTIFICATION,,,,"I, Satya Nadella, certify that: \n1\. I have ...",1256,title: CERTIFICATION header1: header2: head...,title: CERTIFICATION header1: header2: head...


In [12]:
import openai
from typing import Any, Optional, Union
from tenacity import (
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_random_exponential,
)

load_dotenv()  

openai.api_type = "azure"
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = "2024-05-01-preview"
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  

deployment_name_embedding = "text-embedding-ada-002" 

In [15]:
def before_retry_sleep(retry_state):
    print("Rate limited on the OpenAI embeddings API, sleeping before retrying...")

@retry(
    retry=retry_if_exception_type(openai.error.RateLimitError),
    wait=wait_random_exponential(min=15, max=60),
    stop=stop_after_attempt(15),
    before_sleep=before_retry_sleep,
)
def compute_embedding(text):
    #refresh_openai_token()
    #embedding_args = {"deployment_id": deployment_name_embedding} if args.openaihost != "openai" else {}
    return openai.Embedding.create(engine="text-embedding-3-small" , input=text)["data"][0]["embedding"]


In [16]:
df['full_title_vector'] = df['full_title'].apply(compute_embedding) 
df['content_vector'] = df['content'].apply(compute_embedding)

In [17]:
df[['title','header1','header2','header3','content','full_title_vector','content_vector']]

Unnamed: 0,title,header1,header2,header3,content,full_title_vector,content_vector
0,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,,,,FORM 10-Q\n:selected:\nQUARTERLY REPORT PURSUA...,"[0.03364112973213196, 0.003108411328867078, 0....","[0.05341602489352226, 0.0077919247560203075, 0..."
1,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,,MICROSOFT CORPORATION,,WASHINGTON (STATE OF INCORPORATION) \n91-1144...,"[0.024496957659721375, -0.027848653495311737, ...","[0.030771823599934578, -0.02253672480583191, 0..."
2,MICROSOFT CORPORATION FORM 10-Q For the Quarte...,,,,| PART I. FINANCIAL INFORMATION ||| Page |\n||...,"[0.05188297480344772, 0.014284387230873108, 0....","[0.021289236843585968, 0.028044601902365685, 0..."
3,PART I. FINANCIAL INFORMATION ITEM 1. FINANCIA...,,,,"| (In millions, except per share amounts) (Una...","[-0.010826809331774712, -0.0069723669439554214...","[0.0006828737678006291, 0.003162274369969964, ..."
4,COMPREHENSIVE INCOME STATEMENTS,,,,| (In millions) (Unaudited) | Three Months End...,"[0.015895847231149673, 0.012530338950455189, 0...","[0.003113292623311281, 0.00854665320366621, 0...."
...,...,...,...,...,...,...,...
126,ITEM 2. UNREGISTERED SALES OF EQUITY SECURITIE...,,,,Following are our monthly share repurchases fo...,"[0.010583214461803436, 0.057578783482313156, 0...","[0.02236097864806652, 0.009578189812600613, 0...."
127,ITEM 6. EXHIBITS,,,,15.1 Letter regarding unaudited interim financ...,"[-0.018644539639353752, 0.031691934913396835, ...","[0.005142883397638798, 0.03827199339866638, 0...."
128,SIGNATURE,,,,Pursuant to the requirements of the Securities...,"[0.020313549786806107, 0.020454615354537964, 0...","[0.05372314900159836, 0.009199251420795918, 0...."
129,CERTIFICATION,,,,"I, Satya Nadella, certify that: \n1\. I have ...","[0.009701263159513474, 0.0226792823523283, 0.0...","[0.02512427605688572, 0.00577161880210042, 0.0..."


In [20]:
def get_keywords_and_phrases(content):
    query = "can you extract a the key-value pair of the following text? text:" + content#table_html
    messages = [{"role":"system","content":"You are a bot that helps with extracting key-value pairs from text provided. Please provide the output in a json structure"}, 
               {"role":"user","content":query}]

    response = openai.ChatCompletion.create(engine="gpt4o",  
                                        messages = messages, 
                                        temperature=0.2,  
                                        max_tokens=1000,  top_p=0.5,  
                                        frequency_penalty=0,  
                                        presence_penalty=0,  
                                        stop=None)
    raw_extract = response.choices[0].message.content
    print(raw_extract)
    return raw_extract

In [27]:
text = get_keywords_and_phrases(df['content'][3])

```json
{
  "Three Months Ended": {
    "March 31, 2023": {
      "Revenue": {
        "Product": 15588,
        "Service and other": 37269,
        "Total revenue": 52857
      },
      "Cost of revenue": {
        "Product": 3941,
        "Service and other": 12187,
        "Total cost of revenue": 16128
      },
      "Gross margin": 36729,
      "Research and development": 6984,
      "Sales and marketing": 5750,
      "General and administrative": 1643,
      "Operating income": 22352,
      "Other income (expense), net": 321,
      "Income before income taxes": 22673,
      "Provision for income taxes": 4374,
      "Net income": 18299,
      "Earnings per share": {
        "Basic": 2.46,
        "Diluted": 2.45
      },
      "Weighted average shares outstanding": {
        "Basic": 7441,
        "Diluted": 7464
      }
    },
    "March 31, 2022": {
      "Revenue": {
        "Product": 17366,
        "Service and other": 31994,
        "Total revenue": 49360
      },
      "Cos

In [28]:
def answer_questions(text):
    query = "what was microsoft revenue for the 9 months that ended in march 2023?: text" + text#table_html
    messages = [{"role":"system","content":"You are a bot that helps with answering questions"}, 
               {"role":"user","content":query}]

    response = openai.ChatCompletion.create(engine="gpt4o",  
                                        messages = messages, 
                                        temperature=0.2,  
                                        max_tokens=1000,  top_p=0.5,  
                                        frequency_penalty=0,  
                                        presence_penalty=0,  
                                        stop=None)
    raw_extract = response.choices[0].message.content
    print(raw_extract)
    key_phrases = raw_extract

In [29]:
answer_questions(text)

For the nine months that ended on March 31, 2023, Microsoft's total revenue was $155,726 million. This revenue is broken down into:

- Product revenue: $47,846 million
- Service and other revenue: $107,880 million
