In [10]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, ContentFormat
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentAnalysisFeature
from azure.ai.documentintelligence.models import DocumentTable

from langchain.text_splitter import MarkdownHeaderTextSplitter
import os
from dotenv import load_dotenv
import pandas as pd
import mdpd
import re
import json
import time
import tiktoken
import uuid

import os
from openai import AzureOpenAI


load_dotenv(override=True)

# Document Intelligence Client

AZURE_DOC_INTELLIGENCE_ENDPOINT = os.environ["AZURE_DOC_INTELLIGENCE_ENDPOINT"]
AZURE_DOC_INTELLIGENCE_KEY = os.environ["AZURE_DOC_INTELLIGENCE_KEY"]

document_intelligence_client = DocumentIntelligenceClient(endpoint=AZURE_DOC_INTELLIGENCE_ENDPOINT, credential=AzureKeyCredential(AZURE_DOC_INTELLIGENCE_KEY), api_version="2024-02-29-preview")

# Azure OpenAI Client

aoai_client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version="2024-07-01-preview"
)

In [11]:
def OcrExtractionDI(relative_path: str, Markdown: [bool]=True):
    
    path_to_document = os.path.abspath(
        os.path.join(relative_path))
    
    if Markdown==True:
        output_format = ContentFormat.MARKDOWN
    else:
        output_format = None

    with open(path_to_document, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document("prebuilt-layout", 
                                                                    analyze_request=f, content_type="application/octet-stream", 
                                                                    output_content_format=output_format)
    return poller.result()

In [12]:
def MdFormatting(ocr_extraction):
    doc_string = ocr_extraction.content
    strings_to_replace = re.findall(".+\n===", doc_string)
    for string in strings_to_replace:
        doc_string = doc_string.replace(string, "=== "+string.replace("===",""))

    ## Split the document into chunks base on markdown headers.
    headers_to_split_on = [
        ("===", "Title"),
        ("##", "Header 1"),
    ]
    text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    markdown_chunks = text_splitter.split_text(doc_string)


    chunk_list = []
    for chunk in markdown_chunks:
        try:
            title = chunk.metadata['Title']
        except:
            title = ""
        try:
            header1 = chunk.metadata['Header 1']
        except:
            header1 = ""

        chunk_list.append({"title": title,"header":header1,"content": title + "/n" + header1 + "/n"+ chunk.page_content})

    return pd.DataFrame(chunk_list)

In [13]:
def NumberTokens(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model("gpt-4o")
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [14]:
def GenerateVector(text: str) -> list:
    response = aoai_client.embeddings.create(
        input = text,
        model= "text-embedding-3-small")
    return response.data[0].embedding

In [15]:
def GetSummary(content):
    query = "Can you extract a summary of following portion of a 10K filing? content: " + content
    messages = [{"role":"system","content":"You are an investment adivsor that reads information from SEC filings, such as 10K and 10Q. please be concise, please only provide a brief description with no explanation or detail"}, 
               {"role":"user","content":query}]

    response = aoai_client.chat.completions.create(model="gpt-4o-mini",  
                                        messages = messages, 
                                        temperature=0,  
                                        max_tokens=2000,
                                        seed = 42)
    summary = response.choices[0].message.content

    return summary

In [16]:
def GenerateUniqueID():
    return str(uuid.uuid4().fields[-1])

In [17]:
def SaveDFJson(DataFrame):
    for i in DataFrame.index:
        DataFrame.loc[i].to_json("../data/processed/json_file_{}.json".format(i))

In [18]:
print("------------------------------------------------------------------------")
print("Step 1: OCR Extraction using Document Intelligence")
start = time.time()
ocr_extraction = OcrExtractionDI(relative_path = "../data/raw/MICROSOFT-10Q-FY2023-Q3.pdf")
end = time.time()
print("--> Total time: ", end-start)

print("------------------------------------------------------------------------")
print("Step 2: Extracting Markdown Output into DataFrame")
start = time.time()
extracted_dataframe =MdFormatting(ocr_extraction)
extracted_dataframe['char_len'] = extracted_dataframe.content.apply(lambda x: len(x))
extracted_dataframe['token_len'] = extracted_dataframe.content.apply(lambda x: NumberTokens(x))
end = time.time()
print("--> Total time: ", end-start)

print("------------------------------------------------------------------------")
print("Step 3: Create Summary of each Section")
start = time.time()
extracted_dataframe['summary'] = extracted_dataframe.content.apply(lambda x: GetSummary(x))
end = time.time()
print("--> Total time: ", end-start)

print("------------------------------------------------------------------------")
print("Step 4: Vectorizing title")
start = time.time()
extracted_dataframe['title_vector'] = extracted_dataframe.title.apply(lambda x: GenerateVector(x))
end = time.time()
print("--> Total time: ", end-start)

print("------------------------------------------------------------------------")
print("Step 5: Vectorizing content")
start = time.time()
extracted_dataframe['content_vector'] = extracted_dataframe.content.apply(lambda x: GenerateVector(x))
end = time.time()
print("--> Total time: ", end-start)

print("------------------------------------------------------------------------")
print("Step 6: Generating Unique ID and Dropping Unnecesary Columns")
start = time.time()
extracted_dataframe['unique_id'] = extracted_dataframe.content.apply(lambda x: GenerateUniqueID())
extracted_dataframe = extracted_dataframe.drop(columns = ['char_len','token_len'], errors = 'ignore')
end = time.time()
print("--> Total time: ", end-start)

print("------------------------------------------------------------------------")
print("Step 7: Storing documents as json")
start = time.time()
SaveDFJson(extracted_dataframe)
end = time.time()
print("--> Created a total of: [", len(extracted_dataframe), "] files")
print("--> Total time: ", end-start)

------------------------------------------------------------------------
Step 1: OCR Extraction using Document Intelligence
--> Total time:  26.315131187438965
------------------------------------------------------------------------
Step 2: Extracting Markdown Output into DataFrame
--> Total time:  0.2090284824371338
------------------------------------------------------------------------
Step 3: Create Summary of each Section
--> Total time:  155.95663690567017
------------------------------------------------------------------------
Step 4: Vectorizing title
--> Total time:  3.852348804473877
------------------------------------------------------------------------
Step 5: Vectorizing content
--> Total time:  7.335164785385132
------------------------------------------------------------------------
Step 6: Generating Unique ID and Dropping Unnecesary Columns
--> Total time:  0.0004830360412597656
------------------------------------------------------------------------
Step 7: Storing d