In [32]:
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, ContentFormat
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentAnalysisFeature
from azure.ai.documentintelligence.models import DocumentTable
from azure.ai.documentintelligence.models import DocumentTable

from langchain.text_splitter import MarkdownHeaderTextSplitter
from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter
from prepdocslib.page import Page
from prepdocslib.parser import Parser

import os
from dotenv import load_dotenv
import pandas as pd
import mdpd
import re
import json
import time
import tiktoken
import uuid
import html
import os
from openai import AzureOpenAI


load_dotenv(override=True)

# Document Intelligence Client

AZURE_DOC_INTELLIGENCE_ENDPOINT = os.environ["AZURE_DOC_INTELLIGENCE_ENDPOINT"]
AZURE_DOC_INTELLIGENCE_KEY = os.environ["AZURE_DOC_INTELLIGENCE_KEY"]

document_intelligence_client = DocumentIntelligenceClient(endpoint=AZURE_DOC_INTELLIGENCE_ENDPOINT, credential=AzureKeyCredential(AZURE_DOC_INTELLIGENCE_KEY), api_version="2024-02-29-preview")

# Azure OpenAI Client

aoai_client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version="2024-07-01-preview"
)

In [33]:
def table_to_html(table):
    table_html = "<table>"
    rows = [
        sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index)
        for i in range(table.row_count)
    ]
    for row_cells in rows:
        table_html += "<tr>"
        for cell in row_cells:
            tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td"
            cell_spans = ""
            if cell.column_span is not None and cell.column_span > 1:
                cell_spans += f" colSpan={cell.column_span}"
            if cell.row_span is not None and cell.row_span > 1:
                cell_spans += f" rowSpan={cell.row_span}"
            table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}</{tag}>"
        table_html += "</tr>"
    table_html += "</table>"
    return table_html

In [34]:
def text_html_processing(OcrExtractionDIOutput):
    offset = 0
    page_map = []
    page_map_dict =[]

    for page_num, page in enumerate(OcrExtractionDIOutput.pages):
        tables_on_page = [
            table
            for table in (OcrExtractionDIOutput.tables or [])
            if table.bounding_regions and table.bounding_regions[0].page_number == page_num + 1
        ]
        #print(tables_on_page)

        # mark all positions of the table spans in the page
        page_offset = page.spans[0].offset
        page_length = page.spans[0].length
        table_chars = [-1] * page_length
        for table_id, table in enumerate(tables_on_page):
            for span in table.spans:
                # replace all table spans with "table_id" in table_chars array
                for i in range(span.length):
                    idx = span.offset - page_offset + i
                    if idx >= 0 and idx < page_length:
                        table_chars[idx] = table_id

        # build page text by replacing characters in table spans with table html
        page_text = ""
        added_tables = set()
        for idx, table_id in enumerate(table_chars):
            if table_id == -1:
                page_text += OcrExtractionDIOutput.content[page_offset + idx]
            elif table_id not in added_tables:
                page_text += table_to_html(tables_on_page[table_id])
                added_tables.add(table_id)

        page_text += " "
        page_map.append((page_num+1, offset, page_text))

        single_page_dict = {}
        single_page_dict['page_num']= page_num+1
        single_page_dict['content'] = page_text
        single_page_dict['offset'] = offset
        page_map_dict.append(single_page_dict)

        offset += len(page_text)

    return page_map_dict

In [35]:
def CombinePages(pagemap):
    CombineText = "\n".join(p['content'] for p in pagemap)
    return CombineText

In [36]:
def OcrExtractionDI(relative_path: str, Markdown: [bool]=True):
    
    path_to_document = os.path.abspath(
        os.path.join(relative_path))
    
    if Markdown==True:
        output_format = ContentFormat.MARKDOWN
    else:
        output_format = None

    with open(path_to_document, "rb") as f:
        poller = document_intelligence_client.begin_analyze_document("prebuilt-layout", 
                                                                    analyze_request=f, content_type="application/octet-stream", 
                                                                    output_content_format=output_format)
    OcrExtractionDIOutput = poller.result()
    
    if Markdown==False:
        pagemap = text_html_processing(OcrExtractionDIOutput)
        extracted_processed_text = pagemap
    else:
        extracted_processed_text = OcrExtractionDIOutput

    return extracted_processed_text

In [37]:
def find_page(offset):
    num_pages = len(ocr_extraction)
    for i in range(num_pages - 1):
        if offset >= ocr_extraction[i]['offset'] and offset < ocr_extraction[i + 1]['offset']:
            return i
    return num_pages - 1

def FindEndFirstTable(text):
    pattern = r"</table>"
    matches_end = [match.start() for match in re.finditer(pattern, text)]
    #print(matches_end)
    #print("Found end of first table " + str(matches_end[0]+len(pattern)))
    return matches_end[0]+len(pattern)

def FindEndLastClosedTable(text):
    pattern = r"</table>"
    matches_end = [match.start() for match in re.finditer(pattern, text)]
    #print(matches_end)
    #print("Found end of last closed table: " + str(matches_end[-1]+len(pattern)))
    return matches_end[-1]+len(pattern)

def FindFirstOpenTableOffset(text):
    pattern = r"</table>"
    matches_end = [match.start() for match in re.finditer(pattern, text)]
    matches_start = [match.start() for match in re.finditer(r"<table", text)]
    if matches_end[0] < matches_start[0]:
        #print("Warning: Section starts with an open table")
        additional_start_offset = matches_end[0]
    else:
        additional_start_offset = 0
    return additional_start_offset

def CustomTextSplitter(pagemap):
    all_text = "\n".join(p['content'] for p in pagemap)

    DEFAULT_OVERLAP_PERCENT = 0  # See semantic search article for 10% overlap performance
    DEFAULT_SECTION_LENGTH = 1500  # Roughly 400-500 tokens for English

    STANDARD_WORD_BREAKS = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]
    # See W3C document https://www.w3.org/TR/jlreq/#cl-01
    CJK_WORD_BREAKS = ["、","，", "；", "：","（", "）","【","】","「","」","『","』","〔","〕","〈","〉","《","》","〖","〗","〘","〙","〚","〛","〝","〞","〟","〰","–","—","‘","’","‚","‛","“","”","„","‟","‹","›",]
    STANDARD_SENTENCE_ENDINGS = [".", "!", "?"]
    # See CL05 and CL06, based on JIS X 4051:2004
    # https://www.w3.org/TR/jlreq/#cl-04
    CJK_SENTENCE_ENDINGS = ["。", "！", "？", "‼", "⁇", "⁈", "⁉"]

    sentence_endings = STANDARD_SENTENCE_ENDINGS + CJK_SENTENCE_ENDINGS
    word_breaks = STANDARD_WORD_BREAKS + CJK_WORD_BREAKS
    max_section_length = DEFAULT_SECTION_LENGTH
    sentence_search_limit = 200
    #max_tokens_per_section = max_tokens_per_section
    section_overlap = 0

    length = len(all_text)
    start = 0
    end = length

    sections  = []
    index = 0
    raw_sections = []

    while start + section_overlap < length:
        last_word = -1
        end = start + max_section_length

        if end > length:
            end = length
        else:
            # Try to find the end of the sentence
            while (
                end < length
                and (end - start - max_section_length) < sentence_search_limit
                and all_text[end] not in sentence_endings
            ):
                if all_text[end] in word_breaks:
                    last_word = end
                end += 1
            if end < length and all_text[end] not in sentence_endings and last_word > 0:
                end = last_word  # Fall back to at least keeping a whole word
        if end < length:
            end += 1

        # Try to find the start of the sentence or at least a whole word boundary
        '''
        last_word = -1
        while (
            start > 0
            and start > end - max_section_length - 2 * sentence_search_limit
            and all_text[start] not in sentence_endings
        ):
            if all_text[start] in word_breaks:
                last_word = start
            start -= 1
        if all_text[start] not in sentence_endings and last_word > 0:
            start = last_word
        if start > 0:
            start += 1
        '''

        section_text = all_text[start:end]
        
        opened_tables = re.findall(r"<table", section_text, re.DOTALL)
        closed_tables = re.findall(r"</table>", section_text, re.DOTALL)
        raw_sections.append(section_text)
        
        if len(opened_tables) == 0:
            #print("------------------------------------------------------------------------")
            #print("Index: ", index)
            #print("---> No table found in chunk:")
            start = end
            #print("New start is: ", start)
            sections.append({"content": section_text,"chunk_id":index+1})
        elif len(opened_tables) == 1:
            #print("------------------------------------------------------------------------")
            #print("Index: ", index)
            #print("---> {} Table found in chunk:".format(len(opened_tables)))
            if len(closed_tables) == len(opened_tables):
                #print("---> {} Table found in chunk:".format(len(closed_tables)))
                section_text = all_text[start:end]
                start = end
                sections.append({"content": section_text,"chunk_id":index+1})
                #print("New start is: ", start)
            else:    
                #print("Index: ", index)
                #print("--------> 1 table not closed")
                #print("--------> Adding 5000 characters to the end of the table")
                incrased_section_text = all_text[start:end+5000]
                end_first_table = FindEndFirstTable(incrased_section_text)
                end = start + end_first_table
                section_text = all_text[start:end]
                start = end
                #print("New start is: ", start)
                sections.append({"content": section_text,"chunk_id":index+1})
        else:
            #print("------------------------------------------------------------------------")
            #print("Index: ", index)
            #print("---> {} Table found in chunk:".format(len(opened_tables)))
            
            if len(closed_tables) == len(opened_tables):
                #print("---> {} Table found in chunk:".format(len(closed_tables)))
                section_text = all_text[start:end]
                start = end
                sections.append({"content": section_text,"chunk_id":index+1})
                #print("New start is: ", start)
            else:
                #print("Index: ", index)
                #print("--------> {} closed tables".format(len(closed_tables)))
                #print("--------> {} tables not closed".format(len(opened_tables)-len(closed_tables)))
                end_last_closed_table = FindEndLastClosedTable(section_text)
                end = start + end_last_closed_table
                section_text = all_text[start:end]
                start = end
                sections.append({"content": section_text,"chunk_id":index+1})
                #print("New start is: ", start)
        index = index + 1

    if start + section_overlap < end:
        sections.append({"content": all_text[start:end],"chunk_id":index+1})
    return pd.DataFrame(sections)

In [38]:
def MdFormatting(ocr_extraction):
    doc_string = ocr_extraction.content
    strings_to_replace = re.findall(".+\n===", doc_string)
    for string in strings_to_replace:
        doc_string = doc_string.replace(string, "=== "+string.replace("===",""))

    ## Split the document into chunks base on markdown headers.
    headers_to_split_on = [
        ("===", "Title"),
        ("##", "Header 1"),
    ]
    text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

    markdown_chunks = text_splitter.split_text(doc_string)

    chunk_list = []
    for chunk in markdown_chunks:
        try:
            title = chunk.metadata['Title']
        except:
            title = ""
        try:
            header1 = chunk.metadata['Header 1']
        except:
            header1 = ""

        chunk_list.append({"title": title,"header":header1,"content": title + "/n" + header1 + "/n"+ chunk.page_content})
    return pd.DataFrame(chunk_list)

In [39]:
def NumberTokens(string: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model("gpt-4o")
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [40]:
def GenerateVector(text: str) -> list:
    response = aoai_client.embeddings.create(
        input = text,
        model= "text-embedding-3-small")
    return list(response.data[0].embedding)

In [41]:
def GetSummary(content):
    query = "Can you extract a summary of following portion of a 10K filing? content: " + content
    messages = [{"role":"system","content":"You are an investment adivsor that reads information from SEC filings, such as 10K and 10Q. please be concise, please only provide a brief description with no explanation or detail"}, 
               {"role":"user","content":query}]

    response = aoai_client.chat.completions.create(model="gpt-4o-mini",  
                                        messages = messages, 
                                        temperature=0,  
                                        max_tokens=2000,
                                        seed = 42)
    summary = response.choices[0].message.content

    return summary

In [42]:
def GetTitle(content):
    query = "Can you extract the title of following portion of a 10K filing? content: " + content
    messages = [{"role":"system","content":"You are an investment adivsor that reads information from SEC filings, such as 10K and 10Q. please be concise and do not generate any extra language, the reader will know that is reading a title"}, 
               {"role":"user","content":query}]

    response = aoai_client.chat.completions.create(model="gpt-4o-mini",  
                                        messages = messages, 
                                        temperature=0,  
                                        max_tokens=2000,
                                        seed = 42)
    title = response.choices[0].message.content

    return title

In [43]:
def GenerateUniqueID():
    return str(uuid.uuid4().fields[-1])

In [44]:
def SaveDFJson(DataFrame):
    for i in DataFrame.index:
        DataFrame.loc[i].to_json("../data/processed/json_file_{}.json".format(i))

In [45]:
def GetReportPeriod(content):
    query = "Can you extract the period corresponding 10K/10Q filing? content: " + content
    messages = [{"role":"system","content":"You are an investment adivsor that reads information from SEC filings, such as 10K and 10Q. please be concise and do not generate any extra language. please generate the period in the format of 'YYYY-MM-DD'"}, 
               {"role":"user","content":query}]

    response = aoai_client.chat.completions.create(model="gpt-4o-mini",  
                                        messages = messages, 
                                        temperature=0,  
                                        max_tokens=2000,
                                        seed = 42)
    period = response.choices[0].message.content

    return period

In [46]:
def ProcessMD(ocr_extraction):
    print("------------------------------------------------------------------------")
    print("Step 2: Extracting Markdown Output into DataFrame")
    start = time.time()
    extracted_dataframe =MdFormatting(ocr_extraction)
    extracted_dataframe['char_len'] = extracted_dataframe.content.apply(lambda x: len(x))
    extracted_dataframe['token_len'] = extracted_dataframe.content.apply(lambda x: NumberTokens(x))
    end = time.time()
    print("--> Total time: ", end-start)

    print("------------------------------------------------------------------------")
    print("Step 3: Vectorizing title")
    start = time.time()
    extracted_dataframe['title_vector'] = extracted_dataframe.title.apply(lambda x: GenerateVector(x))
    end = time.time()
    print("--> Total time: ", end-start)

    print("------------------------------------------------------------------------")
    print("Step 4: Vectorizing content")
    start = time.time()
    extracted_dataframe['content_vector'] = extracted_dataframe.content.apply(lambda x: GenerateVector(x))
    end = time.time()
    print("--> Total time: ", end-start)

    print("------------------------------------------------------------------------")
    print("Step 5: Generating Unique ID and Dropping Unnecesary Columns")
    start = time.time()
    extracted_dataframe['chunk_id'] = [i+1 for i in range(len(extracted_dataframe))]
    extracted_dataframe = extracted_dataframe.drop(columns = ['char_len','token_len','header'], errors = 'ignore')
    end = time.time()
    print("--> Total time: ", end-start)

    return extracted_dataframe

In [47]:
def ProcessChunk(ocr_extraction):
    print("------------------------------------------------------------------------")
    print("Step 2: Extracting Markdown Output into DataFrame")
    start = time.time()
    extracted_dataframe =CustomTextSplitter(ocr_extraction)
    extracted_dataframe['char_len'] = extracted_dataframe.content.apply(lambda x: len(x))
    extracted_dataframe['token_len'] = extracted_dataframe.content.apply(lambda x: NumberTokens(x))
    end = time.time()
    print("--> Total time: ", end-start)

    print("------------------------------------------------------------------------")
    print("Step 3: Create Title of each Section")
    start = time.time()
    extracted_dataframe['title'] = extracted_dataframe.content.apply(lambda x: GetTitle(x))
    end = time.time()
    print("--> Total time: ", end-start)

    print("------------------------------------------------------------------------")
    print("Step 4: Vectorizing title")
    start = time.time()
    extracted_dataframe['title_vector'] = extracted_dataframe.title.apply(lambda x: GenerateVector(x))
    end = time.time()
    print("--> Total time: ", end-start)

    print("------------------------------------------------------------------------")
    print("Step 5: Vectorizing content")
    start = time.time()
    extracted_dataframe['content_vector'] = extracted_dataframe.content.apply(lambda x: GenerateVector(x))
    end = time.time()
    print("--> Total time: ", end-start)

    print("------------------------------------------------------------------------")
    print("Step 6: Generating Unique ID and Dropping Unnecesary Columns")
    start = time.time()
    extracted_dataframe = extracted_dataframe.drop(columns = ['char_len','token_len'], errors = 'ignore')
    end = time.time()
    print("--> Total time: ", end-start)

    return extracted_dataframe

In [48]:
def ProcessPage(ocr_extraction):
    print("------------------------------------------------------------------------")
    print("Step 2: Extracting Markdown Output into DataFrame")
    start = time.time()
    extracted_dataframe =pd.DataFrame(ocr_extraction).drop(columns=['offset'])
    extracted_dataframe['char_len'] = extracted_dataframe.content.apply(lambda x: len(x))
    extracted_dataframe['token_len'] = extracted_dataframe.content.apply(lambda x: NumberTokens(x))
    end = time.time()
    print("--> Total time: ", end-start)

    print("------------------------------------------------------------------------")
    print("Step 3: Create Title of each Section")
    start = time.time()
    extracted_dataframe['title'] = extracted_dataframe.content.apply(lambda x: GetTitle(x))
    extracted_dataframe['title'] = extracted_dataframe['title'].fillna("none")
    end = time.time()
    print("--> Total time: ", end-start)

    print("------------------------------------------------------------------------")
    print("Step 4: Vectorizing title")
    start = time.time()
    extracted_dataframe['title_vector'] = extracted_dataframe.title.apply(lambda x: GenerateVector(x))
    end = time.time()
    print("--> Total time: ", end-start)

    print("------------------------------------------------------------------------")
    print("Step 5: Vectorizing content")
    start = time.time()
    extracted_dataframe['content_vector'] = extracted_dataframe.content.apply(lambda x: GenerateVector(x))
    end = time.time()
    print("--> Total time: ", end-start)

    print("------------------------------------------------------------------------")
    print("Step 6: Generating Unique ID and Dropping Unnecesary Columns")
    start = time.time()
    extracted_dataframe['chunk_id'] = extracted_dataframe['page_num']
    extracted_dataframe = extracted_dataframe.drop(columns = ['char_len','token_len'], errors = 'ignore')
    end = time.time()
    print("--> Total time: ", end-start)

    return extracted_dataframe

In [49]:
relative_path = "../../data/raw/"
files = os.listdir(relative_path)

Markdown = False
PageIndex = True

for Filename in files:

    cleaned_filename = re.sub(r".pdf",'', Filename)
    cleaned_filename.split("-")
    form_type = cleaned_filename.split("-")[0]
    ticker = cleaned_filename.split("-")[1]
    filing_date = cleaned_filename.split("-")[4]+"-"+cleaned_filename.split("-")[3]+"-"+cleaned_filename.split("-")[2]
    print("Working on: ", Filename)

    print("------------------------------------------------------------------------")
    print("Step 1: OCR Extraction using Document Intelligence")
    start = time.time()
    ocr_extraction = OcrExtractionDI(relative_path = "../../data/raw/"+Filename, Markdown=Markdown)
    end = time.time()
    print("--> Total time: ", end-start)

    if Markdown==True:
        outout_dataframe = ProcessMD(ocr_extraction)
        outout_dataframe['preprocessing_pipeline'] = "DI_MD_MarkDownTextSplitter"
    elif Markdown==False and PageIndex==False:
        outout_dataframe = ProcessChunk(ocr_extraction)
        outout_dataframe['preprocessing_pipeline'] = "DI_Text_HTML_CustomTextSplitter"
    elif Markdown==False and PageIndex==True:
        outout_dataframe = ProcessPage(ocr_extraction)
        outout_dataframe['preprocessing_pipeline'] = "DI_Text_HTML_PageSplitter"

    outout_dataframe['filename'] = re.sub(".pdf", "", Filename)
    outout_dataframe['chunk_id'] = outout_dataframe.filename.astype(str)+"-chunk-id-"+outout_dataframe['chunk_id'].astype(str)
    outout_dataframe['filing_period'] = GetReportPeriod(outout_dataframe['content'][0])
    outout_dataframe['filing_date'] = filing_date
    outout_dataframe['form_type'] = form_type
    outout_dataframe['ticker'] = ticker

    outout_dataframe.to_parquet("../../data/processed/files/"+Filename+".parquet")

Working on:  10K-AMZN-02-03-2023.pdf
------------------------------------------------------------------------
Step 1: OCR Extraction using Document Intelligence
--> Total time:  20.321908712387085
------------------------------------------------------------------------
Step 2: Extracting Markdown Output into DataFrame
--> Total time:  0.04969310760498047
------------------------------------------------------------------------
Step 3: Create Title of each Section
--> Total time:  20.79076075553894
------------------------------------------------------------------------
Step 4: Vectorizing title
--> Total time:  2.83966064453125
------------------------------------------------------------------------
Step 5: Vectorizing content
--> Total time:  5.416094541549683
------------------------------------------------------------------------
Step 6: Generating Unique ID and Dropping Unnecesary Columns
--> Total time:  0.0005152225494384766
Working on:  10K-AMZN-02-04-2022.pdf
-------------------