### Step_0: Import libraries

In [1]:
import argparse
import base64
import glob
import html
import io
import os
import re
import tempfile
import time
from typing import Any, Optional, Union
import json

from dotenv import load_dotenv

import openai
import tiktoken
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential, TokenCredential
from azure.identity import AzureDeveloperCliCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    HnswParameters,
    PrioritizedFields,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SemanticSettings,
    SimpleField,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
)
from azure.storage.blob import BlobServiceClient
from azure.storage.filedatalake import (
    DataLakeServiceClient,
)
from pypdf import PdfReader, PdfWriter
from tenacity import (
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_random_exponential,
)

### Step_1: Parse information out of document

In [2]:
# Configure environment variables  
load_dotenv()  

endpoint = os.getenv("AZURE_DOC_INTELLIGENCE_ENDPOINT") 
key = os.getenv("AZURE_DOC_INTELLIGENCE_KEY") 

In [3]:

def get_document_text(filename):
    offset = 0
    page_map = []

    #if args.verbose:
    print(f"Extracting text from '{filename}' using Azure Document Intelligence")
    form_recognizer_client = DocumentAnalysisClient(
        endpoint=endpoint,
        credential=AzureKeyCredential(key),
        headers={"x-ms-useragent": "azure-search-chat-demo/1.0.0"},
    )
    with open(filename, "rb") as f:
        poller = form_recognizer_client.begin_analyze_document("prebuilt-layout", document=f)
    form_recognizer_results = poller.result()

    for page_num, page in enumerate(form_recognizer_results.pages):
        tables_on_page = [
            table
            for table in (form_recognizer_results.tables or [])
            if table.bounding_regions and table.bounding_regions[0].page_number == page_num + 1
        ]

        # mark all positions of the table spans in the page
        page_offset = page.spans[0].offset
        page_length = page.spans[0].length
        table_chars = [-1] * page_length
        for table_id, table in enumerate(tables_on_page):
            for span in table.spans:
                # replace all table spans with "table_id" in table_chars array
                for i in range(span.length):
                    idx = span.offset - page_offset + i
                    if idx >= 0 and idx < page_length:
                        table_chars[idx] = table_id

        # build page text by replacing characters in table spans with table html
        page_text = ""
        added_tables = set()
        for idx, table_id in enumerate(table_chars):
            if table_id == -1:
                page_text += form_recognizer_results.content[page_offset + idx]
            elif table_id not in added_tables:
                page_text += table_to_html(tables_on_page[table_id])
                added_tables.add(table_id)

        page_text += " "
        page_map.append((page_num, offset, page_text))
        offset += len(page_text)

    return page_map

def table_to_html(table):
    table_html = "<table>"
    rows = [
        sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index)
        for i in range(table.row_count)
    ]
    for row_cells in rows:
        table_html += "<tr>"
        for cell in row_cells:
            tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td"
            cell_spans = ""
            if cell.column_span > 1:
                cell_spans += f" colSpan={cell.column_span}"
            if cell.row_span > 1:
                cell_spans += f" rowSpan={cell.row_span}"
            table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}</{tag}>"
        table_html += "</tr>"
    table_html += "</table>"
    return table_html

#### Step_1_1: Run parser

In [4]:
filenames = ['MICROSOFT-10K-FY2023-Q3']

In [5]:
path_to_sample_documents = os.path.abspath(
    os.path.join(
        "raw_files/MICROSOFT-10Q-FY2023-Q3.pdf",
    )
)

In [6]:
page_map  = get_document_text(path_to_sample_documents)

Extracting text from 'c:\Users\jomedin\Documents\AI-Experimentation\RAG_Hackathon\RAG_process\raw_files\MICROSOFT-10Q-FY2023-Q3.pdf' using Azure Document Intelligence


In [7]:
text = ""
for page in page_map:
    text += page[2] + "\n"

In [8]:
new_page_map = [0,page_map[-1][1], text]

### Step_2: Create Sections
* Chunk files by using **MAX_SECTION_LENGTH, SENTENCE_SEARCH_LIMIT and SECTION_OVERLAP**
* Create embeddings **deployment_name_embedding**

In [9]:
import openai


load_dotenv()  

openai.api_type = "azure"
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  

use_vectors=True
deployment_name_embedding = "text-embedding-ada-002" 
filename = path_to_sample_documents

In [10]:
"\n".join(p[2] for p in page_map[:2])

':selected: X QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Quarterly Period Ended March 31, 2023\nUNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549\nFORM 10-Q\nOR\nto :unselected: TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Transition Period From\nCommission File Number 001-37845\nMICROSOFT CORPORATION\nWASHINGTON (STATE OF INCORPORATION)\n91-1144442\n(I.R.S. ID)\nONE MICROSOFT WAY, REDMOND, WASHINGTON 98052-6399 (425) 882-8080 www.microsoft.com/investor\nSecurities registered pursuant to Section 12(b) of the Act:\nTitle of each class\nTrading Symbol MSFT MSFT MSFT\nName of exchange on which registered\nNASDAQ\nNASDAQ\nNASDAQ\nCommon stock, $0.00000625 par value per share 3.125% Notes due 2028 2.625% Notes due 2033\nIndicate by check mark whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act o

In [11]:
MAX_SECTION_LENGTH = 8000
SENTENCE_SEARCH_LIMIT = 100
SECTION_OVERLAP = 1000

def split_text(page_map, filename):
    SENTENCE_ENDINGS = [".", "!", "?"]
    WORDS_BREAKS = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]
    print(f"Splitting '{filename}' into sections")

    def find_page(offset):
        num_pages = len(page_map)
        for i in range(num_pages - 1):
            if offset >= page_map[i][1] and offset < page_map[i + 1][1]:
                return i
        return num_pages - 1

    all_text = "\n".join(p[2] for p in page_map)
    length = len(all_text)
    start = 0
    end = length
    while start + SECTION_OVERLAP < length:
        last_word = -1
        end = start + MAX_SECTION_LENGTH

        if end > length:
            end = length
        else:
            # Try to find the end of the sentence
            while (
                end < length
                and (end - start - MAX_SECTION_LENGTH) < SENTENCE_SEARCH_LIMIT
                and all_text[end] not in SENTENCE_ENDINGS
            ):
                if all_text[end] in WORDS_BREAKS:
                    last_word = end
                end += 1
            if end < length and all_text[end] not in SENTENCE_ENDINGS and last_word > 0:
                end = last_word  # Fall back to at least keeping a whole word
        if end < length:
            end += 1

        # Try to find the start of the sentence or at least a whole word boundary
        last_word = -1
        while (
            start > 0
            and start > end - MAX_SECTION_LENGTH - 2 * SENTENCE_SEARCH_LIMIT
            and all_text[start] not in SENTENCE_ENDINGS
        ):
            if all_text[start] in WORDS_BREAKS:
                last_word = start
            start -= 1
        if all_text[start] not in SENTENCE_ENDINGS and last_word > 0:
            start = last_word
        if start > 0:
            start += 1

        section_text = all_text[start:end]
        yield (section_text, find_page(start))

        last_table_start = section_text.rfind("<table")
        if last_table_start > 2 * SENTENCE_SEARCH_LIMIT and last_table_start > section_text.rfind("</table"):
            # If the section ends with an unclosed table, we need to start the next section with the table.
            # If table starts inside SENTENCE_SEARCH_LIMIT, we ignore it, as that will cause an infinite loop for tables longer than MAX_SECTION_LENGTH
            # If last table starts inside SECTION_OVERLAP, keep overlapping
            print(f"Section ends with unclosed table, starting next section with the table at page {find_page(start)} offset {start} table start {last_table_start}")
            start = min(end - SECTION_OVERLAP, start + last_table_start)
        else:
            start = end - SECTION_OVERLAP

    if start + SECTION_OVERLAP < end:
        yield (all_text[start:end], find_page(start))

def filename_to_id(filename):
    filename_ascii = re.sub("[^0-9a-zA-Z_-]", "_", filename)
    filename_hash = base64.b16encode(filename.encode("utf-8")).decode("ascii")
    return f"file-{filename_ascii}-{filename_hash}"


def create_sections(
    filename, page_map, use_vectors, deployment_name_embedding: Optional[str] = None, embedding_model: Optional[str] = None, company =None, form_type =None,fiscal_year=None,quarter = None
):
    file_id = filename_to_id(filename)
    for i, (content, pagenum) in enumerate(split_text(page_map, filename)):
        section = {
            "id": f"{file_id}-page-{i}",
            "content": content,
            "category": None,
            "sourcepage": blob_name_from_file_page(filename, pagenum),
            "sourcefile": filename,
            "companyname": company,
            "formtype": form_type,
            "fiscalyear": fiscal_year,
            "quarter": quarter
        }
        if use_vectors:
            section["embedding"] = compute_embedding(content, deployment_name_embedding, embedding_model)
        yield section


def before_retry_sleep(retry_state):
    print("Rate limited on the OpenAI embeddings API, sleeping before retrying...")

@retry(
    retry=retry_if_exception_type(openai.error.RateLimitError),
    wait=wait_random_exponential(min=15, max=60),
    stop=stop_after_attempt(15),
    before_sleep=before_retry_sleep,
)
def compute_embedding(text, deployment_name_embedding, embedding_model: Optional[str] = None):
    #refresh_openai_token()
    #embedding_args = {"deployment_id": deployment_name_embedding} if args.openaihost != "openai" else {}
    return openai.Embedding.create(engine=deployment_name_embedding, input=text)["data"][0]["embedding"]

def blob_name_from_file_page(filename, page=0):
    if os.path.splitext(filename)[1].lower() == ".pdf":
        return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
    else:
        return os.path.basename(filename)

AttributeError: module 'openai' has no attribute 'error'

#### Step_2_1: Creating chunks

In [None]:
chunks =[section for section in split_text(page_map, filename)]

Splitting 'c:\Users\jomedin\Documents\AI-Experimentation\RAG_Hackathon\RAG_process\raw_files\MICROSOFT-10Q-FY2023-Q3.pdf' into sections
Section ends with unclosed table, starting next section with the table at page 0 offset 0 table start 7760
Section ends with unclosed table, starting next section with the table at page 3 offset 6974 table start 7910
Section ends with unclosed table, starting next section with the table at page 8 offset 21123 table start 7858
Section ends with unclosed table, starting next section with the table at page 9 offset 28187 table start 7435
Section ends with unclosed table, starting next section with the table at page 14 offset 42322 table start 7371
Section ends with unclosed table, starting next section with the table at page 16 offset 49415 table start 8126
Section ends with unclosed table, starting next section with the table at page 24 offset 70708 table start 6545
Section ends with unclosed table, starting next section with the table at page 26 offset 

In [None]:
chunks[0]

(':selected: X QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Quarterly Period Ended March 31, 2023\nUNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549\nFORM 10-Q\nOR\nto :unselected: TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Transition Period From\nCommission File Number 001-37845\nMICROSOFT CORPORATION\nWASHINGTON (STATE OF INCORPORATION)\n91-1144442\n(I.R.S. ID)\nONE MICROSOFT WAY, REDMOND, WASHINGTON 98052-6399 (425) 882-8080 www.microsoft.com/investor\nSecurities registered pursuant to Section 12(b) of the Act:\nTitle of each class\nTrading Symbol MSFT MSFT MSFT\nName of exchange on which registered\nNASDAQ\nNASDAQ\nNASDAQ\nCommon stock, $0.00000625 par value per share 3.125% Notes due 2028 2.625% Notes due 2033\nIndicate by check mark whether the registrant (1) has filed all reports required to be filed by Section 13 or 15(d) of the Securities Exchange Act 

#### Step_2_2: Creating sections

In [None]:
sections = create_sections(
    os.path.basename(path_to_sample_documents),
    page_map,
    use_vectors,
    deployment_name_embedding,
)

In [None]:
sections =[section for section in sections]

Splitting 'MICROSOFT-10Q-FY2023-Q3.pdf' into sections
Section ends with unclosed table, starting next section with the table at page 0 offset 0 table start 7760
Section ends with unclosed table, starting next section with the table at page 3 offset 6974 table start 7910
Section ends with unclosed table, starting next section with the table at page 8 offset 21123 table start 7858
Section ends with unclosed table, starting next section with the table at page 9 offset 28187 table start 7435
Section ends with unclosed table, starting next section with the table at page 14 offset 42322 table start 7371
Section ends with unclosed table, starting next section with the table at page 16 offset 49415 table start 8126
Section ends with unclosed table, starting next section with the table at page 24 offset 70708 table start 6545
Section ends with unclosed table, starting next section with the table at page 26 offset 77161 table start 7422
Section ends with unclosed table, starting next section wit

#### Step_2_3: Creating keywordfields

In [None]:
import openai
from alive_progress import alive_bar

deployment_name_embedding = "text-embedding-ada-002"
filename = path_to_sample_documents

def before_retry_sleep(retry_state):
    print("Rate limited on the OpenAI embeddings API, sleeping before retrying...")

@retry(
    retry=retry_if_exception_type(openai.error.RateLimitError),
    wait=wait_random_exponential(min=15, max=60),
    stop=stop_after_attempt(15),
    before_sleep=before_retry_sleep,
)
def get_keywords_and_phrases(content):
    query = "can you extract a description of the following text? text:" + content#table_html
    messages = [{"role":"system","content":"You are an investment adivsor that reads information from SEC filings, such as 10K and 10Q. please be concise, please only provide a brief description with no explanation or detail"}, 
               {"role":"user","content":query}]

    response = openai.ChatCompletion.create(engine="gpt4t",  
                                        messages = messages, 
                                        temperature=0.5,  
                                        max_tokens=1000,  top_p=0.5,  
                                        frequency_penalty=0,  
                                        presence_penalty=0,  
                                        stop=None)
    raw_extract = response.choices[0].message.content
    print(raw_extract)
    #json_load = json.loads(raw_extract)
    #print(json_load)
    #categories = json_load['categories']
    key_phrases = raw_extract
    #print(key_phrases)
    
    return key_phrases

def add_kwords_kphrases(sections):
    
    enriched_sections = []
    
    with alive_bar(len(sections)) as bar:
        for section_l in sections:

            output = get_keywords_and_phrases(section_l['content'])

            section_l['description'] = output
            section_l["description_embedding"] = compute_embedding(section_l['content'], deployment_name_embedding)
 

            enriched_sections.append(section_l)
            bar()

    return enriched_sections


get_keywords_and_phrases(sections[1]['content'])

In [None]:
%%time
add_kwords_kphrases(sections[:1])

on 0: This text is a Form 10-Q quarterly report filed by Microsoft Corporation with the SEC for the quarter ended March 31, 2023. It includes financial statements such as income statements, comprehensive income statements, balance sheets, and cash flow statements for the three and nine months ended March 31, 2023, and 2022. The report details Microsoft's revenues, costs, operating income, net income, and comprehensive income, along with basic and diluted earnings per share. It also provides information on the company's assets, including cash and cash equivalents and short-term investments. Microsoft is identified as a large accelerated filer and has confirmed compliance with SEC filing requirements. The company's common stock is listed on NASDAQ with a par value of $0.00000625 per share, and as of April 20, 2023, there were 7,435,487,575 shares outstanding.
|████████████████████████████████████████| 1/1 [100%] in 13.7s (0.07/s) 
CPU times: total: 15.6 ms
Wall time: 13.7 s


[{'id': 'file-MICROSOFT-10Q-FY2023-Q3_pdf-4D4943524F534F46542D3130512D4659323032332D51332E706466-page-0',
  'content': ':selected: X QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Quarterly Period Ended March 31, 2023\nUNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549\nFORM 10-Q\nOR\nto :unselected: TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Transition Period From\nCommission File Number 001-37845\nMICROSOFT CORPORATION\nWASHINGTON (STATE OF INCORPORATION)\n91-1144442\n(I.R.S. ID)\nONE MICROSOFT WAY, REDMOND, WASHINGTON 98052-6399 (425) 882-8080 www.microsoft.com/investor\nSecurities registered pursuant to Section 12(b) of the Act:\nTitle of each class\nTrading Symbol MSFT MSFT MSFT\nName of exchange on which registered\nNASDAQ\nNASDAQ\nNASDAQ\nCommon stock, $0.00000625 par value per share 3.125% Notes due 2028 2.625% Notes due 2033\nIndicate by check mark whethe

In [None]:
## GPT3516k: 1min6.6secs
## GPT4: 9min 58secs
## GPT4o: 1min 54secs

In [None]:
%%time
enriched_sections = add_kwords_kphrases(sections)

on 0: This text is a Form 10-Q quarterly report filed by Microsoft Corporation for the period ended March 31, 2023. It includes financial statements such as income statements, comprehensive income statements, balance sheets, and statements of cash flows for the three and nine months ended March 31, 2023, and 2022. The report details Microsoft's revenues, costs, operating income, net income, and comprehensive income, along with assets, liabilities, and stockholders' equity as of March 31, 2023, and June 30, 2022. Microsoft is identified as a large accelerated filer and has confirmed compliance with SEC filing requirements. The company's common stock is listed on NASDAQ under the symbol MSFT, and it has reported having 7,435,487,575 shares of common stock outstanding as of April 20, 2023.
on 1: The provided text includes financial data and statements from a company's SEC filings, covering comprehensive income, balance sheets, cash flow statements, and stockholders' equity statements. 
  

### Step_3: Putting all together

In [None]:
filenames = os.listdir("raw_files")[:-3]

#### Step_3_1: Generating all chunks

In [None]:
all_sections = []
for single_filename in filenames:
    local_path = os.path.abspath(
    os.path.join(
        "raw_files/"+str(single_filename),))
    print("----------------------------------------------------------------------------")
    print("Processing following file: ", local_path)
     
    company = single_filename.split("-")[1]
    form_type = single_filename.split("-")[0]
    #fiscal_year = single_filename.split("-")[2]
    #quarter = single_filename.split("-")[3]
    
    print("Creating filemap, doing OCR to extract text and tables using Document Intelligence")
    page_map  = get_document_text(local_path)
    
    print("Creating Sections (chunks) and embeddings")
    section_generator = create_sections(
        os.path.basename(single_filename),
        page_map,
        use_vectors,
        deployment_name_embedding,
        company=company,
        form_type = form_type,
        #fiscal_year= fiscal_year,
        #quarter= quarter)
        )
    all_sections = all_sections+[single_section for single_section in section_generator]
    print("File Processed")

----------------------------------------------------------------------------
Processing following file:  c:\Users\jomedin\Documents\AI-Experimentation\RAG_Hackathon\RAG_process\raw_files\10K-AMZN-02-03-2023.pdf
Creating filemap, doing OCR to extract text and tables using Document Intelligence
Extracting text from 'c:\Users\jomedin\Documents\AI-Experimentation\RAG_Hackathon\RAG_process\raw_files\10K-AMZN-02-03-2023.pdf' using Azure Document Intelligence
Creating Sections (chunks) and embeddings
Splitting '10K-AMZN-02-03-2023.pdf' into sections
Section ends with unclosed table, starting next section with the table at page 33 offset 141362 table start 4734
Section ends with unclosed table, starting next section with the table at page 35 offset 146043 table start 7605
Section ends with unclosed table, starting next section with the table at page 50 offset 209743 table start 8048
Section ends with unclosed table, starting next section with the table at page 59 offset 245119 table start 7775

#### Step_3_2: Enriching Sections

In [None]:
enriched_sections = add_kwords_kphrases(all_sections)

on 0: This text is the Table of Contents of a Form 10-K filing submitted to the SEC by Amazon.com, Inc. The filing includes information about the company's business, risk factors, financial statements, and other relevant details.
on 1: The company is guided by four principles: customer obsession, passion for invention, commitment to operational excellence, and long-term thinking. They serve various customer sets including consumers, sellers, developers, enterprises, content creators, advertisers, and employees. The company operates in three segments: North America, International, and Amazon Web Services (AWS). They serve consumers through online and physical stores, offering a wide selection, competitive prices, and convenience. They also manufacture and sell electronic devices and develop media content. They offer programs for sellers to grow their businesses and fulfill orders through the company. They serve developers and enterprises through AWS, providing a range of technology serv

In [None]:
enriched_sections[0]['description']

"This text is the Table of Contents of a Form 10-K filing submitted to the SEC by Amazon.com, Inc. The filing includes information about the company's business, risk factors, financial statements, and other relevant details."

In [None]:
enriched_sections[0]

{'id': 'file-10K-AMZN-02-03-2023_pdf-31304B2D414D5A4E2D30322D30332D323032332E706466-page-0',
 'content': 'Table of Contents\nUNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549\nFORM 10-K\n(Mark One) :selected: ☒ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the fiscal year ended December 31, 2022\nor :unselected: ☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\nFor the transition period from to\n.\nCommission File No. 000-22513\nAMAZON.COM, INC.\n(Exact name of registrant as specified in its charter)\nDelaware (State or other jurisdiction of incorporation or organization)\nTitle of Each Class Common Stock, par value $.01 per share\n91-1646860 (I.R.S. Employer Identification No.)\n410 Terry Avenue North Seattle, Washington 98109-5210 (206) 266-1000\n(Address and telephone number, including area code, of registrant\'s principal executive offices) Securities registered pursuant to Se

In [None]:
# Output embeddings to docVectors.json file
with open("output/enrichedVectors.json", "w") as f:
    json.dump(enriched_sections, f)