### Part 0. Loading libraries

In [1]:
import argparse
import base64
import glob
import html
import io
import os
import re
import tempfile
import time
from typing import Any, Optional, Union
import json
import pandas as pd
from dotenv import load_dotenv
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, ContentFormat
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, DocumentAnalysisFeature
from azure.ai.documentintelligence.models import DocumentTable

import openai
import tiktoken
from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential, TokenCredential
from azure.identity import AzureDeveloperCliCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    HnswParameters,
    PrioritizedFields,
    SearchableField,
    SearchField,
    SearchFieldDataType,
    SearchIndex,
    SemanticConfiguration,
    SemanticField,
    SemanticSettings,
    SimpleField,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
)
from azure.storage.blob import BlobServiceClient
from azure.storage.filedatalake import (
    DataLakeServiceClient,
)
from pypdf import PdfReader, PdfWriter
from tenacity import (
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_random_exponential,
)

  from pandas.core import (


In [3]:
# Configure environment variables  
load_dotenv()  

endpoint = os.environ["AZURE_DOC_INTELLIGENCE_ENDPOINT"]
key = os.environ["AZURE_DOC_INTELLIGENCE_KEY"]

In [4]:
def filename_to_id(filename):
    filename_ascii = re.sub("[^0-9a-zA-Z_-]", "_", filename)
    filename_hash = base64.b16encode(filename.encode("utf-8")).decode("ascii")
    return f"file-{filename_ascii}-{filename_hash}"

In [5]:
def get_document_text(filename):
    offset = 0
    page_map = []
    page_map_dict =[]

    #if args.verbose:
    print(f"Extracting text from '{filename}' using Azure Document Intelligence")
    document_intelligence_client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key), api_version="2023-10-31-preview")
    
    
    with open(filename, "rb") as f:
            poller = document_intelligence_client.begin_analyze_document("prebuilt-layout", 
                                                                 analyze_request=f, content_type="application/octet-stream", 
                                                                 #output_content_format=ContentFormat.MARKDOWN,
                                                                 features=[DocumentAnalysisFeature.KEY_VALUE_PAIRS])
    form_recognizer_results = poller.result()

    for page_num, page in enumerate(form_recognizer_results.pages):
        tables_on_page = [
            table
            for table in (form_recognizer_results.tables or [])
            if table.bounding_regions and table.bounding_regions[0].page_number == page_num + 1
        ]
        #print(tables_on_page)

        # mark all positions of the table spans in the page
        page_offset = page.spans[0].offset
        page_length = page.spans[0].length
        table_chars = [-1] * page_length
        for table_id, table in enumerate(tables_on_page):
            for span in table.spans:
                # replace all table spans with "table_id" in table_chars array
                for i in range(span.length):
                    idx = span.offset - page_offset + i
                    if idx >= 0 and idx < page_length:
                        table_chars[idx] = table_id

        # build page text by replacing characters in table spans with table html
        page_text = ""
        added_tables = set()
        for idx, table_id in enumerate(table_chars):
            if table_id == -1:
                page_text += form_recognizer_results.content[page_offset + idx]
            elif table_id not in added_tables:
                page_text += table_to_html(tables_on_page[table_id])
                added_tables.add(table_id)

        page_text += " "
        page_map.append((page_num+1, offset, page_text))

        file_id = filename_to_id(filename)

        single_page_dict = {}
        single_page_dict['page_num']= page_num+1
        single_page_dict['content'] = page_text
        single_page_dict['id'] = f"{file_id}-page-{page_num+1}"
        #single_page_dict['sourcefile'] = filename
        page_map_dict.append(single_page_dict)

        offset += len(page_text)

        # Extracting information of key value pairs
        keyvalues = []
        for keyvalue in form_recognizer_results.key_value_pairs:
            if keyvalue.value:
                if keyvalue.value.content !='$':
                    if keyvalue.value.content.lower() !='the':
                        keyval_dict = {}
                        keyval_dict['page_num'] = keyvalue.key.bounding_regions[0].page_number
                        keyval_dict['key'] = keyvalue.key.content
                        keyval_dict['value'] = keyvalue.value.content
                        key_value= keyvalue.key.content + ": " + str(keyvalue.value.content)
                        key_value = re.sub(r'(.)\1+', r'\1', key_value)
                        keyval_dict['key_value'] = key_value
            #print(keyval_dict)
            keyvalues.append(keyval_dict)
            keyvalues =[i for n, i in enumerate(keyvalues) if i not in keyvalues[n + 1:]]

    return page_map_dict,keyvalues

def table_to_html(table: DocumentTable):
    table_html = "<table>"
    rows = [
        sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index)
        for i in range(table.row_count)
    ]
    for row_cells in rows:
        table_html += "<tr>"
        for cell in row_cells:
            tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td"
            cell_spans = ""
            if cell.column_span is not None and cell.column_span > 1:
                cell_spans += f" colSpan={cell.column_span}"
            if cell.row_span is not None and cell.row_span > 1:
                cell_spans += f" rowSpan={cell.row_span}"
            table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}</{tag}>"
        table_html += "</tr>"
    table_html += "</table>"
    return table_html

In [6]:
path_to_sample_documents = os.path.abspath(
    os.path.join(
        "MICROSOFT-10Q-FY2023-Q3.pdf",
    )
)

extraction,keyvalues = get_document_text(path_to_sample_documents)

In [8]:
import numpy as np
df_corpus = pd.DataFrame(extraction)
df_keyvalues = pd.DataFrame(keyvalues)

kv_page = []
for page_number in df_keyvalues.page_num.unique():
    keyval_dict = {}
    keyval_dict['page_num'] = page_number

    filtered_df = df_keyvalues[df_keyvalues['page_num']==page_number]
    keyval_dict['key'] = filtered_df['key'].to_list()
    keyval_dict['key_value'] = filtered_df['key_value'].to_list()
    kv_page.append(keyval_dict)

kv_page_df = pd.DataFrame(kv_page)
df_merged = df_corpus.merge(kv_page_df, how='left')
df_merged['key'] = df_merged['key'].fillna({i: [] for i in df_merged.index})
df_merged['key_value'] = df_merged['key_value'].fillna({i: [] for i in df_merged.index})
df_merged['page_num'] = df_merged['page_num'].astype(str)

In [9]:
df_merged

Unnamed: 0,page_num,content,id,key,key_value
0,1,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,file-c__Users_jomedin_Documents_AI-Experimenta...,[QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15...,[QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15...
1,2,MICROSOFT CORPORATION FORM 10-Q For the Quarte...,file-c__Users_jomedin_Documents_AI-Experimenta...,[a) Income Statements for the Three and Nine M...,[a) Income Statements for the Thre and Nine Mo...
2,3,PART ! Item 1\nPART I. FINANCIAL INFORMATION I...,file-c__Users_jomedin_Documents_AI-Experimenta...,[],[]
3,4,PART ! Item 1\nCOMPREHENSIVE INCOME STATEMENTS...,file-c__Users_jomedin_Documents_AI-Experimenta...,[],[]
4,5,PART I Item 1\nBALANCE SHEETS\n(In millions) (...,file-c__Users_jomedin_Documents_AI-Experimenta...,[],[]
...,...,...,...,...,...
69,70,"Exhibit 31.1\nCERTIFICATION\nI, Satya Nadella,...",file-c__Users_jomedin_Documents_AI-Experimenta...,[Chief Executive Officer],[Chief Executive Oficer: Satya Nadela]
70,71,"Exhibit 31.2\nCERTIFICATION\nI, Amy E. Hood, c...",file-c__Users_jomedin_Documents_AI-Experimenta...,[Executive Vice President and Chief Financial ...,[Executive Vice President and Chief Financial ...
71,72,Exhibit 32.1\nCERTIFICATION PURSUANT TO SECTIO...,file-c__Users_jomedin_Documents_AI-Experimenta...,[],[]
72,73,Exhibit 32.2\nCERTIFICATION PURSUANT TO SECTIO...,file-c__Users_jomedin_Documents_AI-Experimenta...,[],[]


In [10]:
import openai
from typing import Any, Optional, Union
from tenacity import (
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_random_exponential,
)

load_dotenv()  

openai.api_type = "azure"
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  

deployment_name_embedding = "text-embedding-3-small" 

def before_retry_sleep(retry_state):
    print("Rate limited on the OpenAI embeddings API, sleeping before retrying...")

@retry(
    retry=retry_if_exception_type(openai.error.RateLimitError),
    wait=wait_random_exponential(min=15, max=60),
    stop=stop_after_attempt(15),
    before_sleep=before_retry_sleep,
)
def compute_embedding(text):
    #refresh_openai_token()
    #embedding_args = {"deployment_id": deployment_name_embedding} if args.openaihost != "openai" else {}
    return openai.Embedding.create(engine="text-embedding-3-small" , input=text)["data"][0]["embedding"]


df_merged['content_vector'] = df_merged['content'].apply(compute_embedding)

In [11]:
df_merged['content'][1]

"MICROSOFT CORPORATION FORM 10-Q For the Quarter Ended March 31, 2023 INDEX\nPage\nPART I. FINANCIAL INFORMATION\nItem 1. Financial Statements\na) Income Statements for the Three and Nine Months Ended March 31, 2023 and 2022 3\nb) Comprehensive Income Statements for the Three and Nine Months Ended March 31, 2023 and 2022 4\nc) Balance Sheets as of March 31, 2023 and June 30, 2022 5\nd) Cash Flows Statements for the Three and Nine Months Ended March 31, 2023 and 2022 6\ne) Stockholders' Equity Statements for the Three and Nine Months Ended March 31, 2023 and 2022 7\nf) Notes to Financial Statements 8\ng) Report of Independent Registered Public Accounting Firm 31\nItem 2. Management's Discussion and Analysis of Financial Condition and Results of Operations 32\nItem 3. Quantitative and Qualitative Disclosures About Market Risk 50\nItem 4. Controls and Procedures 50\nPART II. OTHER INFORMATION\n<table><tr><td>Item 1.</td><td>Legal Proceedings</td></tr><tr><td>Item 1A.</td><td>Risk Factors<

In [12]:
def get_page_summary(content):
    query = "Can you extract the key points out of the following file in a list? if you find a table, please also parse the nested tables. Please make it easy for a search engine to find this information.if you find codes and descriptions, please separate them" + content#table_html
    messages = [{"role":"system","content":"You are an agent that works with policy files"}, 
               {"role":"user","content":query}]

    response = openai.ChatCompletion.create(engine="gpt4o",  
                                        messages = messages, 
                                        temperature=0.2,  
                                        max_tokens=1000,  top_p=0.5,  
                                        frequency_penalty=0,  
                                        presence_penalty=0,  
                                        stop=None)
    raw_extract = response.choices[0].message.content
    #print(raw_extract)
    #json_load = json.loads(raw_extract)
    #print(json_load)
    #categories = json_load['categories']
    key_phrases = raw_extract
    #print(key_phrases)
    
    return key_phrases

In [19]:
df_merged = df_merged[df_merged['content']!=" "]

In [20]:
df_merged['summary'] = df_merged['content'].apply(get_page_summary)
df_merged['summary_vector'] = df_merged['summary'].apply(compute_embedding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged['summary'] = df_merged['content'].apply(get_page_summary)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged['summary_vector'] = df_merged['summary'].apply(compute_embedding)


In [21]:
def get_title(content):
    query = "Please identify the title of the page. Only provide the title without additional text" + content#table_html
    messages = [{"role":"system","content":"You are an agent that works with policy files"}, 
               {"role":"user","content":query}]

    response = openai.ChatCompletion.create(engine="gpt4o",  
                                        messages = messages, 
                                        temperature=0.2,  
                                        max_tokens=1000,  top_p=0.5,  
                                        frequency_penalty=0,  
                                        presence_penalty=0,  
                                        stop=None)
    raw_extract = response.choices[0].message.content
    #print(raw_extract)
    #json_load = json.loads(raw_extract)
    #print(json_load)
    #categories = json_load['categories']
    key_phrases = raw_extract
    #print(key_phrases)
    
    return key_phrases

In [22]:
df_merged['title'] = df_merged['content'].apply(get_title)
df_merged['title_vector'] = df_merged['title'].apply(compute_embedding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged['title'] = df_merged['content'].apply(get_title)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged['title_vector'] = df_merged['title'].apply(compute_embedding)


In [23]:
# Output embeddings to docVectors.json file
with open("enrichedVectors.json", "w") as f:
    json.dump(df_merged.to_dict(orient = "records"), f)

In [24]:
df_merged['summary'][1]

"Here are the key points extracted from the Microsoft Corporation Form 10-Q for the Quarter Ended March 31, 2023:\n\n### PART I. FINANCIAL INFORMATION\n\n1. **Item 1. Financial Statements**\n   - a) Income Statements for the Three and Nine Months Ended March 31, 2023 and 2022 (Page 3)\n   - b) Comprehensive Income Statements for the Three and Nine Months Ended March 31, 2023 and 2022 (Page 4)\n   - c) Balance Sheets as of March 31, 2023 and June 30, 2022 (Page 5)\n   - d) Cash Flows Statements for the Three and Nine Months Ended March 31, 2023 and 2022 (Page 6)\n   - e) Stockholders' Equity Statements for the Three and Nine Months Ended March 31, 2023 and 2022 (Page 7)\n   - f) Notes to Financial Statements (Page 8)\n   - g) Report of Independent Registered Public Accounting Firm (Page 31)\n\n2. **Item 2. Management's Discussion and Analysis of Financial Condition and Results of Operations** (Page 32)\n\n3. **Item 3. Quantitative and Qualitative Disclosures About Market Risk** (Page 50

In [25]:
df_merged

Unnamed: 0,page_num,content,id,key,key_value,content_vector,summary,summary_vector,title,title_vector
0,1,UNITED STATES SECURITIES AND EXCHANGE COMMISSI...,file-c__Users_jomedin_Documents_AI-Experimenta...,[QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15...,[QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15...,"[0.03833403438329697, 0.002314687939360738, 0....",Here are the key points extracted from the pro...,"[0.04412780702114105, 0.007658707443624735, 0....",FORM 10-Q,"[0.06141217052936554, 0.06183680146932602, 0.0..."
1,2,MICROSOFT CORPORATION FORM 10-Q For the Quarte...,file-c__Users_jomedin_Documents_AI-Experimenta...,[a) Income Statements for the Three and Nine M...,[a) Income Statements for the Thre and Nine Mo...,"[0.03716615214943886, 0.04110429063439369, 0.0...",Here are the key points extracted from the Mic...,"[0.0394403412938118, 0.05236594378948212, 0.03...",MICROSOFT CORPORATION FORM 10-Q For the Quarte...,"[0.048066094517707825, 0.01976948231458664, 0...."
2,3,PART ! Item 1\nPART I. FINANCIAL INFORMATION I...,file-c__Users_jomedin_Documents_AI-Experimenta...,[],[],"[-0.0018395369406789541, -0.016537902876734734...",### Key Points from the Financial Information ...,"[-0.00813346914947033, -0.015815794467926025, ...",PART I. FINANCIAL INFORMATION ITEM 1. FINANCIA...,"[0.012208127416670322, -0.015663474798202515, ..."
3,4,PART ! Item 1\nCOMPREHENSIVE INCOME STATEMENTS...,file-c__Users_jomedin_Documents_AI-Experimenta...,[],[],"[-0.00022720432025380433, 0.011946385726332664...",Here are the key points extracted from the fil...,"[0.0053397538140416145, 0.018671542406082153, ...",COMPREHENSIVE INCOME STATEMENTS,"[0.04133220389485359, 0.01480789203196764, 0.0..."
4,5,PART I Item 1\nBALANCE SHEETS\n(In millions) (...,file-c__Users_jomedin_Documents_AI-Experimenta...,[],[],"[0.0071314494125545025, 0.030603835359215736, ...",### Key Points from the Balance Sheets (Unaudi...,"[0.023321226239204407, 0.01961182989180088, 0....",BALANCE SHEETS,"[0.03111141547560692, 0.018195541575551033, 0...."
...,...,...,...,...,...,...,...,...,...,...
68,69,"Exhibit 15.1\nApril 25, 2023 The Board of Dire...",file-c__Users_jomedin_Documents_AI-Experimenta...,[],[],"[0.052164170891046524, 0.011121505871415138, 0...",### Key Points from Exhibit 15.1\n\n1. **Date ...,"[0.04429785534739494, 0.02371601015329361, 0.0...",Exhibit 15.1,"[-0.014841333031654358, 0.03249764069914818, -..."
69,70,"Exhibit 31.1\nCERTIFICATION\nI, Satya Nadella,...",file-c__Users_jomedin_Documents_AI-Experimenta...,[Chief Executive Officer],[Chief Executive Oficer: Satya Nadela],"[0.02683945931494236, 0.00965652521699667, 0.0...",### Key Points from Exhibit 31.1 CERTIFICATION...,"[0.033934757113456726, 0.014141477644443512, 0...",Exhibit 31.1,"[-0.03350350260734558, 0.027750801295042038, 0..."
70,71,"Exhibit 31.2\nCERTIFICATION\nI, Amy E. Hood, c...",file-c__Users_jomedin_Documents_AI-Experimenta...,[Executive Vice President and Chief Financial ...,[Executive Vice President and Chief Financial ...,"[0.026798756793141365, -0.00034364653402008116...",### Key Points from Exhibit 31.2 CERTIFICATION...,"[0.028141465038061142, -0.004184776917099953, ...",Exhibit 31.2,"[-0.027961766347289085, 0.03342688828706741, 0..."
71,72,Exhibit 32.1\nCERTIFICATION PURSUANT TO SECTIO...,file-c__Users_jomedin_Documents_AI-Experimenta...,[],[],"[0.052370935678482056, 0.02611520327627659, 0....",### Key Points from Exhibit 32.1\n\n1. **Certi...,"[0.050960421562194824, 0.03261365741491318, 0....",Exhibit 32.1,"[-0.023292362689971924, 0.0391208752989769, 0...."


### Part 2. Generating test set

In [28]:
import os

os.environ["OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")  

In [29]:
from langchain_community.document_loaders import DirectoryLoader

In [None]:
from langchain_community.document_loaders import DirectoryLoader
loader = DirectoryLoader()
documents = loader.load()