### Import required libraries and environment variables

In [1]:
# Import required libraries  
import os  
import json  
import openai  
import pandas as pd
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,  
)  

  
# Configure environment variables  
load_dotenv(override=True) # take environment variables from .env.

# Variables not used here do not need to be updated in your .env file
AZURE_SEARCH_SERVICE_ENDPOINT = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
AZURE_SEARCH_ADMIN_CREDENTIAL = AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY")) if os.getenv("AZURE_SEARCH_ADMIN_KEY") else DefaultAzureCredential()
index_name = "evaluation-index"

azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-small")
azure_openai_model_name = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME", "text-embedding-3-small")
azure_openai_model_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS", 1536))

# note: The chat deployment should support tool use
# To learn more, please see
# https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-4-and-gpt-4-turbo-models
# https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-35
azure_openai_chat_deployment = os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT", "gpt-4o-mini")
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-07-01-preview")

  from pandas.core import (


### Modifying vector

In [2]:
# Generate Document Embeddings using OpenAI Ada 002

# Read the text-sample.json
with open('enrichedVectors.json', 'r', encoding='utf-8') as file:
    input_data = json.load(file)

In [3]:
# Generate embeddings for title and content fields
for item in input_data:
    item['@search.action'] = 'upload'

# Output embeddings to docVectors.json file
with open("docVectors.json", "w") as f:
    json.dump(input_data, f)

## Insert text and embeddings into vector store
Add texts and metadata from the JSON data to the vector store:

In [4]:
for i in input_data[:2]:
    print("-----------------------------------------------")
    print(i['content'])

-----------------------------------------------
UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549
FORM 10-Q :selected: QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Quarterly Period Ended March 31, 2023
OR :unselected: TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Transition Period From to
Commission File Number 001-37845
MICROSOFT CORPORATION
WASHINGTON (STATE OF INCORPORATION)
91-1144442 (I.R.S. ID)
ONE MICROSOFT WAY, REDMOND, WASHINGTON 98052-6399 (425) 882-8080 www.microsoft.com/investor
<table><tr><th>Title of each class</th><th>Trading Symbol</th><th>Name of exchange on which registered</th></tr><tr><th>Common stock, $0.00000625 par value per share</th><th>MSFT</th><th>NASDAQ</th></tr><tr><td>3.125% Notes due 2028</td><td>MSFT</td><td>NASDAQ</td></tr><tr><td>2.625% Notes due 2033</td><td>MSFT</td><td>NASDAQ</td></tr></table>
Indicate by check mark whether the

In [5]:
# Upload some documents to the index
with open('docVectors.json', 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=AZURE_SEARCH_SERVICE_ENDPOINT, index_name=index_name, credential=AZURE_SEARCH_ADMIN_CREDENTIAL)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents")

Uploaded 74 documents


## Perform a Semantic Hybrid Search for short context

### Getting Top N Documents

In [6]:
from openai import AzureOpenAI

openai.api_type = "azure"
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  

deployment_name_embedding = "text-embedding-ada-002" 

client = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version = "2024-05-01-preview",
  azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT") 
)

def compute_embedding(text):
    #refresh_openai_token()
    #embedding_args = {"deployment_id": deployment_name_embedding} if args.openaihost != "openai" else {}
    return client.embeddings.create(model="text-embedding-ada-002" , input=text).data[0].embedding

### Answering question from context

In [20]:
def LLM_answer(query,results_list):
    query = query + str(results_list) #table_html

    instructions= """
    You are an agent that works with SEC filings
    ## Very Important Instruction
        ### On Your Ability to Refuse Answering Out-of-Domain Questions
        - **Read the user's query, conversation history, and retrieved documents sentence by sentence carefully.**
        - Try your best to understand the user's query (prior conversation can provide more context, you can know what "it", "this", etc., actually refer to; ignore any requests about the desired format of the response), and assess the user's query based solely on provided documents and prior conversation.
        - Classify a query as 'in-domain' if, from the retrieved documents, you can find enough information possibly related to the user's intent which can help you generate a good response to the user's query. Formulate your response by specifically citing relevant sections.
        - For queries not upheld by the documents, or in case of unavailability of documents, categorize them as 'out-of-domain'.
        - You have the ability to answer general requests (**no extra factual knowledge needed**), e.g., formatting (list results in a table, compose an email, etc.), summarization, translation, math, etc. requests. Categorize general requests as 'in-domain'.
        - You don't have the ability to access real-time information, since you cannot browse the internet. Any query about real-time information (e.g., **current stock**, **today's traffic**, **current weather**), MUST be categorized as an **out-of-domain** question, even if the retrieved documents contain relevant information. You have no ability to answer any real-time query.
        - Think twice before you decide whether the user's query is really an in-domain question or not. Provide your reason if you decide the user's query is in-domain.
        - If you have decided the user's query is an in-domain question, then:
            * You **must generate citations for all the sentences** which you have used from the retrieved documents in your response.
            * You must generate the answer based on all relevant information from the retrieved documents and conversation history.
            * You cannot use your own knowledge to answer in-domain questions.
        - If you have decided the user's query is an out-of-domain question, then:
            * Your only response is "The requested information is not available in the retrieved data. Please try another query or topic."
        - For out-of-domain questions, you **must respond** with "The requested information is not available in the retrieved data. Please try another query or topic."

        ### On Your Ability to Do Greeting and General Chat
        - **If the user provides a greeting like "hello" or "how are you?" or casual chat like "how's your day going", "nice to meet you", you must answer with a greeting.
        - Be prepared to handle summarization requests, math problems, and formatting requests as a part of general chat, e.g., "solve the following math equation", "list the result in a table", "compose an email"; they are general chats. Please respond to satisfy the user's requirements.

        ### On Your Ability to Answer In-Domain Questions with Citations
        - Examine the provided JSON documents diligently, extracting information relevant to the user's inquiry. Forge a concise, clear, and direct response, embedding the extracted facts. Attribute the data to the corresponding document using the citation format [page_num]. Strive to achieve a harmonious blend of brevity, clarity, and precision, maintaining the contextual relevance and consistency of the original source. Above all, confirm that your response satisfies the user's query with accuracy, coherence, and user-friendly composition.
        - **You must generate a citation for all the document sources you have referred to at the end of each corresponding sentence in your response.**
        - **The citation mark [page_num] must be placed at the end of the corresponding sentence which cited the document.**
        - **Every claim statement you generate must have at least one citation.**
        """
    
    messages = [{"role":"system","content":instructions}, 
               {"role":"user","content":query}]
    
    print(messages)

    response = client.chat.completions.create(model="gpt4o",  
                                        messages = messages, 
                                        temperature=0.4,  
                                        max_tokens=1000,  top_p=0.5,  
                                        frequency_penalty=0,  
                                        presence_penalty=0,  
                                        stop=None)

    return response.choices[0].message.content

In [29]:
# Semantic Hybrid Search
#query ="what was microsofts revenue for the nine months that ended in march 2023"
query ="Can you provide a detailed, ordered list of all the risks mentioned on this file?"
vector_value = compute_embedding(query)

## Hybrid
vector = Vector(value=vector_value, k=30, fields="content_vector,summary_vector,title_vector")
results_hybrid = search_client.search(  
    search_text=query,  
    vectors=[vector],
    select=["content","key_value","page_num",'title'],
    query_type="semantic", query_language="en-us", semantic_configuration_name='my-semantic-config', query_caption="extractive", query_answer="extractive",
    top=20
)
results_list_hybrid = [result for result in results_hybrid]

#Vector
vector = Vector(value=vector_value, k=30, fields="content_vector, title_vector")
results_vector = search_client.search(  
    search_text=None,  
    vectors=[vector],
    select=["content","key_value","page_num",'title'],
    #query_type="semantic", query_language="en-us", semantic_configuration_name='boosted-semantic-config', query_caption="extractive", query_answer="extractive",
    top=20
)
results_list_vector = [result for result in results_vector]

#combining results
results_vector_df = pd.DataFrame(results_list_vector)
results_hybrid_df = pd.DataFrame(results_list_hybrid)
combined_results = pd.concat([results_vector_df,results_hybrid_df], ignore_index =True).drop_duplicates(subset=['page_num']).drop(columns=['@search.highlights', '@search.captions'])

combined_results['page_num'] = combined_results['page_num'].astype(int)

combined_results = combined_results.sort_values(by=['page_num']).to_dict(orient='records')

sorted_list = sorted(combined_results, key=lambda x: (x['page_num']), reverse=False)
content_list = ["title: "+ i['title'] +"\n context: " +i['content'] for i in combined_results]

  combined_results = pd.concat([results_vector_df,results_hybrid_df], ignore_index =True).drop_duplicates(subset=['page_num']).drop(columns=['@search.highlights', '@search.captions'])


In [30]:
sorted_list

[{'content': "MICROSOFT CORPORATION FORM 10-Q For the Quarter Ended March 31, 2023 INDEX\nPage\nPART I. FINANCIAL INFORMATION\nItem 1. Financial Statements\na) Income Statements for the Three and Nine Months Ended March 31, 2023 and 2022 3\nb) Comprehensive Income Statements for the Three and Nine Months Ended March 31, 2023 and 2022 4\nc) Balance Sheets as of March 31, 2023 and June 30, 2022 5\nd) Cash Flows Statements for the Three and Nine Months Ended March 31, 2023 and 2022 6\ne) Stockholders' Equity Statements for the Three and Nine Months Ended March 31, 2023 and 2022 7\nf) Notes to Financial Statements 8\ng) Report of Independent Registered Public Accounting Firm 31\nItem 2. Management's Discussion and Analysis of Financial Condition and Results of Operations 32\nItem 3. Quantitative and Qualitative Disclosures About Market Risk 50\nItem 4. Controls and Procedures 50\nPART II. OTHER INFORMATION\n<table><tr><td>Item 1.</td><td>Legal Proceedings</td></tr><tr><td>Item 1A.</td><td>

In [31]:
answer =  LLM_answer(query,combined_results)
print(answer)

[{'role': 'system', 'content': '\n    You are an agent that works with SEC filings\n    ## Very Important Instruction\n        ### On Your Ability to Refuse Answering Out-of-Domain Questions\n        - **Read the user\'s query, conversation history, and retrieved documents sentence by sentence carefully.**\n        - Try your best to understand the user\'s query (prior conversation can provide more context, you can know what "it", "this", etc., actually refer to; ignore any requests about the desired format of the response), and assess the user\'s query based solely on provided documents and prior conversation.\n        - Classify a query as \'in-domain\' if, from the retrieved documents, you can find enough information possibly related to the user\'s intent which can help you generate a good response to the user\'s query. Formulate your response by specifically citing relevant sections.\n        - For queries not upheld by the documents, or in case of unavailability of documents, cate