### Import required libraries and environment variables

In [929]:
# Import required libraries  
import os  
import json  
import openai  
import pandas as pd
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,  
)  

  
# Configure environment variables  
# Configure environment variables  
# Configure environment variables  
load_dotenv()  
service_endpoint = os.getenv("AZURE_AI_SEARCH_ENDPOINT") 
index_name = "single_page_index" #os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_AI_SEARCH_KEY") 
openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION") 
model: str = "text-embedding-3-small" 
credential = AzureKeyCredential(key)

### Modifying vector

In [930]:
# Generate Document Embeddings using OpenAI Ada 002

# Read the text-sample.json
with open('enrichedVectors.json', 'r', encoding='utf-8') as file:
    input_data = json.load(file)

In [931]:
# Generate embeddings for title and content fields
for item in input_data:
    item['@search.action'] = 'upload'

# Output embeddings to docVectors.json file
with open("docVectors.json", "w") as f:
    json.dump(input_data, f)

## Insert text and embeddings into vector store
Add texts and metadata from the JSON data to the vector store:

In [932]:
for i in input_data[:2]:
    print("-----------------------------------------------")
    print(i['content'])

-----------------------------------------------
UNITED STATES SECURITIES AND EXCHANGE COMMISSION Washington, D.C. 20549
FORM 10-Q :selected: QUARTERLY REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Quarterly Period Ended March 31, 2023
OR :unselected: TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 For the Transition Period From to
Commission File Number 001-37845
MICROSOFT CORPORATION
WASHINGTON (STATE OF INCORPORATION)
91-1144442 (I.R.S. ID)
ONE MICROSOFT WAY, REDMOND, WASHINGTON 98052-6399 (425) 882-8080 www.microsoft.com/investor
<table><tr><th>Title of each class</th><th>Trading Symbol</th><th>Name of exchange on which registered</th></tr><tr><th>Common stock, $0.00000625 par value per share</th><th>MSFT</th><th>NASDAQ</th></tr><tr><td>3.125% Notes due 2028</td><td>MSFT</td><td>NASDAQ</td></tr><tr><td>2.625% Notes due 2033</td><td>MSFT</td><td>NASDAQ</td></tr></table>
Indicate by check mark whether the

In [933]:
# Upload some documents to the index
with open('docVectors.json', 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents")

Uploaded 74 documents


## Perform a Semantic Hybrid Search for short context

### Getting Top N Documents

In [934]:
openai.api_type = "azure"
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = "2023-07-01-preview"
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  

deployment_name_embedding = "text-embedding-3-small" 

def before_retry_sleep(retry_state):
    print("Rate limited on the OpenAI embeddings API, sleeping before retrying...")

@retry(
    retry=retry_if_exception_type(openai.error.RateLimitError),
    wait=wait_random_exponential(min=15, max=60),
    stop=stop_after_attempt(15),
    before_sleep=before_retry_sleep,
)
def compute_embedding(text):
    #refresh_openai_token()
    #embedding_args = {"deployment_id": deployment_name_embedding} if args.openaihost != "openai" else {}
    return openai.Embedding.create(engine="text-embedding-3-small" , input=text)["data"][0]["embedding"]

### Answering question from context

In [935]:
def LLM_answer(query,results_list):
    query = query + str(results_list) #table_html
    messages = [{"role":"system","content":"You are an agent that works with policy documents. Sometimes tables might need to be unpacked, as values might appear to be nested. Please be concise. Please be mindful that you can have more than one coverage. Only use information for the following sources. Please be exhaustive in your answer, group your output by source if needed and do not duplicate values. If you thin that providing a table is the best answer, please provide it\n"}, 
               {"role":"user","content":query}]
    
    print(messages)

    response = openai.ChatCompletion.create(engine="gpt4-32k",  
                                        messages = messages, 
                                        temperature=0.4,  
                                        max_tokens=1000,  top_p=0.5,  
                                        frequency_penalty=0,  
                                        presence_penalty=0,  
                                        stop=None)

    return response.choices[0].message.content

In [942]:
# Semantic Hybrid Search
query ="what was microsofts revenue for the nine months that ended in march 2023"
vector_value = compute_embedding(query)

## Hybrid
vector = Vector(value=vector_value, k=30, fields="content_vector,summary_vector,title_vector")
results_hybrid = search_client.search(  
    search_text=query,  
    vectors=[vector],
    select=["content","key_value","page_num",'title'],
    query_type="semantic", query_language="en-us", semantic_configuration_name='boosted-semantic-config', query_caption="extractive", query_answer="extractive",
    top=5
)
results_list_hybrid = [result for result in results_hybrid]

#Vector
vector = Vector(value=vector_value, k=30, fields="content_vector, title_vector")
results_vector = search_client.search(  
    search_text=None,  
    vectors=[vector],
    select=["content","key_value","page_num",'title'],
    #query_type="semantic", query_language="en-us", semantic_configuration_name='boosted-semantic-config', query_caption="extractive", query_answer="extractive",
    top=9
)
results_list_vector = [result for result in results_vector]

#combining results
results_vector_df = pd.DataFrame(results_list_vector)
results_hybrid_df = pd.DataFrame(results_list_hybrid)
combined_results = pd.concat([results_vector_df,results_hybrid_df], ignore_index =True).drop_duplicates(subset=['page_num']).drop(columns=['@search.highlights', '@search.captions'])

combined_results['page_num'] = combined_results['page_num'].astype(int)

combined_results = combined_results.sort_values(by=['page_num']).to_dict(orient='records')

sorted_list = sorted(results_list, key=lambda x: (x['@search.reranker_score']), reverse=False)
content_list = ["title: "+ i['title'] +"\n context: " +i['content'] for i in results_list]

  combined_results = pd.concat([results_vector_df,results_hybrid_df], ignore_index =True).drop_duplicates(subset=['page_num']).drop(columns=['@search.highlights', '@search.captions'])


In [943]:
answer =  LLM_answer(query,combined_results)
print(answer)

[{'role': 'system', 'content': 'You are an agent that works with policy documents. Sometimes tables might need to be unpacked, as values might appear to be nested. Please be concise. Please be mindful that you can have more than one coverage. Only use information for the following sources. Please be exhaustive in your answer, group your output by source if needed and do not duplicate values. If you thin that providing a table is the best answer, please provide it\n'}, {'role': 'user', 'content': 'what was microsofts revenue for the nine months that ended in march 2023[{\'page_num\': 2, \'key_value\': [\'a) Income Statements for the Thre and Nine Months Ended March 31, 2023 and 202: 3\', \'b) Comprehensive Income Statements for the Thre and Nine Months Ended March 31, 2023 and 202: 5\', \'d) Cash Flows Statements for the Thre and Nine Months Ended March 31, 2023 and 202: 6\', "e) Stockholders\' Equity Statements for the Thre and Nine Months Ended March 31, 2023 and 202: 7", \'f) Notes t