### Import required libraries and environment variables

In [1]:
# Import required libraries  
import os  
import json  
import openai  
import pandas as pd
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,  
)  

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine="text-embedding-ada-002")
    embeddings = response['data'][0]['embedding']
    return embeddings
  
# Configure environment variables  
# Configure environment variables  
load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION") 
credential = AzureKeyCredential(key)

### Modifying vector

In [2]:
# Generate Document Embeddings using OpenAI Ada 002

# Read the text-sample.json
with open('output/enrichedVectors.json', 'r', encoding='utf-8') as file:
    input_data = json.load(file)

In [3]:
# Generate embeddings for title and content fields
for item in input_data:
    item['@search.action'] = 'upload'

# Output embeddings to docVectors.json file
with open("output/docVectors.json", "w") as f:
    json.dump(input_data, f)

## Insert text and embeddings into vector store
Add texts and metadata from the JSON data to the vector store:

In [4]:
for i in input_data[:2]:
    print("-----------------------------------------------")
    print(i['description'])

-----------------------------------------------
This is a Form 10-Q SEC filing for Microsoft Corporation for the quarter ended March 31, 2023. The company is registered with the NASDAQ exchange and is classified as a large accelerated filer. As of April 20, 2023, Microsoft had 7,435,487,575 shares of common stock outstanding. The report includes financial statements detailing revenue, costs, gross margin, operating income, and net income for the period. The company's total revenue for the quarter was $52,857 million, with a net income of $18,299 million. The company's assets include cash and cash equivalents of $26,562 million and short-term investments of $77,865 million.
-----------------------------------------------
The text contains financial data from various financial statements. 

1. Income statement data: Net income of $18,299 and $16,728 for two different periods. Other comprehensive income (loss) of $813 and $(2,913) for the same periods. 

2. Balance sheet data: Total asset

In [3]:
# Upload some documents to the index
with open('output/docVectors.json', 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents")

## Perform a Semantic Hybrid Search for short context

### Getting Top N Documents

In [4]:
# Semantic Hybrid Search
query ="What was Microsoft’s revenue for the nine months that ended on March 31 2023?"

vector = Vector(value=generate_embeddings(query), k=10, fields="embedding")

results = search_client.search(  
    search_text=query,  
    vectors=[vector],
    select=["content","id","sourcefile"],
    query_type="semantic", query_language="en-us", semantic_configuration_name='boosted-semantic-config', query_caption="extractive", query_answer="extractive",
    top=3
)

### Answering question from context

In [5]:
def get_filter(query,results):
    results_list = [result for result in results]
    messages = [{"role":"system","content":"\nAssistant helps the company employees answering questions from SEC filings like 10Q and 10K. Be brief in your answers.\nAnswer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.\nDO NOT generate answers that DON'T use the sources below for any of the previous questions\nFor tabular information return it as an html table. Do not return markdown format.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brakets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf]"}, 
            {"role":"user","content":"\n question \n"+str(query) + " context: "+ str(results_list)}]
    
    print(messages)

    response = openai.ChatCompletion.create(engine="gpt4",  
                                        messages = messages, 
                                        temperature=0,  
                                        max_tokens=200,  top_p=0.5,  
                                        frequency_penalty=0,  
                                        presence_penalty=0,  
                                        stop=None)

    return response.choices[0].message.content

answer =  get_filter(query,results)
print(answer)

[{'role': 'system', 'content': "\nAssistant helps the company employees answering questions from SEC filings like 10Q and 10K. Be brief in your answers.\nAnswer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.\nDO NOT generate answers that DON'T use the sources below for any of the previous questions\nFor tabular information return it as an html table. Do not return markdown format.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brakets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf]"}, {'role': 'user', 'content': '\n question \nWhat was Microsoft’s revenue for the nine months that ended on March 31 2023? context

## Perform a Semantic Hybrid Search on Long Context

### Getting Top N Documents

In [6]:
# Semantic Hybrid Search
query ="please provide all the risks factors associated with microsoft in FY2023 Q3?"

vector = Vector(value=generate_embeddings(query), k=10, fields="embedding")

results = search_client.search(  
    search_text=query,  
    vectors=[vector],
    select=["content","id","sourcefile"],
    query_type="semantic", query_language="en-us", semantic_configuration_name='boosted-semantic-config', query_caption="extractive", query_answer="extractive",
    top=10
)

results_list = [result for result in results]

sorted_list = sorted(results_list, key=lambda x: (x['id']))

### Feeding documents to LLM

In [7]:
from langchain.docstore.document import Document
if results == None:
    docs = [Document(page_content="No results found")]
else :
    docs = [
        Document(page_content=doc['content'], metadata={"id": doc['id'], "source": doc['sourcefile']})
        for doc in sorted_list
        ]

In [8]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate

In [14]:
llm = AzureChatOpenAI(
                openai_api_base=openai.api_base,
                openai_api_version=openai.api_version,
                deployment_name='gpt4',
                temperature=0,
                openai_api_key=openai.api_key,
                openai_api_type="azure",
                max_tokens=2000)

In [15]:
%%time
chainType = "refine"
template = """
            Given the following extracted parts of a long document and a question, create a detailed final answer. Please return the sources only at the end.
            If you don't know the answer, just say that you don't know. Don't try to make up an answer. 
            If the answer is not contained within the text below, say \"I don't know\".

            QUESTION: {question}
            =========
            {parts of document}
            =========
            """
#qaPrompt = PromptTemplate(template=template, input_variables=["summaries", "question"])
#qaChain = load_qa_with_sources_chain(llm, chain_type=chainType, prompt=qaPrompt)
qaChain = load_qa_with_sources_chain(llm, chain_type=chainType)
answer = qaChain({"input_documents": docs, "question": query}, return_only_outputs=False)
outputAnswer = answer['output_text']
print(outputAnswer)

The updated answer, including the new context, is as follows:

1. The markets for software, devices, and cloud-based services are dynamic and highly competitive. Microsoft's competitors are developing new software and devices, and deploying competing cloud-based services for consumers and businesses.

2. The devices and form factors customers prefer evolve rapidly, and influence how users access services in the cloud, and in some cases, the user's choice of which suite of cloud-based services to use. Microsoft must continue to evolve and adapt over an extended time in pace with this changing environment.

3. Microsoft's increasing focus on cloud-based services presents execution and competitive risks. They are devoting significant resources to develop and deploy their cloud-based strategies. The Windows ecosystem must continue to evolve with this changing environment. Their success in the Internet of Things (IoT) will depend on the level of adoption of their offerings such as Azure, Az