### Import required libraries and environment variables

In [1]:
# Import required libraries  
import os  
import json  
import openai  
import pandas as pd
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,  
)  

@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine="text-embedding-ada-002")
    embeddings = response['data'][0]['embedding']
    return embeddings
  
# Configure environment variables  
# Configure environment variables  
load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT_2") 
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME_2") 
key = os.getenv("AZURE_SEARCH_ADMIN_KEY_2") 
openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION") 
credential = AzureKeyCredential(key)

  from pandas.core import (


### Modifying vector

In [2]:
# Generate Document Embeddings using OpenAI Ada 002

# Read the text-sample.json
with open('output/enrichedVectors.json', 'r', encoding='utf-8') as file:
    input_data = json.load(file)

In [3]:
# Generate embeddings for title and content fields
for item in input_data:
    item['@search.action'] = 'upload'

# Output embeddings to docVectors.json file
with open("output/docVectors.json", "w") as f:
    json.dump(input_data, f)

## Insert text and embeddings into vector store
Add texts and metadata from the JSON data to the vector store:

In [4]:
for i in input_data[:2]:
    print("-----------------------------------------------")
    print(i['description'])

-----------------------------------------------
This text is the Table of Contents of a Form 10-K filing submitted to the SEC by Amazon.com, Inc. The filing includes information about the company's business, risk factors, financial statements, and other relevant details.
-----------------------------------------------
The company is guided by four principles: customer obsession, passion for invention, commitment to operational excellence, and long-term thinking. They serve various customer sets including consumers, sellers, developers, enterprises, content creators, advertisers, and employees. The company operates in three segments: North America, International, and Amazon Web Services (AWS). They serve consumers through online and physical stores, offering a wide selection, competitive prices, and convenience. They also manufacture and sell electronic devices and develop media content. They offer programs for sellers to grow their businesses and fulfill orders through the company. The

In [5]:
# Upload some documents to the index
with open('output/docVectors.json', 'r') as file:  
    documents = json.load(file)  

search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)


result = search_client.upload_documents(documents[:500])
print(f"Uploaded {len(documents)} documents")

result = search_client.upload_documents(documents[500:])
print(f"Uploaded {len(documents)} documents")

Uploaded 1009 documents
Uploaded 1009 documents


## Perform a Semantic Hybrid Search for short context

### Getting Top N Documents

In [14]:
# Semantic Hybrid Search
query ="What was Microsoft’s revenue for the nine months that ended on March 31 2023?"

vector = Vector(value=generate_embeddings(query), k=50, fields="embedding")

results = search_client.search(  
    search_text=query,  
    vectors=[vector],
    select=["content","id","sourcefile"],
    query_type="semantic", query_language="en-us", semantic_configuration_name='boosted-semantic-config', query_caption="extractive", query_answer="extractive",
    top=7
)

In [15]:
results_list = [result for result in results]

sorted_list = sorted(results_list, key=lambda x: (x['id']))
sorted_list

[{'sourcefile': '10Q-MSFT-04-25-2023.pdf',
  'id': 'file-10Q-MSFT-04-25-2023_pdf-3130512D4D5346542D30342D32352D323032332E706466-page-1',
  'content': "18,299</td><td>$ 16,728</td><td>$ 52,280</td><td>$ 55,998</td></tr><tr><td>Other comprehensive income (loss), net of tax:</td><td></td><td></td><td></td><td></td></tr><tr><td>Net change related to derivatives</td><td>(9)</td><td>6</td><td>(34)</td><td>8</td></tr><tr><td>Net change related to investments</td><td>753</td><td>(2,882)</td><td>(796)</td><td>(4,047)</td></tr><tr><td>Translation adjustments and other</td><td>69</td><td>(37)</td><td>(136)</td><td>(259)</td></tr><tr><td>Other comprehensive income (loss)</td><td>813</td><td>(2,913)</td><td>(966)</td><td>(4,298)</td></tr><tr><td>Comprehensive income</td><td>$ 19,112</td><td>$ 13,815</td><td>$ 51,314</td><td>$ 51,700</td></tr></table>\nRefer to accompanying notes.\n4 \n(In millions) (Unaudited)\nPART ! Item 1\nBALANCE SHEETS\n<table><tr><th></th><th>March 31, 2023</th><th>June 30, 2

### Answering question from context

In [None]:
def get_filter(query,results):
    results_list = [result for result in results]
    messages = [{"role":"system","content":"\nAssistant helps the company employees answering questions from SEC filings like 10Q and 10K. Be brief in your answers.\nAnswer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.\nDO NOT generate answers that DON'T use the sources below for any of the previous questions\nFor tabular information return it as an html table. Do not return markdown format.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brakets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf]"}, 
            {"role":"user","content":"\n question \n"+str(query) + " context: "+ str(results_list)}]
    
    print(messages)

    response = openai.ChatCompletion.create(engine="gpt4",  
                                        messages = messages, 
                                        temperature=0,  
                                        max_tokens=200,  top_p=0.5,  
                                        frequency_penalty=0,  
                                        presence_penalty=0,  
                                        stop=None)

    return response.choices[0].message.content

answer =  get_filter(query,results)
print(answer)

[{'role': 'system', 'content': "\nAssistant helps the company employees answering questions from SEC filings like 10Q and 10K. Be brief in your answers.\nAnswer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below. If asking a clarifying question to the user would help, ask the question.\nDO NOT generate answers that DON'T use the sources below for any of the previous questions\nFor tabular information return it as an html table. Do not return markdown format.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brakets to reference the source, e.g. [info1.txt]. Don't combine sources, list each source separately, e.g. [info1.txt][info2.pdf]"}, {'role': 'user', 'content': '\n question \nWhat was Microsoft’s revenue for the nine months that ended on March 31 2023? context

## Perform a Semantic Hybrid Search on Long Context

### Getting Top N Documents

In [None]:
# Semantic Hybrid Search
query ="please provide all the risks factors associated with microsoft in FY2023 Q3?"

vector = Vector(value=generate_embeddings(query), k=10, fields="embedding")

results = search_client.search(  
    search_text=query,  
    vectors=[vector],
    select=["content","id","sourcefile"],
    query_type="semantic", query_language="en-us", semantic_configuration_name='boosted-semantic-config', query_caption="extractive", query_answer="extractive",
    top=10
)

results_list = [result for result in results]

sorted_list = sorted(results_list, key=lambda x: (x['id']))

### Feeding documents to LLM

In [None]:
from langchain.docstore.document import Document
if results == None:
    docs = [Document(page_content="No results found")]
else :
    docs = [
        Document(page_content=doc['content'], metadata={"id": doc['id'], "source": doc['sourcefile']})
        for doc in sorted_list
        ]

In [None]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate

In [None]:
llm = AzureChatOpenAI(
                openai_api_base=openai.api_base,
                openai_api_version=openai.api_version,
                deployment_name='chat16k',
                temperature=0,
                openai_api_key=openai.api_key,
                openai_api_type="azure",
                max_tokens=2000)

  warn_deprecated(


In [None]:
%%time
chainType = "refine"
template = """
            Given the following extracted parts of a long document and a question, create a detailed final answer. Please return the sources only at the end.
            If you don't know the answer, just say that you don't know. Don't try to make up an answer. 
            If the answer is not contained within the text below, say \"I don't know\".

            QUESTION: {question}
            =========
            {parts of document}
            =========
            """
#qaPrompt = PromptTemplate(template=template, input_variables=["summaries", "question"])
#qaChain = load_qa_with_sources_chain(llm, chain_type=chainType, prompt=qaPrompt)
qaChain = load_qa_with_sources_chain(llm, chain_type=chainType)
answer = qaChain({"input_documents": docs, "question": query}, return_only_outputs=False)
outputAnswer = answer['output_text']
print(outputAnswer)

  warn_deprecated(


The risks factors associated with Microsoft in FY2023 Q3, in addition to the previously mentioned factors, include:

1. Damage to brands or reputation: If Microsoft's brands or reputation are damaged, it could negatively impact its revenues, margins, or ability to attract highly qualified employees.

2. Adverse economic or market conditions: Worsening economic conditions, including inflation, recession, pandemic, or other changes, may cause lower IT spending and adversely affect Microsoft's revenue. Declining demand for PCs, servers, and other computing devices, or reduced consumer or business spending, could also impact revenue.

3. Sales channel disruption: Microsoft's product distribution system relies on an extensive partner and retail network. Economic conditions impacting partners, such as the bankruptcy of a major distributor, OEM, or retailer, could cause sales channel disruption.

4. Impaired ability to collect payments: Challenging economic conditions may impair the ability o