### Import required libraries and environment variables

In [2]:
# Import required libraries  
import os  
import json  
import openai  
import pandas as pd
from dotenv import load_dotenv  
from tenacity import retry, wait_random_exponential, stop_after_attempt  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import Vector  
from azure.search.documents.indexes.models import (  
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,  
)  
  
# Configure environment variables  
load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT") 
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME") 
key = os.getenv("AZURE_SEARCH_ADMIN_KEY") 
openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION") 
model: str = "text-embedding-ada-002" 
credential = AzureKeyCredential(key)

### Modifying vector

In [3]:
# Generate Document Embeddings using OpenAI Ada 002

# Read the text-sample.json
with open('output/enrichedVectors.json', 'r', encoding='utf-8') as file:
    input_data = json.load(file)

In [3]:
# Generate embeddings for title and content fields
for item in input_data:
    item['@search.action'] = 'upload'

# Output embeddings to docVectors.json file
with open("output/docVectors.json", "w") as f:
    json.dump(input_data, f)

## Insert text and embeddings into vector store
Add texts and metadata from the JSON data to the vector store:

In [6]:
for i in input_data:
    print("-----------------------------------------------")
    print(i['description'])

-----------------------------------------------
This is a Form 10-Q SEC filing for Microsoft Corporation for the quarter ended March 31, 2023. The company is registered with the NASDAQ exchange and is classified as a large accelerated filer. As of April 20, 2023, Microsoft had 7,435,487,575 shares of common stock outstanding. The report includes financial statements detailing revenue, costs, gross margin, operating income, and net income for the period. The company's total revenue for the quarter was $52,857 million, with a net income of $18,299 million. The company's assets include cash and cash equivalents of $26,562 million and short-term investments of $77,865 million.
-----------------------------------------------
The text contains financial data from various financial statements. 

1. Income statement data: Net income of $18,299 and $16,728 for two different periods. Other comprehensive income (loss) of $813 and $(2,913) for the same periods. 

2. Balance sheet data: Total asset

In [7]:
# Upload some documents to the index
with open('output/docVectors.json', 'r') as file:  
    documents = json.load(file)  
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
result = search_client.upload_documents(documents)
print(f"Uploaded {len(documents)} documents")

Uploaded 36 documents


## Perform a Semantic Hybrid Search

In [8]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(6))
# Function to generate embeddings for title and content fields, also used for query embeddings
def generate_embeddings(text):
    response = openai.Embedding.create(
        input=text, engine="text-embedding-ada-002")
    embeddings = response['data'][0]['embedding']
    return embeddings

In [159]:
# Semantic Hybrid Search
query ="please provide all the risks factors associated with microsoft in FY2023 Q3?"

vector = Vector(value=generate_embeddings(query), k=10, fields="embedding")

results = search_client.search(  
    search_text=query,  
    vectors=[vector],
    select=["content","id","sourcefile"],
    query_type="semantic", query_language="en-us", semantic_configuration_name='boosted-semantic-config', query_caption="extractive", query_answer="extractive",
    top=10
)

'''
semantic_answers = results.get_answers()
for answer in semantic_answers:
    print(answer)
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")
for result in results:
    print(result)
    print(f"Score: {result['@search.score']}")
    print(f"Score: {result['@search.reranker_score']}")
    print(f"Content: {result['content']}")

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")
'''

'\nsemantic_answers = results.get_answers()\nfor answer in semantic_answers:\n    print(answer)\n    if answer.highlights:\n        print(f"Semantic Answer: {answer.highlights}")\n    else:\n        print(f"Semantic Answer: {answer.text}")\n    print(f"Semantic Answer Score: {answer.score}\n")\nfor result in results:\n    print(result)\n    print(f"Score: {result[\'@search.score\']}")\n    print(f"Score: {result[\'@search.reranker_score\']}")\n    print(f"Content: {result[\'content\']}")\n\n    captions = result["@search.captions"]\n    if captions:\n        caption = captions[0]\n        if caption.highlights:\n            print(f"Caption: {caption.highlights}\n")\n        else:\n            print(f"Caption: {caption.text}\n")\n'

In [160]:
results_list = [result for result in results]

In [161]:
sorted_list = sorted(results_list, key=lambda x: (x['id']))

In [147]:
from langchain.docstore.document import Document
if results == None:
    docs = [Document(page_content="No results found")]
else :
    docs = [
        Document(page_content=doc['content'], metadata={"id": doc['id'], "source": doc['sourcefile']})
        for doc in sorted_list
        ]

In [156]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate

In [157]:
llm = AzureChatOpenAI(
                openai_api_base=openai.api_base,
                openai_api_version=openai.api_version,
                deployment_name='gpt4',
                temperature=0,
                openai_api_key=openai.api_key,
                openai_api_type="azure",
                max_tokens=2000)
embeddings = OpenAIEmbeddings(engine='text-emedding-ada-002', chunk_size=1, openai_api_key=openai.api_key)

In [158]:
# Semantic Hybrid Search
query ="Please provide all the risks associated with microsoft in FY2023 Q3?"

In [163]:
%%time
chainType = "refine"
template = """
            Given the following extracted parts of a long document and a question, create a final answer. Please return the sources only at the end.
            If you don't know the answer, just say that you don't know. Don't try to make up an answer. 
            If the answer is not contained within the text below, say \"I don't know\".

            QUESTION: {question}
            =========
            {parts of document}
            =========
            """
#qaPrompt = PromptTemplate(template=template, input_variables=["summaries", "question"])
#qaChain = load_qa_with_sources_chain(llm, chain_type=chainType, prompt=qaPrompt)
qaChain = load_qa_with_sources_chain(llm, chain_type=chainType)
answer = qaChain({"input_documents": docs, "question": query}, return_only_outputs=False)
outputAnswer = answer['output_text']
print(outputAnswer)

The risk factors associated with Microsoft in FY2023 Q3 include:

1. Competitive pressures: Decreased sales volumes, price reductions, and/or increased operating costs, such as for research and development, marketing, and sales incentives, could lead to lower revenue, gross margins, and operating income.

2. Execution and competitive risks in cloud-based services: Microsoft's increasing focus on cloud-based services presents execution and competitive risks. The company's success in the Internet of Things (IoT) will depend on the level of adoption of its offerings such as Azure, Azure Stack, Azure IOT Edge, and Azure Sphere.

3. Infrastructure costs: Microsoft is incurring costs to build and maintain infrastructure to support cloud computing services. These costs will reduce the operating margins the company has previously achieved.

4. Fraudulent or abusive activities: Some users may engage in fraudulent or abusive activities through Microsoft's cloud-based services, which could impact