# Index Data
Purpose is to uplaod the data to the Azure AI Search Index
This uses the PUSH API, however it is also very viable to use the Azure AI Search Indexer that points to the JSON files directly

## Required for this step
- Azure AI Search Service

## Important
- Ensure that Semantic Ranker is enabled in Azure AI Search service



In [1]:
# # Import required libraries  
import os  
import json
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient  


In [6]:
#Load the configuration details for the Cognitive Search Service and Azure OpenAI Instance
#Credentials should be secured using a more secure method such as Azure KeyVault
config = json.load(open("config.json"))
    
# Azure AI Search Config
search_service_name = config["search_service_name"]
search_service_url = "https://{}.search.windows.net/".format(search_service_name)
search_admin_key = config["search_admin_key"]
index_name = config["search_index_name"]
index_schema_file = config["search_index_schema_file"]
search_api_version = config["search_api_version"]
search_headers = {  
    'Content-Type': 'application/json',  
    'api-key': search_admin_key  
}  

index_client = SearchIndexClient(
        endpoint=search_service_url, credential=AzureKeyCredential(search_admin_key))
search_client = SearchClient(endpoint=search_service_url, index_name=index_name, credential=AzureKeyCredential(search_admin_key))

print ('Search Service Name:', search_service_name)
print ('Index Name:', index_name)

# Set a temp directory for downloading pdf's for processing
data_root_dir = config["data_root_dir"]
json_dir = os.path.join(data_root_dir, "json")
print ('JSON Dir:', json_dir)

In [3]:
# Get all files in dir
def get_files_in_dir(in_dir):
    return [os.path.join(dp, f) for dp, dn, filenames in os.walk(in_dir) for f in filenames]


In [4]:
json_docs = get_files_in_dir(json_dir)
total_files = len(json_docs)
print ('Total Files to upload:', total_files)

Total Files to upload: 8


In [5]:
for json_doc in json_docs:
    print (json_doc)
    with open(json_doc, "r") as j_in:
        documents = json.loads(j_in.read())

    result = search_client.upload_documents(documents)  
    print(f"Uploaded {len(documents)} chunks") 

/aci/data/data/customers/financial-docs/json/msft-2022_Annual_Report.docx.json
Uploaded 239 chunks
/aci/data/data/customers/financial-docs/json/msft-2022_Shareholder_Letter.docx.json
Uploaded 20 chunks
/aci/data/data/customers/financial-docs/json/MSFT_FY22Q4_10K.docx.json
Uploaded 282 chunks
/aci/data/data/customers/financial-docs/json/nvda-Q3FY24-CFO-Commentary.pdf.json
Uploaded 13 chunks
/aci/data/data/customers/financial-docs/json/NVIDIA-10Q.pdf.json
Uploaded 110 chunks
/aci/data/data/customers/financial-docs/json/orcl-2q24-pressrelease-December-final.pdf.json
Uploaded 26 chunks
/aci/data/data/customers/financial-docs/json/orcl-f20d8dea-697e-464e-b775-b0e33d1db211.pdf.json
Uploaded 88 chunks
/aci/data/data/customers/financial-docs/json/orcl-Q224_Form8K_Exhibit99-1_Earnings_Release_Tables_Final.pdf.json
Uploaded 21 chunks
