### Create a search index

In [21]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os
import pandas as pd
from azure.search.documents import SearchClient  

load_dotenv(override=True) # take environment variables from .env.

# Variables not used here do not need to be updated in your .env file
AZURE_SEARCH_SERVICE_ENDPOINT = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
AZURE_SEARCH_ADMIN_CREDENTIAL = AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY")) if os.getenv("AZURE_SEARCH_ADMIN_KEY") else DefaultAzureCredential()
index_name = os.environ["AZURE_SEARCH_INDEX_NAME"].lower().replace("_", "-")

azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-small")
azure_openai_model_name = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME", "text-embedding-3-small")
azure_openai_model_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS", 1536))

# note: The chat deployment should support tool use
# To learn more, please see
# https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-4-and-gpt-4-turbo-models
# https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-35
azure_openai_chat_deployment = os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT", "gpt-4o-mini")
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-07-01-preview")

### Read data

In [22]:
path = "../../data/processed/files/"
filelist = os.listdir(path)

In [23]:
load_df

Unnamed: 0,page_num,content,title,title_vector,content_vector,chunk_id,preprocessing_pipeline,filename,filing_period,filing_date,form_type,ticker,id
0,1,Table of Contents\nUNITED STATES SECURITIES AN...,FORM 10-K,"[0.06034832075238228, 0.05649842694401741, 0.0...","[0.05381559208035469, 0.026631174609065056, 0....",10K-AMZN-02-03-2023-chunk-id-1,DI_Text_HTML_PageSplitter,10K-AMZN-02-03-2023,2022-12-31,2023-02-03,10K,AMZN,10K-AMZN-02-03-2023-chunk-id-1
1,2,"Table of Contents\nAMAZON.COM, INC. FORM 10-K ...","AMAZON.COM, INC. FORM 10-K For the Fiscal Year...","[0.036177292466163635, 0.02635868266224861, 0....","[0.022779833525419235, 0.041926249861717224, 0...",10K-AMZN-02-03-2023-chunk-id-2,DI_Text_HTML_PageSplitter,10K-AMZN-02-03-2023,2022-12-31,2023-02-03,10K,AMZN,10K-AMZN-02-03-2023-chunk-id-2
2,3,"Table of Contents\nAMAZON.COM, INC.\nPART I\nI...",Item 1. Business,"[0.005116552114486694, 0.010721826925873756, 0...","[0.043223753571510315, 0.009905443526804447, 0...",10K-AMZN-02-03-2023-chunk-id-3,DI_Text_HTML_PageSplitter,10K-AMZN-02-03-2023,2022-12-31,2023-02-03,10K,AMZN,10K-AMZN-02-03-2023-chunk-id-3
3,4,Table of Contents\nCompetition\nOur businesses...,Competition,"[0.03811872377991676, -0.02396334894001484, 0....","[0.04903312399983406, 0.028485119342803955, 0....",10K-AMZN-02-03-2023-chunk-id-4,DI_Text_HTML_PageSplitter,10K-AMZN-02-03-2023,2022-12-31,2023-02-03,10K,AMZN,10K-AMZN-02-03-2023-chunk-id-4
4,5,Table of Contents\nAvailable Information\nOur ...,Executive Officers and Directors,"[0.0003115860163234174, -0.014203808270394802,...","[0.019433414563536644, -0.01925777457654476, 0...",10K-AMZN-02-03-2023-chunk-id-5,DI_Text_HTML_PageSplitter,10K-AMZN-02-03-2023,2022-12-31,2023-02-03,10K,AMZN,10K-AMZN-02-03-2023-chunk-id-5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1991,76,"Exhibit 15.1\nOctober 25, 2022\nThe Board of D...",Exhibit 15.1,"[-0.014841333031654358, 0.03249764069914818, -...","[0.04970259964466095, 0.011187892407178879, 0....",10Q-MSFT-10-25-2022-chunk-id-76,DI_Text_HTML_PageSplitter,10Q-MSFT-10-25-2022,2022-09-30,2022-10-25,10Q,MSFT,10Q-MSFT-10-25-2022-chunk-id-76
1992,77,"Exhibit 31.1\nCERTIFICATION\nI, Satya Nadella,...",Exhibit 31.1 - Certification,"[-0.014201141893863678, 0.02938828431069851, 0...","[0.02333853580057621, 0.010006517171859741, 0....",10Q-MSFT-10-25-2022-chunk-id-77,DI_Text_HTML_PageSplitter,10Q-MSFT-10-25-2022,2022-09-30,2022-10-25,10Q,MSFT,10Q-MSFT-10-25-2022-chunk-id-77
1993,78,"Exhibit 31.2\nCERTIFICATION\nI, Amy E. Hood, c...",Exhibit 31.2 - Certification,"[-0.004663168452680111, 0.033500369638204575, ...","[0.022416625171899796, 0.00013361001037992537,...",10Q-MSFT-10-25-2022-chunk-id-78,DI_Text_HTML_PageSplitter,10Q-MSFT-10-25-2022,2022-09-30,2022-10-25,10Q,MSFT,10Q-MSFT-10-25-2022-chunk-id-78
1994,79,Exhibit 32.1\nCERTIFICATION PURSUANT TO SECTIO...,CERTIFICATION PURSUANT TO SECTION 906 OF THE S...,"[0.036175042390823364, 0.01837925612926483, -0...","[0.04927904158830643, 0.027612054720520973, 0....",10Q-MSFT-10-25-2022-chunk-id-79,DI_Text_HTML_PageSplitter,10Q-MSFT-10-25-2022,2022-09-30,2022-10-25,10Q,MSFT,10Q-MSFT-10-25-2022-chunk-id-79


In [24]:
load_df = pd.DataFrame()
for filename in filelist:
    df = pd.read_parquet(path + filename)
    load_df = pd.concat([load_df, df], ignore_index=True)

load_df = load_df.reset_index(drop=True)
load_df['title_vector'] = load_df['title_vector'].apply(lambda x: x.tolist())
load_df['content_vector'] = load_df['content_vector'].apply(lambda x: x.tolist())
load_df['id'] = load_df['chunk_id']
#load_df = load_df.drop(columns=['page_num','chunk_id','preprocessing_pipeline','filing_period','filing_date'])


In [25]:
input_data = load_df.to_dict(orient='records')

# Generate embeddings for title and content fields
for item in input_data:
    item['@search.action'] = 'upload'

In [26]:
search_client = SearchClient(endpoint=AZURE_SEARCH_SERVICE_ENDPOINT, index_name=index_name, credential=AZURE_SEARCH_ADMIN_CREDENTIAL)


for ai_search_document in input_data:
    search_client.upload_documents(ai_search_document)