### Create a search index

In [40]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os
import pandas as pd
from azure.search.documents import SearchClient  

load_dotenv(override=True) # take environment variables from .env.

# Variables not used here do not need to be updated in your .env file
AZURE_SEARCH_SERVICE_ENDPOINT = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
AZURE_SEARCH_ADMIN_CREDENTIAL = AzureKeyCredential(os.getenv("AZURE_SEARCH_ADMIN_KEY")) if os.getenv("AZURE_SEARCH_ADMIN_KEY") else DefaultAzureCredential()
index_name = os.environ["AZURE_SEARCH_INDEX_NAME"].lower().replace("_", "-")

azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_openai_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT", "text-embedding-3-small")
azure_openai_model_name = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL_NAME", "text-embedding-3-small")
azure_openai_model_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS", 1536))

# note: The chat deployment should support tool use
# To learn more, please see
# https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-4-and-gpt-4-turbo-models
# https://learn.microsoft.com/en-us/azure/ai-services/openai/concepts/models#gpt-35
azure_openai_chat_deployment = os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT", "gpt-4o-mini")
azure_openai_api_version = os.getenv("AZURE_OPENAI_API_VERSION", "2024-07-01-preview")

### Read data

In [41]:
path = "../../data/processed/files/"
filelist = os.listdir(path)

In [75]:
load_df = pd.DataFrame()
for filename in filelist:
    df = pd.read_parquet(path + filename)
    load_df = pd.concat([load_df, df], ignore_index=True)

load_df = load_df.reset_index(drop=True)
load_df['title_vector'] = load_df['title_vector'].apply(lambda x: x.tolist())
load_df['content_vector'] = load_df['content_vector'].apply(lambda x: x.tolist())
load_df['id'] = load_df['chunk_id']
load_df = load_df.drop(columns=['page_num','chunk_id','preprocessing_pipeline','filing_period','filing_date'])


In [76]:
input_data = load_df.to_dict(orient='records')

# Generate embeddings for title and content fields
for item in input_data:
    item['@search.action'] = 'upload'

In [84]:
search_client = SearchClient(endpoint=AZURE_SEARCH_SERVICE_ENDPOINT, index_name=index_name, credential=AZURE_SEARCH_ADMIN_CREDENTIAL)


for ai_search_document in input_data:
    search_client.upload_documents(ai_search_document)