## Import libs and Environment vars

In [None]:
from openai import AzureOpenAI
import pandas as pd
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient 
from utils.storage_functions import get_table_data
import requests
import json
import os
from dotenv import load_dotenv

load_dotenv(override=True)

SEARCH_ENDPOINT = os.getenv("SEARCH_ENDPOINT")
INDEX_NAME = os.getenv("INDEX_NAME")
SEARCH_KEY = os.getenv("SEARCH_QUERY_KEY")
AOAI_API_KEY = os.getenv("AOAI_API_KEY")
AOAI_ENDPOINT = os.getenv("AOAI_ENDPOINT")
AOAI_API_VERSION = os.getenv("AOAI_API_VERSION")
AOAI_MODEL = os.getenv("AOAI_MODEL")
SEARCH_ADMIN_KEY = os.getenv("SEARCH_ADMIN_KEY")
STORAGE_ACCOUNT_NAME = os.getenv("STORAGE_ACCOUNT_NAME")

## Install required libraries (if not already installed)

In [None]:
# pip install -r requirements.txt

## 1. Define function to call AOAI Embeddings endpoint
- Used to create vector embeddings for each row and field of the table

In [None]:
from openai import AzureOpenAI

def get_embeddings(text):

    openai_client = AzureOpenAI(
            api_key =  '<add your api key here',
            api_version = "2024-05-01-preview",
            azure_endpoint ='https://tccontoso.openai.azure.com/'
        )
    
    embedding = openai_client.embeddings.create(
                 input = text,
                 model= 'text-embedding-3-large'
             ).data[0].embedding
    
    return embedding


## 2. Create Index in Azure AI Search
- Ensure that there is sufficient space for a new index. Standard tier only allows 3 indexes per resource
- If an index of the same name already exists it will be updated. If the changes you are making are incompatible with the existing index definition, you will receive an error. In this case, either rename your target index, or delete the existing one

In [None]:
def createIndex(index_name):
  print("Creating index...")
  print(f"SEARCH_ADMIN_KEY = {SEARCH_ADMIN_KEY}")
  headers = {
    "Content-Type": "application/json",
    "api-key": SEARCH_ADMIN_KEY
  }

  with open("index_definition.json", "r", encoding="utf-8") as f:
    index_definition = json.load(f)

  index_definition["name"] = index_name
  print(f"Current api-key: {index_definition['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['apiKey']}")
  index_definition['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['apiKey'] = AOAI_API_KEY
  print(f"Updated api-key: {index_definition['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['apiKey']}")
  endpoint = f"{SEARCH_ENDPOINT}/indexes/{index_name}?api-version=2024-11-01-Preview"
  # Send the PUT request to create or update the index
  response = requests.put(endpoint, headers=headers, data=json.dumps(index_definition))
  
  if 200 <= response.status_code < 300:
    print("Index created or updated successfully!")
  else:
    print(f"Failed to create or update index. Status code: {response.status_code}")

createIndex("csv-index")

## 3. Upload JSON records to the AI Search Index
- Execute for loop to iterate over each row in the table, create a vector embedding for the desired field, build an index payload JSON, upload payload to remote AI Search Resource
- If using CSV with different fields, update the fields in the `doc` JSON accordingly
- metadata_storage_path points to the source of the data
- This step replaces the "indexer" job that would otherwise be run using the Azure Search https://learn.microsoft.com/en-us/rest/api/searchservice/create-indexer

In [8]:

def uploadIndexContent():
  # Creates an index over a CSV that follows the schema of the ServiceCatalogData.csv
  # Manually creates index rather than using the Indexer API

  credential = AzureKeyCredential(SEARCH_ADMIN_KEY)
  search_client = SearchClient(SEARCH_ENDPOINT, INDEX_NAME, credential)

  docs_to_upload = []

  # df = pd.read_csv(file_path)
  df = get_table_data()

  print(df.head())
  print(df.columns)
  count = 0
  metadata_storage_path = f"https://{STORAGE_ACCOUNT_NAME}.table.core.windows.net/DSWtest"
  for i, row in df.iterrows():
    try:
      doc = {
        "id": str(i),
        "category": row["Category"],
        "categoryVector": get_embeddings(row["Category"]),
        "categoryDescription": row["CategoryDescription"],
        "categoryDescriptionVector": get_embeddings(row["CategoryDescription"]),
        "DefaultTeam": row["DefaultTeam"],
        "defaultTeamVector": get_embeddings(row["DefaultTeam"]),
        "Organization": row["Organization"],
        "organizationVector": get_embeddings(row["Organization"]),
        "Service": row["Service"],
        "serviceVector": get_embeddings(row["Service"]),
        "metadata_storage_path": metadata_storage_path
      }

      docs_to_upload.append(doc)

      #Uncomment to test with a small number of documents
      # if count>5:
      #   break
      # count += 1

      response = search_client.upload_documents(documents=docs_to_upload)
      # print(response)

    except Exception as e:
      print(f"Error uploading document {i}: {e}")

uploadIndexContent()

https://www.google.com
https://www.google.com
https://www.google.com
https://www.google.com
https://www.google.com
https://www.google.com
https://www.google.com
https://www.google.com
https://www.google.com
https://www.google.com
https://www.google.com
https://www.google.com


[<azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f681cc47950>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f681cc45750>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f681cc47990>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f681cc47890>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f681cc457d0>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f681cc47790>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f681cc45110>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f681cc47410>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f681cc47610>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f681cc472d0>,
 <azure.search.documents._generated.models._models_py3.IndexingResult at 0x7f681cc47390>,
 <azure.se