# [Integrated Vectorization](https://learn.microsoft.com/en-us/azure/search/vector-search-integrated-vectorization) BDP Notebook - REST API


* Adapted for BDP from https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/integrated-vectorization/azure-search-integrated-vectorization-sample.ipynb
* Does not use API keys (relies on Azure Entra ID)

# Environment Variables Setup

Create a `.env` file in the current directory with the following environment variables:

## Required Variables

```env
# Azure AI Search Configuration
AZURE_SEARCH_SERVICE_ENDPOINT = "https://search-we-tst-air-01-dev-ds-1a-bdpdraft.search.windows.net"
AZURE_SEARCH_INDEX_NAME = "YOUR_INDEX_NAME"

# Azure OpenAI Configuration
AZURE_OPENAI_ENDPOINT = "https://cs-openai-we-tst-air-01-dev-ds-1a-bdpdraft.openai.azure.com/"
AZURE_OPENAI_EMBEDDING_MODEL_NAME = "text-embedding-ada-002"
AZURE_OPENAI_EMBEDDING_DIMENSIONS = "1536"
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_ID = "https://cs-openai-we-tst-air-01-dev-ds-1a-bdpdraft.openai.azure.com/openai/deployments/text-embedding-ada-002"

# Azure Blob Storage Configuration
BLOB_CONTAINER_NAME = "YOUR_CONTAINER_NAME"
BLOB_CONNECTION_STRING = "ResourceId=/subscriptions/73cbde61-61fb-4d35-88e9-4aef7ab0c415/resourceGroups/rg-we-tst-air-01-dev-app-1a-bdpdraft/providers/Microsoft.Storage/storageAccounts/stpwetstair01devapp1abdp;"

# Azure Key Vault Configuration
KEYVAULT_KEY_NAME = "cmk-aisearch-we-tst-air-01-dev-ds-1a-bdpdraft"
KEYVAULT_URI = "https://kvwetstair01devds1abdpdr.vault.azure.net/"
```

## Notes

1. Replace the example values with your actual Azure resource configurations
2. Ensure the `.env` file is in the same directory as your Python notebook
3. The script uses Azure CLI credentials for authentication, so make sure you're logged in with `az login`
4. Never commit the `.env` file to version control


## Important notice

The assumption is that you already have a container with the proper name "YOUR_CONTAINER_NAME" in the private storage account. If you don't have it, please go through the documentation:

https://amadeus.atlassian.net/wiki/spaces/DAAS/pages/2262041902/8.+Azure+Private+Storage+Accounts#%3Aquestion_mark%3A-Can-I-request-the-creation-of-ad-hoc-containers%3F

If you don't know what "private storage account" is in BDP, go here: https://amadeus.atlassian.net/wiki/x/LgXUhg 


### Create an encrypted datasource to private storage account


In [None]:
from dotenv import load_dotenv
import os
import requests
import json
from azure.identity import AzureCliCredential

load_dotenv(override=True)

url = "{AZURE_SEARCH_SERVICE_ENDPOINT}/datasources('{AZURE_SEARCH_INDEX_NAME}-blob')?api-version=2024-07-01".format(
    AZURE_SEARCH_SERVICE_ENDPOINT=os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"],
    AZURE_SEARCH_INDEX_NAME=os.environ["AZURE_SEARCH_INDEX_NAME"],
)

payload = json.dumps(
    {
        "name": "{AZURE_SEARCH_INDEX_NAME}-blob".format(
            AZURE_SEARCH_INDEX_NAME=os.environ["AZURE_SEARCH_INDEX_NAME"]
        ),
        "description": "Private storage account data source",
        "type": "azureblob",  # it can be "adlsgen2" or "azureblob" for us
        "credentials": {
            "connectionString": "{BLOB_CONNECTION_STRING}".format(
                BLOB_CONNECTION_STRING=os.environ["BLOB_CONNECTION_STRING"]
            ),
        },
        "container": {
            "name": "{BLOB_CONTAINER_NAME}".format(
                BLOB_CONTAINER_NAME=os.environ["BLOB_CONTAINER_NAME"]
            ),
        },
        "encryptionKey": {
            "keyVaultKeyName": "{KEYVAULT_KEY_NAME}".format(
                KEYVAULT_KEY_NAME=os.environ["KEYVAULT_KEY_NAME"]
            ),
            "keyVaultKeyVersion": "",  # can be empty string to retrieve latest version
            "keyVaultUri": "{KEYVAULT_URI}".format(
                KEYVAULT_URI=os.environ["KEYVAULT_URI"]
            ),
        },
    }
)

credential = AzureCliCredential()
token = credential.get_token("https://search.azure.com/.default").token

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {token}",
}

try:
    response = requests.request("PUT", url, headers=headers, data=payload)
    response.raise_for_status()
    
    if 200 <= response.status_code < 300:
        print(f"Success: {response.status_code} - {response.reason}")
        try:
            json_data = response.json()
            print("Response JSON:", json.dumps(json_data, indent=4))
        except ValueError:
            print("Response does not contain valid JSON data.")
    else:
        print(f"Unexpected status code: {response.status_code} - {response.reason}")

except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
except requests.exceptions.ConnectionError as conn_err:
    print(f"Connection error occurred: {conn_err}")
except requests.exceptions.Timeout as timeout_err:
    print(f"Timeout error occurred: {timeout_err}")
except requests.exceptions.RequestException as req_err:
    print(f"An error occurred: {req_err}")

### Create an encrypted search index


In [None]:
import requests
import json
from dotenv import load_dotenv
import os

load_dotenv(override=True)

url = "{AZURE_SEARCH_SERVICE_ENDPOINT}/indexes('{AZURE_SEARCH_INDEX_NAME}')?api-version=2024-07-01".format(
    AZURE_SEARCH_SERVICE_ENDPOINT=os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"],
    AZURE_SEARCH_INDEX_NAME=os.environ["AZURE_SEARCH_INDEX_NAME"],
)
payload = json.dumps(
    {
        "name": "{AZURE_SEARCH_INDEX_NAME}".format(
            AZURE_SEARCH_INDEX_NAME=os.environ["AZURE_SEARCH_INDEX_NAME"]
        ),
        "fields": [
            {
                "name": "parent_id",
                "type": "Edm.String",
                "sortable": True,
                "filterable": True,
                "facetable": True,
            },
            {
                "name": "title",
                "type": "Edm.String",
            },
            {
                "name": "chunk_id",
                "type": "Edm.String",
                "key": True,
                "sortable": True,
                "filterable": True,
                "facetable": True,
                "analyzer": "keyword",
            },
            {
                "name": "chunk",
                "type": "Edm.String",
                "sortable": False,
                "filterable": False,
                "facetable": False,
            },
            {
                "name": "vector",
                "type": "Collection(Edm.Single)",
                "dimensions": "{AZURE_OPENAI_EMBEDDING_DIMENSIONS}".format(
                    AZURE_OPENAI_EMBEDDING_DIMENSIONS=os.environ[
                        "AZURE_OPENAI_EMBEDDING_DIMENSIONS"
                    ]
                ),
                "vectorSearchProfile": "myHnswProfile",
            },
        ],
        "vectorSearch": {
            "algorithms": [
                {
                    "name": "myHnsw",
                    "kind": "hnsw",
                },
            ],
            "profiles": [
                {
                    "name": "myHnswProfile",
                    "algorithm": "myHnsw",
                    "vectorizer": "myOpenAI",
                }
            ],
            "vectorizers": [
                {
                    "name": "myOpenAI",
                    "kind": "azureOpenAI",
                    "azureOpenAIParameters": {
                        "apiKey": "",  # to use a system manged identity, leave apiKey and authIdentity blank
                        # https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-azure-openai-embedding#skill-parameters
                        "resourceUri": "{AZURE_OPENAI_ENDPOINT}".format(
                            AZURE_OPENAI_ENDPOINT=os.environ["AZURE_OPENAI_ENDPOINT"]
                        ),
                        "modelName": "{AZURE_OPENAI_EMBEDDING_MODEL_NAME}".format(
                            AZURE_OPENAI_EMBEDDING_MODEL_NAME=os.environ[
                                "AZURE_OPENAI_EMBEDDING_MODEL_NAME"
                            ]
                        ),
                        "deploymentId": "{AZURE_OPENAI_EMBEDDING_MODEL_NAME}".format(
                            AZURE_OPENAI_EMBEDDING_MODEL_NAME=os.environ[
                                "AZURE_OPENAI_EMBEDDING_MODEL_NAME"
                            ]
                        ),
                    },
                },
            ],
        },
        "semantic": {
            "configurations": [
                {
                    "name": "my-semantic-config",
                    "prioritizedFields": {
                        "prioritizedContentFields": [
                            {
                                "fieldName": "chunk",
                            },
                        ],
                    },
                },
            ]
        },
        "encryptionKey": {
            "keyVaultKeyName": "{KEYVAULT_KEY_NAME}".format(
                KEYVAULT_KEY_NAME=os.environ["KEYVAULT_KEY_NAME"]
            ),
            "keyVaultKeyVersion": "",  # can be empty string to retrieve latest version
            "keyVaultUri": "{KEYVAULT_URI}".format(
                KEYVAULT_URI=os.environ["KEYVAULT_URI"]
            ),
        },
    },
)

credential = AzureCliCredential()
token = credential.get_token("https://search.azure.com/.default").token

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {token}",
}

try:
    response = requests.request("PUT", url, headers=headers, data=payload)
    response.raise_for_status()
    
    if 200 <= response.status_code < 300:
        print(f"Success: {response.status_code} - {response.reason}")
        try:
            json_data = response.json()
            print("Response JSON:", json.dumps(json_data, indent=4))
        except ValueError:
            print("Response does not contain valid JSON data.")
    else:
        print(f"Unexpected status code: {response.status_code} - {response.reason}")

except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
except requests.exceptions.ConnectionError as conn_err:
    print(f"Connection error occurred: {conn_err}")
except requests.exceptions.Timeout as timeout_err:
    print(f"Timeout error occurred: {timeout_err}")
except requests.exceptions.RequestException as req_err:
    print(f"An error occurred: {req_err}")

### Create an encrypted skillset


In [None]:
import requests
import json
from dotenv import load_dotenv
import os

load_dotenv(override=True)

url = "{AZURE_SEARCH_SERVICE_ENDPOINT}/skillsets('{AZURE_SEARCH_INDEX_NAME}-skillset')?api-version=2024-07-01".format(
    AZURE_SEARCH_SERVICE_ENDPOINT=os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"],
    AZURE_SEARCH_INDEX_NAME=os.environ["AZURE_SEARCH_INDEX_NAME"],
)

payload = json.dumps(
    {
        "name": "{AZURE_SEARCH_INDEX_NAME}-skillset".format(
            AZURE_SEARCH_INDEX_NAME=os.environ["AZURE_SEARCH_INDEX_NAME"]
        ),
        "description": "Skillset to chunk documents and generate embeddings",
        "skills": [
            {
                "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
                "description": "Splitt skill to chunk documents",
                "textSplitMode": "pages",
                "context": "/document",
                "maximumPageLength": 2000,
                "pageOverlapLength": 500,
                "inputs": [
                    {
                        "name": "text",
                        "source": "/document/content",
                    },
                ],
                "outputs": [
                    {
                        "name": "textItems",
                        "targetName": "pages",
                    },
                ],
            },
            {
                "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
                "description": "Skill to generate embeddings via Azure OpenAI",
                "context": "/document/pages/*",
                "resourceUri": "{AZURE_OPENAI_ENDPOINT}".format(
                    AZURE_OPENAI_ENDPOINT=os.environ["AZURE_OPENAI_ENDPOINT"]
                ),
                "deploymentId": "{AZURE_OPENAI_EMBEDDING_MODEL_NAME}".format(
                    AZURE_OPENAI_EMBEDDING_MODEL_NAME=os.environ[
                        "AZURE_OPENAI_EMBEDDING_MODEL_NAME"
                    ]
                ),
                "apiKey": "",  # to use a system manged identity, leave apiKey and authIdentity blank
                # https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-azure-openai-embedding#skill-parameters
                "modelName": "{AZURE_OPENAI_EMBEDDING_MODEL_NAME}".format(
                    AZURE_OPENAI_EMBEDDING_MODEL_NAME=os.environ[
                        "AZURE_OPENAI_EMBEDDING_MODEL_NAME"
                    ]
                ),
                "dimensions": "{AZURE_OPENAI_EMBEDDING_DIMENSIONS}".format(
                    AZURE_OPENAI_EMBEDDING_DIMENSIONS=os.environ[
                        "AZURE_OPENAI_EMBEDDING_DIMENSIONS"
                    ]
                ),
                "inputs": [
                    {
                        "name": "text",
                        "source": "/document/pages/*",
                    },
                ],
                "outputs": [
                    {
                        "name": "embedding",
                        "targetName": "vector",
                    },
                ],
            },
        ],
        "indexProjections": {
            "parameters": {
                "projectionMode": "skipIndexingParentDocuments",
            },
            "selectors": [
                {
                    "mappings": [
                        {
                            "name": "chunk",
                            "source": "/document/pages/*",
                        },
                        {
                            "name": "vector",
                            "source": "/document/pages/*/vector",
                        },
                        {
                            "name": "title",
                            "source": "/document/metadata_storage_name",
                        },
                    ],
                    "targetIndexName": "{AZURE_SEARCH_INDEX_NAME}".format(
                        AZURE_SEARCH_INDEX_NAME=os.environ["AZURE_SEARCH_INDEX_NAME"]
                    ),
                    "parentKeyFieldName": "parent_id",
                    "sourceContext": "/document/pages/*",
                }
            ],
        },
        "encryptionKey": {
            "keyVaultKeyName": "{KEYVAULT_KEY_NAME}".format(
                KEYVAULT_KEY_NAME=os.environ["KEYVAULT_KEY_NAME"]
            ),
            "keyVaultKeyVersion": "",  # can be empty string to retrieve latest version
            "keyVaultUri": "{KEYVAULT_URI}".format(
                KEYVAULT_URI=os.environ["KEYVAULT_URI"]
            ),
        },
    },
)

credential = AzureCliCredential()
token = credential.get_token("https://search.azure.com/.default").token

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {token}",
}

try:
    response = requests.request("PUT", url, headers=headers, data=payload)
    response.raise_for_status()
    
    if 200 <= response.status_code < 300:
        print(f"Success: {response.status_code} - {response.reason}")
        try:
            json_data = response.json()
            print("Response JSON:", json.dumps(json_data, indent=4))
        except ValueError:
            print("Response does not contain valid JSON data.")
    else:
        print(f"Unexpected status code: {response.status_code} - {response.reason}")

except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
except requests.exceptions.ConnectionError as conn_err:
    print(f"Connection error occurred: {conn_err}")
except requests.exceptions.Timeout as timeout_err:
    print(f"Timeout error occurred: {timeout_err}")
except requests.exceptions.RequestException as req_err:
    print(f"An error occurred: {req_err}")

### Create an encrypted indexer


In [None]:
import requests
import json
from dotenv import load_dotenv
import os

load_dotenv(override=True)

url = "{AZURE_SEARCH_SERVICE_ENDPOINT}/indexers('{AZURE_SEARCH_INDEX_NAME}-indexer')?api-version=2024-07-01".format(
    AZURE_SEARCH_SERVICE_ENDPOINT=os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"],
    AZURE_SEARCH_INDEX_NAME=os.environ["AZURE_SEARCH_INDEX_NAME"],
)

payload = json.dumps(
    {
        "name": "{AZURE_SEARCH_INDEX_NAME}-indexer".format(
            AZURE_SEARCH_INDEX_NAME=os.environ["AZURE_SEARCH_INDEX_NAME"]
        ),
        "description": "Indexer to index documents and generate embeddings",
        "skillsetName": "{AZURE_SEARCH_INDEX_NAME}-skillset".format(
            AZURE_SEARCH_INDEX_NAME=os.environ["AZURE_SEARCH_INDEX_NAME"]
        ),
        "dataSourceName": "{AZURE_SEARCH_INDEX_NAME}-blob".format(
            AZURE_SEARCH_INDEX_NAME=os.environ["AZURE_SEARCH_INDEX_NAME"]
        ),
        "targetIndexName": "{AZURE_SEARCH_INDEX_NAME}".format(
            AZURE_SEARCH_INDEX_NAME=os.environ["AZURE_SEARCH_INDEX_NAME"]
        ),
        "parameters": {
            "configuration": {
                "dataToExtract": "contentAndMetadata",
                "parsingMode": "jsonArray",
                "firstLineContainsHeaders": False,
                # "executionEnvironment": "private", #never use this parameter unless Azure AI Search SKU is upper or equal to "S2"
                # otherwise it will fail
            }
        },
        "encryptionKey": {
            "keyVaultKeyName": "{KEYVAULT_KEY_NAME}".format(
                KEYVAULT_KEY_NAME=os.environ["KEYVAULT_KEY_NAME"]
            ),
            "keyVaultKeyVersion": "",  # can be empty string to retrieve latest version
            "keyVaultUri": "{KEYVAULT_URI}".format(
                KEYVAULT_URI=os.environ["KEYVAULT_URI"]
            ),
        },
    }
)

credential = AzureCliCredential()
token = credential.get_token("https://search.azure.com/.default").token

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {token}",
}

try:
    response = requests.request("PUT", url, headers=headers, data=payload)
    response.raise_for_status()
    
    if 200 <= response.status_code < 300:
        print(f"Success: {response.status_code} - {response.reason}")
        try:
            json_data = response.json()
            print("Response JSON:", json.dumps(json_data, indent=4))
        except ValueError:
            print("Response does not contain valid JSON data.")
    else:
        print(f"Unexpected status code: {response.status_code} - {response.reason}")

except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
except requests.exceptions.ConnectionError as conn_err:
    print(f"Connection error occurred: {conn_err}")
except requests.exceptions.Timeout as timeout_err:
    print(f"Timeout error occurred: {timeout_err}")
except requests.exceptions.RequestException as req_err:
    print(f"An error occurred: {req_err}")

### Run the indexer


In [None]:
import requests
import json
from dotenv import load_dotenv
import os

load_dotenv(override=True)

indexerName = "{AZURE_SEARCH_INDEX_NAME}-indexer".format(
    AZURE_SEARCH_INDEX_NAME=os.environ["AZURE_SEARCH_INDEX_NAME"]
)

url = "{AZURE_SEARCH_SERVICE_ENDPOINT}/indexers('{indexerName}')/search.run?api-version=2024-07-01".format(
    AZURE_SEARCH_SERVICE_ENDPOINT=os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"],
    indexerName=indexerName,
)

credential = AzureCliCredential()
token = credential.get_token("https://search.azure.com/.default").token

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {token}",
}

try:
    response = requests.request("POST", url, headers=headers, data=payload)
    response.raise_for_status()
    
    if 200 <= response.status_code < 300:
        print(f"Success: {response.status_code} - {response.reason}")
        try:
            json_data = response.json()
            print("Response JSON:", json.dumps(json_data, indent=4))
        except ValueError:
            print("Response does not contain valid JSON data.")
    else:
        print(f"Unexpected status code: {response.status_code} - {response.reason}")

except requests.exceptions.HTTPError as http_err:
    print(f"HTTP error occurred: {http_err}")
except requests.exceptions.ConnectionError as conn_err:
    print(f"Connection error occurred: {conn_err}")
except requests.exceptions.Timeout as timeout_err:
    print(f"Timeout error occurred: {timeout_err}")
except requests.exceptions.RequestException as req_err:
    print(f"An error occurred: {req_err}")