# Create an AI Search (formerly Azure Cognitive Search) resource

In [None]:
import json
import os
import subprocess

import requests
from azure.identity import DefaultAzureCredential
from dotenv import load_dotenv

load_dotenv()

with open("config.json") as f:
    config = json.load(f)
    RESOURCE_GROUP_NAME = config["resource_group_name"]
    LOCATION = config["location"]
    STORAGE_ACCOUNT_NAME = config["storage_account_name"]
    CONTRAINER_NAME = config["container_name"]
    SUFFIX = config["suffix"]

SUBSCRIPTION_ID = os.environ.get("SUBSCRIPTION_ID")
TENANT_ID = os.environ.get("TENANT_ID")
AI_SERVICE_KEY = os.environ.get("AI_SERVICE_KEY")
OPENAI_URI = os.environ.get("OPENAI_URI")
OPENAI_KEY = os.environ.get("OPENAI_KEY")
MODEL_DEPLOYMENT_ID = os.environ.get("MODEL_DEPLOYMENT_ID")

SEARCH_SERVICE_NAME = f"azuredocs-search-{SUFFIX}"

index_name = "azuredocs-index"

credential = DefaultAzureCredential()

In [None]:
# In case you need to specify the tenant ID and subscription ID
!az login --tenant $TENANT_ID
!az account set --subscription $SUBSCRIPTION_ID

command = f"az search service create --name {SEARCH_SERVICE_NAME} --resource-group {RESOURCE_GROUP_NAME} --location {LOCATION} --sku Standard --partition-count 1 --replica-count 1"
!{command}

In [None]:
command = f'az search admin-key show --service-name {SEARCH_SERVICE_NAME} --resource-group {RESOURCE_GROUP_NAME}'
output = subprocess.check_output(command, shell=True).decode('utf-8')
output_json = json.loads(output)
admin_key = output_json['primaryKey']

In [None]:
config["SEARCH_SERVICE_NAME"] = SEARCH_SERVICE_NAME
config["SEARCH_ADMIN_KEY"] = admin_key
config["INDEX_NAME"] = index_name

with open("config.json", 'w') as f:
    json.dump(config, f)

In [None]:
API_VERSION = "2023-10-01-preview"

In [None]:
index_body = {
    "name": index_name,
    "semantic": {
        "configurations": [
            {
                "name": "default",
                "prioritizedFields": {
                    "titleField": {
                        "fieldName": "title"
                    },
                    "prioritizedContentFields": [
                        {
                            "fieldName": "page"
                        }
                    ],
                    "prioritizedKeywordsFields": []
                }
            }
        ]
    },
    "vectorSearch": {
        "algorithms": [
            {
                "name": "myalgo",
                "kind": "hnsw",
                "hnswParameters": {
                    "metric": "cosine",
                    "m": 4,
                    "efConstruction": 400,
                    "efSearch": 1000
                }
            }
        ],
        "vectorizers": [
            {
                "name": "openai",
                "kind": "azureOpenAI",
                "azureOpenAIParameters": {
                    "resourceUri": OPENAI_URI,
                    "apiKey": OPENAI_KEY,
                    "deploymentId": MODEL_DEPLOYMENT_ID
                }
            }
        ],
        "profiles": [
            {
                "name": "myprofile",
                "algorithm": "myalgo",
                "vectorizer": "openai"
            }
        ]
    },    "fields": [
        {
            "name": "ChunkKey",
            "type": "Edm.String",
            "key": True,
            "analyzer": "keyword"
        },
        {
            "name": "ParentKey",
            "type": "Edm.String"
        },
        {
            "name": "page",
            "type": "Edm.String"
        },
        {
            "name": "vector",
            "type": "Collection(Edm.Single)",
            "dimensions": 1536,
            "vectorSearchProfile": "myprofile",
            "searchable": True,
            "retrievable": True,
            "filterable": False,
            "sortable": False,
            "facetable": False
        },
        {
            "name": "filename",
            "retrievable": True,
            "searchable": False,
            "type": "Edm.String"
        },
        {
            "name": "title",
            "retrievable": True,
            "searchable": True,
            "type": "Edm.String"
        },
        {
            "name": "path",
            "retrievable": True,
            "searchable": False,
            "type": "Edm.String"
        },
        {
            "name": "language",
            "retrievable": True,
            "searchable": False,
            "filterable": True,
            "facetable": True,
            "type": "Edm.String"
        }
    ]
}

In [None]:
# Define your headers
headers = {
    "Content-Type": "application/json",
    "api-key": admin_key
}

# Define your URL
url = f"https://{SEARCH_SERVICE_NAME}.search.windows.net/indexes?api-version={API_VERSION}"

# Make the POST request
response = requests.post(url, headers=headers, data=json.dumps(index_body))

# Print the status code and the response
print("Status code:", response.status_code)
print("Response:", response.json())

In [None]:
storage_account_connection_string = config["storage_account_connection_string"]
container_name = config["container_name"]

datasource_body = {
    "name": "azuredocs-datasource",
    "description": "Blob data source example",
    "type": "azureblob",
    "credentials": {
        "connectionString": storage_account_connection_string
    },
    "dataDeletionDetectionPolicy": {
        "@odata.type": "#Microsoft.Azure.Search.NativeBlobSoftDeleteDeletionDetectionPolicy"
    },
    "container": {
        "name": container_name
    }
}

In [None]:
url = f"https://{SEARCH_SERVICE_NAME}.search.windows.net/datasources?api-version={API_VERSION}"

# Make the POST request
response = requests.post(url, headers=headers, data=json.dumps(datasource_body))

# Print the status code and the response
print("Status code:", response.status_code)
print("Response:", response.json())

In [None]:
skillset_body ={
    "name": "azuredocs-skillset",
    "description": "An e2e skillset",
    "skills": [
        {
            "@odata.type": "#Microsoft.Skills.Text.LanguageDetectionSkill",
            "context": "/document",
            "description": "If you have multilingual content, adding a language code is useful for filtering",
            "inputs": [
                {
                    "name": "text",
                    "source": "/document/content"
                }
            ],
            "outputs": [
                {
                    "name": "languageCode",
                    "targetName": "language"
                }
            ]
        },
        {
            "@odata.type": "#Microsoft.Skills.Text.SplitSkill",
            "description": "split text into pages",
            "textSplitMode": "pages",
            "maximumPageLength": 3000,
            "pageOverlapLength": 600,
            "defaultLanguageCode": "en",
            "context": "/document",
            "inputs": [
                {
                    "name": "text",
                    "source": "/document/content"
                },
                {
                    "name": "languageCode",
                    "source": "/document/language"
                }
            ],
            "outputs": [
                {
                    "name": "textItems",
                    "targetName": "pages"
                }
            ]
        },
        {
            "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
            "description": "Azure OpenAI Embedding Skill",
            "context": "/document/pages/*",
            "resourceUri": OPENAI_URI,
            "apiKey": OPENAI_KEY,
            "deploymentId": MODEL_DEPLOYMENT_ID,
            "inputs": [
                {
                    "name": "text",
                    "source": "/document/pages/*"
                }
            ],
            "outputs": [
                {
                    "name": "embedding",
                    "targetName": "vector"
                }
            ]
        }
    ],
    "cognitiveServices": {
        "@odata.type": "#Microsoft.Azure.Search.CognitiveServicesByKey",
        "description": "mycogsvcs resource",
        "key": AI_SERVICE_KEY,
    },
    "indexProjections": {
        "selectors": [
            {
                "targetIndexName": index_name,
                "parentKeyFieldName": "ParentKey",
                "sourceContext": "/document/pages/*",
                "mappings": [
                    {
                        "name": "page",
                        "source": "/document/pages/*"
                    },
                    {
                        "name": "vector",
                        "source": "/document/pages/*/vector"
                    },
                    {
                        "name": "language",
                        "source": "/document/language"
                    },
                    {
                        "name": "filename",
                        "source": "/document/filename"
                    },
                    {
                        "name": "path",
                        "source": "/document/path"
                    },
                    {
                        "name": "title",
                        "source": "/document/title"
                    }
                ]
            }
        ],
        "parameters": {
            "projectionMode": "skipIndexingParentDocuments"
        }
    }
}

In [None]:
# {{searchUri}}/skillsets?api-version=2023-10-01-preview

url = f"https://{SEARCH_SERVICE_NAME}.search.windows.net/skillsets?api-version={API_VERSION}"

# Make the POST request
response = requests.post(url, headers=headers, data=json.dumps(skillset_body))

# Print the status code and the response
print("Status code:", response.status_code)
print("Response:", response.json())

In [None]:
indexer_body = {
    "name": "azuredocs-indexer",
    "dataSourceName": "azuredocs-datasource",
    "targetIndexName": index_name,
    "skillsetName": "azuredocs-skillset",
    "schedule": {
        "interval": "PT2H"
    },
    "parameters": {
        "maxFailedItems": -1,
        "maxFailedItemsPerBatch": -1,
        "batchSize": 1,
        "configuration": {
            "dataToExtract": "contentAndMetadata",
            "indexedFileNameExtensions": ".md,.MD",
            "parsingMode": "text"
        }
    },
    "fieldMappings": [
        {
            "sourceFieldName": "metadata_storage_name",
            "targetFieldName": "filename"
        },
        {
            "sourceFieldName": "metadata_storage_path",
            "targetFieldName": "path"
        },
        {
            "sourceFieldName": "metadata_storage_name",
            "targetFieldName": "title"
        }
    ],
    "outputFieldMappings": [
        {
            "sourceFieldName": "/document/language",
            "targetFieldName": "language"
        }
    ]
}

In [None]:
# {{searchUri}}/indexers?api-version=2023-10-01-preview

url = f"https://{SEARCH_SERVICE_NAME}.search.windows.net/indexers?api-version={API_VERSION}"

# Make the POST request
response = requests.post(url, headers=headers, data=json.dumps(indexer_body))

# Print the status code and the response
print("Status code:", response.status_code)
print("Response:", response.json())
