The AI Search Azure docs can be found here: https://github.com/MicrosoftDocs/azure-docs/tree/main/articles/search



In [6]:
import os
import time
import uuid

import requests
from azure.identity import DefaultAzureCredential
from azure.mgmt.resource import ResourceManagementClient
from azure.mgmt.storage import StorageManagementClient
from azure.storage.blob import BlobServiceClient
from dotenv import load_dotenv
from requests.exceptions import HTTPError
from tqdm import tqdm

load_dotenv()

TENANT_ID = os.getenv("TENANT_ID")
SUBSCRIPTION_ID = os.getenv("SUBSCRIPTION_ID")
GITHUB_PAT = os.getenv("GITHUB_PAT")
LOCATION = os.getenv("LOCATION")

DOCS_GITHUB_URL = "https://api.github.com/repos/MicrosoftDocs/azure-docs/contents/articles/search"

RESOURCE_GROUP_NAME = "chat-with-azure-docs-rg"

# 12 DIGIT SUFFIX FROM HASHED TENANT ID, SUBSCRIPTION ID, AND RESOURCE GROUP NAME
SUFFIX = str(uuid.uuid5(uuid.NAMESPACE_DNS, f"{TENANT_ID}{SUBSCRIPTION_ID}{RESOURCE_GROUP_NAME}")).replace('-', '')[:15]

STORAGE_ACCOUNT_NAME = f"azuredocs{SUFFIX}"
CONTAINER_NAME = "azuredocs"

print(f"Location: {LOCATION}")
print(f"Resource Group: {RESOURCE_GROUP_NAME}")
print(f"Storage Account: {STORAGE_ACCOUNT_NAME}")
print(f"Container: {CONTAINER_NAME}")

Location: northeurope
Resource Group: chat-with-azure-docs-rg
Storage Account: azuredocsad3761558b1a5c5
Container: azuredocs


In [None]:
# # In case you need to specify the tenant ID and subscription ID
# !az login --tenant $TENANT_ID
# !az account set --subscription $SUBSCRIPTION_ID

In [7]:
credential = DefaultAzureCredential()

# Create a resource group or check if it already exists using the SDK
resource_client = ResourceManagementClient(
    credential=DefaultAzureCredential(),
    subscription_id=SUBSCRIPTION_ID
)

resource_client.resource_groups.create_or_update(
    resource_group_name=RESOURCE_GROUP_NAME,
    parameters={"location": LOCATION}
)


<azure.mgmt.resource.resources.v2022_09_01.models._models_py3.ResourceGroup at 0x24ba9476f90>

In [8]:
# Create a storage account or check if it already exists using the SDK
storage_client = StorageManagementClient(credential, SUBSCRIPTION_ID)

storage_account = storage_client.storage_accounts.begin_create(
    RESOURCE_GROUP_NAME,
    STORAGE_ACCOUNT_NAME,
    {
        "sku": {
            "name": "Standard_LRS",
        },
        "kind": "blobstorage",
        "location": LOCATION,
        "accessTier": "Hot",
        "tags": {
            "purpose": "demo",
            "demo-name:": "chat-with-azure-docs"
        },
    }
).result()
print("Create storage account:\n{}".format(storage_account))

# Create Container
container = storage_client.blob_containers.create(
    RESOURCE_GROUP_NAME,
    STORAGE_ACCOUNT_NAME,
    CONTAINER_NAME,
    {
        "public_access": "None"
    }
)

Create storage account:
{'additional_properties': {}, 'id': '/subscriptions/13c1109b-ba76-4ca6-8161-8767bdf3c75c/resourceGroups/chat-with-azure-docs-rg/providers/Microsoft.Storage/storageAccounts/azuredocsad3761558b1a5c5', 'name': 'azuredocsad3761558b1a5c5', 'type': 'Microsoft.Storage/storageAccounts', 'tags': {'purpose': 'demo', 'demo-name:': 'chat-with-azure-docs'}, 'location': 'northeurope', 'sku': <azure.mgmt.storage.v2023_01_01.models._models_py3.Sku object at 0x0000024BA990E0D0>, 'kind': 'StorageV2', 'identity': None, 'extended_location': None, 'provisioning_state': 'Succeeded', 'primary_endpoints': <azure.mgmt.storage.v2023_01_01.models._models_py3.Endpoints object at 0x0000024BA990DA90>, 'primary_location': 'northeurope', 'status_of_primary': 'available', 'last_geo_failover_time': None, 'secondary_location': None, 'status_of_secondary': None, 'creation_time': datetime.datetime(2023, 12, 20, 13, 27, 59, 963472, tzinfo=<isodate.tzinfo.Utc object at 0x0000024BA8783D10>), 'custom_d

In [14]:
connection_string = f"DefaultEndpointsProtocol=https;AccountName={STORAGE_ACCOUNT_NAME};AccountKey={storage_client.storage_accounts.list_keys(RESOURCE_GROUP_NAME, STORAGE_ACCOUNT_NAME).keys[0].value};EndpointSuffix=core.windows.net"

blob_service_client = BlobServiceClient.from_connection_string(connection_string)

container_client = blob_service_client.get_container_client(CONTAINER_NAME)

In [15]:
!az storage blob service-properties delete-policy update --account-name $STORAGE_ACCOUNT_NAME  --enable true --days-retained 7 --connection-string $connection_string

{
  "allowPermanentDelete": null,
  "days": 7,
  "enabled": true
}


In [None]:
# Function to process directory
def process_directory(url):
    headers = {'Authorization': f'token {GITHUB_PAT}'} if GITHUB_PAT else {}
    contents = get_with_retry(url, headers)

    # Check if contents is a list of dictionaries
    if isinstance(contents, list) and all(isinstance(item, dict) for item in contents):
        for content in tqdm(contents, desc="Processing files"):
            if content['type'] == 'file' and content['name'].endswith('.md'):
                # Check if the blob already exists
                blob_client = container_client.get_blob_client(blob=content['path'])
                if not blob_client.exists():
                    # Download and upload .md file
                    file_content = get_with_retry(content['download_url'])
                    blob_client.upload_blob(file_content)
            elif content['type'] == 'dir':
                # Recursive call for directories
                process_directory(content['url'])

def get_with_retry(url, headers=None, retry_after=2):
    while True:
        try:
            with requests.get(url, headers=headers) as response:
                response.raise_for_status()  # Raises stored HTTPError, if one occurred
                return response.content
        except HTTPError as http_err:
            if response.status_code == 403:  # GitHub's rate limit status code
                print(f"Rate limit exceeded. Retrying in {retry_after} seconds...")
                time.sleep(retry_after)
                retry_after *= 2  # Double the wait time for the next attempt
            else:
                raise http_err  # Re-raise the exception if it's not due to rate limiting

# Start processing from the root of the repository
process_directory(DOCS_GITHUB_URL)

In [11]:
# Create a config file that contains the storage account name, container name, resource group name, and location
with open("config.json", "w") as f:
    f.write(f"""{{
    "storage_account_name": "{STORAGE_ACCOUNT_NAME}",
    "container_name": "{CONTAINER_NAME}",
    "resource_group_name": "{RESOURCE_GROUP_NAME}",
    "location": "{LOCATION}",
    "suffix": "{SUFFIX}",
    "storage_account_connection_string": "{connection_string}"
    }}""")