In [None]:
# Notebook to load markdown content to Azure AI Search
# NOTE: There is no chunking of content, but rather a single document equals a PDF page

In [1]:
import os
import re  
import json
from pathlib import Path  

import time
import requests
import concurrent.futures  

from openai import AzureOpenAI
from tenacity import retry, wait_random_exponential, stop_after_attempt 


In [3]:
config = json.load(open("config.json"))

# Azure AI Search Config
search_service_name = config["search_service_name"]
search_service_url = "https://{}.search.windows.net/".format(search_service_name)

search_admin_key = config["search_admin_key"]

index_name = config["search_index_name"]
index_schema_file = config["search_index_schema_file"]
search_api_version = config["search_api_version"]
search_headers = {  
    'Content-Type': 'application/json',  
    'api-key': search_admin_key  
}  

#Azure OpenAI
openai_embedding_api_base = config["openai_embedding_api_base"]
openai_embedding_api_key = config["openai_embedding_api_key"]
openai_embedding_api_version = config["openai_embedding_api_version"]
openai_embeddings_model = config["openai_embedding_model"]

# gets the API Key from environment variable AZURE_OPENAI_API_KEY
embeddings_client = AzureOpenAI(
    api_version=openai_embedding_api_version,
    azure_endpoint=openai_embedding_api_base,
    api_key=openai_embedding_api_key
)

openai_gpt_api_base = config["openai_gpt_api_base"]
openai_gpt_api_key = config["openai_gpt_api_key"]
openai_gpt_api_version = config["openai_gpt_api_version"]
openai_gpt_model = config["openai_gpt_model"]

gpt_client = AzureOpenAI(
    api_key=openai_gpt_api_key,  
    api_version=openai_gpt_api_version,
    base_url=f"{openai_gpt_api_base}/openai/deployments/{openai_gpt_model}"
)


print ('Search Service Name:', search_service_name)
print ('Index Name:', index_name)
print ('Azure OpenAI GPT Base URL:', openai_gpt_api_base)
print ('Azure OpenAI GPT Model:', openai_gpt_model)
print ('Azure OpenAI Embeddings Base URL:', openai_embedding_api_base)
print ('Azure OpenAI Embeddings Model:', openai_embeddings_model)



In [4]:
# Create directory if it does not exist
def ensure_directory_exists(directory_path):  
    path = Path(directory_path)  
    if not path.exists():  
        path.mkdir(parents=True, exist_ok=True)  
        print(f"Directory created: {directory_path}")  
    else:  
        print(f"Directory already exists: {directory_path}")  

# Find all files in a dir
def get_all_files(directory_path):  
    files = []  
    for entry in os.listdir(directory_path):  
        entry_path = os.path.join(directory_path, entry)  
        if os.path.isfile(entry_path):  
            files.append(entry_path)  
    return files  

def extract_numeric_value(filename):  
    # Extract numeric value from filename using regular expression  
    match = re.search(r'(\d+)', filename)  
    return int(match.group(1)) if match else float('inf') 
        
# Function to generate vectors for title and content fields, also used for query vectors
max_attempts = 6
max_backoff = 60
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(max_attempts))
def generate_embedding(text):
    if text == None:
        return None
        
    if len(text) < 10:
        return None
        
    client = AzureOpenAI(
        api_version=openai_embedding_api_version,
        azure_endpoint=openai_embedding_api_base,
        api_key=openai_embedding_api_key
    )    
    counter = 0
    incremental_backoff = 1   # seconds to wait on throttline - this will be incremental backoff
    while True and counter < max_attempts:
        try:
            # text-embedding-3-small == 1536 dims
            response = client.embeddings.create(
                input=text,
                model=openai_embeddings_model
            )
            return json.loads(response.model_dump_json())["data"][0]['embedding']
        except openai.APIError as ex:
            # Handlethrottling - code 429
            if str(ex.code) == "429":
                incremental_backoff = min(max_backoff, incremental_backoff * 1.5)
                print ('Waiting to retry after', incremental_backoff, 'seconds...')
                time.sleep(incremental_backoff)
            elif str(ex.code) == "content_filter":
                print ('API Error', ex.code)
                return None
        except Exception as ex:
            counter += 1
            print ('Error - Retry count:', counter, ex)
    return None


def process_json(file):
    if '.json' in file:
        print ('Processing:', file)
        with open(file, 'r') as j_in:
            json_data = json.loads(j_in.read())
        json_data['vector'] = generate_embedding(json_data['content'])
        with open(file, 'w') as j_out:
            j_out.write(json.dumps(json_data, indent=4))
    else:
        print ('Skipping non JSON file:', file)

    return file



In [5]:
# generate_embedding('That asd asdfasd asdf as df')

In [6]:
# Get the doc_id
folder_path = 'markdown'
entries = os.listdir(folder_path)  
directories = [entry for entry in entries if os.path.isdir(os.path.join(folder_path, entry))]  
if len(directories) > 0:
    doc_id = directories[0]
    print ('Doc ID:', doc_id)
else:
    print ('Could not find the markdown files')

markdown_out_dir = os.path.join('markdown', doc_id)    


Doc ID: 6ba17e30-a320-49a5-9d44-c0f96a0e2869


In [7]:
# Get all the markdown files and sort them by page number
files = os.listdir(markdown_out_dir)  

# Filter out non-txt files (optional)  
txt_files = [f for f in files if f.endswith('.txt')]  
    
# Sort files based on numeric values extracted from filenames  
sorted_files = sorted(txt_files, key=extract_numeric_value)  

total_files = len(sorted_files)
print ('Total Markdown Files:', total_files)


Total Markdown Files: 22


In [8]:
# Create JSON docs
json_out_dir = os.path.join('json', doc_id)
ensure_directory_exists(json_out_dir)

counter = 0
for c in files:
    if '.txt' in c:
        with open(os.path.join(markdown_out_dir, c), 'r') as c_in:
            content = c_in.read()
        # print (c)
        # print (content)

        json_data = {
            'doc_id': doc_id, 
            'page_number': int(c.replace('page_', '').replace('.txt', '')),
            'content': content
            }

        with open(os.path.join(json_out_dir, c.replace('.txt', '.json')), 'w') as c_out:
            c_out.write(json.dumps(json_data, indent=4))

        counter += 1
        if counter % 100 == 0:
            print (counter, 'json chunks written...')

print (counter, 'json chunks written...')



Directory created: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869
22 json chunks written...


In [9]:
# Vectorize the JSON Content
json_files = get_all_files(json_out_dir)
total_files = len(json_files)
print ('Total JSON Files:', total_files)

max_workers = 15

# Using ThreadPoolExecutor with a limit of 10 threads  
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:  
    # Map the function to the array of items  
    results = list(executor.map(process_json, json_files))  
  
print(results)  


Total JSON Files: 22
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/1.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/10.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/11.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/12.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/13.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/14.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/15.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/16.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/17.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/18.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/19.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/2.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/20.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/21.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e2869/22.json
Processing: json/6ba17e30-a320-49a5-9d44-c0f96a0e28

In [10]:
# Load the Index Scheam and ReCreate Index
# Using REST for this to leverage most recent vector capabilities
with open(index_schema_file, "r") as f_in:
    index_schema = json.loads(f_in.read())
    index_schema['name'] = index_name
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['resourceUri'] = openai_embedding_api_base
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['deploymentId'] = openai_embeddings_model
    index_schema['vectorSearch']['vectorizers'][0]['azureOpenAIParameters']['apiKey'] = openai_embedding_api_key

# Making the POST requests to re-create the index  
delete_url = f"{search_service_url}/indexes/{index_name}?api-version={search_api_version}"  
response = requests.delete(delete_url, headers=search_headers)  
if response.status_code == 204:  
    print(f"Index {index_name} deleted successfully.")  
    # print(json.dumps(response.json(), indent=2))  
else:  
    print("Error deleting index, it may not exist.")  

# The endpoint URL for creating the index  
create_index_url = f"{search_service_url}/indexes?api-version={search_api_version}"  
response = requests.post(create_index_url, headers=search_headers, json=index_schema)  
  
# Check the response  
if response.status_code == 201:  
    print(f"Index {index_name} created successfully.")  
    # print(json.dumps(response.json(), indent=2))  
else:  
    print(f"Error creating index {index_name} :")  
    print(response.json())  


Index microsoft-results deleted successfully.
Index microsoft-results created successfully.


In [11]:
# Index the content
batch_size = 50
index_doc_url = f"{search_service_url}/indexes/{index_name}/docs/index?api-version={search_api_version}" 

documents = {"value": []}
for file in json_files:
    if '.json' in file:
        with open(file, 'r') as j_in:
            json_data = json.loads(j_in.read())
        json_data['doc_id'] = json_data['doc_id'] + '-' + str(json_data['page_number'])
        documents["value"].append(json_data)
        if len(documents["value"]) == batch_size:
            response = requests.post(index_doc_url, headers=search_headers, json=documents)  
            # Check the response  
            if response.status_code == 200:  
                print(f"Document Indexed successfully.")  
                # print(json.dumps(response.json(), indent=2))  
            else:  
                print(f"Error indexing document {file} :")  
                print(response.json())  
            documents = {"value": []}
            
response = requests.post(index_doc_url, headers=search_headers, json=documents)  
# Check the response  
if response.status_code == 200:  
    print(f"Document Indexed successfully.")  
    # print(json.dumps(response.json(), indent=2))  
else:  
    print(f"Error indexing document {file} :")  
    print(response.json())  
documents = {"value": []}



Document Indexed successfully.
