# Important
- Edit config.json and add your service details
- Edit the base_url in this notebook to point to your Azure Container App service

In [None]:
import os
import glob  
import requests  
import time
import json
from azure.storage.blob import BlobServiceClient, ContainerClient  
import concurrent.futures  
from functools import partial  
import utils


In [2]:
# Load the service configurations and set the file to process
with open("config.json", "r") as c_in:
    data = json.loads(c_in.read())
data['url_file_to_process'] = 'https://github.com/liamca/GPT4oContentExtraction/raw/main/Transforming-Content-with-GPT4o.pptx'

# base_url = "http://127.0.0.1:3100"
base_url = "https://[container service.[region].azurecontainerapps.io"

job_submit_url = f"{base_url}/start-job"
job_status_url = f"{base_url}/job-status"


In [3]:
# Submit job to convert the document to Markdown files
response = requests.post(job_submit_url, json=data)  

# Check if the request was successful  
if response.status_code == 200:  
    job_info = response.json()  
    job_id=job_info['job_id']
    print(f"Job started successfully! Job ID: {job_id}")  
    data_status = { 
        "job_id": job_info['job_id'],
        "blob_storage_service_name" : data['blob_storage_service_name'],
        "blob_storage_service_api_key" : data['blob_storage_service_api_key'],
        "blob_storage_container" : data['blob_storage_container']
    }  
    
    # Send requests to check job status  
    while True:
        time.sleep(2)
        response = requests.post(job_status_url, json=data_status)  

        # Check if the request was successful  
        if response.status_code == 200:  
            job_status = response.json()  
            print(f"Job Status for Job ID {job_id}: {job_status['status']}")  
            if 'message' in job_status:
                print(f"{job_status['message']}")  
            if job_status['status'] != 'in-progress':
                print (job_status)
                break
        else:  
            print(f"Failed to check job status: {response.status_code} - {response.text}")  
            break
else:  
    print(f"Failed to start job: {response.status_code} - {response.text}")  


Job started successfully! Job ID: 695f1b55-222e-4da5-9aec-616f374b60b7
Job Status for Job ID 695f1b55-222e-4da5-9aec-616f374b60b7: in-progress
Checking if file needs to be converted to PDF...
Job Status for Job ID 695f1b55-222e-4da5-9aec-616f374b60b7: in-progress
Converting images to Markdown...
Job Status for Job ID 695f1b55-222e-4da5-9aec-616f374b60b7: in-progress
Converting images to Markdown...
Job Status for Job ID 695f1b55-222e-4da5-9aec-616f374b60b7: in-progress
Converting images to Markdown...
Job Status for Job ID 695f1b55-222e-4da5-9aec-616f374b60b7: in-progress
Converting images to Markdown...
Job Status for Job ID 695f1b55-222e-4da5-9aec-616f374b60b7: in-progress
Converting images to Markdown...
Job Status for Job ID 695f1b55-222e-4da5-9aec-616f374b60b7: in-progress
Converting images to Markdown...
Job Status for Job ID 695f1b55-222e-4da5-9aec-616f374b60b7: in-progress
Converting images to Markdown...
Job Status for Job ID 695f1b55-222e-4da5-9aec-616f374b60b7: in-progress
C

In [4]:
# Download the files that were processed
connection_string = f"DefaultEndpointsProtocol=https;AccountName={data['blob_storage_service_name']};AccountKey={data['blob_storage_service_api_key']};EndpointSuffix=core.windows.net"  
container_name = data['blob_storage_container']
folder_name = job_info['job_id']

# Initialize the BlobServiceClient and ContainerClient  
blob_service_client = BlobServiceClient.from_connection_string(connection_string)  
container_client = blob_service_client.get_container_client(container_name)  
blobs = container_client.list_blobs(name_starts_with=folder_name)  

# Define the local directory to save the downloaded files  
local_path = job_info['job_id']  

# Ensure the local directory exists  
if not os.path.exists(local_path):  
    os.makedirs(local_path)  

# Download each blob  
for blob in blobs:  
    blob_client = container_client.get_blob_client(blob)  
    blob_name = blob.name  
    # Create the full local path  
    local_file_path = blob_name
    utils.ensure_directory_exists(os.path.dirname(local_file_path))

    # Download the blob to the local file  
    with open(local_file_path, "wb") as download_file:  
        download_file.write(blob_client.download_blob().readall())  

    print(f"Downloaded {blob_name} to {local_file_path}")  

print("Download completed!")  


Directory created: 695f1b55-222e-4da5-9aec-616f374b60b7/images
Downloaded 695f1b55-222e-4da5-9aec-616f374b60b7/images/1.png to 695f1b55-222e-4da5-9aec-616f374b60b7/images/1.png
Directory already exists: 695f1b55-222e-4da5-9aec-616f374b60b7/images
Downloaded 695f1b55-222e-4da5-9aec-616f374b60b7/images/2.png to 695f1b55-222e-4da5-9aec-616f374b60b7/images/2.png
Directory already exists: 695f1b55-222e-4da5-9aec-616f374b60b7/images
Downloaded 695f1b55-222e-4da5-9aec-616f374b60b7/images/3.png to 695f1b55-222e-4da5-9aec-616f374b60b7/images/3.png
Directory already exists: 695f1b55-222e-4da5-9aec-616f374b60b7/images
Downloaded 695f1b55-222e-4da5-9aec-616f374b60b7/images/4.png to 695f1b55-222e-4da5-9aec-616f374b60b7/images/4.png
Directory already exists: 695f1b55-222e-4da5-9aec-616f374b60b7/images
Downloaded 695f1b55-222e-4da5-9aec-616f374b60b7/images/5.png to 695f1b55-222e-4da5-9aec-616f374b60b7/images/5.png
Directory already exists: 695f1b55-222e-4da5-9aec-616f374b60b7/images
Downloaded 695f1b

In [5]:
# Re-create the index
utils.create_index(data)

Dimensions in Embedding Model: 1536
Index test deleted successfully.
Index test created successfully.


In [6]:
# Vectorize the content and store in an AI Search compatible JSON format
max_workers = 15

print ('Vectorizing content...')
markdown_files = glob.glob(os.path.join(os.path.join(folder_name, 'markdown'), "*.txt"))  
json_out_dir = os.path.join(job_info['job_id'], 'json')
utils.ensure_directory_exists(json_out_dir)

partial_process_json = partial(utils.process_json, doc_id=job_info['job_id'], json_out_dir=json_out_dir, data=data)  
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:  
    results = list(executor.map(partial_process_json, markdown_files))  
print(results)  

json_files = glob.glob(os.path.join(json_out_dir, "*.json"))
total_files = len(json_files)
print ('Total JSON Files:', total_files)


Vectorizing content...
Directory created: 695f1b55-222e-4da5-9aec-616f374b60b7/json
file 695f1b55-222e-4da5-9aec-616f374b60b7/markdown/1.txt
file 695f1b55-222e-4da5-9aec-616f374b60b7/markdown/2.txt
file 695f1b55-222e-4da5-9aec-616f374b60b7/markdown/3.txt
file 695f1b55-222e-4da5-9aec-616f374b60b7/markdown/4.txt
file 695f1b55-222e-4da5-9aec-616f374b60b7/markdown/5.txt
file 695f1b55-222e-4da5-9aec-616f374b60b7/markdown/6.txt
file 695f1b55-222e-4da5-9aec-616f374b60b7/markdown/7.txt
file 695f1b55-222e-4da5-9aec-616f374b60b7/markdown/8.txt
file 695f1b55-222e-4da5-9aec-616f374b60b7/markdown/9.txt
['695f1b55-222e-4da5-9aec-616f374b60b7/markdown/1.txt', '695f1b55-222e-4da5-9aec-616f374b60b7/markdown/2.txt', '695f1b55-222e-4da5-9aec-616f374b60b7/markdown/3.txt', '695f1b55-222e-4da5-9aec-616f374b60b7/markdown/4.txt', '695f1b55-222e-4da5-9aec-616f374b60b7/markdown/5.txt', '695f1b55-222e-4da5-9aec-616f374b60b7/markdown/6.txt', '695f1b55-222e-4da5-9aec-616f374b60b7/markdown/7.txt', '695f1b55-222e-4d

In [7]:
# Index content
print ('Indexing content...')
utils.index_content(json_files, data)

Indexing content...
Documents Indexed successfully.
