In [23]:
# The endpoint for the AI Resource and the subscription key for the
#  airesource are stored in keyvault secrets
# Parameters would be passed in for your key vault name and the secrets

kv_endpoint = 'https://kvaicontund.vault.azure.net/'
secret_name_subscription_key = 'subscription-key'
secret_name_ai_endpoint = 'content-understanding-endpoint'
secret_analyzer_name = 'analyzer-name'
doc_location = "https://raw.githubusercontent.com/contosojh/sample-files/main/summer-catalog-10-pages.pdf"

StatementMeta(, 33c65ac0-332a-4578-89b4-73b2845f4f96, 32, Finished, Available, Finished)

In [24]:
timeout_seconds = 9000
polling_interval_seconds = 20

StatementMeta(, 33c65ac0-332a-4578-89b4-73b2845f4f96, 33, Finished, Available, Finished)

In [25]:
# Get the Azure AI Endpoint and the Azure AI Endpoint Subscription Key
subscription_key = mssparkutils.credentials.getSecret(kv_endpoint,secret_name_subscription_key)
ai_endpoint= mssparkutils.credentials.getSecret(kv_endpoint,secret_name_ai_endpoint)
analyzer_name = mssparkutils.credentials.getSecret(kv_endpoint,secret_analyzer_name)


StatementMeta(, 33c65ac0-332a-4578-89b4-73b2845f4f96, 34, Finished, Available, Finished)

In [26]:
import requests
import time
from azure.keyvault.secrets import SecretClient
import pandas as pd

StatementMeta(, 33c65ac0-332a-4578-89b4-73b2845f4f96, 35, Finished, Available, Finished)

In [27]:
# Define headers with the subscription key and content-type
headers = {
    "Ocp-Apim-Subscription-Key": subscription_key,
    "Content-Type": "application/json"
}

# Define the body with the URL to be sent in the request
body = {
    "url": doc_location
}

# Send a POST request to the specified endpoint
display("Sending POST request...")
response = requests.post(
    f"{ai_endpoint}contentunderstanding/analyzers/{analyzer_name}:analyze?api-version=2025-05-01-preview",
    headers=headers,
    json=body
)

# display(response.status_code)

StatementMeta(, 33c65ac0-332a-4578-89b4-73b2845f4f96, 36, Finished, Available, Finished)

'Sending POST request...'

In [28]:
# Check if the request was successful
if response.status_code == 404:
    print("Resource not found. Please check the endpoint URL and the resource you are requesting.")
    print(f"Response: {response.json()}")
else:
    # Print the JSON response
    #print(response.json())

    result_headers = {
        "Ocp-Apim-Subscription-Key": subscription_key
    }

    # print(response.json().get("id"))
    opid = response.json().get("id")

    status_url = f"{ai_endpoint}/contentunderstanding/analyzerResults/{opid}?api-version=2025-05-01-preview"
    response = requests.get(status_url, headers=result_headers)
    result = response.json()
    status = result.get("status")
    start_time = time.time()
    elapsed_time = 0

    # Poll until the operation is complete
    while status == "Running":
        response = requests.get(status_url, headers=result_headers)
        result = response.json()
        status = result.get("status")
        if status == "Running":
            time.sleep(polling_interval_seconds)
            elapsed_time = time.time() - start_time
            if elapsed_time > timeout_seconds:
                # Cancel the job if the elapsed time exceeds the timeout
                cancel_url = f"{ai_endpoint}/contentunderstanding/analyzersResults/{opid}?api-version=2025-05-01-preview"
                cancel_response = requests.delete(cancel_url, headers=result_headers)
                status = "Timed Out; job cancelled"

        elif status in ["Failed", "Cancelled"]:
            raise Exception(f"Operation {status}")

    print(f"Status: {status}")

StatementMeta(, 33c65ac0-332a-4578-89b4-73b2845f4f96, 37, Finished, Available, Finished)

Status: Succeeded


In [29]:
if status == "Succeeded":
    # display(result.get("result"))
    results_json = result.get("result")
    contents = results_json.get("contents")
    # print(contents)
    course_list = contents[0]['fields']['summerCourseCatalog']['valueArray']

    # Convert to a list of dictionaries with clean values
    data = []
    for course in course_list:
        obj = course['valueObject']
        row = {
            'courseCategory': obj['courseCategory']['valueString'],
            'courseName': obj['courseName']['valueString'],
            'courseDescription': obj['courseDescription']['valueString'],
            'courseSectionNumber': obj['courseSectionNumber']['valueString'],
            'courseSectionName': obj['courseSectionName']['valueString'],
            'courseInstructor': obj['courseInstructor']['valueString'],
            'courseStartDate': obj['courseStartDate']['valueDate'],
            'courseTime': obj['courseTime']['valueString'],
            'numberOfSessions': obj['numberOfSessions']['valueNumber'],
            'courseLocation': obj['courseLocation']['valueString'],
            'courseCost': obj['courseCost']['valueNumber'],
            'courseEndDate': obj['courseEndDate']['valueString']
        }
        data.append(row)
    # Create a DataFrame
    df = pd.DataFrame(data)

    # Display the DataFrame
    display(df)
else:
    if elapsed_time >= timeout_seconds:
        print(f"Status was {status} but timed out after {elapsed_time} seconds")


StatementMeta(, 33c65ac0-332a-4578-89b4-73b2845f4f96, 38, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 8eeff0bc-b47b-419d-b153-ffdfbf7b4140)