In [1]:
# Example of registering the component in a workspace
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Get workspace
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

Found the config file in: /config.json


In [2]:
# add .. to sys.path

import sys
sys.path.append("..")

In [8]:
from azure.ai.ml import dsl, Input, load_component, load_environment
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes


# Load component and environment definitions
analyzer_component = load_component(source="./doc_analyzer_component.yaml")

   
@dsl.pipeline(
    description="Document analysis pipeline",
    default_compute="gpu-cluster-a100"
)
def doc_analysis_pipeline(
    pdf_folder,
    doc_intel_connection_id: str,
    confidence_threshold: float = 0.5,
    min_length: int = 10,
    overlap_threshold: float = 0.5,
    ignore_roles: str = "pageFooter,footnote,pageHeader",
    compute_name: str = "gpu-cluster-a100"
):
    # Define the analysis job
    analysis_job = analyzer_component(
        input_folder=pdf_folder,  # Pass the Input object directly
        doc_intel_connection_id=doc_intel_connection_id,
        confidence_threshold=confidence_threshold,
        min_length=min_length,
        overlap_threshold=overlap_threshold,
        ignore_roles=ignore_roles
    )

    analysis_job.compute = compute_name
    
    return {
        "output_dir": analysis_job.outputs.output_dir,
    }

# Get the document intelligence connection
doc_intelligence_connection = ml_client.connections.get("doc-intelligence-connection")

# Get the data asset with version
raw_papers = ml_client.data.get("raw_papers", version="1")

# Create Input object for the data
pdf_input = Input(type=AssetTypes.URI_FOLDER, path=raw_papers.path)

# Create and submit the pipeline
pipeline_job = ml_client.jobs.create_or_update(
    doc_analysis_pipeline(
        pdf_folder=pdf_input,  # Pass the Input object
        doc_intel_connection_id=doc_intelligence_connection.id,
        compute_name="gpu-cluster-a100",
        confidence_threshold=0.3,
        min_length=15,
        overlap_threshold=0.7,
        ignore_roles="pageFooter,footnote,pageHeader"
    ),
    experiment_name="document-analysis",
)

# Optional: Stream the job logs
try:
    ml_client.jobs.stream(pipeline_job.name)
except Exception as e:
    print(f"Error streaming logs: {str(e)}")

# Get the outputs
try:
    job_outputs = ml_client.jobs.get(pipeline_job.name).outputs
    print("\nPipeline outputs:")
    print(f"Output folder: {job_outputs['output_dir']}")
except Exception as e:
    print(f"Error getting job outputs: {str(e)}")

pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored


RunId: gray_box_7qxvbklp8g
Web View: https://ml.azure.com/runs/gray_box_7qxvbklp8g?wsid=/subscriptions/f804f2da-c27b-45ac-bf80-16d4d331776d/resourcegroups/rg-acadocser-dev-04/workspaces/mlw-acadocser-dev-04

Streaming logs/azureml/executionlogs.txt

[2025-02-28 13:33:43Z] Submitting 1 runs, first five are: 258983de:921f302e-9bd3-4751-a931-ef0c17c24381
[2025-02-28 13:47:52Z] Completing processing run id 921f302e-9bd3-4751-a931-ef0c17c24381.

Execution Summary
RunId: gray_box_7qxvbklp8g
Web View: https://ml.azure.com/runs/gray_box_7qxvbklp8g?wsid=/subscriptions/f804f2da-c27b-45ac-bf80-16d4d331776d/resourcegroups/rg-acadocser-dev-04/workspaces/mlw-acadocser-dev-04


Pipeline outputs:
Output folder: ${{parent.outputs.output_dir}}


In [6]:
results

NameError: name 'results' is not defined

In [6]:
doc_intelligence_connection = ml_client.connections.get("my-doc-intelligence-connection")
raw_papers = ml_client.data.get("raw-papers", version="1") 

# Create the pipeline
pipeline = doc_analysis_pipeline(
    pdf_folder=raw_papers.path,
    doc_intel_connection_id=doc_intelligence_connection.id,  # Your connection ID
    compute_name="hp-gpu-cluster",
    confidence_threshold=0.3,
    min_length=15,
    overlap_threshold=0.7,
    ignore_roles="pageFooter,footnote,pageHeader"
)

# Submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    experiment_name="document-analysis",
)

# Wait for the job to complete
# ml_client.jobs.stream(pipeline_job.name)

# # Get the outputs
# job_outputs = ml_client.jobs.get(pipeline_job.name).outputs

# print("\nPipeline outputs:")
# print(f"Output folder: {job_outputs['output_dir']}")



ResourceNotFoundError: (UserError) User error when calling GenericAssetMLIndexServiceClient.MoveNext. Service invocation failed!
Request: GET swedencentral.api.azureml.ms/genericasset/subscriptions/f804f2da-c27b-45ac-bf80-16d4d331776d/resourceGroups/rg-airesearcher-dev-01/providers/Microsoft.MachineLearningServices/workspaces/mlw-airesearcher-dev-01/indexes/raw-papers/versions/1
Status Code: 404 NotFound
Error Code: UserError/NotFoundError
Reason Phrase: Asset with Asset ID was not found
Response Body: {"error":{"code":"UserError","message":"Asset with Asset ID was not found","details":[],"innerError":{"code":"NotFoundError"}},"correlation":{"operation":"9f169ae04ea0a4554e00c1e0086dc283","request":"2eb5ebcf4929e1f8"},"environment":"swedencentral","location":"swedencentral","time":"2025-02-07T20:42:40.6259304+00:00","componentName":"genericasset","statusCode":404}
Code: UserError
Message: User error when calling GenericAssetMLIndexServiceClient.MoveNext. Service invocation failed!
Request: GET swedencentral.api.azureml.ms/genericasset/subscriptions/f804f2da-c27b-45ac-bf80-16d4d331776d/resourceGroups/rg-airesearcher-dev-01/providers/Microsoft.MachineLearningServices/workspaces/mlw-airesearcher-dev-01/indexes/raw-papers/versions/1
Status Code: 404 NotFound
Error Code: UserError/NotFoundError
Reason Phrase: Asset with Asset ID was not found
Response Body: {"error":{"code":"UserError","message":"Asset with Asset ID was not found","details":[],"innerError":{"code":"NotFoundError"}},"correlation":{"operation":"9f169ae04ea0a4554e00c1e0086dc283","request":"2eb5ebcf4929e1f8"},"environment":"swedencentral","location":"swedencentral","time":"2025-02-07T20:42:40.6259304+00:00","componentName":"genericasset","statusCode":404}