In [1]:
from azure.ai.ml.entities import CommandComponent, Environment, BuildContext
from pathlib import Path

def create_doc_analyzer_component(
    environment_image: str = "mcr.microsoft.com/azureml/curated/acpt-pytorch-2.2-cuda12.1:18",
    conda_file: str = "../conda.yaml"
) -> CommandComponent:
    """
    Create the document analyzer component using Azure ML SDK v2.
    
    Args:
        environment_image: Docker image to use for the environment
        conda_file: Path to conda environment file
        
    Returns:
        CommandComponent: The defined component
    """
    
    # Create the environment
    env = Environment(
        build=BuildContext(path="./docker"),
        name="doc-analyzer-env",
        description="Custom Environment for Document Analyzer",
    )
    
    return CommandComponent(
        name="document_analyzer",
        display_name="Document Analyzer",
        # version="6",
        description="Analyzes multiple PDF documents using Azure Document Intelligence and local processing",
        
        # Define inputs
        inputs={
            "input_folder": {
                "type": "uri_folder",
                "description": "Input folder containing PDF files"
            },
            "doc_intel_connection_id": {
                "type": "string",
                "description": "Azure ML connection ID for Document Intelligence"
            },
            "confidence_threshold": {
                "type": "number",
                "default": 0.7,
                "description": "Confidence threshold for element detection"
            },
            "min_length": {
                "type": "integer",
                "default": 10,
                "description": "Minimum text length to consider"
            },
            "overlap_threshold": {
                "type": "number",
                "default": 0.5,
                "description": "Threshold for overlap detection"
            },
            "ignore_roles": {
                "type": "string",
                "default": "pageFooter,footnote",
                "description": "Comma-separated list of roles to ignore"
            }
        },
        
        # Define outputs
        outputs={
            "output_dir": {
                "type": "uri_folder",
                "description": "Folder containing visualization images organized by PDF"
            }
        },
        
        # Define environment
        environment=env,
        
        # Define code and command
        code="../",
        command="python run.py --input_folder ${{inputs.input_folder}} --doc_intel_connection_id ${{inputs.doc_intel_connection_id}} --confidence_threshold ${{inputs.confidence_threshold}} --min_length ${{inputs.min_length}} --overlap_threshold ${{inputs.overlap_threshold}} --ignore_roles ${{inputs.ignore_roles}} --output_dir ${{outputs.output_dir}}"
    )

In [2]:
# Example of registering the component in a workspace
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Get workspace
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

Found the config file in: /config.json


In [3]:
# Register the component
# ml_client.components.create_or_update(create_doc_analyzer_component())

In [4]:
# add .. to sys.path

import sys
sys.path.append("..")

In [5]:
from azure.ai.ml import MLClient, dsl, Input, Output
from azure.ai.ml.entities import Pipeline, Data
from azure.identity import DefaultAzureCredential

def create_document_analysis_pipeline(
    pdf_folder: str,
    doc_intel_connection_id: str,
    confidence_threshold: float = 0.5,
    min_length: int = 10,
    overlap_threshold: float = 0.5,
    ignore_roles: str = "pageFooter,footnote,pageHeader",
    compute_name: str = "cpu-cluster"
) -> Pipeline:
    """
    Create a pipeline for document analysis.
    
    Args:
        pdf_folder: Path or reference to the input folder containing PDFs
        doc_intel_connection_id: Azure ML connection ID for Document Intelligence
        compute_name: Name of the compute target to use
        
    Returns:
        Pipeline: The defined pipeline
    """
    
    @dsl.pipeline(
        description="Document analysis pipeline",
        default_compute=compute_name
    )
    def doc_analysis_pipeline():
        # Get the document analyzer component
        analyzer_component = create_doc_analyzer_component()
        
        # Define the analysis job
        analysis_job = analyzer_component(
            input_folder=Input(type="uri_folder", path=pdf_folder),
            doc_intel_connection_id=doc_intel_connection_id,
            # Optional: Override default parameters
            confidence_threshold=confidence_threshold,
            min_length=min_length,
            overlap_threshold=overlap_threshold,
            ignore_roles=ignore_roles
        )
        
        return {
            "output_dir": analysis_job.outputs.output_dir,
        }
    
    # Return the pipeline object by calling the pipeline function
    return doc_analysis_pipeline()

In [None]:
from azure.ai.ml import MLClient, dsl, Input, Output
from azure.ai.ml.entities import Pipeline, Data
from azure.identity import DefaultAzureCredential
from azure.ai.ml.constants import AssetTypes

doc_intelligence_connection = ml_client.connections.get("my-doc-intelligence-connection")

# Create the pipeline
pipeline = create_document_analysis_pipeline(
    pdf_folder="azureml:raw_papers:1",
    doc_intel_connection_id=doc_intelligence_connection.id,  # Your connection ID
    compute_name="hp-gpu-cluster",
    confidence_threshold=0.2,
    min_length=15,
    overlap_threshold=0.7,
    ignore_roles="pageFooter,footnote,pageHeader"
)

# Submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    experiment_name="document-analysis",
)

# Wait for the job to complete
ml_client.jobs.stream(pipeline_job.name)

# Get the outputs
job_outputs = ml_client.jobs.get(pipeline_job.name).outputs

print("\nPipeline outputs:")
print(f"Output folder: {job_outputs['output_dir']}")



Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
[32mUploading enhanced_doc_analyzer_c

RunId: epic_lettuce_z1zfxtyy5m
Web View: https://ml.azure.com/runs/epic_lettuce_z1zfxtyy5m?wsid=/subscriptions/f804f2da-c27b-45ac-bf80-16d4d331776d/resourcegroups/rg-airesearcher-dev-01/workspaces/mlw-airesearcher-dev-01

Streaming logs/azureml/executionlogs.txt

[2025-02-07 10:24:57Z] Submitting 1 runs, first five are: bd537f61:c396afeb-5dc6-454f-8006-26337abfe6ac
[2025-02-07 10:47:40Z] Completing processing run id c396afeb-5dc6-454f-8006-26337abfe6ac.

Execution Summary
RunId: epic_lettuce_z1zfxtyy5m
Web View: https://ml.azure.com/runs/epic_lettuce_z1zfxtyy5m?wsid=/subscriptions/f804f2da-c27b-45ac-bf80-16d4d331776d/resourcegroups/rg-airesearcher-dev-01/workspaces/mlw-airesearcher-dev-01


Pipeline outputs:
Output folder: ${{parent.outputs.output_dir}}
Combined elements data: ${{parent.outputs.combined_elements_data}}
