In [1]:
from azure.ai.ml.entities import CommandComponent, Environment
from pathlib import Path

def create_doc_analyzer_component(
    environment_image: str = "mcr.microsoft.com/azureml/curated/acpt-pytorch-2.2-cuda12.1:18",
    conda_file: str = "../conda.yaml"
) -> CommandComponent:
    """
    Create the document analyzer component using Azure ML SDK v2.
    
    Args:
        environment_image: Docker image to use for the environment
        conda_file: Path to conda environment file
        
    Returns:
        CommandComponent: The defined component
    """
    
    # Create the environment
    env = Environment(
        image=environment_image,
        conda_file=conda_file
    )
    
    return CommandComponent(
        name="document_analyzer",
        display_name="Document Analyzer",
        version="5",
        description="Analyzes multiple PDF documents using Azure Document Intelligence and local processing",
        
        # Define inputs
        inputs={
            "input_folder": {
                "type": "uri_folder",
                "description": "Input folder containing PDF files"
            },
            "doc_intel_connection_id": {
                "type": "string",
                "description": "Azure ML connection ID for Document Intelligence"
            },
            "confidence_threshold": {
                "type": "number",
                "default": 0.7,
                "description": "Confidence threshold for element detection"
            },
            "min_length": {
                "type": "integer",
                "default": 10,
                "description": "Minimum text length to consider"
            },
            "overlap_threshold": {
                "type": "number",
                "default": 0.5,
                "description": "Threshold for overlap detection"
            },
            "ignore_roles": {
                "type": "string",
                "default": "pageFooter,footnote",
                "description": "Comma-separated list of roles to ignore"
            }
        },
        
        # Define outputs
        outputs={
            "markdown_output_folder": {
                "type": "uri_folder",
                "description": "Folder containing markdown files for each processed PDF"
            },
            "combined_elements_data": {
                "type": "uri_file",
                "description": "CSV file containing combined elements data from all PDFs"
            },
            "visualizations_folder": {
                "type": "uri_folder",
                "description": "Folder containing visualization images organized by PDF"
            }
        },
        
        # Define environment
        environment=env,
        
        # Define code and command
        code="../",
        command="""
        python run.py 
        --input_folder ${{inputs.input_folder}} 
        --doc_intel_connection_id ${{inputs.doc_intel_connection_id}}
        --confidence_threshold ${{inputs.confidence_threshold}} 
        --min_length ${{inputs.min_length}} 
        --overlap_threshold ${{inputs.overlap_threshold}} 
        --ignore_roles ${{inputs.ignore_roles}} 
        --markdown_output_folder ${{outputs.markdown_output_folder}} 
        --combined_elements_data ${{outputs.combined_elements_data}} 
        --visualizations_folder ${{outputs.visualizations_folder}}
        """
    )

In [2]:
# Example of registering the component in a workspace
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Get workspace
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

Found the config file in: /config.json


In [11]:
# Register the component
ml_client.components.create_or_update(create_doc_analyzer_component())

CommandComponent({'latest_version': None, 'intellectual_property': None, 'auto_increment_version': False, 'source': 'REMOTE.WORKSPACE.COMPONENT', 'is_anonymous': False, 'auto_delete_setting': None, 'name': 'document_analyzer', 'description': 'Analyzes multiple PDF documents using Azure Document Intelligence and local processing', 'tags': {}, 'properties': {}, 'print_as_yaml': False, 'id': '/subscriptions/f804f2da-c27b-45ac-bf80-16d4d331776d/resourceGroups/rg-airesearcher-dev-01/providers/Microsoft.MachineLearningServices/workspaces/mlw-airesearcher-dev-01/components/document_analyzer/versions/5', 'Resource__source_path': None, 'base_path': '/home/azureuser/projects/academic-document-analyzer/enhanced_doc_analyzer_component/sdk-v2', 'creation_context': <azure.ai.ml._restclient.v2024_01_01_preview.models._models_py3.SystemData object at 0x7f2e1572d2b0>, 'serialize': <msrest.serialization.Serializer object at 0x7f2e15733b80>, 'command': '\n        python run.py \n        --input_folder ${

In [3]:
# add .. to sys.path

import sys
sys.path.append("..")

In [4]:
from azure.ai.ml import MLClient, dsl, Input, Output
from azure.ai.ml.entities import Pipeline, Data
from azure.identity import DefaultAzureCredential
from azure.ai.ml.constants import AssetTypes

def create_document_analysis_pipeline(
    pdf_folder: Data,
    doc_intel_connection_id: str,
    compute_name: str = "hp-gpu-cluster"
) -> Pipeline:
    """
    Create a pipeline for document analysis.
    
    Args:
        pdf_folder: Reference to the input folder containing PDFs
        doc_intel_connection_id: Azure ML connection ID for Document Intelligence
        compute_name: Name of the compute target to use
        
    Returns:
        Pipeline: The defined pipeline
    """
    
    @dsl.pipeline(
        description="Document analysis pipeline",
        default_compute=compute_name
    )
    def doc_analysis_pipeline():
        # Get the document analyzer component
        analyzer_component = create_doc_analyzer_component()
        
        # Define the analysis job
        analysis_job = analyzer_component(
            input_folder=Input(type=AssetTypes.URI_FOLDER, path=pdf_folder),
            doc_intel_connection_id=doc_intel_connection_id,
            # Optional: Override default parameters
            confidence_threshold=0.8,
            min_length=15,
            overlap_threshold=0.6,
            ignore_roles="pageFooter,footnote,pageHeader"
        )
        
        return {
            "markdown_output_folder": analysis_job.outputs.markdown_output_folder,
            "combined_elements_data": analysis_job.outputs.combined_elements_data,
            "visualizations_folder": analysis_job.outputs.visualizations_folder
        }
    
    return doc_analysis_pipeline


# Get the dataset
# pdf_folder = ml_client.data.get("azureml:raw_papers:1")

# Create the pipeline
pipeline = create_document_analysis_pipeline(
    pdf_folder="azureml:raw_papers:1",
    doc_intel_connection_id="my-doc-intelligence-connection",  # Your connection ID
    compute_name="azureml:hp-gpu-cluster"
)

# Submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    experiment_name="document-analysis",
)

# Wait for the job to complete
ml_client.jobs.stream(pipeline_job.name)

# Get the outputs
job_outputs = ml_client.jobs.get(pipeline_job.name).outputs

print("\nPipeline outputs:")
print(f"Markdown output folder: {job_outputs['markdown_output_folder']}")
print(f"Combined elements data: {job_outputs['combined_elements_data']}")
print(f"Visualizations folder: {job_outputs['visualizations_folder']}")



AttributeError: 'function' object has no attribute 'compute'