In [4]:
from azure.ai.ml import load_component, MLClient
from azure.ai.ml.entities import Component
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.entities import CommandComponent, Environment

def init_component() -> Component:
    return CommandComponent(
        name="document_analyzer",
        display_name="Document Analyzer",
        description="Analyzes PDF documents using Azure Document Intelligence and local processing",
        version="1.0.0",
        type="command",
        inputs={
            "input_pdf": {
                "type": "uri_file",
                "description": "Input PDF file to analyze"
            },
            "azure_api_key": {
                "type": "string",
                "description": "Azure Document Intelligence API key"
            },
            "azure_endpoint": {
                "type": "string",
                "description": "Azure Document Intelligence endpoint"
            },
            "confidence_threshold": {
                "type": "number",
                "default": 0.7,
                "description": "Confidence threshold for element detection"
            },
            "min_length": {
                "type": "integer",
                "default": 10,
                "description": "Minimum text length to consider"
            },
            "overlap_threshold": {
                "type": "number",
                "default": 0.5,
                "description": "Threshold for overlap detection"
            },
            "ignore_roles": {
                "type": "string",
                "default": "pageFooter,footnote",
                "description": "Comma-separated list of roles to ignore"
            }
        },
        outputs={
            "markdown_output": {
                "type": "uri_file",
                "description": "Markdown representation of the analyzed document"
            },
            "elements_data": {
                "type": "uri_file",
                "description": "CSV file containing detected elements data"
            },
            "visualizations": {
                "type": "uri_folder",
                "description": "Folder containing visualization images"
            }
        },
        code="./",  # Assumes code is in the current directory
        environment=Environment(
            image="mcr.microsoft.com/azureml/curated/acpt-pytorch-2.2-cuda12.1:18",
            conda_file="conda.yaml"
        ),
        command="""
        python run.py \
            --input_pdf ${{inputs.input_pdf}} \
            --azure_api_key ${{inputs.azure_api_key}} \
            --azure_endpoint ${{inputs.azure_endpoint}} \
            --confidence_threshold ${{inputs.confidence_threshold}} \
            --min_length ${{inputs.min_length}} \
            --overlap_threshold ${{inputs.overlap_threshold}} \
            --ignore_roles ${{inputs.ignore_roles}} \
            --markdown_output ${{outputs.markdown_output}} \
            --elements_data ${{outputs.elements_data}} \
            --visualizations ${{outputs.visualizations}}
        """
    )

In [18]:
@dsl.pipeline(name="document_analysis_pipeline")
def create_example_pipeline(
    input_pdf: Input(type=AssetTypes.URI_FILE),
    azure_api_key: str,
    azure_endpoint: str
):
    """Create an example pipeline using the component."""
    
    # Get the component
    doc_analyzer = init_component()
    
    # Run the analysis
    analysis_job = doc_analyzer(
        input_pdf=input_pdf,
        azure_api_key=azure_api_key,
        azure_endpoint=azure_endpoint
    )
    
    return {
        "markdown_output": analysis_job.outputs.markdown_output,
        "elements_data": analysis_job.outputs.elements_data,
        "visualizations": analysis_job.outputs.visualizations
    }

In [2]:
# import required libraries
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential


ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

Found the config file in: /config.json


In [5]:
# Register the component
registered_component = ml_client.components.create_or_update(init_component())

[32mUploading enhanced_doc_analyzer_component (0.1 MBs): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 104602/104602 [00:00<00:00, 902567.38i

In [None]:




def create_example_pipeline():
    """Create an example pipeline using the component."""
    
    @pipeline(name="document_analysis_pipeline")
    def document_analysis_pipeline(
        input_pdf: Input(type="uri_file"),
        azure_api_key: str,
        azure_endpoint: str
    ):
        # Get the component
        doc_analyzer = init_component()
        
        # Run the analysis
        analysis_job = doc_analyzer(
            input_pdf=input_pdf,
            azure_api_key=azure_api_key,
            azure_endpoint=azure_endpoint
        )
        
        return {
            "markdown_output": analysis_job.outputs.markdown_output,
            "elements_data": analysis_job.outputs.elements_data,
            "visualizations": analysis_job.outputs.visualizations
        }
    
    return document_analysis_pipeline


    
    # Initialize component
    # component = init_component()
    
    # Create example pipeline
    # pipeline = create_example_pipeline()
    
print("Component and pipeline created successfully!")
print("\nTo use this component in Azure ML:")
print("1. Package the component:")
print("   - Ensure all files are in the same directory:")
print("     - conda.yaml")
print("     - run.py")
print("     - enhanced_document_analyzer/")
print("2. Register the component in Azure ML:")
print("   ml_client.components.create_or_update(component)")
print("3. Use in a pipeline as shown in the example pipeline")



Component and pipeline created successfully!

To use this component in Azure ML:
1. Package the component:
   - Ensure all files are in the same directory:
     - conda.yaml
     - run.py
     - enhanced_document_analyzer/
2. Register the component in Azure ML:
   ml_client.components.create_or_update(component)
3. Use in a pipeline as shown in the example pipeline
