In [1]:
# import required libraries
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Environment, BuildContext
from azure.identity import DefaultAzureCredential

In [2]:
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

Found the config file in: /home/alibina/repo/academic-document-analyzer/config.json


In [None]:
from pathlib import Path
import os
import yaml
from typing import List

from azure.ai.ml import Input, Output
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.entities import Component, Environment, Command
from enhanced_document_analyzer  import EnhancedDocumentAnalyzer

def init_component():
    """Initialize the Enhanced Document Analyzer component."""
    
    # Define the component
    component = Component(
        name="enhanced_document_analyzer",
        display_name="Enhanced Document Analyzer",
        version="1.0.0",
        description="Analyzes PDF documents using Azure Document Intelligence and local layout detection",
        tags={"category": "Document Processing"},
        
        # Define inputs
        inputs={
            "input_pdf": Input(type="uri_file", description="Input PDF file to analyze"),
            "azure_api_key": Input(type="string", description="Azure Document Intelligence API key"),
            "azure_endpoint": Input(type="string", description="Azure Document Intelligence endpoint"),
            "confidence_threshold": Input(
                type="number", 
                default=0.7,
                description="Confidence threshold for element detection"
            ),
            "min_length": Input(
                type="integer",
                default=10,
                description="Minimum text length to consider"
            ),
            "overlap_threshold": Input(
                type="number",
                default=0.5,
                description="Threshold for overlap detection"
            ),
            "ignore_roles": Input(
                type="string",
                default="pageFooter,footnote",
                description="Comma-separated list of roles to ignore"
            )
        },
        
        # Define outputs
        outputs={
            "markdown_output": Output(
                type="uri_file",
                description="Markdown representation of the analyzed document"
            ),
            "elements_data": Output(
                type="uri_file",
                description="CSV file containing detected elements data"
            ),
            "visualizations": Output(
                type="uri_folder",
                description="Folder containing visualization images"
            )
        },
        
        # Define the environment
        environment=Environment(
            conda_file="conda.yaml",
            image="mcr.microsoft.com/azureml/curated/acpt-pytorch-2.2-cuda12.1:18"
        ),
        
        # Define the command
        command=Command(
            program="python",
            command="""python run.py 
                      --input_pdf ${{inputs.input_pdf}} 
                      --azure_api_key ${{inputs.azure_api_key}}
                      --azure_endpoint ${{inputs.azure_endpoint}}
                      --confidence_threshold ${{inputs.confidence_threshold}}
                      --min_length ${{inputs.min_length}}
                      --overlap_threshold ${{inputs.overlap_threshold}}
                      --ignore_roles ${{inputs.ignore_roles}}
                      --markdown_output ${{outputs.markdown_output}}
                      --elements_data ${{outputs.elements_data}}
                      --visualizations ${{outputs.visualizations}}"""
        )
    )
    
    return component


In [None]:




def create_example_pipeline():
    """Create an example pipeline using the component."""
    
    @pipeline(name="document_analysis_pipeline")
    def document_analysis_pipeline(
        input_pdf: Input(type="uri_file"),
        azure_api_key: str,
        azure_endpoint: str
    ):
        # Get the component
        doc_analyzer = init_component()
        
        # Run the analysis
        analysis_job = doc_analyzer(
            input_pdf=input_pdf,
            azure_api_key=azure_api_key,
            azure_endpoint=azure_endpoint
        )
        
        return {
            "markdown_output": analysis_job.outputs.markdown_output,
            "elements_data": analysis_job.outputs.elements_data,
            "visualizations": analysis_job.outputs.visualizations
        }
    
    return document_analysis_pipeline


    
    # Initialize component
    # component = init_component()
    
    # Create example pipeline
    # pipeline = create_example_pipeline()
    
print("Component and pipeline created successfully!")
print("\nTo use this component in Azure ML:")
print("1. Package the component:")
print("   - Ensure all files are in the same directory:")
print("     - conda.yaml")
print("     - run.py")
print("     - enhanced_document_analyzer/")
print("2. Register the component in Azure ML:")
print("   ml_client.components.create_or_update(component)")
print("3. Use in a pipeline as shown in the example pipeline")



Component and pipeline created successfully!

To use this component in Azure ML:
1. Package the component:
   - Ensure all files are in the same directory:
     - conda.yaml
     - run.py
     - enhanced_document_analyzer/
2. Register the component in Azure ML:
   ml_client.components.create_or_update(component)
3. Use in a pipeline as shown in the example pipeline
