In [1]:
# import required libraries
from azure.ai.ml import MLClient
from azure.ai.ml.entities import Environment, BuildContext
from azure.identity import DefaultAzureCredential

In [4]:
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

Found the config file in: ./config.json


In [5]:
env_docker_conda = Environment(
    image="mcr.microsoft.com/azureml/curated/acpt-pytorch-2.2-cuda12.1:18",
    conda_file="enviroment/conda.yaml",
    name="docker-academic-documents-analyser",
    description="Environment for the Academic Documents Analyser",
)
ml_client.environments.create_or_update(env_docker_conda)

Environment({'arm_type': 'environment_version', 'latest_version': None, 'image': 'mcr.microsoft.com/azureml/curated/acpt-pytorch-2.2-cuda12.1:18', 'intellectual_property': None, 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'docker-academic-documents-analyser', 'description': 'Environment for the Academic Documents Analyser', 'tags': {}, 'properties': {'azureml.labels': 'latest'}, 'print_as_yaml': False, 'id': '/subscriptions/f804f2da-c27b-45ac-bf80-16d4d331776d/resourceGroups/rg-airesearcher-dev-01/providers/Microsoft.MachineLearningServices/workspaces/mlw-airesearcher-dev-01/environments/docker-academic-documents-analyser/versions/1', 'Resource__source_path': '', 'base_path': '/home/alibina/repo/academic-document-analyzer', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7f23f8534820>, 'serialize': <msrest.serialization.Serializer object at 0x7f23f8534d00>, 'version': '1', 'conda_file': {'channels': ['conda-

In [None]:
from pathlib import Path
import os
import yaml
from typing import List

from azure.ai.ml import Input, Output
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.entities import Component, Environment, Command
from src import EnhancedDocumentAnalyzer

def init_component():
    """Initialize the Enhanced Document Analyzer component."""
    
    # Define the component
    component = Component(
        name="enhanced_document_analyzer",
        display_name="Enhanced Document Analyzer",
        version="1.0.0",
        description="Analyzes PDF documents using Azure Document Intelligence and local layout detection",
        tags={"category": "Document Processing"},
        
        # Define inputs
        inputs={
            "input_pdf": Input(type="uri_file", description="Input PDF file to analyze"),
            "azure_api_key": Input(type="string", description="Azure Document Intelligence API key"),
            "azure_endpoint": Input(type="string", description="Azure Document Intelligence endpoint"),
            "confidence_threshold": Input(
                type="number", 
                default=0.7,
                description="Confidence threshold for element detection"
            ),
            "min_length": Input(
                type="integer",
                default=10,
                description="Minimum text length to consider"
            ),
            "overlap_threshold": Input(
                type="number",
                default=0.5,
                description="Threshold for overlap detection"
            ),
            "ignore_roles": Input(
                type="string",
                default="pageFooter,footnote",
                description="Comma-separated list of roles to ignore"
            )
        },
        
        # Define outputs
        outputs={
            "markdown_output": Output(
                type="uri_file",
                description="Markdown representation of the analyzed document"
            ),
            "elements_data": Output(
                type="uri_file",
                description="CSV file containing detected elements data"
            ),
            "visualizations": Output(
                type="uri_folder",
                description="Folder containing visualization images"
            )
        },
        
        # Define the environment
        environment=Environment(
            conda_file="enviroment/conda.yaml",
            image="mcr.microsoft.com/azureml/curated/acpt-pytorch-2.2-cuda12.1:18"
        ),
        
        # Define the command
        command=Command(
            program="python",
            command="""python run.py 
                      --input_pdf ${{inputs.input_pdf}} 
                      --azure_api_key ${{inputs.azure_api_key}}
                      --azure_endpoint ${{inputs.azure_endpoint}}
                      --confidence_threshold ${{inputs.confidence_threshold}}
                      --min_length ${{inputs.min_length}}
                      --overlap_threshold ${{inputs.overlap_threshold}}
                      --ignore_roles ${{inputs.ignore_roles}}
                      --markdown_output ${{outputs.markdown_output}}
                      --elements_data ${{outputs.elements_data}}
                      --visualizations ${{outputs.visualizations}}"""
        )
    )
    
    return component

def create_conda_env():
    """Create the conda environment specification."""
    
    conda_env = {
        "name": "doc_analyzer_env",
        "channels": ["conda-forge", "defaults"],
        "dependencies": [
            "python=3.9",
            "pip",
            {
                "pip": [
                    "azure-ai-formrecognizer>=3.2.0",
                    "azure-ai-ml>=1.4.0",
                    "torch>=1.12.0",
                    "Pillow>=9.0.0",
                    "pandas>=1.4.0",
                    "numpy>=1.21.0",
                    "transformers>=4.20.0",
                    "nougat-ocr>=0.1.0",
                    "fitz>=0.0.1",
                    "PyMuPDF>=1.19.0"
                ]
            }
        ]
    }
    
    # Save conda environment specification
    with open("enviroment/conda.yaml", "w") as f:
        yaml.dump(conda_env, f)

def create_run_script():
    """Create the run script for the component."""
    
    run_script = """
import argparse
from pathlib import Path
import pandas as pd
from enhanced_document_analyzer import EnhancedDocumentAnalyzer

def parse_args():
    parser = argparse.ArgumentParser()
    
    # Input arguments
    parser.add_argument("--input_pdf", type=str, required=True)
    parser.add_argument("--azure_api_key", type=str, required=True)
    parser.add_argument("--azure_endpoint", type=str, required=True)
    parser.add_argument("--confidence_threshold", type=float, default=0.7)
    parser.add_argument("--min_length", type=int, default=10)
    parser.add_argument("--overlap_threshold", type=float, default=0.5)
    parser.add_argument("--ignore_roles", type=str, default="pageFooter,footnote")
    
    # Output arguments
    parser.add_argument("--markdown_output", type=str, required=True)
    parser.add_argument("--elements_data", type=str, required=True)
    parser.add_argument("--visualizations", type=str, required=True)
    
    return parser.parse_args()

def main():
    # Parse arguments
    args = parse_args()
    
    # Create output directories
    output_dir = Path("output")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    vis_dir = Path(args.visualizations)
    vis_dir.mkdir(parents=True, exist_ok=True)
    
    # Initialize analyzer
    analyzer = EnhancedDocumentAnalyzer(
        api_key=args.azure_api_key,
        endpoint=args.azure_endpoint,
        output_dir=str(output_dir),
        confidence_threshold=args.confidence_threshold,
        min_length=args.min_length,
        overlap_threshold=args.overlap_threshold,
        ignor_roles=args.ignore_roles.split(",")
    )
    
    try:
        # Process document
        markdown_text, elements_df, visualizations = analyzer.analyze_document(args.input_pdf)
        
        # Save markdown output
        with open(args.markdown_output, "w", encoding="utf-8") as f:
            f.write(markdown_text)
        
        # Save elements data
        elements_df.to_csv(args.elements_data, index=False)
        
        # Copy visualizations to output directory
        import shutil
        for page_num, vis_path in visualizations.items():
            src_path = Path(vis_path)
            dst_path = vis_dir / src_path.name
            shutil.copy2(src_path, dst_path)
        
        print(f"\nProcessing complete!")
        print(f"Total elements detected: {len(elements_df)}")
        print(f"Visualization pages generated: {len(visualizations)}")
        
    except Exception as e:
        print(f"Error processing document: {str(e)}")
        raise

if __name__ == "__main__":
    main()
"""
    
    # Save run script
    with open("run.py", "w") as f:
        f.write(run_script.strip())

def create_example_pipeline():
    """Create an example pipeline using the component."""
    
    @pipeline(name="document_analysis_pipeline")
    def document_analysis_pipeline(
        input_pdf: Input(type="uri_file"),
        azure_api_key: str,
        azure_endpoint: str
    ):
        # Get the component
        doc_analyzer = init_component()
        
        # Run the analysis
        analysis_job = doc_analyzer(
            input_pdf=input_pdf,
            azure_api_key=azure_api_key,
            azure_endpoint=azure_endpoint
        )
        
        return {
            "markdown_output": analysis_job.outputs.markdown_output,
            "elements_data": analysis_job.outputs.elements_data,
            "visualizations": analysis_job.outputs.visualizations
        }
    
    return document_analysis_pipeline

def main():
    """Set up all component files."""
    # Create conda environment specification
    create_conda_env()
    
    # Create run script
    create_run_script()
    
    # Initialize component
    component = init_component()
    
    # Create example pipeline
    pipeline = create_example_pipeline()
    
    print("Component and pipeline created successfully!")
    print("\nTo use this component in Azure ML:")
    print("1. Package the component:")
    print("   - Ensure all files are in the same directory:")
    print("     - conda.yaml")
    print("     - run.py")
    print("     - enhanced_document_analyzer/")
    print("2. Register the component in Azure ML:")
    print("   ml_client.components.create_or_update(component)")
    print("3. Use in a pipeline as shown in the example pipeline")

if __name__ == "__main__":
    main()