In [5]:
import sys

sys.path.append('../chunk_caption_index_component/')
sys.path.append('../enhanced_doc_analyzer_component/')

In [6]:
# Example of registering the component in a workspace
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Get workspace
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

Found the config file in: /config.json
Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


In [7]:
from azure.ai.ml import dsl, Input
from azure.ai.ml import load_component, load_environment
from azure.ai.ml.constants import AssetTypes

# Load components and environment
analyzer_component = load_component(source="./enhanced_doc_analyzer_component/doc_analyzer_component.yaml")
chunk_caption_index = load_component(source="./chunk_caption_index_component/chunk-caption-index-component.yaml")

@dsl.pipeline(
    description="Combined document analysis and indexing pipeline",
    default_compute="gpu-cluster"
)
def document_processing_pipeline(
    # Document Analyzer inputs
    
    pdf_folder,
    doc_intel_connection_id: str,
    azure_openai_connection_id: str,
    azure_search_connection_id: str,
    confidence_threshold: float = 0.5,
    min_length: int = 10,
    overlap_threshold: float = 0.5,
    ignore_roles: str = "pageFooter,footnote,pageHeader",
    embd_deployment_name: str = "text-embedding-ada-002",
    vision_deployment_name: str = "gpt-4",
    index_name: str = "myindex",
    # Compute settings
    analyzer_compute: str = "gpu-cluster",
    indexer_compute: str = "cpu-cluster"
):
    # Document Analyzer step
    analysis_job = analyzer_component(
        input_folder=pdf_folder,
        doc_intel_connection_id=doc_intel_connection_id,
        confidence_threshold=confidence_threshold,
        min_length=min_length,
        overlap_threshold=overlap_threshold,
        ignore_roles=ignore_roles
    )
    analysis_job.compute = analyzer_compute

    # Chunk Caption Index step
    # Using the output from document analyzer as input
    chunk_caption_job = chunk_caption_index(
        input_folder=analysis_job.outputs.output_dir,
        azure_openai_connection_id=azure_openai_connection_id,
        azure_search_connection_id=azure_search_connection_id,
        embd_deployment_name=embd_deployment_name,
        vision_deployment_name=vision_deployment_name,
        index_name=index_name
    )
    chunk_caption_job.compute = indexer_compute

    return {
        "analyzer_output": analysis_job.outputs.output_dir,
        "final_output": chunk_caption_job.outputs.output_folder
    }

# Example usage
def main(ml_client):
    # Get connections
    doc_intelligence_connection = ml_client.connections.get("my-doc-intelligence-connection")
    azure_search_connection = ml_client.connections.get("aisearch505")
    azure_openai_connection = ml_client.connections.get("aoai-sweden-505")

    # Get the data asset with version
    raw_papers = ml_client.data.get("raw_papers", version="1")
    # Create Input object for the data
    pdf_input = Input(type=AssetTypes.URI_FOLDER, path=raw_papers.path)

    # Create pipeline
    pipeline = document_processing_pipeline(
        # Document Analyzer params
        pdf_folder=pdf_input,
        doc_intel_connection_id=doc_intelligence_connection.id,
        confidence_threshold=0.3,
        min_length=15,
        overlap_threshold=0.7,
        ignore_roles="pageFooter,footnote,pageHeader",
        
        # Chunk Caption Index params
        azure_openai_connection_id=azure_openai_connection.id,
        azure_search_connection_id=azure_search_connection.id,
        embd_deployment_name="text-embedding-ada-002",
        vision_deployment_name="gpt-4v",
        index_name="myindex",
        
        # Compute settings
        analyzer_compute="hp-gpu-cluster",
        indexer_compute="cpu-cluster"
    )
    
    return pipeline

In [8]:
pipeline = main(ml_client)

# Submit the pipeline
run = ml_client.jobs.create_or_update(
    pipeline,
    experiment_name="document-processing-pipeline",
    tags={"type": "document-processing"}
)

[32mUploading chunk_caption_index_component (0.02 MBs): 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 24636/24636 [00:00<00:00, 267211.98i