In [1]:
# from azure.ai.ml.entities import CommandComponent, Environment, BuildContext
# from pathlib import Path

# def create_doc_analyzer_component(
#     environment_image: str = "mcr.microsoft.com/azureml/curated/acpt-pytorch-2.2-cuda12.1:18",
#     conda_file: str = "../conda.yaml"
# ) -> CommandComponent:
#     """
#     Create the document analyzer component using Azure ML SDK v2.
    
#     Args:
#         environment_image: Docker image to use for the environment
#         conda_file: Path to conda environment file
        
#     Returns:
#         CommandComponent: The defined component
#     """
    
#     # Create the environment
#     env = Environment(
#         build=BuildContext(path="./docker"),
#         name="doc-analyzer-env",
#         description="Custom Environment for Document Analyzer",
#     )
    
#     return CommandComponent(
#         name="document_analyzer",
#         display_name="Document Analyzer",
#         # version="6",
#         description="Analyzes multiple PDF documents using Azure Document Intelligence and local processing",
        
#         # Define inputs
#         inputs={
#             "input_folder": {
#                 "type": "uri_folder",
#                 "description": "Input folder containing PDF files"
#             },
#             "doc_intel_connection_id": {
#                 "type": "string",
#                 "description": "Azure ML connection ID for Document Intelligence"
#             },
#             "confidence_threshold": {
#                 "type": "number",
#                 "default": 0.7,
#                 "description": "Confidence threshold for element detection"
#             },
#             "min_length": {
#                 "type": "integer",
#                 "default": 10,
#                 "description": "Minimum text length to consider"
#             },
#             "overlap_threshold": {
#                 "type": "number",
#                 "default": 0.5,
#                 "description": "Threshold for overlap detection"
#             },
#             "ignore_roles": {
#                 "type": "string",
#                 "default": "pageFooter,footnote",
#                 "description": "Comma-separated list of roles to ignore"
#             }
#         },
        
#         # Define outputs
#         outputs={
#             "output_dir": {
#                 "type": "uri_folder",
#                 "description": "Folder containing visualization images organized by PDF"
#             }
#         },
        
#         # Define environment
#         environment=env,
        
#         # Define code and command
#         code="../",
#         command="python run.py --input_folder ${{inputs.input_folder}} --doc_intel_connection_id ${{inputs.doc_intel_connection_id}} --confidence_threshold ${{inputs.confidence_threshold}} --min_length ${{inputs.min_length}} --overlap_threshold ${{inputs.overlap_threshold}} --ignore_roles ${{inputs.ignore_roles}} --output_dir ${{outputs.output_dir}}"
#     )

In [None]:
! az login

In [1]:
# Example of registering the component in a workspace
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Get workspace
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

Found the config file in: /home/alibina/repo/academic-document-analyzer/config.json


In [2]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data

# Create a data asset
data = Data(
    name="raw_papers",
    version="1",
    path="../../files",
    type="uri_folder"
)

# Register the data asset
# ml_client.data.create_or_update(data)

In [3]:
# Register the component
# ml_client.components.create_or_update(create_doc_analyzer_component())

In [4]:
# add .. to sys.path

import sys
sys.path.append("..")

In [5]:
from azure.ai.ml import dsl, Input, load_component, load_environment
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes


# Load component and environment definitions
analyzer_component = load_component(source="./doc_analyzer_component.yaml")

   
@dsl.pipeline(
    description="Document analysis pipeline",
    default_compute="gpu-cluster"
)
def doc_analysis_pipeline(
    pdf_folder,
    doc_intel_connection_id: str,
    confidence_threshold: float = 0.5,
    min_length: int = 10,
    overlap_threshold: float = 0.5,
    ignore_roles: str = "pageFooter,footnote,pageHeader",
    compute_name: str = "gpu-cluster"
):
    # Define the analysis job
    analysis_job = analyzer_component(
        input_folder=pdf_folder,  # Pass the Input object directly
        doc_intel_connection_id=doc_intel_connection_id,
        confidence_threshold=confidence_threshold,
        min_length=min_length,
        overlap_threshold=overlap_threshold,
        ignore_roles=ignore_roles
    )

    analysis_job.compute = compute_name
    
    return {
        "output_dir": analysis_job.outputs.output_dir,
    }

# Get the document intelligence connection
doc_intelligence_connection = ml_client.connections.get("my-doc-intelligence-connection")

# Get the data asset with version
raw_papers = ml_client.data.get("raw_papers", version="1")

# Create Input object for the data
pdf_input = Input(type=AssetTypes.URI_FOLDER, path=raw_papers.path)

# Create and submit the pipeline
pipeline_job = ml_client.jobs.create_or_update(
    doc_analysis_pipeline(
        pdf_folder=pdf_input,  # Pass the Input object
        doc_intel_connection_id=doc_intelligence_connection.id,
        compute_name="hp-gpu-cluster",
        confidence_threshold=0.3,
        min_length=15,
        overlap_threshold=0.7,
        ignore_roles="pageFooter,footnote,pageHeader"
    ),
    experiment_name="document-analysis",
)

# Optional: Stream the job logs
try:
    ml_client.jobs.stream(pipeline_job.name)
except Exception as e:
    print(f"Error streaming logs: {str(e)}")

# Get the outputs
try:
    job_outputs = ml_client.jobs.get(pipeline_job.name).outputs
    print("\nPipeline outputs:")
    print(f"Output folder: {job_outputs['output_dir']}")
except Exception as e:
    print(f"Error getting job outputs: {str(e)}")

Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.


HttpResponseError: Operation returned an invalid status 'This request is not authorized to perform this operation.'
ErrorCode:AuthorizationFailure

In [None]:
doc_intelligence_connection = ml_client.connections.get("my-doc-intelligence-connection")
raw_papers = ml_client.data.get("raw-papers", version="1") 

# Create the pipeline
pipeline = doc_analysis_pipeline(
    pdf_folder=raw_papers.path,
    doc_intel_connection_id=doc_intelligence_connection.id,  # Your connection ID
    compute_name="gpu-cluster",
    confidence_threshold=0.3,
    min_length=15,
    overlap_threshold=0.7,
    ignore_roles="pageFooter,footnote,pageHeader",
    top_margin_percent=10,
    bottom_margin_percent=20,
)

# Submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    experiment_name="document-analysis",
)

# Wait for the job to complete
# ml_client.jobs.stream(pipeline_job.name)

# # Get the outputs
# job_outputs = ml_client.jobs.get(pipeline_job.name).outputs

# print("\nPipeline outputs:")
# print(f"Output folder: {job_outputs['output_dir']}")



ResourceNotFoundError: (UserError) User error when calling GenericAssetMLIndexServiceClient.MoveNext. Service invocation failed!
Request: GET swedencentral.api.azureml.ms/genericasset/subscriptions/f804f2da-c27b-45ac-bf80-16d4d331776d/resourceGroups/rg-airesearcher-dev-01/providers/Microsoft.MachineLearningServices/workspaces/mlw-airesearcher-dev-01/indexes/raw-papers/versions/1
Status Code: 404 NotFound
Error Code: UserError/NotFoundError
Reason Phrase: Asset with Asset ID was not found
Response Body: {"error":{"code":"UserError","message":"Asset with Asset ID was not found","details":[],"innerError":{"code":"NotFoundError"}},"correlation":{"operation":"9f169ae04ea0a4554e00c1e0086dc283","request":"2eb5ebcf4929e1f8"},"environment":"swedencentral","location":"swedencentral","time":"2025-02-07T20:42:40.6259304+00:00","componentName":"genericasset","statusCode":404}
Code: UserError
Message: User error when calling GenericAssetMLIndexServiceClient.MoveNext. Service invocation failed!
Request: GET swedencentral.api.azureml.ms/genericasset/subscriptions/f804f2da-c27b-45ac-bf80-16d4d331776d/resourceGroups/rg-airesearcher-dev-01/providers/Microsoft.MachineLearningServices/workspaces/mlw-airesearcher-dev-01/indexes/raw-papers/versions/1
Status Code: 404 NotFound
Error Code: UserError/NotFoundError
Reason Phrase: Asset with Asset ID was not found
Response Body: {"error":{"code":"UserError","message":"Asset with Asset ID was not found","details":[],"innerError":{"code":"NotFoundError"}},"correlation":{"operation":"9f169ae04ea0a4554e00c1e0086dc283","request":"2eb5ebcf4929e1f8"},"environment":"swedencentral","location":"swedencentral","time":"2025-02-07T20:42:40.6259304+00:00","componentName":"genericasset","statusCode":404}