In [1]:
import sys

sys.path.append('../chunk_caption_index_component/')
# enhanced_doc_analyzer_component/enhanced_document_analyzer

In [2]:
# Example of registering the component in a workspace
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Get workspace
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

Found the config file in: /config.json


In [3]:
import argparse
import logging
import traceback
from openai import AzureOpenAI
from azureml.rag.utils.connections import get_connection_by_id_v2
from azureml.rag.utils.logging import get_logger, safe_mlflow_start_run, track_activity

from document_processor import DocumentProcessor

In [7]:
import dotenv
import os
import pandas as pd

# Load the environment variables
dotenv.load_dotenv()

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")

# Initialize the OpenAI client
openai_vision_client = AzureOpenAI(
    azure_endpoint=azure_openai_endpoint, 
    api_key=azure_openai_key, 
    api_version='2023-03-15-preview'
    )
openai_embedding_client = AzureOpenAI(
    azure_endpoint=azure_openai_endpoint, 
    api_key=azure_openai_key, 
    api_version='2024-02-01'
    )

azure_search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
azure_search_key = os.getenv("AZURE_SEARCH_KEY")


In [5]:
# Initialize document processor
processor = DocumentProcessor(
    input_folder="./output-azure/0.55/1-s2.0-S0927796X2030053X-am.pdf"
    output_folder="./output-azure-processed/0.55/1-s2.0-S0927796X2030053X-am.pdf"
    openai_client=openai_vision_client,
    vision_deployment_name="gpt-4v",
    embedding_client=embedding_client,
    embd_deployment_name="text-embedding-ada-002"
    search_endpoint=azure_search_endpoint,
    search_key=azure_search_key,
    search_api_version="2023-11-01",
    index_name="test-index",
    max_chunk_length=512
)