In [1]:
import sys

sys.path.append('../chunk_caption_index_component/')
# enhanced_doc_analyzer_component/enhanced_document_analyzer

In [2]:
# Example of registering the component in a workspace
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Get workspace
ml_client = MLClient.from_config(
    credential=DefaultAzureCredential()
)

Found the config file in: /config.json


In [3]:

from openai import AzureOpenAI


from document_processor import DocumentProcessor

In [4]:
import dotenv
import os
import pandas as pd

# Load the environment variables
dotenv.load_dotenv()

azure_openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
azure_openai_key = os.getenv("AZURE_OPENAI_KEY")

# Initialize the OpenAI client
openai_vision_client = AzureOpenAI(
    azure_endpoint=azure_openai_endpoint, 
    api_key=azure_openai_key, 
    api_version='2023-03-15-preview'
    )
openai_embedding_client = AzureOpenAI(
    azure_endpoint=azure_openai_endpoint, 
    api_key=azure_openai_key, 
    api_version='2024-02-01'
    )

azure_search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
azure_search_key = os.getenv("AZURE_SEARCH_KEY")
index_name="test-index-0"


In [5]:
# Initialize document processor
processor = DocumentProcessor(
    input_folder="./output-azure/0.55/1-s2.0-S0927796X2030053X-am.pdf",
    output_folder="./output-azure-processed/0.55/1-s2.0-S0927796X2030053X-am.pdf",
    openai_client=openai_vision_client,
    vision_deployment_name="gpt-4v",
    embedding_client=openai_embedding_client,
    embd_deployment_name="text-embedding-ada-002",
    search_endpoint=azure_search_endpoint,
    search_key=azure_search_key,
    search_api_version="2023-11-01",
    index_name=index_name,
    max_chunk_length=512
)

stats, chunks_df = processor.process()

In [6]:
stats

{'processed': 311,
 'errors': 0,
 'chunks': 433,
 'timestamp': '2025-02-07T16:05:17.659796'}

In [7]:
chunks_df.head()

Unnamed: 0,chunk_id,pdf_file,page,bounding_box,type,text,image_path,role,confidence,source,embedding,is_caption
0,3af8da08-9953-46af-ae83-1680ddd0c31a,1-s2.0-S0927796X2030053X-am.pdf,1,"(1.49, 1.52, 6.75, 1.73)",text,Polymer Informatics: Current Status and Critic...,,title,,azure_document_intelligence,"[-0.02136324532330036, -0.005665221251547337, ...",False
1,23e4cd6d-e690-4a2e-b622-d0cf44503c3a,1-s2.0-S0927796X2030053X-am.pdf,1,"(1.24, 1.97, 7.02, 2.82)",text,"Lihua Chenª, Ghanshyam Pilaniab, Rohit Batrac,...",,,,azure_document_intelligence,"[-0.0017890825401991606, 0.014668765477836132,...",False
2,de91c4d0-0314-47b1-96d6-f35f27e9fd95,1-s2.0-S0927796X2030053X-am.pdf,1,"(0.87, 3.70, 7.38, 6.17)",text,Artificial intelligence (AI) based approaches ...,,,,azure_document_intelligence,"[-0.014036566950380802, -0.011912076734006405,...",False
3,d9ded88e-e0d7-49f3-98be-53f8b2dfe46c,1-s2.0-S0927796X2030053X-am.pdf,1,"(0.87, 3.70, 7.38, 6.17)",text,"Questions regarding synthesizability, and pote...",,,,azure_document_intelligence,"[-0.02521873265504837, -0.010758204385638237, ...",False
4,4a37b8d6-4823-4032-9d2d-5a9c4e822544,1-s2.0-S0927796X2030053X-am.pdf,1,"(0.87, 3.70, 7.38, 6.17)",text,Other major hurdles for polymer informatics ar...,,,,azure_document_intelligence,"[-0.006296014878898859, 0.002676484640687704, ...",False


In [3]:
from retriever_utils import ImageCaptionViewer

from azure.search.documents import SearchClient
from azure.core.credentials import AzureKeyCredential
import pandas as pd

azure_search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
azure_search_key = os.getenv("AZURE_SEARCH_KEY")
index_name="myindex"


# Initialize your search client
search_client = SearchClient(
    endpoint=azure_search_endpoint,
    index_name=index_name,
    credential=AzureKeyCredential(azure_search_key)
)

# Get image chunks
results = search_client.search(
    search_text="*",
    filter="type eq 'image'",
    select=["id", "pdf_file", "page", "bounding_box", "text", "image_path", 
           "role", "confidence", "source"],
    top=1000
)

# Convert to DataFrame
chunks = []
for result in results:
    chunks.append(dict(result))
df = pd.DataFrame(chunks)

# # Create and display the viewer
# viewer = ImageCaptionViewer(df)
# viewer.display()

In [4]:
df.head()

In [6]:
for result in results:
    print(result)