In [1]:
import os
from dataclasses import dataclass

from IPython.display import Markdown
from dotenv import load_dotenv
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
    ContentFormat,
    AnalyzeDocumentRequest,
    DocumentAnalysisFeature,
)
USE_DOC_INTEL_PREVIEW_VERSION = True
DOC_INTEL_MODEL_ID = "prebuilt-layout" # E.g. "prebuilt-read", "prebuilt-layout", or "prebuilt-document"

# Possible Document Intelligence features
# v4.0 (Preview): ['ocrHighResolution', 'languages', 'barcodes', 'formulas', 'styleFont', 'keyValuePairs', 'queryFields']
# v3.3 (GA):      ['ocrHighResolution', 'languages', 'barcodes', 'formulas', 'styleFont']
DOC_INTEL_FEATURES = ['ocrHighResolution', 'languages', 'styleFont']

DOC_INTEL_ENDPOINT = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
DOC_INTEL_API_KEY = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_API_KEY")

In [2]:
di_client = DocumentIntelligenceClient(
        endpoint=DOC_INTEL_ENDPOINT, 
        credential=AzureKeyCredential(DOC_INTEL_API_KEY),
        api_version="2024-07-31-preview",
    )
enabled_features = [DocumentAnalysisFeature(feature) for feature in DOC_INTEL_FEATURES]



In [3]:
pdf_path = "../raw_documents/pdf/oral_cancer_text_5th_table&image.pdf"

In [4]:
import base64

def convert_pdf_to_base64(pdf_path: str):
    # Read the PDF file in binary mode, encode it to base64, and decode to string
    with open(pdf_path, "rb") as file:
        base64_encoded_pdf = base64.b64encode(file.read()).decode()
    return base64_encoded_pdf
analyze_request = AnalyzeDocumentRequest(bytes_source=convert_pdf_to_base64(pdf_path))

poller = di_client.begin_analyze_document(
            model_id=DOC_INTEL_MODEL_ID,
            analyze_request=analyze_request,
            output_content_format=ContentFormat.MARKDOWN,
            features=enabled_features
        )
analyzedDocumentResult = poller.result()

In [5]:
import sys
import os
from dotenv import load_dotenv

load_dotenv("/home/azureuser/slm-fine-tune-private-domain-kb-generation/.env")
sys.path.append(os.path.abspath(os.path.join('..')))
markdown_img_tag_path_or_url = os.getenv("MARKDOWN_IMG_TAG_PATH_OR_URL")

In [6]:
from docProcess.customizedProcess.markdownImageTagFigureProcessor import MarkdownImageTagDocumentFigureProcessor
from docProcess.azureDocIntelligResultPostProcessor import DocumentIntelligenceResultPostProcessor
from docProcess.fileTools import extract_pdf_page_images, load_pymupdf_pdf

# config the figure processor
figure_processor = MarkdownImageTagDocumentFigureProcessor(
    before_figure_text_formats=["*Figure Caption:* {caption}"],
    output_figure_img=True,
    figure_img_text_format="*Figure Content:*\n{content}",
    after_figure_text_formats=None,
    markdown_img_tag_path_or_url=markdown_img_tag_path_or_url
)

# config the post processor with markdown figure processor
doc_intel_result_processor = DocumentIntelligenceResultPostProcessor(
    figure_processor = figure_processor)

pdf = load_pymupdf_pdf(pdf_path=pdf_path, pdf_url=None)
doc_page_imgs = extract_pdf_page_images(pdf, img_dpi=100, starting_idx=1)

processed_content_docs = await doc_intel_result_processor.process_analyze_result(
        analyzedDocumentResult,
        doc_page_imgs=doc_page_imgs
    )


Error processing element /figures/0 (start_page_number: 1).
Exception: 'async_generator' object is not iterable
Element info: ElementInfo(element_id='/figures/0', element={'id': '1.1', 'boundingRegions': [{'pageNumber': 1, 'polygon': [4.1959, 1.4486, 7.3516, 1.4488, 7.3514, 5.1379, 4.1957, 5.1377]}], 'spans': [{'offset': 962, 'length': 202}], 'elements': ['/paragraphs/12', '/paragraphs/13', '/paragraphs/14', '/paragraphs/15', '/paragraphs/16', '/paragraphs/17', '/paragraphs/18', '/paragraphs/19', '/paragraphs/20', '/paragraphs/21', '/paragraphs/22', '/paragraphs/23'], 'caption': {'content': 'Fig. 5.1 Schematic indicating the location of the lymph node levels in the neck as described in Table 5.1', 'boundingRegions': [{'pageNumber': 1, 'polygon': [4.1896, 5.2764, 7.332, 5.2758, 7.332, 5.5387, 4.1896, 5.5393]}], 'spans': [{'offset': 983, 'length': 105}], 'elements': ['/paragraphs/24']}}, full_span_bounds=SpanBounds(offset=962, end=1164), spans=[{'offset': 962, 'length': 202}], start_page

In [None]:
from docProcess.docIntelligElementTools import convert_processed_di_docs_to_markdown

processed_content_md = await convert_processed_di_docs_to_markdown(processed_content_docs, default_text_merge_separator="\n")

Markdown(processed_content_md)