In [None]:
import os
import sys
import urllib3
from glob import glob
from tqdm.auto import tqdm
from dotenv import load_dotenv

from openai import AzureOpenAI

urllib3.disable_warnings()

sys.path.append('..')
from src.dataset.preprocess import split_and_save_pdf, pdf_to_blocks_and_png  # noqa: E402
from src.tools.text_extract import analyze_image_with_blocks, extract_company_name  # noqa: E402

load_dotenv()

## PDFのパスを読み込む

In [None]:
val_pdfs = sorted(glob("../signate_data/validation/documents/*.pdf"))
test_pdfs = sorted(glob('../signate_data/documents/*.pdf'))

## PDFを分割して再構成
- 中心で分割できそうなスライドは分割し、それぞれを1枚のスライドとして再構成する
- 処理内容については`src/dataset/preprocess.py`を参照

In [None]:
for pdf in tqdm(val_pdfs):
    pdf_name = pdf.split('/')[-1].split('.')[0].zfill(3)

    output_dir = f"../data/documents/{pdf_name}_split"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    split_and_save_pdf(pdf, os.path.join(output_dir, f"{pdf_name}_split.pdf"))

for pdf in tqdm(test_pdfs):
    pdf_name = pdf.split('/')[-1].split('.')[0].zfill(3)

    output_dir = f"../data/test/documents/{pdf_name}_split"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    split_and_save_pdf(pdf, os.path.join(output_dir, f"{pdf_name}_split.pdf"))

## 再構成したPDFを読み込む

In [None]:
val_split_pdfs = sorted(glob("../data/documents/*_split/*.pdf"))
test_split_pdfs = sorted(glob("../data/test/documents/*_split/*.pdf"))

## Azure OpenAI Serviceの設定

In [None]:
client = AzureOpenAI(
        api_key=os.getenv("AZURE_OPENAI_API_KEY"),
        azure_endpoint=os.getenv("AZURE_OPENAI_API_ENDPOINT"),
        api_version=os.getenv("API_VERSION"),
    )

## テキスト抽出
- `src/dataset/preprocess.py` の `pdf_to_block_and_png` により、構造化されたテキスト情報とスライドのPNG画像を得る
- 上記の両データを入力とし、`gpt-4o-mini` を用いてPDFよりテキスト抽出
- プロンプトの詳細などについては `src/tools/text_extract.py` を参照

In [None]:
for spdf in tqdm(val_split_pdfs, desc="PDF Processing"):
    pdf_name = spdf.split('/')[-1].split('.')[0].split('_')[0].zfill(3)
    output_dir = f"../data/documents/{pdf_name}_split"
    page_blocks, image_paths = pdf_to_blocks_and_png(spdf, output_dir)

    markdown_text = ""
    lines = []
    for i, (page_block, image_path) in enumerate(zip(page_blocks, image_paths)):
        extracted_text = analyze_image_with_blocks(
            client, image_path, page_block, os.getenv("MODEL")
        )
        line = f"## P.{i+1}\n\n{extracted_text}\n\n"
        markdown_text += line
        lines.append(line)

    company_name = extract_company_name(lines[0])

    markdown_output_dir = "../data/documents/markdowns"
    if not os.path.exists(markdown_output_dir):
        os.makedirs(markdown_output_dir)
    with open(os.path.join(markdown_output_dir, f"{company_name}.md"), 'w', encoding='utf-8') as f:
        f.write(markdown_text)

for spdf in tqdm(test_split_pdfs, desc="PDF Processing"):
    pdf_name = spdf.split('/')[-1].split('.')[0].split('_')[0].zfill(3)
    output_dir = f"../data/documents/test/{pdf_name}_split"
    page_blocks, image_paths = pdf_to_blocks_and_png(spdf, output_dir)

    markdown_text = ""
    lines = []
    for i, (page_block, image_path) in enumerate(zip(page_blocks, image_paths)):
        extracted_text = analyze_image_with_blocks(
            client, image_path, page_block, os.getenv("MODEL")
        )
        line = f"## P.{i+1}\n\n{extracted_text}\n\n"
        markdown_text += line
        lines.append(line)

    company_name = extract_company_name(lines[0])

    markdown_output_dir = "../data/documents/test/markdowns"
    if not os.path.exists(markdown_output_dir):
        os.makedirs(markdown_output_dir)
    with open(os.path.join(markdown_output_dir, f"{company_name}.md"), 'w', encoding='utf-8') as f:
        f.write(markdown_text)