In [1]:
# 라이브러리 임포트
import os
import json
import re
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import numpy as np
import torch
from PIL import Image, ImageDraw
import matplotlib.pyplot as plt

# PDF 처리
from pdf2image import convert_from_path

# DeepSeek-OCR
from transformers import AutoModel, AutoTokenizer

# RAG 시스템
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
import faiss
from rank_bm25 import BM25Okapi
from sentence_transformers import CrossEncoder

from openai import OpenAI

# 환경 변수
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import tempfile, shutil

tmpdir = tempfile.mkdtemp(prefix="deepseekocr_")

# API 키 로드
load_dotenv(dotenv_path="../.env")
print("라이브러리 임포트 완료")

라이브러리 임포트 완료


In [2]:
# 폴더 생성
ORIGIN_DIR = Path("./origin")      # PDF 논문 폴더
IMAGES_DIR = Path("./images")      # PDF → 이미지 결과
EXTRACTED_DIR = Path("./extracted") # 추출된 이미지 저장 폴더
# FIGURES_DIR = Path("./figures")    # 추출된 figure/table
DATA_DIR = Path("./data")          # 메타데이터 및 description 저장

for directory in [ORIGIN_DIR, IMAGES_DIR, EXTRACTED_DIR, DATA_DIR]:
    if not directory.exists():
        directory.mkdir(exist_ok=True)
        print(f"폴더 생성: {directory}")

In [3]:
def pdf_to_high_res_images(pdf_path: str, output_dir: Path, dpi: int = 300) -> List[Path]:
    """
    PDF를 고해상도 이미지로 변환
    
    Args:
        pdf_path: PDF 파일 경로
        output_dir: 출력 폴더
        dpi: 해상도 (기본: 300, 고해상도)
    
    Returns:
        저장된 이미지 경로 리스트
    """
    print(f"\nPDF 변환 중: {pdf_path}")
    print(f"해상도: {dpi} DPI")
    
    # PDF 이름 추출
    pdf_name = Path(pdf_path).stem
    
    # 이미지로 변환
    images = convert_from_path(pdf_path, dpi=dpi)
    
    # 저장
    image_paths = []
    for i, image in enumerate(images, 1):
        image_path = output_dir / f"{pdf_name}_page_{i:03d}.png"
        image.save(image_path, "PNG")
        image_paths.append(image_path)
        print(f"페이지 {i}/{len(images)} 저장: {image_path.name}")
    
    print(f"\n변환 완료: {len(images)}개 페이지")
    return image_paths

In [4]:
# PDF 파일 리스트 가져오기
pdf_files = list(ORIGIN_DIR.glob("*.pdf"))
print(f"발견된 PDF 파일: {len(pdf_files)}개")
for pdf_file in pdf_files:
    print(f"  - {pdf_file.name}")

# 모든 PDF를 이미지로 변환
all_page_images = {}
for pdf_file in pdf_files:
    image_paths = pdf_to_high_res_images(pdf_file, IMAGES_DIR, dpi=300)
    all_page_images[pdf_file.stem] = image_paths

발견된 PDF 파일: 1개
  - LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval.pdf

PDF 변환 중: origin/LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval.pdf
해상도: 300 DPI
페이지 1/8 저장: LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval_page_001.png
페이지 2/8 저장: LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval_page_002.png
페이지 3/8 저장: LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval_page_003.png
페이지 4/8 저장: LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval_page_004.png
페이지 5/8 저장: LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval_page_005.png
페이지 6/8 저장: LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval_page_006.png
페이지 7/8 저장: LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval_page_007.png
페이지 8/8 저장: LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval_page_008.png

변

In [5]:
client = OpenAI()

In [6]:
model_name = 'deepseek-ai/DeepSeek-OCR'
print(f"\nDeepSeek-OCR 모델 로딩: {model_name}")


DeepSeek-OCR 모델 로딩: deepseek-ai/DeepSeek-OCR


In [7]:
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

In [8]:
# flash-attn 체크
try:
    import flash_attn
    attn_impl = "flash_attention_2"
    print("flash-attn 사용")
except ImportError:
    attn_impl = "eager"
    print("기본 attention 사용")

# 모델 로드
model_kwargs = {
    "trust_remote_code": True,
    "attn_implementation": attn_impl
}

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    model_kwargs["torch_dtype"] = torch.bfloat16
    # 단일 GPU만 사용하도록 명시적으로 지정
    model_kwargs["device_map"] = "cuda:0"
else:
    print("CPU 모드 (느림)")
    model_kwargs["device_map"] = "cpu"

model = AutoModel.from_pretrained(model_name, **model_kwargs)
model = model.eval()
print("모델 로딩 완료\n")

flash-attn 사용
GPU: NVIDIA GeForce RTX 3090


config.json: 0.00B [00:00, ?B/s]

modeling_deepseekocr.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-OCR:
- modeling_deepseekocr.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
You are using a model of type deepseek_vl_v2 to instantiate a model of type DeepseekOCR. This is not supported for all configurations of models and can yield errors.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/1 [00:00<?, ?it/s]

model-00001-of-000001.safetensors:   0%|          | 0.00/6.67G [00:00<?, ?B/s]

Some weights of DeepseekOCRForCausalLM were not initialized from the model checkpoint at deepseek-ai/DeepSeek-OCR and are newly initialized: ['model.vision_model.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


모델 로딩 완료



In [9]:
def locate_figures_with_grounding(tokenizer, model, image_path: Path) -> Dict:
    """
    이미지에서 figure/table 위치 추출
    
    Returns:
        {
            'text': 전체 OCR 텍스트,
            'figures': [{'name': 'Figure 1', 'caption': '...', 'bbox': [x1,y1,x2,y2]}, ...]
        }
    """
    # Grounding 프롬프트로 객체 위치 감지
    prompt = "<image>\n<|grounding|>Identify and locate all figures, tables, and charts in this page. For each, provide the bounding box coordinates and the associated caption."
    print(f"Inside locate_figures_with_grounding - Processing image: {str(image_path)}")
    
    try:
        # output_path 전달, save_results=False로 설정하여 반환값 받기 시도
        result = model.infer(
            tokenizer,
            prompt=prompt,
            image_file=str(image_path),
            output_path=tmpdir,
            base_size=1024,
            image_size=640,
            crop_mode=True,
            eval_mode=True,
            save_results=False  # 파일 저장 대신 반환값 사용
        )
        
        # None 체크
        if result is None:
            print(f"경고: infer가 None 반환 - 빈 결과 반환")
            return ""
        
        return result.strip()
        
    except Exception as e:
        print(f"추출 요류: {e}")
        return ""

In [10]:
def free_ocr(tokenizer, model, image_path: Path) -> str:
    """
    이미지에서 전체 텍스트 추출 (OCR)
    
    Returns:
        전체 OCR 텍스트
    """
    prompt = "<image>\nFree OCR."
    
    try:
        result = model.infer(
            tokenizer,
            prompt=prompt,
            image_file=str(image_path),
            output_path=tmpdir,
            base_size=1024,
            image_size=640,
            crop_mode=True,
            eval_mode=True,
            save_results=False  # 파일 저장 대신 반환값 사용
        )
        
        # None 체크
        if result is None:
            print(f"경고: infer가 None 반환 - 빈 결과 반환")
            return ""
        
        return result.strip()
        
    except Exception as e:
        print(f"OCR 오류: {e}")
        return ""

In [11]:
def normalize_html_quotes(html: str) -> str:
    """
    HTML 속성 값을 작은따옴표(')로 통일
    """
    soup = BeautifulSoup(html, "html.parser")
    # prettify는 자동으로 인덴트 넣기 때문에 formatter="minimal"로 최소한의 포맷만
    return soup.decode(formatter="minimal").replace('"', "'")

In [12]:
def extract_table(tokenizer, model, image_path: Path) -> str:
    """
    이미지에서 전체 텍스트 추출 (OCR)
    
    Returns:
        전체 OCR 텍스트
    """
    
    prompt = "<image>\n<|grounding|>Convert the document to markdown."
    
    try:
        result = model.infer(
            tokenizer,
            prompt=prompt,
            image_file=str(image_path),
            output_path=tmpdir,
            base_size=1024,
            image_size=640,
            crop_mode=True,
            eval_mode=True,
            save_results=False  # 파일 저장 대신 반환값 사용
        )
        result = result.strip()

        tag = re.search(r'<table>.*?</table>', result, re.DOTALL)
        result = tag.group(0) if tag else None
        
        # None 체크
        if result is None:
            print(f"경고: infer가 None 반환 - 빈 결과 반환")
            return ""
        
        # if result.startswith("Do"):
        #     result = re.sub(r'^[Dd]o not.*?<table>', '<table>', result, count=1, flags=re.S)

        result = normalize_html_quotes(result)
        
        return result
    except Exception as e:
        print(f"OCR 오류: {e}")
        return ""

In [13]:
def generate_detailed_description(tokenizer, model, image_path: Path) -> str:
    """
    이미지에 대한 상세한 설명 생성
    
    Args:
        image_path: 이미지 파일 경로
    
    Returns:
        상세 설명 텍스트 (plain text, LLM 처리에 최적화)
    """
    prompt = "<image>\nDescribe this image in detail."
    
    try:
        result = model.infer(
            tokenizer,
            prompt=prompt,
            image_file=str(image_path),
            output_path=tmpdir,
            base_size=1024,
            image_size=640,
            crop_mode=True,
            eval_mode=True,
            save_results=False
        )
        
        if result is None:
            print(f"경고: infer가 None 반환 - 빈 결과 반환")
            return ""
        
        # Grounding 태그 제거 (있을 경우)
        result = re.sub(r'<\|[^|]+\|>', '', result)
        result = re.sub(r'<\|\/[^|]+\|>', '', result)
        
        return result.strip()
        
    except Exception as e:
        return f"생성 오류: {e}"

In [14]:
def crop_and_save_figure(image_path: Path, bbox: List[float], save_path: Path) -> Path:
    """
    이미지에서 bounding box 영역을 crop하여 저장
    
    Args:
        image_path: 원본 이미지 경로
        bbox: [x1, y1, x2, y2] 좌표 (비율 0-1 또는 픽셀 좌표)
        output_path: 저장할 경로
    
    Returns:
        저장된 이미지 경로
    """
    image = Image.open(image_path)
    width, height = image.size
    
    # 좌표가 0-1 비율인 경우 픽셀로 변환
    if all(0 <= coord <= 1 for coord in bbox):
        x1 = int(bbox[0] * width)
        y1 = int(bbox[1] * height)
        x2 = int(bbox[2] * width)
        y2 = int(bbox[3] * height)
    else:
        x1, y1, x2, y2 = [int(c) for c in bbox]
    
    # Crop
    cropped = image.crop((x1, y1, x2, y2))
    
    # 저장
    cropped.save(save_path, "PNG")
    print(f"    - 크롭 저장 완료: {save_path}")

In [18]:
def parse_grounding_result(text: str) -> List[Dict]:
    """
    Grounding 결과 파싱
    DeepSeek-OCR의 실제 출력 형식에 맞게 파싱
    
    실제 형식:
    <|ref|>image<|/ref|><|det|>[[x1, y1, x2, y2]]<|/det|>
    <|ref|>image_caption<|/ref|><|det|>[[x1, y1, x2, y2]]<|/det|>
    Caption text here...
    """
    if not text or not isinstance(text, str):
        return {'text': '', 'figures': []}
    
    figures = []

    # image_pattern = re.compile(
    #     r"""
    #     <\|ref\|>(?P<kind>image|table)<\|/ref\|>\s*
    #     <\|det\|>\s*\[\[(?P<box>[^]]+)\]\]\s*<\|/det\|>\s*
    #     <\|ref\|>(?P<cap_kind>image_caption|table_caption)<\|/ref\|>\s*
    #     <\|det\|>\s*\[\[(?P<cap_box>[^]]+)\]\]\s*<\|/det\|>\s*
    #     """,
    #     re.IGNORECASE | re.DOTALL | re.VERBOSE
    # )
    # image_matches = list(re.finditer(image_pattern, text))

    image_pattern = re.compile(
        r"""
        (?:
            # 패턴 1: figure/table -> caption
            <\|ref\|>(?P<kind1>image|table)<\|/ref\|>\s*
            <\|det\|>\s*\[\[(?P<box1>[^]]+)\]\]\s*<\|/det\|>\s*
            <\|ref\|>(?P<cap_kind1>image_caption|table_caption)<\|/ref\|>\s*
            <\|det\|>\s*\[\[(?P<cap_box1>[^]]+)\]\]\s*<\|/det\|>
        |
            # 패턴 2: caption -> figure/table
            <\|ref\|>(?P<cap_kind2>image_caption|table_caption)<\|/ref\|>\s*
            <\|det\|>\s*\[\[(?P<cap_box2>[^]]+)\]\]\s*<\|/det\|>\s*
            (?:[^\n<]+\n)?                  # 선택적: caption 텍스트 한 줄
            \s*\n                           # 빈 줄
            <\|ref\|>(?P<kind2>image|table)<\|/ref\|>\s*
            <\|det\|>\s*\[\[(?P<box2>[^]]+)\]\]\s*<\|/det\|>
        )
        """,
        re.IGNORECASE | re.DOTALL | re.VERBOSE
    )
    image_matches = list(re.finditer(image_pattern, text))

    figures = []

    for i, match in enumerate(image_matches, start=1):
        # kind = match.group("kind").lower() # image or table
        # bbox = match.group("box")
        # caption_kind = match.group("cap_kind").lower()
        # caption_bbox = match.group("cap_box")

        # 어느 패턴에 매칭되었는지 확인
        if match.group('kind1'):  # 패턴 1
            kind = match.group('kind1').lower()
            bbox = match.group('box1')
            caption_kind = match.group('cap_kind1').lower()
            caption_bbox = match.group('cap_box1')
        else:  # 패턴 2
            kind = match.group('kind2').lower()
            bbox = match.group('box2')
            caption_kind = match.group('cap_kind2').lower()
            caption_bbox = match.group('cap_box2')
    
        if caption_kind.split('_')[1] != 'caption' or caption_kind.split('_')[0] != kind:
            raise ValueError("Mismatch between figure/table and caption types.")
        
        try:
            # 좌표 파싱: "506, 317, 914, 630" -> [506, 317, 914, 630]
            coords = [float(x.strip()) for x in bbox.split(',')]
            caption_coords = [float(x.strip()) for x in caption_bbox.split(',')]
            if len(coords) == 4 and len(caption_coords) == 4:
                # 좌표가 1000 기준으로 normalize되어 있으면 0-1로 변환
                if all(c >= 1 for c in coords):
                    coords = [c / 1000.0 for c in coords]
                if all(c >= 1 for c in caption_coords):
                    caption_coords = [c / 1000.0 for c in caption_coords]

                figure_dict = {
                    "type": kind, 
                    "number": i,
                    "bbox": coords,
                    "caption_bbox": caption_coords
                }
                figures.append(figure_dict)
                
        except Exception as e:
            print(f"좌표 파싱 오류: {e}")
            continue

    return figures

In [19]:
def parse_caption(client, caption: str) -> str:
    """
    Sends a message to the OpenAI Chat Completions API and returns the response.
    """

    prompt = """You are an expert in scientific document parsing.

Your task is to extract structured information from figure, table, or equation captions.

Each caption may begin with labels such as:

- "Figure 1", "Fig. 2", "FIGURE 3", "Fig. 1a"
- "Table I", "TABLE 3", "Tbl. 2"
- "Eq. (5)", "Equation 4", "Algorithm 2", "Listing 1"

Your goal is to **separate** the **type**, **number**, and **text** clearly.

Always output **valid JSON** in the form:

```json
{{
  "type": "<one of: figure | table | equation | algorithm | listing | appendix | theorem>",
  "number": "<string — the normalized number or ID (e.g., '1', 'IV', '3b', 'B', '(5)')>",
  "text": "<the remaining caption text after the label>"
}}
```

---

### Rules:

1. **Type recognition**
    - Case-insensitive.
    - Map variations to canonical types:
        - “Fig.”, “Figure”, “FIGURE” → `"figure"`
        - “Table”, “TABLE”, “Tbl.” → `"table"`
        - “Eq.”, “Equation”, “Eqs.” → `"equation"`
        - “Algorithm”, “Alg.” → `"algorithm"`
        - “Listing”, “Code”, “Example” → `"listing"`
        - “Appendix” → `"appendix"`
        - “Theorem”, “Lemma”, “Proposition” → `"theorem"`
2. **Number normalization**
    - Keep the numeric or letter identifier only (e.g., “1”, “1a”, “IV”, “B”, “(5)”).
    - Preserve parentheses if they are part of equation format.
3. **Text extraction**
    - Remove the label and delimiter (like “:”, “.”, “–”, “—”) that follows it.
    - Preserve all descriptive text exactly as written.
4. **If no recognizable label appears**, output `"type": "unknown"`, `"number": ""`, and treat the full input as `"text"`.

---

### Examples:

**Input:**

`Fig. 1: A sample response generated by the conventional RAG pipeline (Vanilla RAG) and our proposed pipeline (Advanced RAG)...`

**Output:*

```json
{{
  "type": "figure",
  "number": "1",
  "text": "A sample response generated by the conventional RAG pipeline (Vanilla RAG) and our proposed pipeline (Advanced RAG) for a given user query. Our proposed advanced RAG pipeline improves the response by retrieving an additional relevant text chunk for the LLM while eliminating an irrelevant one retrieved by the Vanilla RAG pipeline. The orange symbols indicate the English translation of the Bangla texts."
}}
```

**Input:**

`TABLE IV — Results of ablation experiments.`

**Output:**

```json
{{
  "type": "table",
  "number": "4",
  "text": "Results of ablation experiments."
}}
```

**Input:**

`Algorithm 3: Pseudo-code for iterative training loop.`

**Output:**

```json
{{
  "type": "algorithm",
  "number": "3",
  "text": "Pseudo-code for iterative training loop."
}}
```

**Input:**

`Eq. (5): The loss function is defined as follows.`

**Output:**

```json
{{
  "type": "equation",
  "number": "5",
  "text": "The loss function is defined as follows."
}}
```

Return only a valid JSON object with exactly these three keys and no extra text."""

    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",  # gpt-3.5-turbo
            response_format={"type": "json_object"},
            messages=[
                {"role": "system", "content": "You are the best text parser for scientific documents."},
                {"role": "user", "content": f"{prompt}\nCaption: {caption}"}
            ]
        )
        result = response.choices[0].message.content
        
    except Exception as e:
        return f"캡션 파싱 오류: {e}"
    return json.loads(result)

In [20]:
# 모든 페이지에서 figure/table 추출
all_figures_data = []

for pdf_name, image_paths in all_page_images.items():
    print(f"\n{'='*80}")
    print(f"처리 중: {pdf_name}")
    print(f"{'='*80}")
    
    for page_idx, image_path in enumerate(image_paths, 1):
        if page_idx != 3:
            continue
        print(f"\n페이지 {page_idx}/{len(image_paths)}: {image_path.name}")
        print(str(image_path))
        
        # Figure/Table 감지
        result = locate_figures_with_grounding(tokenizer, model, str(image_path))
        print()
        print(f"OCR Result\n{result}")

        
        figures = parse_grounding_result(result)
        print(f"발견된 figure/table: {len(figures)}개")

        extracted_page_dir = EXTRACTED_DIR / image_path.stem
        if not extracted_page_dir.exists():
            extracted_page_dir.mkdir(exist_ok=True, parents=True)
        
        for figure_dict in figures:
            print(f"  - {figure_dict['number']}th {figure_dict['type']}")
            
            figure_crop_path = extracted_page_dir / f"{figure_dict['type']}_{str(figure_dict['number']).zfill(3)}.png"
            caption_crop_path = extracted_page_dir / f"{figure_dict['type']}_caption_{str(figure_dict['number']).zfill(3)}.png"

            crop_and_save_figure(image_path, figure_dict['bbox'], figure_crop_path)
            crop_and_save_figure(image_path, figure_dict['caption_bbox'], caption_crop_path)

            caption = free_ocr(tokenizer, model, caption_crop_path)
            print(f"    - caption 추출 완료")
            caption_dict = parse_caption(client, caption)
            print(f"    - caption 파싱 완료: {caption_dict['type']} {caption_dict['number']}")

            if figure_dict['type'] == 'table' and caption_dict['type'] == 'table':
                figure_description = extract_table(tokenizer, model, figure_crop_path)
                print(f"    - table 추출 완료")
            else:
                figure_description = generate_detailed_description(tokenizer, model, figure_crop_path)
                print(f"    - figure description 생성 완료")

            figure_data = {
                'pdf_name': pdf_name,
                'page': page_idx,
                'type': caption_dict['type'],
                'recognized_type': figure_dict['type'],
                'number': caption_dict['number'],
                'name': f"{caption_dict['type'].capitalize()} {caption_dict['number']}",
                'caption': caption_dict['text'],
                'description': figure_description,
                'bbox': figure_dict['bbox'],
                'caption_bbox': figure_dict['caption_bbox'],
                'figure_path': str(figure_crop_path),
                'caption_path': str(caption_crop_path),
                'source_image': str(image_path)
            }
            
            # 메타데이터 저장
            all_figures_data.append(figure_data)

# 메타데이터 저장
figure_data_path = DATA_DIR / "figures_data.json"
with open(figure_data_path, 'w', encoding='utf-8') as f:
    json.dump(all_figures_data, f, ensure_ascii=False, indent=2)

print(f"\n\n총 {len(all_figures_data)}개 figure/table 추출 완료")
print(f"메타데이터 저장: {figure_data_path}")


처리 중: LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval

페이지 3/8: LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval_page_003.png
images/LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval_page_003.png
Inside locate_figures_with_grounding - Processing image: images/LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval_page_003.png


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


BASE:  torch.Size([1, 256, 1280])
PATCHES:  torch.Size([6, 100, 1280])


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.



OCR Result
<|ref|>title<|/ref|><|det|>[[210, 63, 355, 75]]<|/det|>
# III. METHODOLOGY

<|ref|>title<|/ref|><|det|>[[77, 83, 245, 94]]<|/det|>
# A. Problem Formulation

<|ref|>text<|/ref|><|det|>[[77, 102, 491, 370]]<|/det|>
Given a collection of \(n\)  government gazettes, we con-catenate them to form a single source document, denoted as \(D\) . This document is then divided into \(m\)  smaller text chunks, represented as \(C=\{C_{1},C_{2},\ldots C_{m}\}\) , to facilitate efficient retrieval. Each chunk is processed through an em-bedding model, which generates corresponding embeddings \(E=\{E_{1},E_{2}\ldots E_{m}\}\) . These embeddings are stored in a vector database to enable efficient similarity-based retrieval.During retrieval, relevant text chunks from C are retrieved by computing the similarity between the stored embeddings E and the embedding of the user query, denoted as \(E_{Q}\) . In the conventional vanilla RAG pipeline, the retrieved text chunks are directly passed to a ge

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


    - caption 파싱 완료: table I
BASE:  torch.Size([1, 256, 1280])
PATCHES:  torch.Size([3, 100, 1280])
    - table 추출 완료
  - 2th image
    - 크롭 저장 완료: extracted/LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval_page_003/image_002.png
    - 크롭 저장 완료: extracted/LegalRAG-A_Hybrid_RAG_System_for_Multilingual_Legal_Information_Retrieval_page_003/image_caption_002.png


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


BASE:  torch.Size([1, 256, 1280])
PATCHES:  torch.Size([8, 100, 1280])
    - caption 추출 완료


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


    - caption 파싱 완료: figure 2
BASE:  torch.Size([1, 256, 1280])
PATCHES:  torch.Size([2, 100, 1280])
    - figure description 생성 완료


총 2개 figure/table 추출 완료
메타데이터 저장: data/figures_data.json
