In [None]:
# for multivector retriever

from typing import List, Dict
from dataclasses import dataclass
import json
from langchain_core.documents import Document


def extract_documents_for_docstore(json_data: Dict) -> List[Document]:
    documents = []
    for page_num, page_data in json_data["page_elements"].items():
        elements = sorted(page_data["elements"], key=lambda x: x["id"])
        page_content = "\n".join([element["content"] for element in elements])
        page_metadata = json_data["page_metadata"][str(page_num)]
        metadata = {
            "id": page_num,
            "page": int(page_num),
            "images": (
                json_data["images"].get(page_num, "") if "images" in json_data else []
            ),
            "doc_id": f"{json_data['title']}_{page_num}",
            **page_metadata,
        }
        documents.append(Document(page_content=page_content, metadata=metadata))
    return documents


def extract_documents_for_vectorstore(json_data: Dict) -> List[Document]:
    documents = []
    for page_num, summary in json_data["text_summary"].items():
        page_content = summary
        if page_num in json_data["image_summary"]:
            page_content += "\n" + json_data["image_summary"][page_num]
        page_metadata = json_data["page_metadata"][str(page_num)]
        metadata = {
            "page": int(page_num),
            "images": (
                json_data["images"].get(page_num, "") if "images" in json_data else []
            ),
            "doc_id": f"{json_data['title']}_{page_num}",
            **page_metadata,
        }
        documents.append(Document(page_content=page_content, metadata=metadata))
    return documents


json_path = ""

with open(json_path, "r", encoding="utf-8") as file:
    json_data = json.load(file)

docstore_documents = extract_documents_for_docstore(json_data)

vectorstore_documents = extract_documents_for_vectorstore(json_data)

print("Docstore Documents:")
for doc in docstore_documents:
    print(f"Page content: {doc.page_content[:100]}...")
    print(f"Metadata: {doc.metadata}")
    print("---")

print("\nVectorstore Documents:")
for doc in vectorstore_documents:
    print(f"Page content: {doc.page_content[:100]}...")
    print(f"Metadata: {doc.metadata}")
    print("---")

In [37]:
from langchain_core.documents import Document
import json


def extract_documents_for_single_store(json_path: str) -> list[Document]:
    with open(json_path, "r", encoding="utf-8") as file:
        data = json.load(file)

    documents = []

    for page_number, page_data in data["page_elements"].items():
        page_content = ""
        images = []

        for element in page_data["elements"]:
            if element["type"] == "text":
                page_content += f"<text>{element['content']}</text>\n"
            elif element["type"] == "image":
                image_id = element["id"]
                image_summary = data["image_summary"].get(
                    str(image_id), "이미지 설명 없음"
                )
                page_content += f"{image_summary}\n"
                images.append(element["content"])

        page_metadata = data["page_metadata"].get(page_number, {})
        page_metadata["page_number"] = int(page_number)

        doc = Document(
            page_content=page_content,
            metadata={
                **page_metadata,
                "images": images,
                "doc_id": f"{data['title']}_{page_number}",
            },
        )
        documents.append(doc)

    return documents


# 함수 사용 예시
json_path = ""
documents = extract_documents_for_single_store(json_path)