# 문서를 Tree Graph 형태로 분해

### 텍스트 후처리

1. 코드 주석의 `#`를 `//`로 변환
2. 목차의 `.....` 삭제
3. 페이지 구분자 `-----` 삭제

In [None]:
def clean_text(text):
    text = text.replace(" #", " //")
    text = text.replace(".....", "")
    text = text.replace("-----", "")
    text = text.replace("**", "")
    text = text.replace("```", "---") 
    return text

with open("samples/bedrock-manual.txt", "r", encoding="utf-8") as file:
    md_text_origin = file.read()

md_text = clean_text(md_text_origin)
print(md_text)

### Header 단위로 1차 텍스트 분리

In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "1"),
    ("##", "2"),
    ("###", "3"),
    ("####", "4"),
    ("#####", "5")
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(md_text)

for header in md_header_splits:
    content_length = len(header.page_content)
    print(f"{header.page_content}")
    print(f"{header.metadata}", end="\n=====================\n")

In [None]:
from typing import List
from langchain_aws import BedrockEmbeddings

def custom_text_splitter(text: str) -> List[str]:
    def split_chunk(chunk: str, delimiters: List[str]) -> List[str]:
        for delimiter in delimiters:
            if delimiter in chunk:
                sub_chunks = [sc.strip() for sc in chunk.split(delimiter) if sc.strip()]
                if all(len(sc) >= 300 for sc in sub_chunks):
                    return sub_chunks
        return [chunk]

    chunks = []
    current_chunk = ""
    delimiters = ["\n\n", "\n---\n", "\n", " "]

    for paragraph in text.split("\n\n"):
        if len(current_chunk) + len(paragraph) + 2 <= 1000:
            current_chunk += ("\n\n" + paragraph) if current_chunk else paragraph
        else:
            if current_chunk:
                chunks.extend(split_chunk(current_chunk, delimiters))
            current_chunk = paragraph

    if current_chunk:
        chunks.extend(split_chunk(current_chunk, delimiters))

    final_chunks = []
    for chunk in chunks:
        if len(chunk) < 300:
            if final_chunks and len(final_chunks[-1]) + len(chunk) + 2 <= 1000:
                final_chunks[-1] += "\n\n" + chunk
            else:
                final_chunks.append(chunk)
        elif len(chunk) > 1000:
            words = chunk.split()
            temp_chunk = ""
            for word in words:
                if len(temp_chunk) + len(word) + 1 <= 1000:
                    temp_chunk += " " + word if temp_chunk else word
                else:
                    if len(temp_chunk) >= 300:
                        final_chunks.append(temp_chunk)
                        temp_chunk = word
                    else:
                        temp_chunk += " " + word
            if temp_chunk:
                final_chunks.append(temp_chunk)
        else:
            final_chunks.append(chunk)

    return final_chunks

def preprocess_content(content: str, vector_search_enabled: bool) -> List[dict]:
    processed_splits = []
    embeddings = BedrockEmbeddings(model_id="cohere.embed-multilingual-v3", region_name="us-east-1")
    order = 0

    chunks = custom_text_splitter(content)
    for chunk in chunks:
        embedding = embeddings.embed_query(chunk) if vector_search_enabled else []
        processed_splits.append({
            'text': chunk,
            'embedding': embedding,
            'order': order
        })
        order += 1

        # 길이가 1000보다 큰 chunk 출력
        if len(chunk) > 1000:
            print(f"Large chunk detected (length: {len(chunk)}):")
            print(chunk)
            print("-" * 50)  

    return processed_splits

# 테스트
vector_search_enabled = False
for doc in md_header_splits:
    preprocessed_contents = preprocess_content(doc.page_content, vector_search_enabled)

### Title 노드와 Content 노드로 분리 & Content를 길이에 따라 Split

In [None]:
from typing import List
from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
from langchain.schema import Document

def create_graph_document_from_md_splits(md_header_splits: List[Document], vector_search_enabled: bool) -> List[GraphDocument]:
    nodes = {}
    relationships = []
    contents = []

    for doc in md_header_splits:
        preprocessed_contents = preprocess_content(doc.page_content, vector_search_enabled)

        headers = sorted([(k, v) for k, v in doc.metadata.items() if k.split()[0].isdigit() and v],
                        key=lambda x: int(x[0]))

        parent_node = None
        for i, (header_key, header_value) in enumerate(headers):
            node_id = f"{header_key}_{header_value}"
            if node_id not in nodes:
                nodes[node_id] = Node(id=node_id, type='Title', properties={'level': header_key, 'value': header_value})

            if parent_node:
                relationships.append(Relationship(
                    source=nodes[parent_node],
                    target=nodes[node_id],
                    type='HAS_CHILD'
                ))

            parent_node = node_id

        # Add content relationships for each preprocessed content
        for processed_content in preprocessed_contents:
            content_node_id = f"{len(contents)}_{processed_content['order']}"
            content_node = Node(id=content_node_id, type='Content', properties={
                'text': processed_content['text'],
                'embedding': processed_content['embedding'],
                'order': processed_content['order'] 
            })
            nodes[content_node_id] = content_node
            contents.append(content_node)

            if parent_node:
                relationships.append(Relationship(
                    source=nodes[parent_node],
                    target=content_node,
                    type='HAS_CONTENTS'
                ))
            else:
                # If there's no header, create a default one
                default_header_id = f"_{len(nodes)}"
                default_header = Node(id=default_header_id, type='Title', properties={'level': '', 'value': 'Default Header'})
                nodes[default_header_id] = default_header
                relationships.append(Relationship(
                    source=default_header,
                    target=content_node,
                    type='HAS_CONTENTS'
                ))

    source_doc = Document(page_content="Bedrock Graph")
    graph_doc = GraphDocument(
        nodes=list(nodes.values()),
        relationships=relationships,
        source=source_doc
    )

    return [graph_doc]


### 그래프 생성

| `vector_search_enabled` | 소요 시간 |
|:---|:---|
| `True` | ~30분 소요 |
| `False` | 1~2분 소요 |

In [None]:
vector_search_enabled=True
graph_docs = create_graph_document_from_md_splits(md_header_splits, vector_search_enabled)

### 중간 처리 된 그래프 파일 저장

In [None]:
import pickle

with open('samples/graph_docs.pkl', 'wb') as f:
    pickle.dump(graph_docs, f)    