# 문서를 Tree Graph 형태로 분해

### 텍스트 후처리

1. 코드 주석의 `#`를 `//`로 변환
2. 목차의 `.....` 삭제
3. 페이지 구분자 `-----` 삭제

In [28]:
def clean_text(text):
    text = text.replace(" #", " //")
    text = text.replace(".....", "")
    text = text.replace("-----", "")
    text = text.replace("**", "")
    text = text.replace("```", "---") 
    return text

with open("samples/bedrock-manual.txt", "r", encoding="utf-8") as file:
    md_text_origin = file.read()

md_text = clean_text(md_text_origin)
print(md_text)

##### User Guide
# Amazon Bedrock

Copyright © 2024 Amazon Web Services, Inc. and/or its affiliates. All rights reserved.




#### Amazon Bedrock: User Guide

Copyright © 2024 Amazon Web Services, Inc. and/or its affiliates. All rights reserved.

Amazon's trademarks and trade dress may not be used in connection with any product or service
that is not Amazon's, in any manner that is likely to cause confusion among customers, or in any
manner that disparages or discredits Amazon. All other trademarks not owned by Amazon are
the property of their respective owners, who may or may not be affiliated with, connected to, or
sponsored by Amazon.




### Table of Contents

What is Amazon Bedrock?  1

Features of Amazon Bedrock .. 1
Amazon Bedrock pricing  2
Supported AWS Regions  2
Key definitions ... 5

Basic concepts ... 5
Advanced features . 7

Getting started ... 8

Request access to an Amazon Bedrock foundation model .... 11
(Optional tutorials) Explore Amazon Bedrock features through the 

### Header 단위로 1차 텍스트 분리

In [29]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "1"),
    ("##", "2"),
    ("###", "3"),
    ("####", "4"),
    ("#####", "5")
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(md_text)

for header in md_header_splits:
    content_length = len(header.page_content)
    print(f"{header.page_content}")
    print(f"{header.metadata}", end="\n=====================\n")

Copyright © 2024 Amazon Web Services, Inc. and/or its affiliates. All rights reserved.
{'1': 'Amazon Bedrock'}
Copyright © 2024 Amazon Web Services, Inc. and/or its affiliates. All rights reserved.  
Amazon's trademarks and trade dress may not be used in connection with any product or service
that is not Amazon's, in any manner that is likely to cause confusion among customers, or in any
manner that disparages or discredits Amazon. All other trademarks not owned by Amazon are
the property of their respective owners, who may or may not be affiliated with, connected to, or
sponsored by Amazon.
{'1': 'Amazon Bedrock', '4': 'Amazon Bedrock: User Guide'}
What is Amazon Bedrock?  1  
Features of Amazon Bedrock .. 1
Amazon Bedrock pricing  2
Supported AWS Regions  2
Key definitions ... 5  
Basic concepts ... 5
Advanced features . 7  
Getting started ... 8  
Request access to an Amazon Bedrock foundation model .... 11
(Optional tutorials) Explore Amazon Bedrock features through the console or 

In [40]:
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_text(text: str, separator: str, chunk_size: int) -> List[str]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=0,
        separators=[separator],
        length_function=len,
    )
    return text_splitter.split_text(text)

def preprocess_content(content: str) -> List[dict]:
    processed_splits = []
    order = 0

    level1_splits = split_text(content, "\n\n", 500)

    for level1_split in level1_splits:
        if len(level1_split) <= 1500:
            processed_splits.append({
                'text': level1_split,
                'order': order
            })
            order += 1
        else:
            level2_splits = split_text(level1_split, "\n---\n", 500)

            for level2_split in level2_splits:
                if len(level2_split) <= 1500:
                    processed_splits.append({
                        'text': level2_split,
                        'order': order
                    })
                    order += 1
                else:
                    level3_splits = split_text(level2_split, "\n", 500)

                    for level3_split in level3_splits:
                        processed_splits.append({
                            'text': level3_split,
                            'order': order
                        })
                        order += 1
                        if len(level3_split) > 2000:
                            print("exceeded\n")
                            print(level3_split)

    return processed_splits

for doc in md_header_splits:
    preprocessed_contents = preprocess_content(doc.page_content)

### Title 노드와 Content 노드로 분리 & Content를 길이에 따라 Split

In [57]:
from typing import List
from langchain_community.graphs.graph_document import GraphDocument, Node, Relationship
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_aws import BedrockEmbeddings

def split_text(text: str, separator: str, chunk_size: int) -> List[str]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=0,
        separators=[separator],
        length_function=len,
    )
    return text_splitter.split_text(text)


def preprocess_content(content: str, vector_search_enabled: bool) -> List[dict]:
    processed_splits = []
    embeddings = BedrockEmbeddings(model_id="cohere.embed-multilingual-v3", region_name="us-east-1")
    order = 0

    # 1단계: "\n\n"로 분리
    level1_splits = split_text(content, "\n\n", 500)

    for level1_split in level1_splits:
        if len(level1_split) <= 1000:
            embedding = embeddings.embed_query(level1_split) if vector_search_enabled else []
            processed_splits.append({
                'text': level1_split,
                'embedding': embedding,
                'order': order
            })
            order += 1
        else:
            # 2단계: "\n---\n"로 분리
            level2_splits = split_text(level1_split, "\n---\n", 500)

            for level2_split in level2_splits:
                if len(level2_split) <= 1000:
                    embedding = embeddings.embed_query(level2_split) if vector_search_enabled else []
                    processed_splits.append({
                        'text': level2_split,
                        'embedding': embedding,
                        'order': order
                    })
                    order += 1
                else:
                    # 3단계: "\n"로 분리
                    level3_splits = split_text(level2_split, "\n", 1500)

                    for level3_split in level3_splits:
                        embedding = embeddings.embed_query(level3_split) if vector_search_enabled else []
                        processed_splits.append({
                            'text': level3_split,
                            'embedding': embedding,
                            'order': order
                        })
                        order += 1
                        if len(level3_split) > 2000:
                            print("exceeded\n")
                            print(level3_split)

    return processed_splits

def create_graph_document_from_md_splits(md_header_splits: List[Document], vector_search_enabled: bool) -> List[GraphDocument]:
    nodes = {}
    relationships = []
    contents = []

    for doc in md_header_splits:
        preprocessed_contents = preprocess_content(doc.page_content, vector_search_enabled)

        headers = sorted([(k, v) for k, v in doc.metadata.items() if k.split()[0].isdigit() and v],
                        key=lambda x: int(x[0]))

        parent_node = None
        for i, (header_key, header_value) in enumerate(headers):
            node_id = f"{header_key}_{header_value}"
            if node_id not in nodes:
                nodes[node_id] = Node(id=node_id, type='Title', properties={'level': header_key, 'value': header_value})

            if parent_node:
                relationships.append(Relationship(
                    source=nodes[parent_node],
                    target=nodes[node_id],
                    type='HAS_CHILD'
                ))

            parent_node = node_id

        # Add content relationships for each preprocessed content
        for processed_content in preprocessed_contents:
            content_node_id = f"{len(contents)}_{processed_content['order']}"
            content_node = Node(id=content_node_id, type='Content', properties={
                'text': processed_content['text'],
                'embedding': processed_content['embedding'],
                'order': processed_content['order'] 
            })
            nodes[content_node_id] = content_node
            contents.append(content_node)

            if parent_node:
                relationships.append(Relationship(
                    source=nodes[parent_node],
                    target=content_node,
                    type='HAS_CONTENTS'
                ))
            else:
                # If there's no header, create a default one
                default_header_id = f"_{len(nodes)}"
                default_header = Node(id=default_header_id, type='Title', properties={'level': '', 'value': 'Default Header'})
                nodes[default_header_id] = default_header
                relationships.append(Relationship(
                    source=default_header,
                    target=content_node,
                    type='HAS_CONTENTS'
                ))

    source_doc = Document(page_content="Bedrock Graph")
    graph_doc = GraphDocument(
        nodes=list(nodes.values()),
        relationships=relationships,
        source=source_doc
    )

    return [graph_doc]


### 그래프 생성

| `vector_search_enabled` | 소요 시간 |
|:---|:---|
| `True` | ~30분 소요 |
| `False` | 1~2분 소요 |

In [58]:
vector_search_enabled=False
graph_docs = create_graph_document_from_md_splits(md_header_splits, vector_search_enabled)

### 중간 처리 된 그래프 파일 저장

In [59]:
import pickle

with open('samples/graph_docs.pkl', 'wb') as f:
    pickle.dump(graph_docs, f)    