In [None]:
html_string = """
<!DOCTYPE html>
  <html lang='en'>
  <head>
    <meta charset='UTF-8'>
    <meta name='viewport' content='width=device-width, initial-scale=1.0'>
    <title>Fancy Example HTML Page</title>
  </head>
  <body>
    <h1>Main Title</h1>
    <p>This is an introductory paragraph with some basic content.</p>

    <h2>Section 1: Introduction</h2>
    <p>This section introduces the topic. Below is a list:</p>
    <ul>
      <li>First item</li>
      <li>Second item</li>
      <li>Third item with <strong>bold text</strong> and <a href='#'>a link</a></li>
    </ul>

    <h3>Subsection 1.1: Details</h3>
    <p>This subsection provides additional details. Here's a table:</p>
    <table border='1'>
      <thead>
        <tr>
          <th>Header 1</th>
          <th>Header 2</th>
          <th>Header 3</th>
        </tr>
      </thead>
      <tbody>
        <tr>
          <td>Row 1, Cell 1</td>
          <td>Row 1, Cell 2</td>
          <td>Row 1, Cell 3</td>
        </tr>
        <tr>
          <td>Row 2, Cell 1</td>
          <td>Row 2, Cell 2</td>
          <td>Row 2, Cell 3</td>
        </tr>
      </tbody>
    </table>

    <h2>Section 2: Media Content</h2>
    <p>This section contains an image and a video:</p>
      <img src='example_image_link.mp4' alt='Example Image'>
      <video controls width='250' src='example_video_link.mp4' type='video/mp4'>
      Your browser does not support the video tag.
    </video>

    <h2>Section 3: Code Example</h2>
    <p>This section contains a code block:</p>
    <pre><code data-lang="html">
    &lt;div&gt;
      &lt;p&gt;This is a paragraph inside a div.&lt;/p&gt;
    &lt;/div&gt;
    </code></pre>

    <h2>Conclusion</h2>
    <p>This is the conclusion of the document.</p>
  </body>
  </html>
"""

In [None]:
from langchain_text_splitters import HTMLHeaderTextSplitter

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)
html_header_splits = html_splitter.split_text(html_string)
html_header_splits

[Document(metadata={'Header 1': 'Main Title'}, page_content='This is an introductory paragraph with some basic content.'),
 Document(metadata={'Header 1': 'Main Title', 'Header 2': 'Section 1: Introduction'}, page_content='This section introduces the topic. Below is a list:  \nFirst item Second item Third item with bold text and a link'),
 Document(metadata={'Header 1': 'Main Title', 'Header 2': 'Section 1: Introduction', 'Header 3': 'Subsection 1.1: Details'}, page_content="This subsection provides additional details. Here's a table:"),
 Document(metadata={'Header 1': 'Main Title', 'Header 2': 'Section 2: Media Content'}, page_content='This section contains an image and a video:'),
 Document(metadata={'Header 1': 'Main Title', 'Header 2': 'Section 3: Code Example'}, page_content='This section contains a code block:'),
 Document(metadata={'Header 1': 'Main Title', 'Header 2': 'Conclusion'}, page_content='This is the conclusion of the document.')]

In [None]:
from langchain_text_splitters import HTMLHeaderTextSplitter

In [None]:
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on = headers_to_split_on)

In [None]:
html_header_splits = html_splitter.split_text(html_string)

In [None]:
for header in html_header_splits:
  print(f'{header.page_content}')
  print(f"{header.metadata}",end = "\n===================================\n")

This is an introductory paragraph with some basic content.
{'Header 1': 'Main Title'}
This section introduces the topic. Below is a list:  
First item Second item Third item with bold text and a link
{'Header 1': 'Main Title', 'Header 2': 'Section 1: Introduction'}
This subsection provides additional details. Here's a table:
{'Header 1': 'Main Title', 'Header 2': 'Section 1: Introduction', 'Header 3': 'Subsection 1.1: Details'}
This section contains an image and a video:
{'Header 1': 'Main Title', 'Header 2': 'Section 2: Media Content'}
This section contains a code block:
{'Header 1': 'Main Title', 'Header 2': 'Section 3: Code Example'}
This is the conclusion of the document.
{'Header 1': 'Main Title', 'Header 2': 'Conclusion'}


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

url = "https://plato.stanford.edu/entries/goedel/"

headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4"),
]

html_splitter = HTMLHeaderTextSplitter(headers_to_split_on)

# for local file use html_splitter.split_text_from_file(<path_to_file>)
html_header_splits = html_splitter.split_text_from_url(url)

# 자기소개서 HTML

In [None]:
from langchain.text_splitter import HTMLHeaderTextSplitter

# HTML 파일 경로
html = "/content/NAEUN LEE 16d7ad66f39d80829f72da23ebc3c916.html"

# 분할할 헤더를 지정
headers_to_split_on = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
]

# HTMLHeaderTextSplitter 객체 초기화
html_splitter = HTMLHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# 로컬 HTML 파일을 분할하여 html_header_splits에 결과 저장
html_header_splits = html_splitter.split_text_from_file(html)

# 결과를 .txt 파일로 저장
with open("/content/HTMLHEADERTEXTSPLITTER_RESULT.txt", "w", encoding="utf-8") as f:
    for section in html_header_splits:
        # Document 객체에서 페이지 콘텐츠만 가져옵니다.
        f.write(f"Content: {section.page_content}\n")
        f.write("=" * 50 + "\n")

print("HTMLHEADERTEXTSPLITTER_RESULT.txt")

HTMLHEADERTEXTSPLITTER_RESULT.txt


In [None]:
html_header_splits

[Document(metadata={'Header 1': 'NAEUN LEE'}, page_content='😀  \nAI/ML Research Student'),
 Document(metadata={'Header 1': 'NAEUN LEE', 'Header 3': 'AI/ML Research Student'}, page_content='Contact'),
 Document(metadata={'Header 1': 'NAEUN LEE', 'Header 3': 'Contact'}, page_content='Location, State/Korea  \n+00 123 456 789  \nnaaa2004@naver.com  \nLinkedIn'),
 Document(metadata={'Header 1': 'NAEUN LEE', 'Header 3': 'AI/ML Research Student'}, page_content='Profile'),
 Document(metadata={'Header 1': 'NAEUN LEE', 'Header 3': 'Profile'}, page_content='전주대학교 인공지능학과에 재학 중이며, 머신러닝과 딥러닝을 포함한 인공지능 기술을 학습하고 있습니다. 현재 빅데이터센터에서 활동하며 데이터 분석 및 실무 경험을 쌓고 있으며, 과거에는 대학교 서포터즈로 활동하며 팀워크와 커뮤니케이션 능력을 키웠습니다. 새로운 도전을 통해 성장하고자 합니다.'),
 Document(metadata={'Header 1': 'NAEUN LEE', 'Header 3': 'AI/ML Research Student'}, page_content='SkillsLanguagesSide Hustles'),
 Document(metadata={'Header 1': 'NAEUN LEE', 'Header 3': 'Skills'}, page_content='Python  \n◾◾◾  \nTensorFlow  \n◾◾◾  \nOpenCV  \n◾◾◾  \nPowerPoint  \n◾