In [7]:
import os
import logging
from typing import Iterator
from tqdm import tqdm
from langchain_core.documents import Document
from langchain_community.document_loaders import ReadTheDocsLoader

In [8]:
class TrackedReadTheDocsLoader(ReadTheDocsLoader):

    def lazy_load(self) -> Iterator[Document]:
        """A lazy loader for Documents."""
        for file_pattern in self.patterns:
            with tqdm(desc=f"File attern: {file_pattern}", total=len(list(self.file_path.rglob(file_pattern)))) as pbar:
                i = 0
                for p in self.file_path.rglob(file_pattern):
                    if p.is_dir():
                        continue
                    with open(p, encoding=self.encoding, errors=self.errors) as f:
                        text = self._clean_data(f.read())
                    yield Document(page_content=text, metadata={"source": str(p)})
                    pbar.update(1)
                    # i += 1
                    # if i >= 10:
                    #     break


In [9]:
path = "/docker_mount/data/python_pptx_docs"

In [None]:
loader = TrackedReadTheDocsLoader(path=path)
raw_documents = loader.load()
print(f"There are {len(raw_documents)} loaded raw documents")

In [None]:
raw_documents[0].page_content

In [1]:
pathes = [
    "/docker_mount/data/python_pptx_docs/Getting Started — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Working with Presentations — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Working with Slides — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Understanding Shapes — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Working with AutoShapes — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Understanding placeholders — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Working with placeholders — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Working with text — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Working with charts — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Working with tables — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Working with Notes Slides — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Concepts — python-pptx 1.0.0 documentation.html"
    ]


In [None]:
# # Open and read an HTML file
# with open(path[1], "r", encoding="utf-8") as file:
#     html_content = file.read()
#     print(html_content)  # Or do something else with the content

In [2]:
import requests
from markitdown import MarkItDown
from bs4 import BeautifulSoup
import io
import os



def get_mk_from_html(path):

    with open(path, 'r', encoding='utf-8') as file:
        html_content = file.read()

    soup = BeautifulSoup(html_content, 'lxml')
    target_element = str(soup.find_all(class_='body')[0])

    # Create a new HTML document
    html_content = f"""
    <!DOCTYPE html>
    <html>
    <head>
        <meta charset="utf-8">
        <title>Extracted Content</title>
        <style>
            /* You can add any CSS styles here */
            body {{
                margin: 20px;
                font-family: Arial, sans-serif;
            }}
        </style>
    </head>
    <body>
        {str(target_element)}
    </body>
    </html>
    """


    response = requests.Response()
    response._content = html_content.encode('utf-8')
    response.raw = io.BytesIO(response._content)
    response.status_code = 200
    response.headers = {
        'Content-Type': 'text/html',
        'Content-Length': str(len(response._content)),
    }

    # Initialize MarkItDown
    markitdown = MarkItDown()
    converted = markitdown.convert(response)
    return converted.text_content


In [3]:
pathes = [
    "/docker_mount/data/python_pptx_docs/Getting Started — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Working with Presentations — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Working with Slides — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Understanding Shapes — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Working with AutoShapes — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Understanding placeholders — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Working with placeholders — python-pptx 1.0.0 documentation.html",
    "/docker_mount/data/python_pptx_docs/Working with text — python-pptx 1.0.0 documentation.html",
    # "/docker_mount/data/python_pptx_docs/Working with charts — python-pptx 1.0.0 documentation.html",
    # "/docker_mount/data/python_pptx_docs/Working with tables — python-pptx 1.0.0 documentation.html",
    # "/docker_mount/data/python_pptx_docs/Working with Notes Slides — python-pptx 1.0.0 documentation.html",
    # "/docker_mount/data/python_pptx_docs/Concepts — python-pptx 1.0.0 documentation.html"
    ]


all_content = ''

for el in pathes:
    all_content += get_mk_from_html(el)

  parser = parser(


In [21]:
all_content

'# Getting Started[¶](https://python-pptx.readthedocs.io/en/latest/user/quickstart.html#getting-started "Permalink to this headline")\n\nA quick way to get started is by trying out some of the examples below to get\na feel for how to use python-pptx.\n\nThe [API documentation](https://python-pptx.readthedocs.io/en/latest/index.html#api) can help you with the fine details of\ncalling signatures and behaviors.\n\n---\n\n## Hello World! example[¶](https://python-pptx.readthedocs.io/en/latest/user/quickstart.html#hello-world-example "Permalink to this headline")\n\n![../_images/hello-world.png](./Getting Started — python-pptx 1.0.0 documentation_files/hello-world.webp)\n\n```\nfrom pptx import Presentation\n\nprs = Presentation()\ntitle_slide_layout = prs.slide_layouts[0]\nslide = prs.slides.add_slide(title_slide_layout)\ntitle = slide.shapes.title\nsubtitle = slide.placeholders[1]\n\ntitle.text = "Hello, World!"\nsubtitle.text = "python-pptx was here!"\n\nprs.save(\'test.pptx\')\n\n```\n\

In [4]:
with open("test.md", 'w', encoding='utf-8') as file:
    file.write(all_content)


In [14]:
from python_pptx import Presentation


b'<div class="body" role="main">\n<div class="section" id="getting-started">\n<span id="examples"></span><h1>Getting Started<a class="headerlink" href="https://python-pptx.readthedocs.io/en/latest/user/quickstart.html#getting-started" title="Permalink to this headline">\xc2\xb6</a></h1>\n<p>A quick way to get started is by trying out some of the examples below to get\na feel for how to use <cite>python-pptx</cite>.</p>\n<p>The <a class="reference internal" href="https://python-pptx.readthedocs.io/en/latest/index.html#api"><span class="std std-ref">API documentation</span></a> can help you with the fine details of\ncalling signatures and behaviors.</p>\n<hr class="docutils"/>\n<div class="section" id="hello-world-example">\n<h2>Hello World! example<a class="headerlink" href="https://python-pptx.readthedocs.io/en/latest/user/quickstart.html#hello-world-example" title="Permalink to this headline">\xc2\xb6</a></h2>\n<img alt="../_images/hello-world.png" src="./Getting Started \xe2\x80\x94 