In [None]:
!pip install beautifulsoup4
!pip install lxml
!pip install requests
!pip install langchain
!pip install langchain-text-splitters
!pip install tiktoken
!pip install docling

In [None]:
from google.colab import files
import io
import re
import json
from pathlib import Path
from datetime import datetime
from bs4 import BeautifulSoup, Comment
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from langchain.text_splitter import MarkdownHeaderTextSplitter

# ファイルをアップロードさせる
uploaded_files = files.upload()

for file_name in uploaded_files.keys():
    print(f"Processing... {file_name}")
    file_contents = uploaded_files[file_name]

    # HTMLの不要タグを除去
    soup = BeautifulSoup(file_contents, "html.parser")
    [element.decompose() for element in soup.find_all("script")]
    # [element.decompose() for element in soup.find_all("a", attrs={"class": "link"})]
    # [element.decompose() for element in soup.find_all("img", attrs={"src": re.compile(".*\.png$")})]
    souped_content = str(soup)

    # 正規表現によるテキスト除去
    cleaned_content = re.sub("<footer.*?>.*?</footer>", '', souped_content, flags=re.MULTILINE | re.DOTALL)

    # マークダウンに変換
    document_converter = DocumentConverter()
    covert_result = document_converter.convert_string(content=cleaned_content, format=InputFormat.HTML, name=file_name)
    markdown = covert_result.document.export_to_markdown()

    # マークダウンファイルを保存
    markdown_file_path = Path(file_name).stem + ".md"
    with open(markdown_file_path, 'w', encoding='utf-8') as f:
        f.write(markdown)

    # マークダウンをJSONに変換する
    headers_to_split_on = [
        ("#", "Header1"),
        ("##", "Header2"),
        ("###", "Header3"),
        ("####", "Header4"),
    ]
    splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        strip_headers=True,
        return_each_line=False,
    )
    splitted_docs = splitter.split_text(markdown)
    splitted_jsons = []
    for i, doc in enumerate(splitted_docs):
        splitted_json = {
#            "id": f"{file_name}_{i}",
            "content": doc.page_content.strip(),
            "metadata": {
                **doc.metadata
            }
        }
        splitted_jsons.append(splitted_json)

    # JSONファイルを保存
    json_file_path = Path(file_name).stem + ".json"
    with open(json_file_path, 'w', encoding='utf-8') as f:
        json.dump(splitted_jsons, f, ensure_ascii=False, indent=2)
    # 完了を知らせるログを出力する
    print(f"{file_name} の変換が完了しました。")
