In [None]:
!apt-get update
!apt-get install -y pandoc

!pip install beautifulsoup4
!pip install lxml
!pip install requests
!pip install langchain
!pip install langchain-text-splitters

!mkdir input
!mkdir output
!mkdir in_progress


In [None]:
from pathlib import Path
import logging
import json
import re
from datetime import datetime
from bs4 import BeautifulSoup, Comment
from langchain.text_splitter import MarkdownHeaderTextSplitter

import pypandoc

"""ログ出力の準備"""
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

input_dir = Path('/content/input')
output_dir = Path('/content/output')
in_progress_dir = Path('/content/in_progress')

In [None]:
"""inputディレクトリ内のすべてのHTMLファイルを読み込む"""
input_html_files = list(input_dir.glob('*.html')) + list(input_dir.glob('*.htm'))
input_html_contents = {}
for input_html_file in input_html_files:
    logger.info(f"読み込み中 {input_html_file.name}")
    # HTMLファイルを読み込み
    with open(input_html_file, 'r', encoding='utf-8') as f:
        input_html_contents[input_html_file.name] = f.read()
        # ファイルサイズをKB単位でログ出力
        file_size = input_html_file.stat().st_size / 1024
        logger.info(f"{input_html_file.name} のファイルサイズ: {file_size:.1f} KB")

logger.info(f"読み込んだHTMLファイル数: {len(input_html_files)}")

In [None]:
"""HTMLファイルの除外処理"""
# 除外パターン（後日設定予定。今は例として2つのパターンを仮置き）
exclude_patterns = [
    "これは除外パターンの例1",  # 例: "広告"
    "これは除外パターンの例2",  # 例: "サンプルテキスト"
]

filtered_html_contents = {}
excluded_files = []

for filename, html_content in input_html_contents.items():
    excluded = False
    for pattern in exclude_patterns:
        if pattern in html_content:
            excluded = True
            logger.info(f"除外された({pattern}) : {filename}")
            break
    if excluded:
        excluded_files.append(filename)
    else:
        filtered_html_contents[filename] = html_content

logger.info(f"除外されたファイル数: {len(excluded_files)} 件")
logger.info(f"除外されなかったファイル数: {len(filtered_html_contents)} 件")


In [None]:
"""SourcePageURLを抽出"""
def extract_canonical_url(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    canonical = soup.find('link', rel='canonical')
    if canonical and canonical.get('href'):
        return canonical['href']
    return ""

# 各HTMLファイルからcanonicalURLを抽出
source_page_urls = {}
canonical_found_count = 0
canonical_not_found_count = 0

for filename, html_content in filtered_html_contents.items():
    logger.info(f"SourcePage抽出中: {filename}")
    canonical_url = extract_canonical_url(html_content)
    source_page_urls[filename] = canonical_url
    
    if canonical_url:
        logger.info(f"{filename} -> {canonical_url}")
        canonical_found_count += 1
    else:
        logger.warning(f"{filename} -> No canonical_url")
        canonical_not_found_count += 1

logger.info(f"SourcePageURL 抽出処理完了: {len(source_page_urls)} 件, 発見できた: {canonical_found_count} 件, 発見できなかった: {canonical_not_found_count} 件")

# タイムスタンプを生成（例: 20240608153045）
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
in_progress_json_path = in_progress_dir / f"source_page_urls_{timestamp}.json"
# source_page_urlsをoutputフォルダに保存
with open(in_progress_json_path, 'w', encoding='utf-8') as f:
    json.dump(source_page_urls, f, ensure_ascii=False, indent=2)

logger.info(f"source_page_urlsを {in_progress_json_path} に出力しました")

In [None]:
"""基礎ノイズを除去"""
major_noise_removed_html_contents = {}

major_noise_tags = ['script', 'style', 'noscript', 'iframe', 'object', 'embed']

for filename, html_content in filtered_html_contents.items():
    soup = BeautifulSoup(html_content, 'html.parser')
    # 基礎ノイズタグを削除
    for tag in major_noise_tags:
        for element in soup.find_all(tag):
            element.decompose()
    # コメントを削除
    for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
        comment.extract()
    # 基礎ノイズ除去後のHTMLを保存
    major_noise_removed_html_contents[filename] = str(soup)

# in_progressフォルダ内にmajor_noise_removedフォルダを作成
major_noise_removed_dir = in_progress_dir / "major_noise_removed"
major_noise_removed_dir.mkdir(parents=True, exist_ok=True)

# 各HTMLコンテンツをmajor_noise_removedフォルダにファイルとして保存
for filename, major_noise_removed_html in major_noise_removed_html_contents.items():
    output_path = major_noise_removed_dir / filename
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(major_noise_removed_html)
    logger.info(f"{filename} を {output_path} に保存しました")

logger.info(f"基礎ノイズ除去済みHTMLを {len(major_noise_removed_html_contents)} 件保存しました")

In [None]:
"""末梢ノイズを除去"""

# minor_noise_pattern.txtからパターンを読み込む
minor_noise_patterns = []
minor_noise_pattern_file = Path("minor_noise_pattern.txt")
with open(minor_noise_pattern_file, "r", encoding="utf-8") as f:
    minor_noise_patterns = [line.strip() for line in f if line.strip()]
logger.info(f"minor_noise_pattern.txtから{len(minor_noise_patterns)}件のパターンを読み込みました")

minor_noise_removed_html_contents = {}

for filename, html_content in major_noise_removed_html_contents.items():
    processed_content = html_content
    for pattern in minor_noise_patterns:
        if re.search(pattern, processed_content, flags=re.MULTILINE | re.DOTALL):
            logger.info(f"ファイル: {filename} にパターン: {pattern} がヒットしました")
        processed_content = re.sub(pattern, '', processed_content, flags=re.MULTILINE | re.DOTALL)
    minor_noise_removed_html_contents[filename] = processed_content

# in_progressフォルダ内にminor_noise_removedフォルダを作成
minor_noise_removed_dir = in_progress_dir / "minor_noise_removed"
minor_noise_removed_dir.mkdir(parents=True, exist_ok=True)
# 各HTMLコンテンツをminor_noise_removedフォルダにファイルとして保存
for filename, minor_noise_removed_html in minor_noise_removed_html_contents.items():
    output_path = minor_noise_removed_dir / filename
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(minor_noise_removed_html)
    logger.info(f"{filename} を {output_path} に保存しました")

logger.info(f"末梢ノイズ除去済みHTMLを {len(minor_noise_removed_html_contents)} 件保存しました")


In [None]:
"""Markdownに変換"""
markdown_contents = {}
for filename, html_content in minor_noise_removed_html_contents.items():
    markdown = pypandoc.convert_text(
        html_content, 
        'commonmark',
        format='html',
        extra_args=['--wrap=none']
    )
    logger.info(f"{filename} のHTMLをMarkdownに変換しました")
    markdown_contents[filename] = markdown
    # in_progressフォルダ内にmarkdownフォルダを作成
    markdown_dir = in_progress_dir / "markdown"
    markdown_dir.mkdir(parents=True, exist_ok=True)

    # 各Markdownコンテンツをmarkdownフォルダにファイルとして保存
    output_md_path = markdown_dir / (Path(filename).stem + ".md")
    with open(output_md_path, 'w', encoding='utf-8') as f:
        f.write(markdown)
    logger.info(f"{filename} を {output_md_path} に保存しました")


In [None]:

json_contents = {}

for filename, markdown_content in markdown_contents.items():
    # MarkdownHeaderTextSplitterのインスタンスを作成
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
        ("####", "Header 4"),
        ("#####", "Header 5"),
        ("######", "Header 6"),
    ]
    splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        strip_headers=False
    )
    docs = splitter.split_text(markdown_content)
    json_contents[filename] = docs
    logger.info(f"{filename} を {len(docs)} セクションに分割し、JSON化しました")
