# HTML Generaror 1: 全自動でWORD版の開発ガイドをHTML版に移植

In [None]:
# Colabセルにまとめて貼り付けて実行
# 1) LibreOffice/Mammoth等のインストール
# 2) indexページ解析 (docDownloadURL)
# 3) doc->docx
# 4) docx->html
# 5) HTML上部の枠を削除

################################
# 事前インストール:
################################
!apt-get update -qq
!apt-get install -y libreoffice
!pip install mammoth requests beautifulsoup4


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  apparmor default-jre default-jre-headless dictionaries-common firebird3.0-common
  firebird3.0-common-doc firebird3.0-server-core firebird3.0-utils fonts-crosextra-caladea
  fonts-crosextra-carlito fonts-dejavu fonts-dejavu-core fonts-dejavu-extra fonts-liberation2
  fonts-linuxlibertine fonts-noto-core fonts-noto-extra fonts-noto-mono fonts-noto-ui-core
  fonts-opensymbol fonts-sil-gentium fonts-sil-gentium-basic gstreamer1.0-gl gstreamer1.0-gtk3
  hunspell-en-us libabsl20210324 libabw-0.1-1 libatk-wrapper-java libatk-wrapper-java-jni
  libbsh-java libcdr-0.1-1 libclucene-contribs1v5 libclucene-core1v5 libcolamd2 libe-book-0.1-1
  libel-api-java libe

In [None]:

# Colabセルにまとめて貼り付けて実行 (Word→HTML変換＋後処理)
# 1) LibreOffice/Mammoth等のインストール
# 2) indexページ解析 (docDownloadURL)
# 3) doc->docx
# 4) docx->html
# 5) remove_top_frame (画像/枠削除)
# 6) remove_table_border_none (必要に応じて)
# 7) apply_table_borders (強制CSS追加)
# 8) remove_top_until_keyword_keep_previous (「設定ガイド」より前の要素を削除)

import requests
import re
import os
import subprocess
import mammoth
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlsplit

def parse_index_page(index_url):
    """Indexページの<table>を解析し (GuideNameEn, GuideNameJp, DocDownloadURL, BaseURL) を返す"""
    resp = requests.get(index_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")
    result_list = []
    table = soup.find("table")
    if not table:
        print("テーブルが見つかりません。HTML構造を確認してください。")
        return result_list

    for row in table.find_all("tr"):
        cols = row.find_all("td")
        if len(cols) < 4:
            continue
        guide_en = cols[0].get_text(strip=True)
        guide_jp = cols[1].get_text(strip=True)
        download_td = cols[2]
        base_url = cols[3].get_text(strip=True)

        # DOCリンク探索
        doc_download_url = None
        for a in download_td.find_all("a"):
            href = a.get("href","")
            if href.lower().endswith(".doc") or href.lower().endswith(".docx"):
                doc_download_url = urljoin(index_url, href)
                break

        result_list.append({
            "GuideNameEn": guide_en,
            "GuideNameJp": guide_jp,
            "DocDownloadURL": doc_download_url,
            "BaseURL": base_url
        })
    return result_list

def doc_to_docx_via_libreoffice(doc_path):
    """soffice --convert-to docx を使って .doc → .docx"""
    soffice_path = "/usr/bin/soffice"
    cmd = [
        soffice_path, "--headless", "--convert-to", "docx",
        doc_path, "--outdir",
        os.path.dirname(doc_path) if os.path.dirname(doc_path) else "."
    ]
    print("Running:", " ".join(cmd))
    subprocess.run(cmd, check=True)
    base, ext = os.path.splitext(doc_path)
    docx_file = base + ".docx"
    if not os.path.exists(docx_file):
        raise FileNotFoundError(f"{docx_file} が生成されませんでした。.doc→.docx失敗？")
    return docx_file

def mammoth_docx_to_html(docx_file, html_file):
    """Mammoth で docx→HTML"""
    with open(docx_file, "rb") as f:
        result = mammoth.convert_to_html(f)
    html_content = result.value
    with open(html_file, "w", encoding="utf-8") as f:
        f.write(html_content)
    print(f"[OK] {docx_file} → {html_file} (HTML化)")

def ensure_body(soup):
    """body が無い場合は作って移動"""
    if not soup.body:
        new_body = soup.new_tag("body")
        # head要素以外の子要素を body に移動
        for child in list(soup.contents):
            if child.name != "head":
                new_body.append(child.extract())
        soup.append(new_body)
    return soup

def remove_top_frame(html_path):
    """表紙の枠(div style='border...')と先頭の大きな画像(p>img base64)を削除"""
    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    soup = ensure_body(soup)
    body = soup.body

    # (1) <div style="border:..."> 削除
    candidate_divs = body.find_all("div", style=re.compile(r"(?i)border"))
    if candidate_divs:
        top_div = candidate_divs[0]
        top_div.decompose()

    # (2) 先頭に出る base64画像っぽい <p><img...> を削除
    paragraphs = body.find_all("p")
    for ptag in paragraphs:
        img = ptag.find("img", src=re.compile(r"^data:image/png;base64"))
        if img:
            ptag.decompose()
            break

    with open(html_path, "w", encoding="utf-8") as f:
        f.write(str(soup))
    print(f"[OK] 表紙枠/画像を削除 => {html_path}")

def force_inline_table_border(html_path):
    """
    (A) tableタグに border='1' を付与
    (B) table/tr/td/th の inline-style から 'border:none' を除去
    (C) 最後に border:1px solid black を追加
    """
    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    soup = ensure_body(soup)
    body = soup.body

    # (A) tableタグに border='1' を強制
    tables = body.find_all("table")
    for tbl in tables:
        tbl['border'] = "1"  # 古いHTML方式
        # もしセル間スペースをなくしたいなら:
        # tbl['cellspacing'] = "0"
        # tbl['cellpadding'] = "4"

    # (B)(C) table, tr, th, td の inline-styleを上書き
    for tag in body.find_all(["table","tr","th","td"]):
        old_style = tag.get("style","")
        # 'border: none' を削除
        new_style = re.sub(r"border\s*:\s*none[^;]*;?", "", old_style, flags=re.IGNORECASE)
        # border:1px solid blackを追記(既にあっても重複OK)
        # デモのため "border-collapse" まで追加するなら table だけにやるとか
        if tag.name == "table":
            # tableには border-collapse: collapse;
            # (inline-styleに !important は書けないのでなるべく conflicts のないルールを…)
            new_style += ";border:1px solid black;border-collapse:collapse;"
        else:
            new_style += ";border:1px solid black;"
        tag["style"] = new_style.strip("; ")

    # 書き戻し
    with open(html_path, "w", encoding="utf-8") as f:
        f.write(str(soup))
    print(f"[OK] テーブル枠線をインラインで強制 => {html_path}")

def remove_top_until_keyword_keep_previous(html_file, keyword="設定ガイド"):
    """キーワードを含む<p>の直前の<p>だけ残し、それより前の要素は削除"""
    with open(html_file, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")
    soup = ensure_body(soup)
    body = soup.body

    target_p = body.find("p", string=lambda txt: txt and keyword in txt)
    if not target_p:
        print(f"'{keyword}' を含む <p> が見つからないためスキップ")
        with open(html_file, "w", encoding="utf-8") as f2:
            f2.write(str(soup))
        return

    prev_p = None
    for sib in target_p.previous_siblings:
        if sib.name == "p":
            prev_p = sib
            break

    # prev_p があればそこまで削除
    keep_node = prev_p if prev_p else target_p
    found_flag = False
    for child in list(body.children):
        if child is keep_node:
            found_flag = True
        if not found_flag:
            child.decompose()

    with open(html_file, "w", encoding="utf-8") as f:
        f.write(str(soup))
    print(f"[OK] '{keyword}' より前の不要要素を削除 => {html_file}")

def main():
    index_url = "https://la-concur-helper.github.io/concur-docs/index.htm"
    print(f"Indexページ: {index_url} を解析します...")
    index_list = parse_index_page(index_url)
    if not index_list:
        print("index_listが空。終了")
        return

    print("\n=== ドキュメント一覧 ===")
    for i,info in enumerate(index_list, start=1):
        doc_url = info["DocDownloadURL"] or "None"
        print(f"{i}. {info['GuideNameEn']} / {info['GuideNameJp']} => {doc_url}")

    sel = input("\nどのDOCファイルを変換しますか？(番号入力, Enterで終了): ")
    if not sel.strip():
        print("キャンセル終了")
        return
    idx = int(sel)
    if idx<1 or idx>len(index_list):
        print("番号不正")
        return

    chosen = index_list[idx-1]
    doc_download_url = chosen["DocDownloadURL"]
    if not doc_download_url:
        print("DOCファイルURLなし。変換不可")
        return

    # ダウンロード
    print("ダウンロードURL:", doc_download_url)
    r = requests.get(doc_download_url)
    if r.status_code!=200:
        print("DL失敗 HTTP", r.status_code)
        return
    fname = os.path.basename(urlsplit(doc_download_url).path)
    with open(fname, "wb") as f:
        f.write(r.content)
    print("保存:", fname)

    # .doc → .docx (LibreOffice)
    if not fname.lower().endswith(".docx"):
        try:
            docx_file = doc_to_docx_via_libreoffice(fname)
        except Exception as e:
            print("doc->docx変換失敗:", e)
            return
    else:
        docx_file = fname

    # docx → html (Mammoth)
    html_file = os.path.splitext(docx_file)[0] + ".html"
    mammoth_docx_to_html(docx_file, html_file)

    # 1) 表紙枠削除
    remove_top_frame(html_file)

    # 2) テーブル強制罫線
    force_inline_table_border(html_file)

    # 3) 「設定ガイド」より前を削除（直前の<p>は残す）
    remove_top_until_keyword_keep_previous(html_file, "設定ガイド")

    print("最終HTML:", html_file)

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'mammoth'

# HTML Generaror 2: 手動で修正したWORD版の開発ガイドをHTML版に移植

In [None]:
# Colabセルで実行
!apt-get update -qq
!apt-get install -y libreoffice
!pip install mammoth beautifulsoup4


W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  apparmor default-jre default-jre-headless dictionaries-common firebird3.0-common
  firebird3.0-common-doc firebird3.0-server-core firebird3.0-utils fonts-crosextra-caladea
  fonts-crosextra-carlito fonts-dejavu fonts-dejavu-core fonts-dejavu-extra fonts-liberation2
  fonts-linuxlibertine fonts-noto-core fonts-noto-extra fonts-noto-mono fonts-noto-ui-core
  fonts-opensymbol fonts-sil-gentium fonts-sil-gentium-basic gstreamer1.0-gl gstreamer1.0-gtk3
  hunspell-en-us libabsl20210324 libabw-0.1-1 libatk-wrapper-java libatk-wrapper-java-jni
  libbsh-java libcdr-0.1-1 libclucene-contribs1v5 libclucene-core1v5 libcolamd2 libe-book-0.1-1
  libel-api-java libe

In [None]:
import os
import re
import subprocess
import mammoth
from bs4 import BeautifulSoup
from google.colab import files

# =============== 各種関数 ===============

def doc_to_docx_via_libreoffice(doc_path):
    """LibreOffice (soffice)で .doc → .docx に変換する"""
    soffice_path = "/usr/bin/soffice"
    cmd = [
        soffice_path, "--headless", "--convert-to", "docx",
        doc_path, "--outdir",
        os.path.dirname(doc_path) if os.path.dirname(doc_path) else "."
    ]
    print("Running:", " ".join(cmd))
    subprocess.run(cmd, check=True)
    base, ext = os.path.splitext(doc_path)
    docx_file = base + ".docx"
    if not os.path.exists(docx_file):
        raise FileNotFoundError(f"{docx_file} が生成されませんでした。.doc→.docx失敗？")
    return docx_file

def mammoth_docx_to_html(docx_file, html_file):
    """Mammoth で docx→HTML"""
    with open(docx_file, "rb") as f:
        result = mammoth.convert_to_html(f)
    html_content = result.value
    with open(html_file, "w", encoding="utf-8") as f:
        f.write(html_content)
    print(f"[OK] {docx_file} → {html_file} (HTML化)")

def ensure_body(soup):
    """soupにbodyが無ければ作成してコンテンツを移動"""
    if not soup.body:
        new_body = soup.new_tag("body")
        # head要素以外の子要素を body に移動
        for child in list(soup.contents):
            if child.name != "head":
                new_body.append(child.extract())
        soup.append(new_body)
    return soup

def remove_top_frame(html_path):
    """HTML上部の表紙枠<div>や先頭のBase64画像などを削除"""
    import re
    from bs4 import BeautifulSoup

    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    soup = ensure_body(soup)
    body = soup.body

    # (1) div style="border:..." 削除
    candidate_divs = body.find_all("div", style=re.compile(r"(?i)border"))
    if candidate_divs:
        top_div = candidate_divs[0]
        top_div.decompose()

    # (2) 先頭の base64画像 <p><img...> 削除
    paragraphs = body.find_all("p")
    for ptag in paragraphs:
        img = ptag.find("img", src=re.compile(r"^data:image/png;base64"))
        if img:
            ptag.decompose()
            break

    with open(html_path, "w", encoding="utf-8") as f:
        f.write(str(soup))
    print(f"[OK] 表紙枠/画像を削除 => {html_path}")

def force_inline_table_border(html_path):
    """tableタグ等に強制的に枠線を付与"""
    import re
    from bs4 import BeautifulSoup

    with open(html_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    soup = ensure_body(soup)
    body = soup.body

    # (A) tableタグに border='1' を付与
    tables = body.find_all("table")
    for tbl in tables:
        tbl['border'] = "1"

    # (B) table/tr/td/th の inline-style から 'border:none' を除去し、'border:1px solid black' を付与
    for tag in body.find_all(["table","tr","th","td"]):
        old_style = tag.get("style","")
        # 'border: none' を削除
        new_style = re.sub(r"border\s*:\s*none[^;]*;?", "", old_style, flags=re.IGNORECASE)
        if tag.name == "table":
            # tableの場合
            new_style += ";border:1px solid black;border-collapse:collapse;"
        else:
            # tr, th, tdの場合
            new_style += ";border:1px solid black;"
        tag["style"] = new_style.strip("; ")

    with open(html_path, "w", encoding="utf-8") as f:
        f.write(str(soup))
    print(f"[OK] テーブル枠線をインラインで強制 =>", html_path)

def remove_top_until_keyword_keep_previous(html_file, keyword="設定ガイド"):
    """指定キーワードを含む<p>の直前の<p>だけ残し、それより上を削除"""
    from bs4 import BeautifulSoup

    with open(html_file, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    soup = ensure_body(soup)
    body = soup.body

    target_p = body.find("p", string=lambda txt: txt and keyword in txt)
    if not target_p:
        print(f"'{keyword}' を含む <p> が見つからないためスキップします")
        with open(html_file, "w", encoding="utf-8") as f2:
            f2.write(str(soup))
        return

    prev_p = None
    for sib in target_p.previous_siblings:
        if sib.name == "p":
            prev_p = sib
            break

    # prev_p があればそこまで削除
    keep_node = prev_p if prev_p else target_p
    found_flag = False
    for child in list(body.children):
        if child is keep_node:
            found_flag = True
        if not found_flag:
            child.decompose()

    with open(html_file, "w", encoding="utf-8") as f:
        f.write(str(soup))
    print(f"[OK] '{keyword}' より前の不要要素を削除 => {html_file}")

# =============== メイン処理 (Colab上で実行) ===============
def convert_word_to_html_with_cleanup(keyword="設定ガイド"):
    """
    GoogleColab上で .doc/.docx をアップロード→HTML変換→後処理まで実行
    """
    # ファイルアップロード
    print("Word(.doc/.docx)ファイルをアップロードしてください...")
    uploaded = files.upload()
    if not uploaded:
        print("アップロードがキャンセルされました。終了します。")
        return

    for filename in uploaded.keys():
        print(f"\n--- 処理を開始します: {filename} ---")
        base, ext = os.path.splitext(filename)
        ext_lower = ext.lower()

        # 1) .doc を .docx に変換（LibreOffice）
        if ext_lower == ".doc":
            try:
                docx_file = doc_to_docx_via_libreoffice(filename)
            except Exception as e:
                print("doc->docx変換失敗:", e)
                continue
        elif ext_lower == ".docx":
            # docxならそのまま使う
            docx_file = filename
        else:
            print("対象外の拡張子です。スキップします。")
            continue

        # 2) docx→html (Mammoth)
        html_file = base + ".html"  # 例: sample.docx → sample.html
        mammoth_docx_to_html(docx_file, html_file)

        # 3) 不要部分の削除やテーブル罫線追加など (必要に応じて削除/改変OK)
        remove_top_frame(html_file)  # 表紙枠と先頭画像を削除
        force_inline_table_border(html_file)  # テーブルに枠線を付与
        remove_top_until_keyword_keep_previous(html_file, keyword)  # キーワード("設定ガイド")前を削除

        # 4) 変換結果のダウンロード (Colab)
        print(f"\n変換完了。ローカル保存: {html_file}")
        print("HTMLファイルをダウンロードします...")
        files.download(html_file)
        print("\n------------------------------------------")

# 実行
convert_word_to_html_with_cleanup(keyword="設定ガイド")


Word(.doc/.docx)ファイルをアップロードしてください...


Saving Exp_SG_Workflow_General-jp.docx to Exp_SG_Workflow_General-jp (1).docx

--- 処理を開始します: Exp_SG_Workflow_General-jp (1).docx ---
[OK] Exp_SG_Workflow_General-jp (1).docx → Exp_SG_Workflow_General-jp (1).html (HTML化)
[OK] 表紙枠/画像を削除 => Exp_SG_Workflow_General-jp (1).html
[OK] テーブル枠線をインラインで強制 => Exp_SG_Workflow_General-jp (1).html
[OK] '設定ガイド' より前の不要要素を削除 => Exp_SG_Workflow_General-jp (1).html

変換完了。ローカル保存: Exp_SG_Workflow_General-jp (1).html
HTMLファイルをダウンロードします...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


------------------------------------------
