<a href="https://colab.research.google.com/github/iim0663418/LLMDataOptimization/blob/main/Pandoc_%E5%AF%A6%E7%8F%BE%E6%A0%BC%E5%BC%8F%E4%BF%9D%E7%95%99%E7%9A%84%E6%96%87%E6%AA%94%E8%BD%89%E6%8F%9B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Pandoc 是一個通用的文檔轉換工具，可以在多種格式之間轉換文檔，如 DOCX、ODT、Markdown、HTML、LaTeX 等。它能夠保留文檔的格式，並提供多種輸出選項。
這段程式碼只支援 DOCX、ODT 運用：
1. 先將文檔轉換為 HTML 格式，保留格式資訊。
2. 然後使用自定義的解析器從 HTML 中提取所需的結構化數據，並進一步轉換為其他格式。</br>

方法論轉換成純文字檔案的過程。

同時加入將 json 轉換成語言模型友善的 .txt 方法，藉此保留原有的格式

In [1]:
!sudo apt-get install pandoc
!pip install beautifulsoup4


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libcmark-gfm-extensions0.29.0.gfm.3 libcmark-gfm0.29.0.gfm.3 pandoc-data
Suggested packages:
  texlive-latex-recommended texlive-xetex texlive-luatex pandoc-citeproc
  texlive-latex-extra context wkhtmltopdf librsvg2-bin groff ghc nodejs php
  python ruby libjs-mathjax libjs-katex citation-style-language-styles
The following NEW packages will be installed:
  libcmark-gfm-extensions0.29.0.gfm.3 libcmark-gfm0.29.0.gfm.3 pandoc
  pandoc-data
0 upgraded, 4 newly installed, 0 to remove and 45 not upgraded.
Need to get 20.6 MB of archives.
After this operation, 156 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libcmark-gfm0.29.0.gfm.3 amd64 0.29.0.gfm.3-3 [115 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libcmark-gfm-extensions0.29.0.gfm.3 amd64 0.29.0.gfm.3-3 [25.1 kB

In [2]:
import os

def create_directories():
    input_dir = "/content/input"
    output_dir = "/content/output"
    txtOutput_dir = "/content/output/txtOutput"
    log_dir = "/content/log"

    # 創建 /content/input 目錄
    os.makedirs(input_dir, exist_ok=True)
    print(f"Directory created: {input_dir}")

    # 創建 /content/output 目錄
    os.makedirs(output_dir, exist_ok=True)
    print(f"Directory created: {output_dir}")
    # 創建 /content/output/txtOutput 目錄
    os.makedirs(txtOutput_dir, exist_ok=True)
    print(f"Directory created: {txtOutput_dir}")
    # 創建 /content/log 目錄
    os.makedirs(log_dir, exist_ok=True)
    print(f"Directory created: {log_dir}")
# 執行函數創建目錄
create_directories()


Directory created: /content/input
Directory created: /content/output
Directory created: /content/output/txtOutput
Directory created: /content/log


In [16]:
from bs4 import BeautifulSoup
import re
import os
import json
import subprocess
import traceback

def convert_to_html(input_file, output_file):
    try:
        subprocess.run(['pandoc', '-o', output_file, input_file], check=True)
        print(f"Converted {input_file} to {output_file}")
    except subprocess.CalledProcessError as e:
        print(f"Error converting {input_file}: {e}")
        raise

def parse_html_and_extract_data(html_file):
    with open(html_file, "r", encoding="utf-8") as file:
        contents = file.read()
        soup = BeautifulSoup(contents, "html.parser")

    content = []
    current_title = None
    current_paragraphs = []

    title_pattern = re.compile('^h[1-6]$')  # 預編正則表示式提高效率

    for element in soup.find_all(lambda tag: re.match(title_pattern, tag.name) or tag.name == 'p'):
        if re.match(title_pattern, element.name):  # 使用 re.match
            if current_title is not None:
                content.append({
                    'title': current_title,
                    'paragraphs': current_paragraphs
                })
            current_title = element.text.strip()
            current_paragraphs = []
        else:
            current_paragraphs.append(element.text.strip())

    if current_title is not None:
        content.append({
            'title': current_title,
            'paragraphs': current_paragraphs
        })

    return {'content': content}


def save_data_as_json(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, indent=4)
    print(f"Saved JSON to {output_file}")

def batch_process_documents(input_folder, output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    summary_data = []
    for filename in os.listdir(input_folder):
        if filename.lower().endswith(('.docx', '.odt')):
            base_name = os.path.splitext(filename)[0]
            input_file = os.path.join(input_folder, filename)
            html_file = os.path.join(output_folder, f"{base_name}.html")
            json_file = os.path.join(output_folder, f"{base_name}.json")

            try:
                convert_to_html(input_file, html_file)
                data = parse_html_and_extract_data(html_file)
                save_data_as_json(data, json_file)
                summary_data.append({base_name: data})
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                traceback.print_exc()

    summary_file = os.path.join(output_folder, "summary.json")
    with open(summary_file, 'w', encoding='utf-8') as f:
        json.dump(summary_data, f, indent=4)
    print(f"Summary saved to {summary_file}")

# 使用範例
input_folder = "/content/input"
output_folder = "/content/output"
batch_process_documents(input_folder, output_folder)


Converted /content/input/AI_Risk_Assessment_Database_Interpretation_Document.docx to /content/output/AI_Risk_Assessment_Database_Interpretation_Document.html
Saved JSON to /content/output/AI_Risk_Assessment_Database_Interpretation_Document.json
Summary saved to /content/output/summary.json


In [17]:
import json
import os

def json_to_txt(data, output_file):
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            for item in data:
                f.write(f"## 文件來源: {item['filename']}\n\n")

                for content in item['content']:
                    if 'title' in content:
                        f.write(f"### 標題: {content['title']}\n")
                    if 'paragraphs' in content:
                        for paragraph in content['paragraphs']:
                            f.write(f"{paragraph}\n\n")
                    if 'lists' in content:
                        for lst in content['lists']:
                            f.write("列表項目:\n")
                            for list_item in lst['items']:
                                f.write(f"- {list_item}\n")
                            f.write("\n")
                    if 'tables' in content:
                        for table in content['tables']:
                            f.write("表格內容:\n")
                            for row in table['rows']:
                                f.write(" | ".join(row) + "\n")
                            f.write("\n")

        print(f"Converted JSON data to {output_file}")

    except Exception as e:
        print(f"Failed to write to {output_file}: {e}")

def batch_json_to_single_txt(input_folder, output_txt_file, output_json_file):
    combined_data = []

    for filename in os.listdir(input_folder):
        if filename.endswith('.json'):
            json_file = os.path.join(input_folder, filename)
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                combined_data.append({
                    "filename": filename,
                    "content": data['content']  # Assuming data structure is adjusted as per new parse method
                })

            except Exception as e:
                print(f"Failed to process {json_file}: {e}")

    json_to_txt(combined_data, output_txt_file)

    try:
        with open(output_json_file, 'w', encoding='utf-8') as f:
            json.dump(combined_data, f, ensure_ascii=False, indent=4)
        print(f"Saved combined data to {output_json_file}")
    except Exception as e:
        print(f"Failed to write combined data to {output_json_file}: {e}")

# 使用範例
input_folder = "/content/output"
output_txt_file = "/content/output/txtOutput/combined_output.txt"
output_json_file = "/content/output/txtOutput/combined_output.json"

batch_json_to_single_txt(input_folder, output_txt_file, output_json_file)


Failed to process /content/output/summary.json: list indices must be integers or slices, not str
Converted JSON data to /content/output/txtOutput/combined_output.txt
Saved combined data to /content/output/txtOutput/combined_output.json


In [6]:
import shutil
import os

def zip_output_directory():
    output_dir = "/content/output"
    zip_file_path = "/content/output_archive.zip"

    # 確保 output 目錄存在
    if os.path.exists(output_dir):
        # 使用 shutil.make_archive 創建 ZIP 文件
        shutil.make_archive(zip_file_path.replace('.zip', ''), 'zip', output_dir)
        print(f"Directory {output_dir} has been zipped into {zip_file_path}")
    else:
        print(f"Directory {output_dir} does not exist")

# 執行打包
zip_output_directory()


Directory /content/output has been zipped into /content/output_archive.zip
