In [2]:
import re
import pandas as pd
from docx import Document
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_LINE_SPACING
from docx.oxml.ns import qn
import os
from openpyxl import load_workbook
from PIL import Image
from tqdm import tqdm  # 导入tqdm

In [3]:
def read_excel(file_path):
    """读取Excel文件"""
    if not os.path.exists(file_path):
        raise FileNotFoundError(f"Excel文件 {file_path} 不存在。")
    try:
        return pd.read_excel(file_path)
    except Exception as e:
        raise Exception(f"读取Excel文件时出错: {e}")


def extract_images_from_excel(excel_file):
    """从Excel中提取图片并保存"""
    image_folder = "extracted_images"
    if not os.path.exists(image_folder):
        os.makedirs(image_folder)

    wb = load_workbook(excel_file)
    sheet = wb.active

    image_paths = []

    image_index = 1
    for row_index, row in enumerate(sheet.iter_rows(), start=1):
        for img in sheet._images:
            if img.anchor._from.row == row_index - 1:
                img_data = img.ref
                image_file_path = os.path.join(image_folder, f"image_{image_index}.png")
                with open(image_file_path, 'wb') as f:
                    f.write(img_data.getvalue())

                image_paths.append(image_file_path)
                image_index += 1

    return image_paths


def resize_image(image_path, output_path, max_size=(800, 800)):
    """调整图片大小并确保其模式是RGB"""
    try:
        img = Image.open(image_path)
        
        # 如果图像是CMYK模式，转换为RGB模式
        if img.mode == 'CMYK':
            img = img.convert('RGB')
        
        img.thumbnail(max_size)  # 调整大小
        img.save(output_path, format='PNG')  # 保存为PNG格式
    except Exception as e:
        print(f"图像调整大小时出错: {e}")
        return False
    return True


def create_placeholder_image(image_path):
    """创建占位符图像"""
    img = Image.new('RGB', (1, 1), color=(255, 255, 255))
    img.save(image_path)


def set_run_font(run, font_name, font_size):
    """设置字体和字号"""
    run.font.name = font_name
    run.font.size = Pt(font_size)
    run.font.color.rgb = RGBColor(0, 0, 0)  # 黑色字体
    run.font._element.rPr.rFonts.set(qn('w:eastAsia'), font_name)

In [6]:
def set_paragraph_format(para, is_title=False):
    """设置段落格式"""
    para.paragraph_format.line_spacing_rule = WD_LINE_SPACING.EXACTLY
    para.paragraph_format.line_spacing = Pt(28.8)
    para.paragraph_format.space_before = Pt(0)
    para.paragraph_format.space_after = Pt(0)

    if is_title:
        para.paragraph_format.first_line_indent = Pt(0)  # 标题不缩进
    else:
        para.paragraph_format.first_line_indent = Pt(31.5)  # 正文首行缩进31.5磅

    para.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY  # 两端对齐


def clean_text(text):
    """去除正文中的多余空格，并保留英文单词之间的空格"""
    # 去除所有类型的空白字符，确保每个单词之间仅有一个空格
    # 替换多种类型的空白字符为标准空格，并清理多余的空格
    text = re.sub(r'[\s\u200B\u00A0\u2002\u2003\u2009\u202F]+', ' ', text)  # 替换空格和特殊空白字符为单个空格
    text = re.sub(r'\s+', ' ', text)  # 替换多个空格为一个
    text = text.strip()  # 去掉文本前后的空格
    return text





In [8]:
def create_word_from_excel(df, image_paths, output_path):
    """根据Excel数据创建Word文件"""
    required_columns = ['新闻标题', '时间', '问题一', '问题二', '问题三', '问题四', '问题五', '信息来源']  # 人物
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"DataFrame中缺少必要的列: {', '.join(missing_columns)}")

    doc = Document()
    image_index = 0
    total_images = len(image_paths)

    title_number = 2478  # 标题序号初始化
# 使用tqdm显示进度条
    for index, row in tqdm(df.iterrows(), total=len(df), desc="生成Word文档", unit="条"):
        # 生成组合标题
        title_parts = []

        # 对于每个部分，检查其是否为空，如果为空则使用“ - ”；；根据标题元素的顺序对内容进行上下调整
        title_parts.append(row['新闻标题'] if pd.notnull(row['新闻标题']) else " - ")
        title_parts.append(row['时间'] if pd.notnull(row['时间']) else " - ")
        title_parts.append(row['问题一'] if pd.notnull(row['问题一']) else " - ")
        title_parts.append(row['问题二'] if pd.notnull(row['问题二']) else " - ")
        title_parts.append(row['问题三'] if pd.notnull(row['问题三']) else " - ")
        title_parts.append(row['问题四'] if pd.notnull(row['问题四']) else " - ")
        title_parts.append(row['问题五'] if pd.notnull(row['问题五']) else " - ")
        title_parts.append(row['信息来源'] if pd.notnull(row['信息来源']) else " - ")
        # 组合标题，并添加序号
        title = f"{title_number}. " + "/".join(title_parts)  # 用“/”分隔并添加序号

        # title = row['标题']######直接导入标题，不需要组合时
        heading = doc.add_heading(title, level=1)

        # 设置标题字体为黑体，字号16pt
        for run in heading.runs:
            set_run_font(run, '黑体', 16)

        # 设置标题段落格式
        heading_paragraph = heading.paragraph_format
        heading_paragraph.line_spacing_rule = WD_LINE_SPACING.EXACTLY
        heading_paragraph.line_spacing = Pt(28.8)
        heading_paragraph.space_before = Pt(0)
        heading_paragraph.space_after = Pt(0)

        # 插入图片
        if image_index < total_images:
            image_path = image_paths[image_index]
            try:
                resized_image_path = f"resized_{image_index + 1}.png"
                if resize_image(image_path, resized_image_path):
                    paragraph = doc.add_paragraph()
                    run = paragraph.add_run()
                    run.add_picture(resized_image_path, width=Inches(3))####设置图片大小
                    paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
                    image_index += 1
                    os.remove(resized_image_path)
                else:
                    placeholder_path = "placeholder_image.png"
                    create_placeholder_image(placeholder_path)
                    paragraph = doc.add_paragraph()
                    run = paragraph.add_run()
                    run.add_picture(placeholder_path, width=Inches(3))####设置图片大小
                    paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
                    os.remove(placeholder_path)
            except Exception as e:
                print(f"插入图片时出错: {e}")
                placeholder_path = "placeholder_image.png"
                create_placeholder_image(placeholder_path)
                paragraph = doc.add_paragraph()
                run = paragraph.add_run()
                run.add_picture(placeholder_path, width=Inches(3))####设置图片大小
                paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
                os.remove(placeholder_path)

        # 插入正文
        if str(row['问题七']) == '未知':
            text = str(row['正文'])
        else:
            text = str(row['问题七'])
            
        clean_text_content = clean_text(text)  # 去除正文中的多余空格
        text_paragraph = doc.add_paragraph(clean_text_content)
        # 设置正文为仿宋，字号16.5pt
        for run in text_paragraph.runs:
            set_run_font(run, '仿宋', 16.5)

        # 设置正文段落格式
        set_paragraph_format(text_paragraph)  # 正文首行缩进，格式设置

        # 更新标题序号
        title_number += 1

    doc.save(output_path)
    print(f"Word文档已保存到 {output_path}")


# 使用示例
if __name__ == "__main__":
    # 读取Excel文件
    excel_file = r"D:\学习\Naraka\Book1.xlsx"  # Excel文件路径
    df = read_excel(excel_file)
    df = df.iloc[125:]
    # 提取图片
    image_paths = extract_images_from_excel(excel_file)

    # 输出的Word文件路径
    output_path = r"D:\学习\Naraka\加勒比地区数据\Book1.docs"

    # 创建Word文档
    create_word_from_excel(df, image_paths, output_path)

生成Word文档: 100%|███████████████████████████████████████████████████████████████████| 11/11 [00:00<00:00, 407.14条/s]

Word文档已保存到 D:\学习\Naraka\加勒比地区数据\Book1.docs



