In [1]:
# 引入所需的库和模块
# 先在terminal中输入: pip install python-docx
from modelscope import AutoModelForCausalLM, AutoTokenizer
from modelscope import GenerationConfig
import tqdm, os, docx
from datetime import datetime

# 加载一个预训练的模型和对应的tokenizer（用于处理文本）
print("尝试加载模型中...")
try:
    tokenizer = AutoTokenizer.from_pretrained("Xunzillm4cc/Xunzi-Qwen-Chat", trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained("Xunzillm4cc/Xunzi-Qwen-Chat", device_map="auto", trust_remote_code=True).eval()
    print("模型和tokenizer加载完成。")
except Exception as e:
    print(f"加载模型时发生错误: {e}")
    
# 测试模型，通过给定的初始文本"你好"生成回应
print("生成初始文本的回应中...")
response, history = model.chat(tokenizer, "你好", history=None)
print("初始回应：", response)

def read_text_from_file(input_file):
    """ 根据文件类型读取内容 """
    file_ext = os.path.splitext(input_file)[1].lower()
    if file_ext == '.txt':
        with open(input_file, 'r', encoding='utf-8') as file:
            return [line.strip() for line in file if line.strip() != '']
    elif file_ext == '.doc' or file_ext == '.docx':
        doc = docx.Document(input_file)
        return [para.text.strip() for para in doc.paragraphs if para.text.strip() != '']
    else:
        raise ValueError("不支持的文件类型。请提供.txt或.doc/.docx文件。")


# 定义一个函数来处理一个文本文件
def process_text_file(input_file):
    if not os.path.exists(input_file):
        print(f"错误: 文件 {input_file} 不存在。")
        print("请确保文件路径正确且文件存在于指定位置。")
        return  # 结束函数执行
    
    # 打开并读取输入的文本文件
    #with open(input_file, 'r', encoding='utf-8') as file:
        #paragraphs = [line.strip() for line in file if line.strip() != '']
    paragraphs = read_text_from_file(input_file)
    print(f"正在打开文件 {input_file} 进行读取...")
 
    total = len(paragraphs)  # 计算文件中总共的段落数量
    print(f"总段落数量: {total}")

    # 获取当前时间，用于创建带有时间戳的输出文件
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = f'output_{timestamp}.txt'
    print(f"输出文件将被保存为：{output_file}")

    # 打开输出文件，准备写入处理后的数据
    with open(output_file, 'w', encoding='utf-8') as outfile, tqdm.tqdm(total=total, desc="Processing", unit="para", ncols=75) as progress:
        for i, para in enumerate(paragraphs):
            try:
                # 为每个段落构造输入，提示模型进行处理
                input_text = '请对冒号后的话正确添加标点符号（断句），如果不需要就直接回复原文：' + para
                response, history = model.chat(tokenizer, input_text, history=None)

                # 将生成的回应写入输出文件
                outfile.write(response + '\n\n')
                outfile.flush()  # 确保数据立即写入文件

                # 更新进度条
                progress.update(1)
            except Exception as e:
                # 在控制台输出错误信息，并给出可能的解决方法
                print(f"处理段落 {i+1} 时发生错误: {e}")
                print("尝试检查模型的输入是否正确或联系技术支持。")

    print('处理完毕！')
    print(f"处理结果已保存到 {output_file}")

# 指定待处理的输入文件名（需要根据实际文件名修改）
input_file = '史料辑录2.docx'
process_text_file(input_file)


2024-04-22 23:48:11,047 - modelscope - INFO - PyTorch version 2.1.2+cu121 Found.
2024-04-22 23:48:11,050 - modelscope - INFO - TensorFlow version 2.14.0 Found.
2024-04-22 23:48:11,050 - modelscope - INFO - Loading ast index from /mnt/workspace/.cache/modelscope/ast_indexer
2024-04-22 23:48:11,325 - modelscope - INFO - Loading done! Current index file version is 1.13.3, with md5 55e7043102d017111a56be6e6d7a6a16 and a total number of 972 components indexed
  from .autonotebook import tqdm as notebook_tqdm
2024-04-22 23:48:13.040423: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-04-22 23:48:13.067754: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-04-22 23:48:13.108218: E tensorflow/compiler/xla/stream_

尝试加载模型中...


  return self.fget.__get__(instance, owner)()
Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.33s/it]


模型和tokenizer加载完成。
生成初始文本的回应中...
初始回应： 你好！很高兴能为你提供帮助。有什么我可以帮忙的吗？
正在打开文件 史料辑录2.docx 进行读取...
总段落数量: 1092
输出文件将被保存为：output_20240422_234824.txt


Processing:  11%|██▎                  | 123/1092 [07:06<56:03,  3.47s/para]


KeyboardInterrupt: 