In [None]:
# ==============================================================
# 0⃣️ 安装依赖
# ============================================================== 
!pip -q install --upgrade openai chardet tqdm

# ==============================================================
# 1⃣️ DeepSeek API 基本配置
# ============================================================== 
import os, re, json, unicodedata, datetime, time
from pathlib import Path
from typing  import List, Tuple, Union
import openai, chardet
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor

DEEPSEEK_API_KEY  = "your-default-api-key"
DEEPSEEK_URL      = "https://api.deepseek.com"
DEEPSEEK_MODEL    = "deepseek-chat"

openai.api_key = DEEPSEEK_API_KEY
deep_client    = openai.OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_URL)

# ==============================================================
# 2⃣️ 用户参数
# ============================================================== 
SRC_ROOT        = "/content/novels_chapters"        # ← 一级目录；其下是“小说名”子目录
OUT_EN_ROOT     = "/content/translate_EN"           # ← 英文译文根目录
OUT_ZH_ROOT     = "/content/translate_back_ZH"      # ← 回译中文根目录
CHAPTER_RANGE   = [1,3,5,25,50,75,100,125]                          # ← 章节范围或 [3,7,9]
WORKERS         = 2                              # 并行线程

PROMPT_ZH2EN = "You are a professional translator. Translate the following Chinese literary text into vivid, fluent English. Keep paragraph breaks. Output ONLY the translation:"
PROMPT_EN2ZH = "你是一位专业文学译者，请将下面的英文文本精准地译回中文，保持段落划分，不要加入额外说明，只输出译文："

# ==============================================================
# 3⃣️ 工具函数
# ============================================================== 
_CHAP_NO_RE = re.compile(r"(\d{1,4})")

def _chapter_no(path: Path) -> int:
    m = _CHAP_NO_RE.search(path.stem)
    if not m:
        raise ValueError(f"文件名无法识别章节号: {path}")
    return int(m.group(1))

def _auto_read(path: Path) -> str:
    raw = path.read_bytes()
    enc = chardet.detect(raw)["encoding"] or "utf-8"
    return raw.decode(enc, errors="ignore")

def _call_deepseek(prompt: str, text: str) -> str:
    rsp = deep_client.chat.completions.create(
        model=DEEPSEEK_MODEL,
        messages=[{"role": "system", "content": prompt},
                  {"role": "user",   "content": text}],
        temperature=0.1,
        max_tokens=4096
    )
    return rsp.choices[0].message.content.strip()

# ==============================================================
# 4⃣️ 翻译单本小说 translate_one_book()
# ============================================================== 
def translate_one_book(book_dir: Path,
                       out_en_root: Path,
                       out_zh_root: Path,
                       chapters: Union[Tuple[int, int], List[int]],
                       workers: int = 4):
    if isinstance(chapters, tuple):
        s, e = chapters
        chap_set = set(range(s, e + 1))
    else:
        chap_set = set(chapters)

    #── 读取章节文件 & 校验缺失 ────────────────────────────────
    file_pairs = []
    for fp in book_dir.glob("*.txt"):
        no = _chapter_no(fp)
        if no in chap_set:
            file_pairs.append((no, fp))
            chap_set.remove(no)

    if chap_set:
        raise FileNotFoundError(f"《{book_dir.name}》缺少章节: {sorted(chap_set)}")

    file_pairs.sort()       # 章节号排序

    #── 准备输出子目录 ────────────────────────────────────
    en_dir = out_en_root / book_dir.name
    zh_dir = out_zh_root / book_dir.name
    en_dir.mkdir(parents=True, exist_ok=True)
    zh_dir.mkdir(parents=True, exist_ok=True)

    #── 翻译函数（单章）───────────────────────────────────
    def _translate(no: int, fp: Path):
        zh_text = _auto_read(fp)

        # zh → en
        en_text = _call_deepseek(PROMPT_ZH2EN, zh_text)
        en_path = en_dir / f"{fp.stem}_en.txt"
        en_path.write_text(en_text, encoding="utf-8")

        # en → zh (back-translation)
        zh_back = _call_deepseek(PROMPT_EN2ZH, en_text)
        zh_path = zh_dir / f"{fp.stem}_back_cn.txt"
        zh_path.write_text(zh_back, encoding="utf-8")
        return fp.name

    #── 并行执行 ─────────────────────────────────────────
    with ThreadPoolExecutor(max_workers=workers) as ex:
        list(tqdm(ex.map(lambda t: _translate(*t), file_pairs),
                  total=len(file_pairs),
                  desc=f"🔄 {book_dir.name}"))

# ==============================================================
# 5⃣️ 翻译全集 translate_library()
# ============================================================== 
def translate_library(src_root: str,
                      out_en_root: str,
                      out_zh_root: str,
                      chapters: Union[Tuple[int, int], List[int]],
                      workers: int = 4):
    src_root = Path(src_root)
    out_en_root = Path(out_en_root)
    out_zh_root = Path(out_zh_root)

    novels = [d for d in src_root.iterdir() if d.is_dir()]
    if not novels:
        raise FileNotFoundError(f"{src_root} 下未找到任何小说子目录")

    for book in novels:
        translate_one_book(book, out_en_root, out_zh_root, chapters, workers)

    print("\n✅ 全部小说翻译完成")
    print(f"英文译文目录 : {out_en_root}")
    print(f"中文回译目录 : {out_zh_root}")

# ==============================================================
# 6⃣️ 运行
# ============================================================== 
translate_library(SRC_ROOT,
                  OUT_EN_ROOT,
                  OUT_ZH_ROOT,
                  CHAPTER_RANGE,
                  workers=WORKERS)

In [9]:
# ==============================================================
# 0⃣️ 安装依赖
# ============================================================== 
!pip -q install --upgrade openai chardet tqdm

# ==============================================================
# 1⃣️ DeepSeek API 基本配置
# ============================================================== 
import os, re, json, unicodedata, datetime, time, random
from pathlib import Path
from typing  import List, Tuple, Union
import openai, chardet
from tqdm.auto import tqdm
from concurrent.futures import ThreadPoolExecutor

DEEPSEEK_API_KEY  = "your-default-api-key"
DEEPSEEK_URL      = "https://api.deepseek.com"
DEEPSEEK_MODEL    = "deepseek-chat"

openai.api_key = DEEPSEEK_API_KEY
deep_client    = openai.OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_URL)

# ==============================================================
# 2⃣️ 用户参数
# ============================================================== 
SRC_ROOT        = "/content/novels_normalized"
OUT_EN_ROOT     = "/content/translate_EN_normalized"
OUT_ZH_ROOT     = "/content/translate_back_ZH_normalized"
CHAPTER_RANGE   = (1, 200)       # ✅ 可为 tuple(start, end) 或 list，如 [1,3,5,10]
N_CHAPTERS      = 8
WORKERS         = 2

PROMPT_ZH2EN = "You are a professional translator. Translate the following Chinese literary text into vivid, fluent English. Keep paragraph breaks. Output ONLY the translation:"
PROMPT_EN2ZH = "你是一位专业文学译者，请将下面的英文文本精准地译回中文，保持段落划分，不要加入额外说明，只输出译文："

# ==============================================================
# 3⃣️ 工具函数
# ============================================================== 
_CHAP_NO_RE = re.compile(r"(\d{1,4})")

def _chapter_no(path: Path) -> int:
    m = _CHAP_NO_RE.search(path.stem)
    if not m:
        raise ValueError(f"文件名无法识别章节号: {path}")
    return int(m.group(1))

def _auto_read(path: Path) -> str:
    raw = path.read_bytes()
    enc = chardet.detect(raw)["encoding"] or "utf-8"
    return raw.decode(enc, errors="ignore")

def _call_deepseek(prompt: str, text: str) -> str:
    rsp = deep_client.chat.completions.create(
        model=DEEPSEEK_MODEL,
        messages=[{"role": "system", "content": prompt},
                  {"role": "user",   "content": text}],
        temperature=0.1,
        max_tokens=4096
    )
    return rsp.choices[0].message.content.strip()

# ==============================================================
# 4⃣️ 翻译单本小说 translate_one_book()
# ============================================================== 
def translate_one_book(book_dir: Path,
                       out_en_root: Path,
                       out_zh_root: Path,
                       chapters: List[int],
                       workers: int = 4):

    chap_set = set(chapters)

    file_pairs = []
    for fp in book_dir.glob("*.txt"):
        no = _chapter_no(fp)
        if no in chap_set:
            file_pairs.append((no, fp))
            chap_set.remove(no)

    if chap_set:
        raise FileNotFoundError(f"《{book_dir.name}》缺少章节: {sorted(chap_set)}")

    file_pairs.sort()

    en_dir = out_en_root / book_dir.name
    zh_dir = out_zh_root / book_dir.name
    en_dir.mkdir(parents=True, exist_ok=True)
    zh_dir.mkdir(parents=True, exist_ok=True)

    def _translate(no: int, fp: Path):
        zh_text = _auto_read(fp)

        en_text = _call_deepseek(PROMPT_ZH2EN, zh_text)
        en_path = en_dir / f"{fp.stem}_en.txt"
        en_path.write_text(en_text, encoding="utf-8")

        zh_back = _call_deepseek(PROMPT_EN2ZH, en_text)
        zh_path = zh_dir / f"{fp.stem}_back_cn.txt"
        zh_path.write_text(zh_back, encoding="utf-8")
        return fp.name

    with ThreadPoolExecutor(max_workers=workers) as ex:
        list(tqdm(ex.map(lambda t: _translate(*t), file_pairs),
                  total=len(file_pairs),
                  desc=f"🔄 {book_dir.name}"))

# ==============================================================
# 5⃣️ 翻译全集 translate_library()
# ============================================================== 
def translate_library(src_root: str,
                      out_en_root: str,
                      out_zh_root: str,
                      chapter_range: Union[Tuple[int, int], List[int]],
                      n_chapters: int,
                      workers: int = 4):
    src_root = Path(src_root)
    out_en_root = Path(out_en_root)
    out_zh_root = Path(out_zh_root)

    # ✅ 转换章节范围
    if isinstance(chapter_range, tuple):
        full_range = list(range(chapter_range[0], chapter_range[1] + 1))
    else:
        full_range = list(chapter_range)

    if n_chapters > len(full_range):
        raise ValueError(f"指定章节数 n_chapters={n_chapters} 超过章节范围长度 {len(full_range)}")

    novels = [d for d in src_root.iterdir() if d.is_dir()]
    if not novels:
        raise FileNotFoundError(f"{src_root} 下未找到任何小说子目录")

    for book in novels:
        chapters_sample = random.sample(full_range, k=n_chapters)
        translate_one_book(book, out_en_root, out_zh_root, chapters_sample, workers)

    print("\n✅ 全部小说翻译完成")
    print(f"英文译文目录 : {out_en_root}")
    print(f"中文回译目录 : {out_zh_root}")

# ==============================================================
# 6⃣️ 运行翻译任务
# ============================================================== 
translate_library(SRC_ROOT,
                  OUT_EN_ROOT,
                  OUT_ZH_ROOT,
                  chapter_range=CHAPTER_RANGE,
                  n_chapters=N_CHAPTERS,
                  workers=WORKERS)

🔄 《奋斗在新明朝》（校对版全本）作者：随轻风去_utf8:   0%|          | 0/8 [00:00<?, ?it/s]

🔄 《反正我是超能力者》（校对版全本）作者：吃书妖_utf8:   0%|          | 0/8 [00:00<?, ?it/s]

🔄 《天可汗》（校对版全本）作者：西风紧_utf8:   0%|          | 0/8 [00:00<?, ?it/s]

🔄 《崩坏世界的传奇大冒险》（精校版全本）作者：国王陛下_utf8:   0%|          | 0/8 [00:00<?, ?it/s]

🔄 《全球进化》（精校版全本）作者：咬狗_utf8:   0%|          | 0/8 [00:00<?, ?it/s]

🔄 《武林半侠传》（校对版全本）作者：文抄公_utf8:   0%|          | 0/8 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [8]:
import os
from pathlib import Path
import re
import unicodedata

# 路径配置
TRANSLATED_ROOT = Path("/content/translate_EN")
NOVELS_DIR      = Path("/content/novels")
OUTPUT_DIR      = Path("/content/randomseed")

OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# 匹配章节号用的正则
chapter_num_re = re.compile(r"(\d{1,4})")

# 提取章节号
def extract_chapter_no(filename: str) -> int:
    match = chapter_num_re.search(filename)
    if match:
        return int(match.group(1))
    else:
        raise ValueError(f"无法解析章节号: {filename}")

# 标准化书名（去除标点、空格、全角符号等）
def normalize(text: str) -> str:
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\.txt$", "", text)  # 去掉.txt后缀
    text = text.lower()
    text = re.sub(r"[_\-：:（）\(\)\[\]【】·•—…,.!！?？\s]", "", text)
    return text

# 匹配翻译目录名到原始书名（尽量贴合原文）
def match_original_title(trans_name: str, original_names: list) -> str:
    norm_trans = normalize(trans_name)
    original_map = {normalize(name): name for name in original_names}
    return original_map.get(norm_trans, trans_name)

# 获取原始书名列表（支持文件或目录）
original_titles = [d.name for d in NOVELS_DIR.iterdir() if d.is_file() or d.is_dir()]

unmatched_books = []

# 遍历翻译目录
for book_dir in TRANSLATED_ROOT.iterdir():
    if not book_dir.is_dir():
        continue

    chapter_nos = []
    for file in book_dir.glob("*_en.txt"):
        try:
            no = extract_chapter_no(file.name)
            chapter_nos.append(no)
        except ValueError:
            continue

    chapter_nos = sorted(set(chapter_nos))
    if not chapter_nos:
        continue

    matched_name = match_original_title(book_dir.name, original_titles)
    if matched_name == book_dir.name:
        unmatched_books.append(book_dir.name)

    # ✅ 构造文件名并加 _randomseed.txt 后缀，避免重复
    final_name = matched_name.rsplit(".txt", 1)[0] + "_randomseed.txt"
    out_path = OUTPUT_DIR / final_name

    with out_path.open("w", encoding="utf-8") as f:
        f.write("\n".join(str(no) for no in chapter_nos))

# ✅ 保存未匹配书名清单
if unmatched_books:
    unmatched_path = OUTPUT_DIR / "unmatched_books.txt"
    with unmatched_path.open("w", encoding="utf-8") as f:
        f.write("以下书名未匹配到 novels 原始标题：\n\n")
        for name in unmatched_books:
            f.write(f"{name}\n")
    print(f"\n📄 未匹配书名清单已保存到：{unmatched_path}")

print("\n✅ 所有章节记录已保存到 randomseed 文件夹（统一命名为 *_randomseed.txt）。")


✅ 所有章节记录已保存到 randomseed 文件夹（统一命名为 *_randomseed.txt）。


In [16]:
# ==============================================================
# 0⃣️ 安装依赖
# ============================================================== 
!pip -q install --upgrade openai google-generativeai chardet tqdm

# ==============================================================
# 1⃣️ 配置
# ============================================================== 
import os, re, json, unicodedata, time, random
from pathlib import Path
from typing import List, Tuple, Union
from tqdm.auto import tqdm
import chardet
from concurrent.futures import ThreadPoolExecutor

# DeepSeek
import openai
DEEPSEEK_API_KEY  = "your-default-api-key"
DEEPSEEK_URL      = "https://api.deepseek.com"
openai.api_key    = DEEPSEEK_API_KEY
deep_client       = openai.OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_URL)

# Gemini
import google.generativeai as genai
GEMINI_API_KEY     = "your-default-api-key"
GEMINI_MODEL       = "gemini-2.0-flash"  # 推荐 pro
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel(GEMINI_MODEL)

# 翻译控制参数
TRANSLATION_ENGINE = "gemini"  # 可选： "deepseek" / "gemini"

# 路径参数
SRC_ROOT    = "/content/novels_normalized"
OUT_EN_ROOT = "/content/translate_EN_normalized"
OUT_ZH_ROOT = "/content/translate_back_ZH_normalized"
CHAPTER_RANGE = (1, 200)
N_CHAPTERS    = 8
WORKERS       = 2

PROMPT_ZH2EN = "You are a professional translator. Translate the following Chinese literary text into vivid, fluent English. Keep paragraph breaks. Output ONLY the translation:"
PROMPT_EN2ZH = "你是一位专业文学译者，请将下面的英文文本精准地译回中文，保持段落划分，不要加入额外说明，只输出译文："

# ==============================================================
# 2⃣️ 工具函数
# ============================================================== 
_CHAP_NO_RE = re.compile(r"(\d{1,4})")

def _chapter_no(path: Path) -> int:
    m = _CHAP_NO_RE.search(path.stem)
    if not m:
        raise ValueError(f"文件名无法识别章节号: {path}")
    return int(m.group(1))

def _auto_read(path: Path) -> str:
    raw = path.read_bytes()
    enc = chardet.detect(raw)["encoding"] or "utf-8"
    return raw.decode(enc, errors="ignore")

def _call_deepseek(prompt: str, text: str) -> str:
    rsp = deep_client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role": "system", "content": prompt},
                  {"role": "user",   "content": text}],
        temperature=0.1,
        max_tokens=8092
    )
    return rsp.choices[0].message.content.strip()

def _call_gemini(prompt: str, text: str) -> str:
    full_prompt = f"{prompt.strip()}\n\n{text.strip()}"
    rsp = gemini_model.generate_content(full_prompt, generation_config={"temperature": 0.3})
    return rsp.text.strip()

def _call_translate(prompt: str, text: str, engine: str) -> str:
    if engine == "gemini":
        return _call_gemini(prompt, text)
    elif engine == "deepseek":
        return _call_deepseek(prompt, text)
    else:
        raise ValueError(f"未知翻译引擎: {engine}")
    
# ==============================================================
# 3⃣️ 翻译单本小说
# ============================================================== 
def translate_one_book(book_dir: Path,
                       out_en_root: Path,
                       out_zh_root: Path,
                       chapters: List[int],
                       engine: str = "deepseek",
                       workers: int = 4):
    """
    翻译单本小说的指定章节。如果已存在英文和回译文件则跳过。
    """
    chap_set = set(chapters)
    file_pairs = []
    for fp in book_dir.glob("*.txt"):
        no = _chapter_no(fp)
        if no in chap_set:
            file_pairs.append((no, fp))
            chap_set.remove(no)

    if chap_set:
        raise FileNotFoundError(f"《{book_dir.name}》缺少章节: {sorted(chap_set)}")

    file_pairs.sort()
    en_dir = out_en_root / book_dir.name
    zh_dir = out_zh_root / book_dir.name
    en_dir.mkdir(parents=True, exist_ok=True)
    zh_dir.mkdir(parents=True, exist_ok=True)

    def _translate(no: int, fp: Path):
        en_path = en_dir / f"{fp.stem}_en.txt"
        zh_path = zh_dir / f"{fp.stem}_back_cn.txt"

        # 跳过已存在的翻译结果
        if en_path.exists() and zh_path.exists():
            return f"✅ skipped: {fp.name}"

        zh_text = _auto_read(fp)

        # 中文 → 英文
        en_text = _call_translate(PROMPT_ZH2EN, zh_text, engine)
        en_path.write_text(en_text, encoding="utf-8")

        # 英文 → 中文回译
        zh_back = _call_translate(PROMPT_EN2ZH, en_text, engine)
        zh_path.write_text(zh_back, encoding="utf-8")

        return f"✅ translated: {fp.name}"

    with ThreadPoolExecutor(max_workers=workers) as ex:
        results = list(tqdm(ex.map(lambda t: _translate(*t), file_pairs),
                            total=len(file_pairs),
                            desc=f"🔄 {book_dir.name}"))
    for line in results:
        print(line)



# ==============================================================
# 4⃣️ 翻译全集
# ============================================================== 
def translate_library(src_root: str,
                      out_en_root: str,
                      out_zh_root: str,
                      chapter_range: Union[Tuple[int, int], List[int]],
                      n_chapters: int,
                      engine: str = "deepseek",
                      workers: int = 4):
    """
    翻译整个小说库：每本小说抽样 n_chapters 个章节翻译。
    如果英文译文目录中已存在 >= n_chapters 个翻译文件，则跳过该本小说。
    """
    src_root = Path(src_root)
    out_en_root = Path(out_en_root)
    out_zh_root = Path(out_zh_root)

    full_range = list(range(chapter_range[0], chapter_range[1] + 1)) if isinstance(chapter_range, tuple) else list(chapter_range)

    if n_chapters > len(full_range):
        raise ValueError(f"指定章节数 n_chapters={n_chapters} 超过章节范围长度 {len(full_range)}")

    novels = [d for d in src_root.iterdir() if d.is_dir()]
    if not novels:
        raise FileNotFoundError(f"{src_root} 下未找到任何小说子目录")

    for book in novels:
        en_dir = out_en_root / book.name
        if en_dir.exists():
            completed = len(list(en_dir.glob("*_en.txt")))
            if completed >= n_chapters:
                print(f"✅ 跳过《{book.name}》，已有 {completed} 个章节翻译")
                continue

        chapters_sample = random.sample(full_range, k=n_chapters)
        translate_one_book(book, out_en_root, out_zh_root, chapters_sample, engine, workers)

    print("\n✅ 全部小说翻译完成")
    print(f"英文译文目录 : {out_en_root}")
    print(f"中文回译目录 : {out_zh_root}")


# ==============================================================
# 5⃣️ 运行任务
# ============================================================== 
translate_library(SRC_ROOT,
                  OUT_EN_ROOT,
                  OUT_ZH_ROOT,
                  chapter_range=CHAPTER_RANGE,
                  n_chapters=N_CHAPTERS,
                  engine=TRANSLATION_ENGINE,
                  workers=WORKERS)

✅ 跳过《《奋斗在新明朝》（校对版全本）作者：随轻风去_utf8》，已有 8 个章节翻译
✅ 跳过《《反正我是超能力者》（校对版全本）作者：吃书妖_utf8》，已有 8 个章节翻译
✅ 跳过《《天可汗》（校对版全本）作者：西风紧_utf8》，已有 8 个章节翻译
✅ 跳过《《崩坏世界的传奇大冒险》（精校版全本）作者：国王陛下_utf8》，已有 8 个章节翻译
✅ 跳过《《全球进化》（精校版全本）作者：咬狗_utf8》，已有 8 个章节翻译
✅ 跳过《《武林半侠传》（校对版全本）作者：文抄公_utf8》，已有 8 个章节翻译
✅ 跳过《国宴大厨在八零》，已有 8 个章节翻译
✅ 跳过《《搜神记》（精校版全本）作者：树下野狐_utf8》，已有 8 个章节翻译
✅ 跳过《重生八零：毒妻不好惹》，已有 8 个章节翻译
✅ 跳过《《窃明》（校对版全本）作者：大爆炸(灰熊猫)_utf8》，已有 8 个章节翻译
✅ 跳过《《蜀山》（精校版全本）作者：流浪的蛤蟆_utf8》，已有 8 个章节翻译
✅ 跳过《《陈二狗的妖孽人生》（校对版全本）作者：烽火戏诸侯_utf8》，已有 8 个章节翻译
✅ 跳过《《贩罪》（精校版全本）作者：三天两觉_utf8》，已有 8 个章节翻译
✅ 跳过《《重生之出人头地》（校对版全本）作者：闹闹不爱闹_utf8》，已有 8 个章节翻译
✅ 跳过《八零喜事：当家肥妻大翻身》，已有 8 个章节翻译
✅ 跳过《《肆虐韩娱》（校对版全本）作者：姬叉_utf8》，已有 8 个章节翻译
✅ 跳过《八零年代好时光》，已有 8 个章节翻译
✅ 跳过《《食物链顶端的男人》（校对版全本）作者：熊狼狗_utf8》，已有 8 个章节翻译
✅ 跳过《《高手寂寞2》（校对版全本）作者：兰帝魅晨_utf8》，已有 8 个章节翻译
✅ 跳过《《黑龙法典》（校对版全本）作者：欢声_utf8》，已有 8 个章节翻译
✅ 跳过《《诛仙》（校对版全本）作者：萧鼎_utf8》，已有 8 个章节翻译
✅ 跳过《《回到过去变成猫》（精校版全本）作者：陈词懒调_utf8》，已有 8 个章节翻译
✅ 跳过《《神游》（校对版全本）作者：徐公子胜治_utf8》，已有 8 个章节翻译
✅ 跳过《《老子是癞蛤蟆》（校对版全本） 作者：烽火戏诸侯_utf8》，已有 8 个章节翻译
✅ 跳过《《未来天王》（校对版全

🔄 《上品寒士》（校对版全本）作者：贼道三痴_utf8:   0%|          | 0/8 [00:00<?, ?it/s]


KeyboardInterrupt



In [10]:
!rm -rf /content/translate_back_ZH_normalized
!rm -rf /content/translate_EN_normalized

In [24]:
# ==============================================================
# 0⃣️ 安装依赖
# ==============================================================
!pip -q install --upgrade openai google-generativeai chardet tqdm

# ==============================================================
# 1⃣️ 配置
# ==============================================================
import os, re, json, unicodedata, time, random
from pathlib import Path
from typing import List, Tuple, Union
from tqdm.auto import tqdm
import chardet
from concurrent.futures import ThreadPoolExecutor

# DeepSeek
import openai
DEEPSEEK_API_KEY  = "your-default-api-key"
DEEPSEEK_URL      = "https://api.deepseek.com"
openai.api_key    = DEEPSEEK_API_KEY
deep_client       = openai.OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_URL)

# Gemini
import google.generativeai as genai
GEMINI_API_KEY     = "your-default-api-key"
GEMINI_MODEL       = "gemini-2.0-flash"
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel(GEMINI_MODEL)

# 路径参数
SRC_ROOT    = Path("/content/novels_normalized")
OUT_EN_ROOT = Path("/content/translate_EN_normalized")
OUT_ZH_ROOT = Path("/content/translate_back_ZH_normalized")
SEED_ROOT   = Path("/content/randomseed")

CHAPTER_RANGE = (1, 200)
N_CHAPTERS    = 8
WORKERS       = 2
TRANSLATION_ENGINE = "gemini"  # 可选： "deepseek" / "gemini"

PROMPT_ZH2EN = "You are a professional translator. Translate the following Chinese literary text into vivid, fluent English. Keep paragraph breaks. Output ONLY the translation:"
PROMPT_EN2ZH = "你是一位专业文学译者，请将下面的英文文本精准地译回中文，保持段落划分，不要加入额外说明，只输出译文："

# ==============================================================
# 2⃣️ 工具函数
# ==============================================================
_CHAP_NO_RE = re.compile(r"(\d{1,4})")

def _chapter_no(path: Path) -> int:
    m = _CHAP_NO_RE.search(path.stem)
    return int(m.group(1)) if m else -1

def _auto_read(path: Path) -> str:
    raw = path.read_bytes()
    enc = chardet.detect(raw)["encoding"] or "utf-8"
    return raw.decode(enc, errors="ignore")

def _call_deepseek(prompt, text):
    rsp = deep_client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role": "system", "content": prompt}, {"role": "user", "content": text}],
        temperature=0.1, max_tokens=8092
    )
    return rsp.choices[0].message.content.strip()

def _call_gemini(prompt, text):
    full_prompt = f"{prompt.strip()}\n\n{text.strip()}"
    rsp = gemini_model.generate_content(full_prompt, generation_config={"temperature": 0.3})
    return rsp.text.strip()

def _call_translate(prompt, text, engine):
    try:
        return _call_gemini(prompt, text) if engine == "gemini" else _call_deepseek(prompt, text)
    except Exception as e:
        return f"【翻译出错】{e}"

# ✅ 翻译一本小说的指定章节（带跳过）
def translate_one_book(book_dir: Path, out_en_root: Path, out_zh_root: Path,
                       chapters: List[int], engine: str = "gemini", workers: int = 2):
    chap_set = set(chapters)
    file_pairs = []
    for fp in book_dir.glob("*.txt"):
        no = _chapter_no(fp)
        if no in chap_set:
            file_pairs.append((no, fp))
            chap_set.remove(no)

    if chap_set:
        print(f"⚠️ 缺章节: {chap_set}，跳过部分")

    file_pairs.sort()
    en_dir = out_en_root / book_dir.name
    zh_dir = out_zh_root / book_dir.name
    en_dir.mkdir(parents=True, exist_ok=True)
    zh_dir.mkdir(parents=True, exist_ok=True)

    def _translate(no, fp):
        try:
            en_path = en_dir / f"{fp.stem}_en.txt"
            zh_path = zh_dir / f"{fp.stem}_back_cn.txt"

            if en_path.exists() and zh_path.exists():
                return f"✅ skipped: {fp.name}"

            zh_text = _auto_read(fp)
            en_text = _call_translate(PROMPT_ZH2EN, zh_text, engine)
            en_path.write_text(en_text, encoding="utf-8")
            zh_back = _call_translate(PROMPT_EN2ZH, en_text, engine)
            zh_path.write_text(zh_back, encoding="utf-8")
            return f"✅ translated: {fp.name}"
        except Exception as e:
            return f"❌ error in {fp.name}: {e}"

    with ThreadPoolExecutor(max_workers=workers) as ex:
        results = list(tqdm(ex.map(lambda t: _translate(*t), file_pairs), total=len(file_pairs),
                            desc=f"🔄 {book_dir.name}"))
    for line in results:
        print(line)

# ✅ 遍历整个小说目录翻译（跳过已完成或出错的）
def translate_library(src_root: str, out_en_root: str, out_zh_root: str,
                      chapter_range: Union[Tuple[int, int], List[int]],
                      n_chapters: int, engine: str = "gemini", workers: int = 2):
    src_root = Path(src_root)
    out_en_root = Path(out_en_root)
    out_zh_root = Path(out_zh_root)
    full_range = list(range(chapter_range[0], chapter_range[1] + 1)) if isinstance(chapter_range, tuple) else list(chapter_range)

    novels = [d for d in src_root.iterdir() if d.is_dir()]
    for book in novels:
        try:
            en_dir = out_en_root / book.name
            if en_dir.exists():
                done = len(list(en_dir.glob("*_en.txt")))
                if done >= n_chapters:
                    print(f"✅ 跳过《{book.name}》，已有 {done} 章")
                    continue
            chapters = random.sample(full_range, k=n_chapters)
            translate_one_book(book, out_en_root, out_zh_root, chapters, engine, workers)
        except Exception as e:
            print(f"❌ 跳过《{book.name}》，错误：{e}")

    print("\n🎉 全部小说处理完成。")

# ✅ 执行任务
translate_library(SRC_ROOT, OUT_EN_ROOT, OUT_ZH_ROOT,
                  chapter_range=CHAPTER_RANGE,
                  n_chapters=N_CHAPTERS,
                  engine=TRANSLATION_ENGINE,
                  workers=WORKERS)

✅ 跳过《《奋斗在新明朝》（校对版全本）作者：随轻风去_utf8》，已有 8 章
✅ 跳过《《反正我是超能力者》（校对版全本）作者：吃书妖_utf8》，已有 8 章
✅ 跳过《《天可汗》（校对版全本）作者：西风紧_utf8》，已有 8 章
✅ 跳过《《崩坏世界的传奇大冒险》（精校版全本）作者：国王陛下_utf8》，已有 8 章
✅ 跳过《《全球进化》（精校版全本）作者：咬狗_utf8》，已有 8 章
✅ 跳过《《武林半侠传》（校对版全本）作者：文抄公_utf8》，已有 8 章
✅ 跳过《国宴大厨在八零》，已有 8 章
✅ 跳过《《搜神记》（精校版全本）作者：树下野狐_utf8》，已有 8 章
✅ 跳过《重生八零：毒妻不好惹》，已有 8 章
✅ 跳过《《窃明》（校对版全本）作者：大爆炸(灰熊猫)_utf8》，已有 8 章
✅ 跳过《《蜀山》（精校版全本）作者：流浪的蛤蟆_utf8》，已有 8 章
✅ 跳过《《陈二狗的妖孽人生》（校对版全本）作者：烽火戏诸侯_utf8》，已有 8 章
✅ 跳过《《贩罪》（精校版全本）作者：三天两觉_utf8》，已有 8 章
✅ 跳过《《重生之出人头地》（校对版全本）作者：闹闹不爱闹_utf8》，已有 8 章
✅ 跳过《八零喜事：当家肥妻大翻身》，已有 8 章
✅ 跳过《《肆虐韩娱》（校对版全本）作者：姬叉_utf8》，已有 8 章
✅ 跳过《八零年代好时光》，已有 8 章
✅ 跳过《《食物链顶端的男人》（校对版全本）作者：熊狼狗_utf8》，已有 8 章
✅ 跳过《《高手寂寞2》（校对版全本）作者：兰帝魅晨_utf8》，已有 8 章
✅ 跳过《《黑龙法典》（校对版全本）作者：欢声_utf8》，已有 8 章
✅ 跳过《《诛仙》（校对版全本）作者：萧鼎_utf8》，已有 8 章
✅ 跳过《《回到过去变成猫》（精校版全本）作者：陈词懒调_utf8》，已有 8 章
✅ 跳过《《神游》（校对版全本）作者：徐公子胜治_utf8》，已有 8 章
✅ 跳过《《老子是癞蛤蟆》（校对版全本） 作者：烽火戏诸侯_utf8》，已有 8 章
✅ 跳过《《未来天王》（校对版全本）作者：陈词懒调_utf8》，已有 8 章
✅ 跳过《《大画家》（校对版全本）作者：醛石_utf8》，已有 8 章
✅ 跳过《《超级惊悚直播》作者：宇文长弓_utf8》，已有 8 章
✅ 跳

🔄 《十州风云志》（校对版全本）作者：知秋_utf8:   0%|          | 0/8 [00:00<?, ?it/s]

✅ translated: 007.txt
✅ translated: 011.txt
✅ translated: 048.txt
✅ translated: 078.txt
✅ translated: 122.txt
✅ translated: 146.txt
✅ translated: 152.txt
✅ translated: 195.txt


🔄 《史上第一混乱》（校对版全本）作者：张小花_utf8:   0%|          | 0/8 [00:00<?, ?it/s]

✅ translated: 015.txt
✅ translated: 050.txt
✅ translated: 088.txt
✅ translated: 112.txt
✅ translated: 125.txt
✅ translated: 142.txt
✅ translated: 175.txt
✅ translated: 176.txt


🔄 《随波逐流之一代军师》（校对版全本）作者：随波逐流_utf8:   0%|          | 0/8 [00:00<?, ?it/s]

✅ translated: 058.txt
✅ translated: 060.txt
✅ translated: 144.txt
✅ translated: 164.txt
✅ translated: 185.txt
✅ translated: 193.txt
✅ translated: 194.txt
✅ translated: 196.txt


🔄 重生八零：佳妻致富忙:   0%|          | 0/8 [00:00<?, ?it/s]

✅ translated: 024.txt
✅ translated: 025.txt
✅ translated: 049.txt
✅ translated: 068.txt
✅ translated: 085.txt
✅ translated: 145.txt
✅ translated: 166.txt
✅ translated: 183.txt


🔄 重回八零过好日子:   0%|          | 0/8 [00:00<?, ?it/s]

✅ translated: 063.txt
✅ translated: 066.txt
✅ translated: 129.txt
✅ translated: 150.txt
✅ translated: 162.txt
✅ translated: 176.txt
✅ translated: 180.txt
✅ translated: 186.txt


🔄 《绝对一番》（校对版全本）作者：海底漫步者_utf8:   0%|          | 0/8 [00:00<?, ?it/s]

✅ translated: 009.txt
✅ translated: 029.txt
✅ translated: 037.txt
✅ translated: 046.txt
✅ translated: 089.txt
✅ translated: 099.txt
✅ translated: 145.txt
✅ translated: 161.txt


🔄 《我的女友是恶女》（校对版全本）作者：海底漫步者_utf8:   0%|          | 0/8 [00:00<?, ?it/s]

✅ translated: 034.txt
✅ translated: 053.txt
✅ translated: 063.txt
✅ translated: 116.txt
✅ translated: 142.txt
✅ translated: 154.txt
✅ translated: 179.txt
✅ translated: 198.txt

🎉 全部小说处理完成。


In [None]:
# ==============================================================
# 0⃣️ 安装依赖
# ============================================================== 
!pip -q install --upgrade openai google-generativeai chardet tqdm

# ==============================================================
# 1⃣️ 配置
# ============================================================== 
import os, re, json, unicodedata, time, random
from pathlib import Path
from typing import List, Tuple, Union
from tqdm.auto import tqdm
import chardet
from concurrent.futures import ThreadPoolExecutor

# DeepSeek
import openai
DEEPSEEK_API_KEY  = "your-default-api-key"
DEEPSEEK_URL      = "https://api.deepseek.com"
openai.api_key    = DEEPSEEK_API_KEY
deep_client       = openai.OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_URL)

# Gemini
import google.generativeai as genai
GEMINI_API_KEY     = "your-default-api-key"
GEMINI_MODEL       = "gemini-2.0-flash"  # 推荐 pro
genai.configure(api_key=GEMINI_API_KEY)
gemini_model = genai.GenerativeModel(GEMINI_MODEL)

# 翻译控制参数
TRANSLATION_ENGINE = "gemini"  # 可选： "deepseek" / "gemini"

# 路径参数
SRC_ROOT    = "/content/novels_normalized"
OUT_EN_ROOT = "/content/translate_EN_normalized"
OUT_ZH_ROOT = "/content/translate_back_ZH_normalized"
CHAPTER_RANGE = (1, 200)
N_CHAPTERS    = 8
WORKERS       = 2

PROMPT_ZH2EN = "You are a professional translator. Translate the following Chinese literary text into vivid, fluent English. Keep paragraph breaks. Output ONLY the translation:"
PROMPT_EN2ZH = "你是一位专业文学译者，请将下面的英文文本精准地译回中文，保持段落划分，不要加入额外说明，只输出译文："

# ==============================================================
# 2⃣️ 工具函数
# ============================================================== 
_CHAP_NO_RE = re.compile(r"(\d{1,4})")

def _chapter_no(path: Path) -> int:
    m = _CHAP_NO_RE.search(path.stem)
    if not m:
        raise ValueError(f"文件名无法识别章节号: {path}")
    return int(m.group(1))

def _auto_read(path: Path) -> str:
    raw = path.read_bytes()
    enc = chardet.detect(raw)["encoding"] or "utf-8"
    return raw.decode(enc, errors="ignore")

def _call_deepseek(prompt: str, text: str) -> str:
    rsp = deep_client.chat.completions.create(
        model="deepseek-chat",
        messages=[{"role": "system", "content": prompt},
                  {"role": "user",   "content": text}],
        temperature=0.1,
        max_tokens=8092
    )
    return rsp.choices[0].message.content.strip()

def _call_gemini(prompt: str, text: str) -> str:
    full_prompt = f"{prompt.strip()}\n\n{text.strip()}"
    rsp = gemini_model.generate_content(full_prompt, generation_config={"temperature": 0.3})
    return rsp.text.strip()

def _call_translate(prompt: str, text: str, engine: str) -> str:
    if engine == "gemini":
        return _call_gemini(prompt, text)
    elif engine == "deepseek":
        return _call_deepseek(prompt, text)
    else:
        raise ValueError(f"未知翻译引擎: {engine}")
    
# ==============================================================
# 3⃣️ 翻译单本小说
# ============================================================== 
def translate_one_book(book_dir: Path,
                       out_en_root: Path,
                       out_zh_root: Path,
                       chapters: List[int],
                       engine: str = "deepseek",
                       workers: int = 4):
    """
    翻译单本小说的指定章节。如果已存在英文和回译文件则跳过。
    """
    chap_set = set(chapters)
    file_pairs = []
    for fp in book_dir.glob("*.txt"):
        no = _chapter_no(fp)
        if no in chap_set:
            file_pairs.append((no, fp))
            chap_set.remove(no)

    if chap_set:
        raise FileNotFoundError(f"《{book_dir.name}》缺少章节: {sorted(chap_set)}")

    file_pairs.sort()
    en_dir = out_en_root / book_dir.name
    zh_dir = out_zh_root / book_dir.name
    en_dir.mkdir(parents=True, exist_ok=True)
    zh_dir.mkdir(parents=True, exist_ok=True)

    def _translate(no: int, fp: Path):
        en_path = en_dir / f"{fp.stem}_en.txt"
        zh_path = zh_dir / f"{fp.stem}_back_cn.txt"

        # 跳过已存在的翻译结果
        if en_path.exists() and zh_path.exists():
            return f"✅ skipped: {fp.name}"

        zh_text = _auto_read(fp)

        # 中文 → 英文
        en_text = _call_translate(PROMPT_ZH2EN, zh_text, engine)
        en_path.write_text(en_text, encoding="utf-8")

        # 英文 → 中文回译
        zh_back = _call_translate(PROMPT_EN2ZH, en_text, engine)
        zh_path.write_text(zh_back, encoding="utf-8")

        return f"✅ translated: {fp.name}"

    with ThreadPoolExecutor(max_workers=workers) as ex:
        results = list(tqdm(ex.map(lambda t: _translate(*t), file_pairs),
                            total=len(file_pairs),
                            desc=f"🔄 {book_dir.name}"))
    for line in results:
        print(line)



# ==============================================================
# 4⃣️ 翻译全集
# ============================================================== 
def translate_library(src_root: str,
                      out_en_root: str,
                      out_zh_root: str,
                      chapter_range: Union[Tuple[int, int], List[int]],
                      n_chapters: int,
                      engine: str = "deepseek",
                      workers: int = 4):
    """
    翻译整个小说库：每本小说抽样 n_chapters 个章节翻译。
    如果英文译文目录中已存在 >= n_chapters 个翻译文件，则跳过该本小说。
    """
    src_root = Path(src_root)
    out_en_root = Path(out_en_root)
    out_zh_root = Path(out_zh_root)

    full_range = list(range(chapter_range[0], chapter_range[1] + 1)) if isinstance(chapter_range, tuple) else list(chapter_range)

    if n_chapters > len(full_range):
        raise ValueError(f"指定章节数 n_chapters={n_chapters} 超过章节范围长度 {len(full_range)}")

    novels = [d for d in src_root.iterdir() if d.is_dir()]
    if not novels:
        raise FileNotFoundError(f"{src_root} 下未找到任何小说子目录")

    for book in novels:
        en_dir = out_en_root / book.name
        if en_dir.exists():
            completed = len(list(en_dir.glob("*_en.txt")))
            if completed >= n_chapters:
                print(f"✅ 跳过《{book.name}》，已有 {completed} 个章节翻译")
                continue

        chapters_sample = random.sample(full_range, k=n_chapters)
        translate_one_book(book, out_en_root, out_zh_root, chapters_sample, engine, workers)

    print("\n✅ 全部小说翻译完成")
    print(f"英文译文目录 : {out_en_root}")
    print(f"中文回译目录 : {out_zh_root}")


# ==============================================================
# 5⃣️ 运行任务
# ============================================================== 
translate_library(SRC_ROOT,
                  OUT_EN_ROOT,
                  OUT_ZH_ROOT,
                  chapter_range=CHAPTER_RANGE,
                  n_chapters=N_CHAPTERS,
                  engine=TRANSLATION_ENGINE,
                  workers=WORKERS)