In [1]:
!pip install -q tqdm

In [2]:
from pathlib import Path
import re, csv, tqdm, pandas as pd

# ① 修改成你的 Google Drive 路径
INPUT_DIR  = Path("/content/novels")     # 原始 .txt
OUTPUT_DIR = Path("/content/novels_normalized")   # 输出根目录
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# ② 拆分参数
CAP_WORDS  = 1_000_000      # 仅取前 100 万“词”
N_NORM     = 200            # 拆成 200 章
TOLERANCE  = 200            # 回溯段落时最多回退 200 词

In [3]:
# === 词切分：用正则把所有连续字母/数字/中文视为一个“词” ===
WORD_RE = re.compile(r"\w+", re.UNICODE)   # 可自行换更复杂分词器

def words_list(text: str):
    """返回 word 列表（带起始字符索引）"""
    return [(m.group(), m.start()) for m in WORD_RE.finditer(text)]

def nearest_par_break(text: str, abs_pos: int, tol: int = TOLERANCE):
    """向前找最近双换行；找不到则硬切"""
    idx = abs_pos
    while idx > 0 and abs_pos - idx < tol * 8:      # *8 ≈ 词→字符粗略放大
        if text[idx-1:idx+1] == "\n\n":
            return idx
        idx -= 1
    return abs_pos

In [4]:
def split_by_words(book_path: Path):
    raw_text = book_path.read_text(encoding="utf-8")
    wlist = words_list(raw_text)[:CAP_WORDS]        # [(word, char_pos), ...]
    total_words = len(wlist)
    target = total_words // N_NORM or 1             # 防 0 除

    meta_rows, chapter_texts = [], []
    cursor_idx = 0
    prev_char_pos = 0

    for k in range(1, N_NORM + 1):
        start_idx = cursor_idx
        cursor_idx += target
        if cursor_idx >= total_words:
            cursor_idx = total_words

        # 回溯到段落边界
        char_pos = wlist[cursor_idx-1][1] + len(wlist[cursor_idx-1][0])
        char_pos = nearest_par_break(raw_text, char_pos)

        # 更新 cursor_idx 到新的 char_pos 对应 word 下标
        while cursor_idx < total_words and wlist[cursor_idx][1] < char_pos:
            cursor_idx += 1

        chapter_text = raw_text[prev_char_pos:char_pos]

        meta_rows.append({
            "book_id"        : book_path.stem,
            "norm_chap_id"   : f"{k:03}",
            "start_word_idx" : start_idx,
            "end_word_idx"   : cursor_idx,
            "chapter_word_cnt": cursor_idx - start_idx,
            "book_word_total": total_words
        })
        chapter_texts.append(chapter_text)
        prev_char_pos = char_pos

        if cursor_idx >= total_words:
            break

    return meta_rows, chapter_texts

In [5]:
all_meta = []
for p in tqdm.tqdm(sorted(INPUT_DIR.glob("*.txt")), desc="Splitting by words"):
    meta, txts = split_by_words(p)
    all_meta.extend(meta)

    out_dir = OUTPUT_DIR / "split_txt" / p.stem
    out_dir.mkdir(parents=True, exist_ok=True)
    for m, t in zip(meta, txts):
        (out_dir / f"{p.stem}_{m['norm_chap_id']}.txt").write_text(t, encoding="utf-8")

Splitting by words: 100%|██████████| 40/40 [00:04<00:00,  8.73it/s]


In [6]:
df = pd.DataFrame(all_meta)
df.to_csv(OUTPUT_DIR / "normalized_200chapters_meta_words.csv",
          index=False, encoding="utf-8")
df.head()

Unnamed: 0,book_id,norm_chap_id,start_word_idx,end_word_idx,chapter_word_cnt,book_word_total
0,《上品寒士》（校对版全本）作者：贼道三痴_utf8,1,0,823,823,164712
1,《上品寒士》（校对版全本）作者：贼道三痴_utf8,2,823,1646,823,164712
2,《上品寒士》（校对版全本）作者：贼道三痴_utf8,3,1646,2469,823,164712
3,《上品寒士》（校对版全本）作者：贼道三痴_utf8,4,2469,3292,823,164712
4,《上品寒士》（校对版全本）作者：贼道三痴_utf8,5,3292,4115,823,164712


In [7]:
!rm -rf /content/novels_normalized

In [10]:
# ===============================================================
# ① 依赖
# ===============================================================
!pip install -q tqdm pandas

from pathlib import Path
import re, tqdm, pandas as pd

INPUT_DIR  = Path("/content/novels")             # 改成你的根目录
OUTPUT_DIR = Path("/content/novels_normalized")  # 输出根目录
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

CAP_WORDS  = 1_000_000        # 取前 100 万词，如需全长改 None
N_NORM     = 200
TOLERANCE  = 200

# ===============================================================
# ② 基础工具
# ===============================================================
WORD_RE = re.compile(r"\w+", re.UNICODE)

def words_list(text):
    return [(m.group(), m.start()) for m in WORD_RE.finditer(text)]

def nearest_par_break(text, abs_pos, tol=TOLERANCE):
    idx = abs_pos
    while idx > 0 and abs_pos - idx < tol * 8:
        if text[idx-1:idx+1] == "\n\n":
            return idx
        idx -= 1
    return abs_pos

def split_by_words(raw_text: str, book_id: str):
    w = words_list(raw_text)
    if CAP_WORDS is not None:
        w = w[:CAP_WORDS]
    tot = len(w); target = tot // N_NORM or 1
    metas, chunks = [], []; cur = 0; prev_char = 0
    for k in range(1, N_NORM + 1):
        start = cur; cur += target
        if cur >= tot: cur = tot
        char_pos = w[cur-1][1] + len(w[cur-1][0])
        char_pos = nearest_par_break(raw_text, char_pos)
        while cur < tot and w[cur][1] < char_pos:
            cur += 1
        chunks.append(raw_text[prev_char:char_pos])
        metas.append(dict(book_id=book_id, norm_chap_id=f"{k:03}",
                          start_word_idx=start, end_word_idx=cur,
                          chapter_word_cnt=cur-start, book_word_total=tot))
        prev_char = char_pos
        if cur >= tot: break
    while len(chunks) < N_NORM:                   # 补空章
        k = len(chunks) + 1
        chunks.append("")
        metas.append(dict(book_id=book_id, norm_chap_id=f"{k:03}",
                          start_word_idx=cur, end_word_idx=cur,
                          chapter_word_cnt=0, book_word_total=tot))
    return metas, chunks

def natural_key(p: Path):
    parts = re.split(r'(\d+)', p.name)
    return [int(s) if s.isdigit() else s.lower() for s in parts]

# ===============================================================
# ③ 递归寻找“书”
#    - 若文件夹下面直接有 *.txt → 视为“一本书”
#    - 若根目录存在孤立 *.txt → 也当成一本书
# ===============================================================
def yield_books(root: Path):
    # 情形 B：根目录孤立 txt
    singles = [p for p in root.glob("*.txt")]
    for p in singles:
        yield p.stem, [p]          # book_id, list[Path]

    # 情形 A/C：子文件夹
    for sub in root.rglob("*"):
        if not sub.is_dir(): continue
        txts = list(sub.glob("*.txt"))
        if txts:
            yield sub.relative_to(root).parts[0], txts  # book_id=顶层文件夹名

# ===============================================================
# ④ 主流程
# ===============================================================
all_meta = []
book_cnt = 0

for book_id, txt_files in tqdm.tqdm(list(yield_books(INPUT_DIR)),
                                    desc="Books"):
    book_cnt += 1
    txt_files = sorted(txt_files, key=natural_key)
    full_text = "\n".join(f.read_text(encoding="utf-8", errors="ignore")
                          for f in txt_files)

    metas, chunks = split_by_words(full_text, book_id)

    out_dir = OUTPUT_DIR / book_id
    out_dir.mkdir(parents=True, exist_ok=True)
    for m, chunk in zip(metas, chunks):
        (out_dir / f"{m['norm_chap_id']}.txt").write_text(
            chunk, encoding="utf-8")
    all_meta.extend(metas)

# ===============================================================
# ⑤ 汇总 CSV
# ===============================================================
pd.DataFrame(all_meta).to_csv(
    OUTPUT_DIR / "normalized_200chapters_meta_words.csv",
    index=False, encoding="utf-8")

print("✅ 全部完成：")
print(f"  - 拆分小说数：{book_cnt}")
print(f"  - 输出目录  : {OUTPUT_DIR}")
print("  - 每本书生成 200 个文件 001.txt–200.txt")

Books: 100%|██████████| 40/40 [00:04<00:00,  8.94it/s]

✅ 全部完成：
  - 拆分小说数：40
  - 输出目录  : /content/novels_normalized
  - 每本书生成 200 个文件 001.txt–200.txt





In [16]:
!rm -rf /content/translate_back_ZH
!rm -rf /content/translate_EN
!rm -rf /content/novels_chapters
!rm -rf /content/最终精修保存路径
!rm -rf /content/randomseed
!rm -rf /content/outputs
!rm -rf /content/outlines
!rm -rf /content/1000_word_chapters_expanded