In [102]:
import pandas as pd
import numpy as np
import seaborn as sns

from glob import glob
import re
from tqdm import tqdm

from ast import literal_eval

def return_liter_eval(x):
    try:
        return list(literal_eval(x))
    except ValueError:
        return None
        
def flatten_genre(genre):
    genre_items = " ".join([k.lower() for x in genre for k in x.split(" ") ])
    return genre_items

In [103]:
meta_df = pd.read_csv("../../gutenberg_standard/gutenberg/metadata/metadata.csv")
meta_df.shape

(74389, 9)

In [104]:
# Apply filtering criteria
meta_df["genre"] = meta_df.subjects.apply(return_liter_eval)
meta_df = meta_df[meta_df["genre"].notnull()]

meta_df = meta_df[meta_df["language"] == "['en']"]
meta_df = meta_df[meta_df["type"] == "Text"]
meta_df = meta_df[meta_df["author"].notnull()]
meta_df = meta_df[meta_df["downloads"] > 0]
meta_df = meta_df[meta_df["authoryearofbirth"] > 1800]

meta_df = meta_df.drop_duplicates(subset="title")

# Extract and normalize genre labels
meta_df["genre_items"] = meta_df["genre"].apply(flatten_genre).str.lower()

# Match if ANY genre term is present
target_terms = ["fiction", "novel", "story", "literature"]

pattern = r"\b(?:{})\b".format("|".join(map(re.escape, target_terms)))

meta_df = meta_df[
    meta_df["genre_items"].str.contains(pattern, regex=True, na=False)
]

In [76]:
# Extract the top genres from the genre items
genres = ["war", "biography", "romance", "drama", "fantasy", "family", 
          "science", "action", "thriller", "western", "horror", 
          "mystery", "crime", "history", "periodicals", "christian"]

# Parse genres 
for g in genres:
    meta_df["genre_"+g] = 0
    meta_df.loc[meta_df.genre_items.str.contains(g), "genre_"+g] = 1
    
meta_df["genre_other"] = 1
meta_df.loc[meta_df[["genre_"+g for g in genres]].any(axis=1), "genre_other"] = 0

In [80]:
raw_books = glob("../../gutenberg_standard/gutenberg/data/text/*_text.txt")
raw_books = [b.split("/")[-1].split("_")[0] for b in raw_books]

In [83]:
import os
import traceback

# Ensure output directory exists
os.makedirs("./data/proc_text", exist_ok=True)

# Precompile regexes for speed and clarity
START_RE = re.compile(r'\*\*\*\s?START OF TH(?:IS|E) PROJECT GUTENBERG EBOOK.*\*\*\*', re.IGNORECASE | re.DOTALL)
END_RE = re.compile(r'END OF TH(?:IS|E) PROJECT GUTENBERG EBOOK', re.IGNORECASE)
WORD_RE = re.compile(r"[a-zA-Z_]+")

ok_books = []
for b in tqdm(meta_df.id.unique(), desc="Processing Gutenberg books"):
    input_path = f"../../gutenberg_standard/gutenberg/data/text/{b}_text.txt"
    output_path = f"../../gutenberg_standard/new_analysis/proc_text_new/{b}_text.txt"
    try:
        with open(input_path, "r", encoding="utf-8", errors="ignore") as f:
            raw = f.read()
    except Exception as e:
        raw = ""
        print(f"Failed processing book {b}: {e}")
    

    # Find start marker
    start_match = START_RE.search(raw)
    start = start_match.end() if start_match else 0

    # Find end marker
    end_match = END_RE.search(raw)
    end = end_match.start() if end_match else len(raw)

    # Truncate text
    truncated_raw = raw[start:end].strip()

    # Compute word count
    word_count = len(WORD_RE.findall(truncated_raw))

    # Save only long texts
    if word_count > 50000:
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(truncated_raw)
        ok_books.append(b)


Processing Gutenberg books:   1%|▊                                                                                                   | 151/19092 [00:01<01:59, 158.06it/s]

Failed processing book PG10511: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG10511_text.txt'
Failed processing book PG10547: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG10547_text.txt'


Processing Gutenberg books:   4%|████▎                                                                                               | 835/19092 [00:06<02:08, 142.24it/s]

Failed processing book PG12: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG12_text.txt'


Processing Gutenberg books:   8%|████████▏                                                                                          | 1584/19092 [00:13<02:28, 117.78it/s]

Failed processing book PG15: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG15_text.txt'


Processing Gutenberg books:  10%|█████████▍                                                                                         | 1830/19092 [00:15<02:17, 125.51it/s]

Failed processing book PG16: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG16_text.txt'


Processing Gutenberg books:  10%|█████████▊                                                                                         | 1891/19092 [00:15<02:10, 131.45it/s]

Failed processing book PG1723: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG1723_text.txt'


Processing Gutenberg books:  13%|████████████▋                                                                                      | 2454/19092 [00:19<02:04, 133.81it/s]

Failed processing book PG1939: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG1939_text.txt'


Processing Gutenberg books:  17%|█████████████████                                                                                  | 3280/19092 [00:26<01:48, 145.83it/s]

Failed processing book PG21850: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG21850_text.txt'


Processing Gutenberg books:  18%|█████████████████▋                                                                                 | 3401/19092 [00:27<01:35, 164.53it/s]

Failed processing book PG22203: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG22203_text.txt'
Failed processing book PG22206: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG22206_text.txt'


Processing Gutenberg books:  23%|██████████████████████▍                                                                            | 4324/19092 [00:32<01:47, 137.13it/s]

Failed processing book PG24895: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG24895_text.txt'


Processing Gutenberg books:  23%|██████████████████████▋                                                                            | 4366/19092 [00:32<01:27, 168.45it/s]

Failed processing book PG24: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG24_text.txt'


Processing Gutenberg books:  24%|███████████████████████▍                                                                           | 4519/19092 [00:33<02:08, 113.24it/s]

Failed processing book PG25564: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG25564_text.txt'


Processing Gutenberg books:  27%|██████████████████████████▍                                                                        | 5100/19092 [00:38<01:39, 141.28it/s]

Failed processing book PG2770: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG2770_text.txt'
Failed processing book PG2774: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG2774_text.txt'


Processing Gutenberg books:  30%|█████████████████████████████▊                                                                     | 5750/19092 [00:42<01:12, 184.33it/s]

Failed processing book PG2994: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG2994_text.txt'


Processing Gutenberg books:  33%|████████████████████████████████▋                                                                  | 6300/19092 [00:46<01:24, 151.68it/s]

Failed processing book PG3184: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG3184_text.txt'
Failed processing book PG3185: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG3185_text.txt'


Processing Gutenberg books:  33%|█████████████████████████████████                                                                  | 6379/19092 [00:47<01:13, 174.06it/s]

Failed processing book PG32060: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG32060_text.txt'


Processing Gutenberg books:  35%|███████████████████████████████████                                                                | 6757/19092 [00:49<01:34, 130.21it/s]

Failed processing book PG32: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG32_text.txt'


Processing Gutenberg books:  39%|██████████████████████████████████████▉                                                            | 7499/19092 [00:56<01:27, 131.75it/s]

Failed processing book PG35101: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG35101_text.txt'


Processing Gutenberg books:  41%|████████████████████████████████████████▎                                                          | 7785/19092 [00:58<01:29, 126.41it/s]

Failed processing book PG35: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG35_text.txt'
Failed processing book PG36050: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG36050_text.txt'


Processing Gutenberg books:  42%|█████████████████████████████████████████▊                                                         | 8074/19092 [01:01<01:31, 120.19it/s]

Failed processing book PG36: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG36_text.txt'


Processing Gutenberg books:  44%|███████████████████████████████████████████▎                                                       | 8352/19092 [01:03<01:29, 120.13it/s]

Failed processing book PG3803: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG3803_text.txt'


Processing Gutenberg books:  45%|████████████████████████████████████████████▊                                                      | 8639/19092 [01:06<01:14, 140.94it/s]

Failed processing book PG3926: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG3926_text.txt'


Processing Gutenberg books:  50%|█████████████████████████████████████████████████▊                                                 | 9618/19092 [01:13<01:19, 119.89it/s]

Failed processing book PG4274: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG4274_text.txt'


Processing Gutenberg books:  51%|██████████████████████████████████████████████████▎                                                | 9699/19092 [01:14<01:20, 117.25it/s]

Failed processing book PG42: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG42_text.txt'


Processing Gutenberg books:  52%|███████████████████████████████████████████████████▊                                               | 9997/19092 [01:16<01:05, 138.12it/s]

Failed processing book PG4405: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG4405_text.txt'


Processing Gutenberg books:  54%|████████████████████████████████████████████████████▌                                             | 10245/19092 [01:18<01:07, 131.56it/s]

Failed processing book PG44: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG44_text.txt'


Processing Gutenberg books:  55%|█████████████████████████████████████████████████████▊                                            | 10483/19092 [01:20<01:12, 118.73it/s]

Failed processing book PG45: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG45_text.txt'


Processing Gutenberg books:  55%|██████████████████████████████████████████████████████▎                                           | 10572/19092 [01:21<01:03, 134.36it/s]

Failed processing book PG462: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG462_text.txt'


Processing Gutenberg books:  56%|██████████████████████████████████████████████████████▉                                           | 10692/19092 [01:22<01:07, 125.26it/s]

Failed processing book PG46: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG46_text.txt'


Processing Gutenberg books:  57%|███████████████████████████████████████████████████████▉                                          | 10897/19092 [01:24<01:11, 115.27it/s]

Failed processing book PG47: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG47_text.txt'


Processing Gutenberg books:  61%|███████████████████████████████████████████████████████████▍                                      | 11583/19092 [01:29<00:40, 184.33it/s]

Failed processing book PG5124: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG5124_text.txt'


Processing Gutenberg books:  62%|████████████████████████████████████████████████████████████▌                                     | 11806/19092 [01:30<00:41, 177.57it/s]

Failed processing book PG51924: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG51924_text.txt'
Failed processing book PG51936: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG51936_text.txt'
Failed processing book PG51939: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG51939_text.txt'
Failed processing book PG51940: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG51940_text.txt'
Failed processing book PG51948: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG51948_text.txt'
Failed processing book PG51951: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG51951_text.txt'
Failed processing book PG51970: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG51970_text.txt'


Processing Gutenberg books:  62%|████████████████████████████████████████████████████████████▋                                     | 11825/19092 [01:30<00:43, 167.62it/s]

Failed processing book PG51: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG51_text.txt'


Processing Gutenberg books:  64%|██████████████████████████████████████████████████████████████▊                                   | 12227/19092 [01:33<00:48, 142.91it/s]

Failed processing book PG5373: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG5373_text.txt'


Processing Gutenberg books:  66%|████████████████████████████████████████████████████████████████▌                                 | 12584/19092 [01:36<00:44, 147.70it/s]

Failed processing book PG54: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG54_text.txt'


Processing Gutenberg books:  74%|████████████████████████████████████████████████████████████████████████▉                         | 14205/19092 [01:48<00:25, 188.11it/s]

Failed processing book PG60: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG60_text.txt'


Processing Gutenberg books:  76%|██████████████████████████████████████████████████████████████████████████▍                       | 14493/19092 [01:50<00:20, 229.29it/s]

Failed processing book PG6191: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG6191_text.txt'


Processing Gutenberg books:  78%|███████████████████████████████████████████████████████████████████████████▉                      | 14802/19092 [01:51<00:23, 184.33it/s]

Failed processing book PG62: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG62_text.txt'


Processing Gutenberg books:  87%|█████████████████████████████████████████████████████████████████████████████████████▏            | 16591/19092 [02:01<00:13, 184.62it/s]

Failed processing book PG68: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG68_text.txt'


Processing Gutenberg books:  87%|█████████████████████████████████████████████████████████████████████████████████████▋            | 16689/19092 [02:02<00:12, 185.62it/s]

Failed processing book PG69279: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG69279_text.txt'


Processing Gutenberg books:  89%|███████████████████████████████████████████████████████████████████████████████████████           | 16961/19092 [02:04<00:13, 152.68it/s]

Failed processing book PG70251: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG70251_text.txt'


Processing Gutenberg books:  93%|███████████████████████████████████████████████████████████████████████████████████████████▍      | 17815/19092 [02:09<00:08, 144.91it/s]

Failed processing book PG72: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG72_text.txt'


Processing Gutenberg books:  96%|█████████████████████████████████████████████████████████████████████████████████████████████▉    | 18290/19092 [02:12<00:06, 117.54it/s]

Failed processing book PG74: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG74_text.txt'


Processing Gutenberg books:  96%|██████████████████████████████████████████████████████████████████████████████████████████████▌   | 18414/19092 [02:13<00:03, 183.09it/s]

Failed processing book PG7684: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG7684_text.txt'
Failed processing book PG76: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG76_text.txt'


Processing Gutenberg books:  97%|███████████████████████████████████████████████████████████████████████████████████████████████   | 18512/19092 [02:13<00:04, 132.25it/s]

Failed processing book PG77: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG77_text.txt'


Processing Gutenberg books:  97%|███████████████████████████████████████████████████████████████████████████████████████████████▏  | 18545/19092 [02:14<00:04, 123.54it/s]

Failed processing book PG78: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG78_text.txt'


Processing Gutenberg books:  98%|████████████████████████████████████████████████████████████████████████████████████████████████▌  | 18617/19092 [02:14<00:05, 94.60it/s]

Failed processing book PG81: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG81_text.txt'


Processing Gutenberg books:  98%|███████████████████████████████████████████████████████████████████████████████████████████████▋  | 18642/19092 [02:15<00:04, 103.26it/s]

Failed processing book PG83: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG83_text.txt'


Processing Gutenberg books:  98%|████████████████████████████████████████████████████████████████████████████████████████████████  | 18707/19092 [02:15<00:03, 111.97it/s]

Failed processing book PG85: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG85_text.txt'


Processing Gutenberg books:  98%|████████████████████████████████████████████████████████████████████████████████████████████████▏ | 18734/19092 [02:15<00:03, 107.93it/s]

Failed processing book PG86: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG86_text.txt'


Processing Gutenberg books:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████▌ | 18813/19092 [02:16<00:02, 97.42it/s]

Failed processing book PG90: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG90_text.txt'


Processing Gutenberg books:  99%|████████████████████████████████████████████████████████████████████████████████████████████████▉ | 18895/19092 [02:17<00:00, 210.62it/s]

Failed processing book PG91: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG91_text.txt'
Failed processing book PG9255: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG9255_text.txt'
Failed processing book PG92: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG92_text.txt'


Processing Gutenberg books:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████▏| 18941/19092 [02:17<00:00, 182.45it/s]

Failed processing book PG93: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG93_text.txt'


Processing Gutenberg books:  99%|█████████████████████████████████████████████████████████████████████████████████████████████████▎| 18961/19092 [02:17<00:00, 154.82it/s]

Failed processing book PG94: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG94_text.txt'
Failed processing book PG95: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG95_text.txt'


Processing Gutenberg books: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████▌| 19018/19092 [02:18<00:00, 127.19it/s]

Failed processing book PG96: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG96_text.txt'


Processing Gutenberg books: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████▊| 19048/19092 [02:18<00:00, 119.55it/s]

Failed processing book PG9830: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG9830_text.txt'
Failed processing book PG98: [Errno 2] No such file or directory: '../../gutenberg_standard/gutenberg/data/text/PG98_text.txt'


Processing Gutenberg books: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 19092/19092 [02:18<00:00, 137.50it/s]


In [85]:
len(ok_books)

10884

In [86]:
book_selected = [i.split("/")[-1][:-9] for i in glob("../../gutenberg_standard/new_analysis/proc_text_new/*")]

In [121]:
book_selected[:5]

['PG72860', 'PG16957', 'PG41021', 'PG70070', 'PG18579']

In [87]:
meta_df = meta_df[meta_df.id.isin(book_selected)].reset_index(drop=True)

In [97]:
meta_df.to_csv("./gutenberg_meta_df.csv")