# Gutenberg

https://huggingface.co/datasets/sedthh/gutenberg_english

In [3]:
from datasets import load_dataset
import json
import re

# https://www.loc.gov/aba/cataloging/classification/lcco/lcco_p.pdf
LOCC_WHITELIST = [
    "PN",  # Literature (General)
    "PQ",  # French, Italian, Spanish, Portuguese literature
    "PR",  # English literature
    "PS",  # American literature
    "PT",  # Other european literature
    "PZ",  # Fiction and juvenile belles lettres
]
BLACKLIST = [
    "poem",
    "opera",
    "music",
    "drama",
    "play",
    "theater",
    "theatre",
    "song",
    "religious",
    "poetry",
    "lyric",
    "biograph",
    "memoir",
    "journal",
    "political",
    "christianity",
    "diary",
    "essay",
    "criticism",
    "review",
    "commentary",
    "philosoph",
    "religion",
    "spiritual",
    "self-help",
    "psycholog",
    "sociolog",
    "politics",
    "economic",
    "business",
    "finance",
    "law",
    "mathematic",
    "didactic",
    "conflict of generation",
    "love stor",
    "domestic",
    "marriage",
    "kentucky",
    "tennessee",
    "virginia",
    "civil war",
    "native american",
    "bildungsroman",
    "indiana",
    "illinois",
    "social life and customs",
    "correspondence",
    "slavery",
    "humor",
    "christmas",
]
year_re = re.compile(r"\d{4}")

ds = load_dataset("sedthh/gutenberg_english")


def metadata_filter(row):
    """Requirements:
    - locc must contain at least one whitelisted term
    - subjects and bookshelves must not contain any blacklisted terms
    - issued year >= 1900
    - authors cannot contain a year < 1850
    - length >= 10000
    """
    meta = json.loads(row["METADATA"])
    return (
        any(term in meta["locc"] for term in LOCC_WHITELIST)
        and not any(term in meta["subjects"].lower() for term in BLACKLIST)
        and not any(term in meta["bookshelves"].lower() for term in BLACKLIST)
        and all(int(year) >= 1900 for year in year_re.findall(meta["issued"]))
        and all(int(year) >= 1850 for year in year_re.findall(meta["authors"]))
        and len(row["TEXT"]) >= 50000
    )


ds = ds.filter(metadata_filter)
book_count = len(ds["train"])
print(f"Book count after filter: {book_count}")

Filter: 100%|██████████| 48284/48284 [01:18<00:00, 618.13 examples/s]


Book count after filter: 8777


In [7]:
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent))
from utils import PROCESSED_BOOKS_PATH, BOOK_DIR
from book_collection import BookCollector

collector = BookCollector(ds, BOOK_DIR, PROCESSED_BOOKS_PATH)
collector.launch_interface()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Keyboard interruption in main thread... closing server.


In [4]:
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent))
from utils import BOOK_DIR

entries = []

for book in (BOOK_DIR / "meta").glob("*.json"):
    with open(book, "r") as f:
        meta = json.load(f)
    y = meta.get("year")
    if isinstance(y, int):
        entries.append((y, meta.get("title"), book.name))

if not entries:
    print("No entries with year")
else:
    entries.sort()
    earliest_year = entries[0][0]
    latest_year = entries[-1][0]
    earliest = [e for e in entries if e[0] == earliest_year]
    latest = [e for e in entries if e[0] == latest_year]
    print("EARLIEST YEAR:", earliest_year)
    for y, title, filename in earliest:
        print("  -", title, f"({filename})")
    print("LATEST YEAR:", latest_year)
    for y, title, filename in latest:
        print("  -", title, f"({filename})")

EARLIEST YEAR: 1889
  - The Witch of Prague: A Fantastic Tale (the-witch-of-prague-a-fantastic-tale.json)
LATEST YEAR: 2005
  - The Revolutions of Time (the-revolutions-of-time.json)
