# Gutenberg

https://huggingface.co/datasets/sedthh/gutenberg_english

In [1]:
from datasets import load_dataset
import json
import re

# https://www.loc.gov/aba/cataloging/classification/lcco/lcco_p.pdf
LOCC_WHITELIST = [
    "PN",  # Literature (General)
    "PQ",  # French, Italian, Spanish, Portuguese literature
    "PR",  # English literature
    "PS",  # American literature
    "PT",  # Other european literature
    "PZ",  # Fiction and juvenile belles lettres
]
BLACKLIST = [
    "poem",
    "opera",
    "music",
    "drama",
    "play",
    "theater",
    "theatre",
    "song",
    "religious",
    "poetry",
    "lyric",
    "biograph",
    "memoir",
    "journal",
    "political",
    "christianity",
    "diary",
    "essay",
    "criticism",
    "review",
    "commentary",
    "philosoph",
    "religion",
    "spiritual",
    "self-help",
    "psycholog",
    "sociolog",
    "politics",
    "economic",
    "business",
    "finance",
    "law",
    "mathematic",
    "didactic",
    "conflict of generation",
    "love stor",
    "domestic",
    "marriage",
    "kentucky",
    "tennessee",
    "virginia",
    "civil war",
    "native american",
    "bildungsroman",
    "indiana",
    "illinois",
    "social life and customs",
    "correspondence",
    "slavery",
    "humor",
]
year_re = re.compile(r"\d{4}")

ds = load_dataset("sedthh/gutenberg_english")


def metadata_filter(row):
    """Requirements:
    - locc must contain at least one whitelisted term
    - subjects and bookshelves must not contain any blacklisted terms
    - issued year >= 1900
    - authors cannot contain a year < 1850
    - length >= 10000
    """
    meta = json.loads(row["METADATA"])
    return (
        any(term in meta["locc"] for term in LOCC_WHITELIST)
        and not any(term in meta["subjects"].lower() for term in BLACKLIST)
        and not any(term in meta["bookshelves"].lower() for term in BLACKLIST)
        and all(int(year) >= 1900 for year in year_re.findall(meta["issued"]))
        and all(int(year) >= 1850 for year in year_re.findall(meta["authors"]))
        and len(row["TEXT"]) >= 50000
    )


ds = ds.filter(metadata_filter)
book_count = len(ds["train"])
print(f"Book count after filter: {book_count}")

  from .autonotebook import tqdm as notebook_tqdm


Book count after filter: 8867


In [6]:
ds["train"][0]["METADATA"]

'{"language": "en", "text_id": 16, "title": "Peter Pan", "issued": "2008-06-25 00:00:00", "authors": "Barrie, J. M. (James Matthew), 1860-1937", "subjects": "Fantasy literature; Peter Pan (Fictitious character) -- Fiction; Never-Never Land (Imaginary place) -- Fiction; Pirates -- Fiction; Fairies -- Fiction", "locc": "PR; PZ", "bookshelves": "Children\'s Literature; Movie Books"}'

In [4]:
import sys
from pathlib import Path

sys.path.append(str(Path.cwd().parent))
from utils import PROCESSED_BOOKS_PATH, BOOK_DIR
from book_collection import BookCollector

collector = BookCollector(ds, BOOK_DIR, PROCESSED_BOOKS_PATH)
collector.launch_interface()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Opening in existing browser session.
Keyboard interruption in main thread... closing server.


In [10]:
for book in (BOOK_DIR / "meta").glob("*.json"):
    with open(book, "r") as f:
        meta = json.load(f)
        if meta.get("genre") == "Fiction":
            print(meta.get("title"))

The Shadow Line: A Confession
Anne of Green Gables
Dream Days
My Ántonia
Three Elephant Power, and Other Stories
Herland
The Mucker
O Pioneers!
The Call of the Wild
The Insidious Dr. Fu Manchu
The Golden Road
The Wisdom of Father Brown
Dear Enemy
Frivolous Cupid
Penrod
Moran of the Lady Letty
Bunner Sisters
The Return of Sherlock Holmes
Anne of Avonlea
Anne of the Island
The Innocence of Father Brown
The Burial of the Guns
Jungle Tales of Tarzan
Freckles
