# Project Gutenberg: scraping, downloading, and metadata
This notebook provides a single workflow with options for scraping a bookshelf, downloading texts, cleaning, and writing metadata CSVs.

In [6]:
import csv
import re
import time
import urllib.error
import urllib.request
from pathlib import Path

from bs4 import BeautifulSoup

BEGIN_TEXT = {
    "*END*THE SMALL PRINT",
    "*** START OF THE PROJECT GUTENBERG",
    "*** START OF THIS PROJECT GUTENBERG",
    " *** START OF THIS PROJECT GUTENBERG",
    "***START OF THE PROJECT GUTENBERG",
    "*** START OF THE COPYRIGHTED",
    "**The Project Gutenberg",
    "*SMALL PRINT!",
    "****     SMALL PRINT!",
    "[\"Small Print\" V.",
    "This etext was prepared by",
    "This etext was produced by",
    "This Etext was prepared by",
    "This eBook was prepared by",
    "This Project Gutenberg Etext was prepared by",
    "E-text prepared by",
    "Produced by",
    "Distributed Proofreading Team",
    "Proofreading Team at http://www.pgdp.net",
    "http://gallica.bnf.fr)",
    "      http://archive.org/details/",
    "      (http://www.ibiblio.org/gutenberg/",
    "http://www.pgdp.net",
    "by The Internet Archive)",
    "by The Internet Archive/Canadian Libraries",
    "by The Internet Archive/American Libraries",
    "public domain material from the Internet Archive",
    "Internet Archive)",
    "Internet Archive/Canadian Libraries",
    "Internet Archive/American Libraries",
    "material from the Google Print project",
    "*END THE SMALL PRINT",
    "The Project Gutenberg",
    "http://gutenberg.spiegel.de/ erreichbar.",
    "http://gutenberg2000.de erreichbar.",
    "Project Runeberg publishes",
    "Beginning of this Project Gutenberg",
    "Project Gutenberg Online Distributed",
    "Gutenberg Online Distributed",
    "the Project Gutenberg Online Distributed",
    "the Project Gutenberg Online Distributed Proofreading Team",
    "Project Gutenberg TEI",
    "Gutenberg Distributed Proofreaders",
    "Project Gutenberg Distributed Proofreaders",
    "and the Project Gutenberg Online Distributed Proofreading Team",
    "Mary Meehan, and the Project Gutenberg Online Distributed Proofreading",
    "                this Project Gutenberg edition.",
    "More information about this book is at the top of this file.",
    "tells you about restrictions in how the file may be used.",
    "of the etext through OCR.",
    "*****These eBooks Were Prepared By Thousands of Volunteers!*****",
    "We need your donations more than ever!",
}

END_TEXT = {
    "*** END OF THE PROJECT GUTENBERG",
    "*** END OF THIS PROJECT GUTENBERG",
    " *** END OF THIS PROJECT GUTENBERG",
    "        ***END OF THE PROJECT GUTENBERG",
    "***END OF THE PROJECT GUTENBERG",
    "*** END OF THE COPYRIGHTED",
    "End of the Project Gutenberg",
    " End of the Project Gutenberg",
    "End of The Project Gutenberg",
    "End of Project Gutenberg",
    "End of this Project Gutenberg",
    "END OF PROJECT GUTENBERG",
    "End of this is COPYRIGHTED",
    "by Project Gutenberg",
    "The Project Gutenberg Etext of ",
    "**This is a COPYRIGHTED Project Gutenberg Etext, Details Above**",
    "Ende dieses Project Gutenberg",
    "Ende dieses Projekt Gutenberg",
    "Ende dieses Etextes ",
    "Ende diese Project Gutenberg",
    "Ende dieses Project Gutenber",
    "Fin de Project Gutenberg",
    "More information about this book is at the top of this file.",
    "We need your donations more than ever!",
}

def fetch_url(url):
    request = urllib.request.Request(url, headers={"User-Agent": "Mozilla/5.0"})
    with urllib.request.urlopen(request) as response:
        return response.read()

def soup_from_url(url):
    return BeautifulSoup(fetch_url(url), "html.parser")

def extract_book_ids(bookshelf_url):
    soup = soup_from_url(bookshelf_url)
    ids = []
    seen = set()
    for a_tag in soup.find_all("a", href=True):
        href = a_tag["href"]
        match = re.search(r"/ebooks/(\d+)", href)
        if not match:
            continue
        book_id = match.group(1)
        if book_id in seen:
            continue
        seen.add(book_id)
        ids.append(book_id)
    return ids

def safe_slug(text):
    slug = re.sub(r"[^a-zA-Z0-9]+", "-", text).strip("-")
    return slug[:80] if slug else "untitled"

def parse_bibrec(soup):
    data = {"title": "", "author": "", "language": "", "subject": []}
    bibrec = soup.find("table", id="bibrec")
    if not bibrec:
        return data
    for row in bibrec.find_all("tr"):
        header = row.find("th")
        value = row.find("td")
        if not header or not value:
            continue
        key = header.get_text(strip=True).lower()
        if key == "title":
            data["title"] = value.get_text(" ", strip=True)
        elif key == "author":
            data["author"] = value.get_text(" ", strip=True)
        elif key == "language":
            data["language"] = value.get_text(" ", strip=True)
        elif key == "subject":
            data["subject"].append(value.get_text(" ", strip=True))
    return data

def download_text(book_id):
    urls = [
        f"https://www.gutenberg.org/ebooks/{book_id}.txt.utf-8",
        f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt",
        f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt.utf-8",
    ]
    for url in urls:
        try:
            data = fetch_url(url)
            text = data.decode("utf-8", errors="replace")
            if text.strip():
                return text, url
        except urllib.error.HTTPError:
            continue
        except urllib.error.URLError:
            continue
    return "", ""

def strip_gutenberg(text):
    lines = text.splitlines()
    start = 0
    end = len(lines)
    for i, line in enumerate(lines):
        check = line.strip()
        if any(check.startswith(marker) for marker in BEGIN_TEXT):
            start = i + 1
            break
    for i, line in enumerate(lines):
        check = line.strip()
        if any(check.startswith(marker) for marker in END_TEXT):
            end = i
            break
    return "\n".join(lines[start:end]).strip()

def write_text(path, text):
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text(text, encoding="utf-8")

def run_bookshelf_pipeline(bookshelf_url, max_books, out_dir, sleep_sec=0.2, download_raw=True, write_clean=True, write_metadata=True):
    out_dir = Path(out_dir).resolve()
    raw_dir = out_dir / "raw"
    clean_dir = out_dir / "clean"
    out_dir.mkdir(parents=True, exist_ok=True)
    book_ids = extract_book_ids(bookshelf_url)[:max_books]
    metadata_rows = []

    for book_id in book_ids:
        book_url = f"https://www.gutenberg.org/ebooks/{book_id}"
        try:
            soup = soup_from_url(book_url)
        except urllib.error.HTTPError:
            continue
        except urllib.error.URLError:
            continue

        meta = parse_bibrec(soup)
        title = meta["title"] or f"gutenberg-{book_id}"
        slug = safe_slug(title)
        raw_path = raw_dir / f"{book_id}__{slug}.txt"
        clean_path = clean_dir / f"{book_id}__{slug}.txt"
        text = ""
        if download_raw or write_clean:
            text, _ = download_text(book_id)
            if not text:
                continue
            if download_raw:
                write_text(raw_path, text)
            if write_clean:
                write_text(clean_path, strip_gutenberg(text))

        if write_metadata:
            metadata_rows.append(
                {
                    "title": title,
                    "author": meta["author"],
                    "gutenberg_id": book_id,
                    "language": meta["language"],
                    "subject": "; ".join(meta["subject"]) if meta["subject"] else "",
                    "url": book_url,
                    "local_path": str((clean_path if write_clean else raw_path).relative_to(out_dir.parent.parent)) if (download_raw or write_clean) else "",
                }
            )
        time.sleep(sleep_sec)

    if write_metadata:
        csv_path = out_dir / "metadata.csv"
        with csv_path.open("w", encoding="utf-8", newline="") as csvfile:
            fieldnames = [
                "title",
                "author",
                "gutenberg_id",
                "language",
                "subject",
                "url",
                "local_path",
            ]
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(metadata_rows)
        print(f"Wrote {len(metadata_rows)} items to {csv_path}")

    return metadata_rows

## Configuration
Set the bookshelf URL, output folder, and toggles below.

In [7]:
bookshelf_url = "https://www.gutenberg.org/ebooks/bookshelf/68"
max_books = 2
out_dir = Path("outputs")
sleep_sec = 0.2

download_raw = True
write_clean = True
write_metadata = True

## Run pipeline
This produces optional raw/clean text files and a metadata CSV.

In [8]:
metadata_rows = run_bookshelf_pipeline(
    bookshelf_url=bookshelf_url,
    max_books=max_books,
    out_dir=out_dir,
    sleep_sec=sleep_sec,
    download_raw=download_raw,
    write_clean=write_clean,
    write_metadata=write_metadata,
    )

Wrote 2 items to /Users/alexwermer-colan/Code/SF-Nexus/webscraping-SF/gutenberg/outputs/metadata.csv


## Download-only example (bookshelf by women)
Raw .txt only, no metadata or cleaning. Update the URL if the wiki page changes.

In [10]:
download_only_url = "https://www.gutenberg.org/ebooks/bookshelf/68"
download_only_dir = Path("downloads")

_ = run_bookshelf_pipeline(
    bookshelf_url=download_only_url,
    max_books=10,
    out_dir=download_only_dir,
    sleep_sec=sleep_sec,
    download_raw=True,
    write_clean=False,
    write_metadata=False,
    )

## Optional: package-based Gutenberg API
Gutenberg PyPl: https://github.com/ageitgey/Gutenberg
Gutenberg HTTP: https://github.com/c-w/gutenberg-http/

If a mirror error occurs, set a mirror such as https://gutenberg.pglaf.org/ in your API calls.
If bookshelf wiki pages 404, prefer the numeric bookshelf endpoints like https://www.gutenberg.org/ebooks/bookshelf/68.

In [18]:
try:
    from gutenberg.acquire import load_etext
    from gutenberg.cleanup import strip_headers

    text = strip_headers(load_etext(2701, mirror="https://gutenberg.pglaf.org/")).strip()
    print(text[:500])
except ImportError:
    print("Install the gutenberg package to use API queries.")

Install the gutenberg package to use API queries.


### Metadata cache (required for `gutenberg.query`)
Populate the cache once, then queries are fast. This can take a long time.

In [19]:
try:
    from gutenberg.acquire import set_metadata_cache
    from gutenberg.acquire.metadata import SqliteMetadataCache

    sqlite_cache = SqliteMetadataCache("/tmp/gutenberg_cache.sqlite")
    sqlite_cache.populate()
    set_metadata_cache(sqlite_cache)
except ImportError:
    print("Install the gutenberg package to configure metadata cache.")

Install the gutenberg package to configure metadata cache.


In [20]:
try:
    from gutenberg.acquire import get_metadata_cache
    cache = get_metadata_cache()
    cache.populate()
except ImportError:
    print("Install the gutenberg package to populate the cache.")

Install the gutenberg package to populate the cache.


In [14]:
# Optional metadata queries (requires populated cache)
# from gutenberg.query import get_etexts, get_metadata
# print(get_metadata("title", 2701))
# print(get_metadata("author", 2701))
# print(get_etexts("title", "Moby Dick; Or, The Whale"))
# print(get_etexts("author", "Melville, Hermann"))