In [7]:
# Yajurveda (TITUS) — Devanagari scraper with simple keys
# Outputs:
#   devanagari_black.json  # {"x.y.z": "देवना…", ...}
#   devanagari_white.json  # {"a.b":   "देवना…", ...}
#
# Notes:
# - BLACK source:  http://titus.uni-frankfurt.de/texte/etcd/ind/aind/ved/yvs/ts/ts001.htm ...
#   key = Book.Chapter.Verse (x.y.z)  (Devanagari sentences joined)
# - WHITE source: http://titus.uni-frankfurt.de/texte/etcd/ind/aind/ved/yvw/vs/vs001.htm ...
#   key = Paragraph.Verse (a.b)       (Devanagari sentences joined)

import os, re, json, time, html as htmlmod
import requests

# ----------------------------
# HTTP fetch with sane headers
# ----------------------------
SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
                  "(KHTML, like Gecko) Chrome/124 Safari/537.36"
})

def fetch(url: str) -> str:
    r = SESSION.get(url, timeout=30)
    r.raise_for_status()
    r.encoding = "utf-8"
    return r.text

# --------------------------------------
# HTML → plain text normalization (TITUS)
# --------------------------------------
def titus_html_to_text(html: str) -> str:
    s = html
    # normalize all <br> flavors / </p> to newlines
    s = re.sub(r'(?is)<\s*br\s*/?\s*>', '\n', s)
    s = re.sub(r'(?is)</\s*p\s*>', '\n', s)
    # drop tags
    s = re.sub(r'(?is)<[^>]+>', '', s)
    # decode entities (&nbsp; etc.)
    s = htmlmod.unescape(s)
    # normalize odd spaces
    s = s.replace('\xa0', ' ').replace('\u2009', ' ')
    s = s.replace('\r', '')
    # collapse and trim lines
    lines = [re.sub(r'\s+', ' ', ln).strip() for ln in s.split('\n')]
    lines = [ln for ln in lines if ln]
    return '\n'.join(lines)

# Tolerant label regexes
RE_BOOK  = re.compile(r'^\s*Book:\s*(\d+)\s*$', re.I)
RE_CHAP  = re.compile(r'^\s*Chapter:\s*(\d+)\s*$', re.I)
RE_PARA  = re.compile(r'^\s*Paragraph:\s*(\d+)\s*$', re.I)   # = Adhyāya for White
RE_VERSE = re.compile(r'^\s*Verse:\s*(\d+)\s*$', re.I)       # = Mantra
RE_SENT  = re.compile(r'^\s*Sentence:\s*([A-Za-z0-9=]+)\s+(.*)$', re.I)

def parse_titus_page(html: str):
    """
    Extract records in appearance order. Works for both VS (white) and TS (black).
    Emits dicts with any of: book, chapter, paragraph, verse, sentence, text.
    """
    text = titus_html_to_text(html)
    recs = []
    book = chapter = para = verse = None
    for ln in text.split('\n'):
        m = RE_BOOK.match(ln)
        if m: book = int(m.group(1));   continue
        m = RE_CHAP.match(ln)
        if m: chapter = int(m.group(1)); continue
        m = RE_PARA.match(ln)
        if m: para = int(m.group(1));    continue
        m = RE_VERSE.match(ln)
        if m: verse = int(m.group(1));   continue
        m = RE_SENT.match(ln)
        if m:
            recs.append({
                "book": book, "chapter": chapter,
                "paragraph": para, "verse": verse,
                "sentence": m.group(1),
                "text": m.group(2)
            })
    return recs

def iter_parts(base_url: str, stem: str, start=1, max_holes=3, sleep=0.25):
    """
    Iterate sequential part pages: base/stem001.htm, stem002.htm, ...
    Stop after `max_holes` consecutive failures (end of series).
    """
    holes = 0
    i = start
    while True:
        url = f"{base_url}/{stem}{i:03d}.htm"
        try:
            html = fetch(url)
            yield i, url, html
            holes = 0
        except Exception:
            holes += 1
            if holes >= max_holes:
                break
        i += 1
        time.sleep(sleep)

def group_join(records, key_fields):
    """
    Join all Sentence texts under the given key tuple -> single string.
    """
    buckets = {}
    for r in records:
        if any(r.get(k) is None for k in key_fields):
            continue
        k = '.'.join(str(r[k]) for k in key_fields)
        buckets.setdefault(k, []).append(r["text"])
    return {k: ' '.join(v) for k, v in buckets.items()}

# -----------------------
# WHITE: a.b (adhyaya.mantra)
# -----------------------
def scrape_white_to_json(out_path="devanagari_white.json"):
    base = "http://titus.uni-frankfurt.de/texte/etcd/ind/aind/ved/yvw/vs"
    allrecs = []
    for idx, url, html in iter_parts(base, "vs", start=1, max_holes=3):
        recs = parse_titus_page(html)
        allrecs.extend(recs)
    # Paragraph=Adhyaya (a), Verse=Mantra (b)
    rolled = group_join(allrecs, ("paragraph", "verse"))
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(rolled, f, ensure_ascii=False, indent=2)
    print(f"[WHITE] parts={idx} sentences={len(allrecs)} unique a.b={len(rolled)} → {out_path}")

# -----------------------
# BLACK: x.y.z (book.chapter.verse)
# -----------------------
def scrape_black_to_json(out_path="devanagari_black.json"):
    base = "http://titus.uni-frankfurt.de/texte/etcd/ind/aind/ved/yvs/ts"
    allrecs = []
    for idx, url, html in iter_parts(base, "ts", start=1, max_holes=3):
        recs = parse_titus_page(html)
        allrecs.extend(recs)
    # Book=Kāṇḍa (x), Chapter=Prapāṭhaka (y), Verse (z)
    rolled = group_join(allrecs, ("book", "chapter", "verse"))
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(rolled, f, ensure_ascii=False, indent=2)
    print(f"[BLACK] parts={idx} sentences={len(allrecs)} unique x.y.z={len(rolled)} → {out_path}")

# run both



In [9]:
# scrape_white_to_json("devanagari_white.json")
scrape_black_to_json("devanagari_black.json")

[BLACK] parts=46 sentences=16405 unique x.y.z=54 → devanagari_black.json
