In [40]:
import requests
from bs4 import BeautifulSoup
import json
from dataclasses import dataclass, field, asdict
from typing import Optional
import re

In [41]:
# Data classes to hold the scraped information

@dataclass
class GreekWord:
    """Represents a Greek word with its Strong's number and definition."""
    english_word: str  # The English word/phrase this Greek word translates to
    word: str  # Greek word
    transliteration: str  # Romanized form
    strongs_number: str  # Strong's reference number
    part_of_speech: str  # e.g., "Noun - Nominative Masculine Singular"
    definition: str  # English definition/meaning

@dataclass
class Translation:
    """Represents a Bible translation of the verse."""
    version: str  # e.g., "ESV", "NIV"
    text: str  # The verse text in this translation

@dataclass 
class CrossReference:
    """Represents a cross-reference to another Bible verse."""
    reference: str  # e.g., "Acts 2:9-11"
    text: str  # The text of the referenced verse

@dataclass
class VerseData:
    """Complete data for a Bible verse from BibleHub."""
    reference: str  # e.g., "1 Peter 1:1"
    book: str
    chapter: int
    verse: int
    translations: list[Translation] = field(default_factory=list)
    greek_words: list[GreekWord] = field(default_factory=list)
    cross_references: list[CrossReference] = field(default_factory=list)
    
    def to_dict(self):
        return asdict(self)

In [42]:
# Target translations we want to extract
TARGET_VERSIONS = {"NIV", "NLT", "ESV", "NKJV"}

def extract_translations(soup: BeautifulSoup, target_versions: set[str] = TARGET_VERSIONS) -> list[Translation]:
    """Extract verse translations from the page."""
    translations = []
    
    # Find the parallel translations section (div#par)
    par_div = soup.find("div", id="par")
    if not par_div:
        return translations
    
    # Find all version text spans
    version_spans = par_div.find_all("span", class_="versiontext")
    
    for span in version_spans:
        # Get the version name from the link inside the span
        link = span.find("a")
        if not link:
            continue
            
        version_name = link.get_text(strip=True)
        
        # Check if this is one of our target versions
        version_abbrev = None
        if "New International Version" in version_name:
            version_abbrev = "NIV"
        elif "New Living Translation" in version_name:
            version_abbrev = "NLT"
        elif "English Standard Version" in version_name:
            version_abbrev = "ESV"
        elif "New King James Version" in version_name:
            version_abbrev = "NKJV"
        
        if version_abbrev and version_abbrev in target_versions:
            # Collect verse text from siblings until we hit the next version or paragraph break
            verse_parts = []
            next_sibling = span.next_sibling
            
            while next_sibling:
                if hasattr(next_sibling, 'name') and next_sibling.name:
                    # It's a tag
                    tag_classes = next_sibling.get('class', [])
                    
                    if next_sibling.name == 'span' and 'versiontext' in tag_classes:
                        break  # Next version starts
                    if next_sibling.name == 'span' and 'p' in tag_classes:
                        break  # Paragraph marker - end of verse
                    if next_sibling.name == 'div':
                        break  # New section
                    if next_sibling.name == 'br':
                        next_sibling = next_sibling.next_sibling
                        continue
                    if next_sibling.name == 'i':
                        # Italicized text (often used for added words)
                        verse_parts.append(next_sibling.get_text())
                else:
                    # Text node
                    text = str(next_sibling).strip()
                    if text:
                        verse_parts.append(text)
                
                next_sibling = next_sibling.next_sibling
            
            verse_text = " ".join(verse_parts).strip()
            # Clean up multiple spaces
            verse_text = re.sub(r'\s+', ' ', verse_text)
            
            if verse_text:
                translations.append(Translation(version=version_abbrev, text=verse_text))
    
    return translations

In [43]:
def extract_greek_words(soup: BeautifulSoup) -> list[GreekWord]:
    """Extract Greek lexicon information from the page."""
    greek_words = []
    
    # Find the Greek section by its heading
    greek_heading = None
    for h in soup.find_all("div", class_="vheading"):
        if "Greek" in h.get_text():
            greek_heading = h
            break
    
    if not greek_heading:
        return greek_words
    
    # Get the parent container and find all Greek word entries
    # Each entry has: span.word (English), span.grk (Greek), span.translit, span.parse, span.str (Strong's), span.str2 (definition)
    parent = greek_heading.parent
    if not parent:
        return greek_words
    
    # Find all English word spans - these mark the start of each entry
    word_spans = parent.find_all("span", class_="word")
    
    for word_span in word_spans:
        english_word = word_span.get_text(strip=True)
        
        # Find the Greek word (next span with class 'grk')
        grk_span = word_span.find_next("span", class_="grk")
        greek_word = ""
        if grk_span:
            greek_word = grk_span.get_text(strip=True)
        
        # Find the transliteration
        translit_span = word_span.find_next("span", class_="translit")
        transliteration = ""
        if translit_span:
            transliteration = translit_span.get_text(strip=True).strip("()")
        
        # Find the parse info (part of speech)
        parse_span = word_span.find_next("span", class_="parse")
        part_of_speech = ""
        if parse_span:
            part_of_speech = parse_span.get_text(strip=True)
        
        # Find Strong's number
        str_span = word_span.find_next("span", class_="str")
        strongs_number = ""
        if str_span:
            link = str_span.find("a")
            if link:
                href = link.get("href", "")
                match = re.search(r"strongs_(\d+)", href)
                if match:
                    strongs_number = match.group(1)
        
        # Find definition
        str2_span = word_span.find_next("span", class_="str2")
        definition = ""
        if str2_span:
            definition = str2_span.get_text(strip=True)
        
        if greek_word:
            greek_words.append(GreekWord(
                english_word=english_word,
                word=greek_word,
                transliteration=transliteration,
                strongs_number=strongs_number,
                part_of_speech=part_of_speech,
                definition=definition
            ))
    
    return greek_words

In [44]:
def extract_cross_references(soup: BeautifulSoup) -> list[CrossReference]:
    """Extract cross-references from the page."""
    cross_refs = []
    
    # Find the cross-reference section (div#crf)
    crf_div = soup.find("div", id="crf")
    if not crf_div:
        return cross_refs
    
    # Cross-references are in span.crossverse elements
    crossverse_spans = crf_div.find_all("span", class_="crossverse")
    
    for cv_span in crossverse_spans:
        # Get the reference from the link inside
        link = cv_span.find("a")
        if not link:
            continue
            
        ref_text = link.get_text(strip=True)
        if not ref_text:
            continue
        
        # The verse text comes after the span, typically after a <br> tag
        # Navigate through siblings to collect the text until the next crossverse or paragraph break
        verse_text = ""
        next_elem = cv_span.next_sibling
        
        while next_elem:
            # Check if it's a Tag (has .name that is not None)
            if hasattr(next_elem, 'name') and next_elem.name is not None:
                # Stop at next crossverse span or paragraph break
                if next_elem.name == 'span':
                    classes = next_elem.get('class', [])
                    if 'crossverse' in classes:
                        break  # Next cross-reference
                    if 'p' in classes:
                        break  # Paragraph break
                # Skip br tags
                if next_elem.name == 'br':
                    next_elem = next_elem.next_sibling
                    continue
            else:
                # NavigableString (text node)
                text = str(next_elem).strip()
                if text:
                    verse_text += text + " "
            next_elem = next_elem.next_sibling
        
        cross_refs.append(CrossReference(
            reference=ref_text,
            text=verse_text.strip()
        ))
    
    return cross_refs

In [45]:
def parse_verse_reference(url: str) -> tuple[str, str, int, int]:
    """Parse book, chapter, verse from a BibleHub URL.
    
    Example: https://biblehub.com/1_peter/1-1.htm -> ("1 Peter 1:1", "1 Peter", 1, 1)
    """
    # Extract book and chapter-verse from URL
    match = re.search(r'/([a-z0-9_]+)/(\d+)-(\d+)\.htm', url.lower())
    if not match:
        return ("", "", 0, 0)
    
    book_slug = match.group(1)
    chapter = int(match.group(2))
    verse = int(match.group(3))
    
    # Convert slug to proper book name
    # e.g., "1_peter" -> "1 Peter"
    book = book_slug.replace("_", " ").title()
    
    # Format reference
    reference = f"{book} {chapter}:{verse}"
    
    return (reference, book, chapter, verse)


def scrape_verse(url: str) -> VerseData:
    """Scrape all data for a Bible verse from BibleHub."""
    
    # Fetch the page
    response = requests.get(url)
    response.raise_for_status()
    
    soup = BeautifulSoup(response.text, "html.parser")
    
    # Parse the reference
    reference, book, chapter, verse = parse_verse_reference(url)
    
    # Extract all the data
    translations = extract_translations(soup)
    greek_words = extract_greek_words(soup)
    cross_refs = extract_cross_references(soup)
    
    return VerseData(
        reference=reference,
        book=book,
        chapter=chapter,
        verse=verse,
        translations=translations,
        greek_words=greek_words,
        cross_references=cross_refs
    )

In [46]:
# Test the scraper
verse_data = scrape_verse("https://biblehub.com/1_peter/1-1.htm")

print(f"Reference: {verse_data.reference}")
print(f"Book: {verse_data.book}, Chapter: {verse_data.chapter}, Verse: {verse_data.verse}")
print()

print("=" * 60)
print("TRANSLATIONS")
print("=" * 60)
for t in verse_data.translations:
    print(f"\n[{t.version}]")
    print(t.text)

print()
print("=" * 60)
print("GREEK WORDS")
print("=" * 60)
for g in verse_data.greek_words:
    print(f"\n'{g.english_word}' -> {g.word} ({g.transliteration}) - Strong's {g.strongs_number}")
    if g.part_of_speech:
        print(f"  Part of Speech: {g.part_of_speech}")
    if g.definition:
        print(f"  Definition: {g.definition}")

print()
print("=" * 60)
print(f"CROSS REFERENCES ({len(verse_data.cross_references)} total)")
print("=" * 60)
for cr in verse_data.cross_references:  # Show ALL cross-references
    print(f"\n{cr.reference}")
    if cr.text:
        print(f"  {cr.text}")

Reference: 1 Peter 1:1
Book: 1 Peter, Chapter: 1, Verse: 1

TRANSLATIONS

[NIV]
Peter, an apostle of Jesus Christ, To God’s elect, exiles scattered throughout the provinces of Pontus, Galatia, Cappadocia, Asia and Bithynia,

[NLT]
This letter is from Peter, an apostle of Jesus Christ. I am writing to God’s chosen people who are living as foreigners in the provinces of Pontus, Galatia, Cappadocia, Asia, and Bithynia.

[ESV]
Peter, an apostle of Jesus Christ, To those who are elect exiles of the Dispersion in Pontus, Galatia, Cappadocia, Asia, and Bithynia,

[NKJV]
Peter, an apostle of Jesus Christ, To the pilgrims of the Dispersion in Pontus, Galatia, Cappadocia, Asia, and Bithynia,

GREEK WORDS

'Peter,' -> Πέτρος (Petros) - Strong's 4074
  Part of Speech: Noun - Nominative Masculine Singular
  Definition: Peter, a Greek name meaning rock. Apparently a primary word; a rock; as a name, Petrus, an apostle.

'an apostle' -> ἀπόστολος (apostolos) - Strong's 652
  Part of Speech: Noun - Nom

In [47]:
# Export as JSON
print(json.dumps(verse_data.to_dict(), indent=2, ensure_ascii=False))

{
  "reference": "1 Peter 1:1",
  "book": "1 Peter",
  "chapter": 1,
  "verse": 1,
  "translations": [
    {
      "version": "NIV",
      "text": "Peter, an apostle of Jesus Christ, To God’s elect, exiles scattered throughout the provinces of Pontus, Galatia, Cappadocia, Asia and Bithynia,"
    },
    {
      "version": "NLT",
      "text": "This letter is from Peter, an apostle of Jesus Christ. I am writing to God’s chosen people who are living as foreigners in the provinces of Pontus, Galatia, Cappadocia, Asia, and Bithynia."
    },
    {
      "version": "ESV",
      "text": "Peter, an apostle of Jesus Christ, To those who are elect exiles of the Dispersion in Pontus, Galatia, Cappadocia, Asia, and Bithynia,"
    },
    {
      "version": "NKJV",
      "text": "Peter, an apostle of Jesus Christ, To the pilgrims of the Dispersion in Pontus, Galatia, Cappadocia, Asia, and Bithynia,"
    }
  ],
  "greek_words": [
    {
      "english_word": "Peter,",
      "word": "Πέτρος",
      "tra