In [9]:
from bs4 import BeautifulSoup, NavigableString, Tag
from typing import List, Tuple, Union, Optional
from dataclasses import dataclass

@dataclass
class StageDirection:
    text: str

@dataclass
class Lyric:
    text: str
    character: str
    start_time: Optional[float]
    end_time: Optional[float]

@dataclass
class Line:
    language: str
    content: Union[StageDirection, Lyric]

@dataclass
class Libretto:
    lines: List[Tuple[Line]]

def clean_text(text) -> str:
    if text is None:
        return ""
    if isinstance(text, list):
        return ' '.join(clean_text(t) for t in text if t.strip() if isinstance(t, (NavigableString, Tag)))
    if isinstance(text, NavigableString):
        return text.strip()
    if isinstance(text, Tag):
        return text.get_text().strip()
    return str(text).strip()

def get_stage_direction_text(group):
    stage_texts = []
    for element in group:
        if isinstance(element, Tag) and element.name == 'i':
            stage_texts.append(clean_text(element))
    return ' '.join(stage_texts)

def parse_libretto(html_content: str) -> Libretto:
    soup = BeautifulSoup(html_content, 'html.parser')
    table = soup.find('table')
    rows = table.find_all('tr')
    
    libretto_lines = []
    
    for row in rows:
        columns = row.find_all('td')
        if len(columns) == 2:
            # Split content by double line breaks
            german_groups = []
            english_groups = []
            current_german = []
            current_english = []
            
            # Process German column
            for element in columns[0].contents:
                if isinstance(element, Tag) and element.name == 'br':
                    if current_german and isinstance(current_german[-1], Tag) and current_german[-1].name == 'br':
                        if len(current_german) > 1:  # Only add if there's actual content
                            german_groups.append(current_german[:-1])
                        current_german = []
                    else:
                        current_german.append(element)
                else:
                    if element.strip() if isinstance(element, NavigableString) else True:
                        current_german.append(element)
            if current_german:
                german_groups.append(current_german)

            # Process English column
            for element in columns[1].contents:
                if isinstance(element, Tag) and element.name == 'br':
                    if current_english and isinstance(current_english[-1], Tag) and current_english[-1].name == 'br':
                        if len(current_english) > 1:  # Only add if there's actual content
                            english_groups.append(current_english[:-1])
                        current_english = []
                    else:
                        current_english.append(element)
                else:
                    if element.strip() if isinstance(element, NavigableString) else True:
                        current_english.append(element)
            if current_english:
                english_groups.append(current_english)

            # Process each group pair
            for g_group, e_group in zip(german_groups, english_groups):
                german_current_character = None
                english_current_character = None
                stage_direction = False
                lyrics = []

                # Check if this is a stage direction group
                stage_direction = any(isinstance(element, Tag) and element.name == 'i' 
                                    for element in g_group + e_group)

                # Get character names if present
                for element in g_group:
                    if isinstance(element, Tag) and element.name == 'b':
                        german_current_character = clean_text(element)
                for element in e_group:
                    if isinstance(element, Tag) and element.name == 'b':
                        english_current_character = clean_text(element)

                # Get lyrics if present
                for element in g_group:
                    if isinstance(element, NavigableString) and element.strip():
                        lyrics.append(clean_text(element))

                # Create appropriate Line objects
                if stage_direction:
                    german = Line(
                        language="de", 
                        content=StageDirection(text=get_stage_direction_text(g_group))
                    )
                    english = Line(
                        language="en", 
                        content=StageDirection(text=get_stage_direction_text(e_group))
                    )
                    libretto_lines.append((german, english))
                elif lyrics:
                    for lyric in lyrics:
                        german = Line(
                            language="de",
                            content=Lyric(
                                text=lyric,
                                character=german_current_character or "",
                                start_time=None,
                                end_time=None
                            )
                        )
                        # Find corresponding English lyric
                        eng_lyric = next((clean_text(e) for e in e_group 
                                        if isinstance(e, NavigableString) and e.strip()), "")
                        english = Line(
                            language="en",
                            content=Lyric(
                                text=eng_lyric,
                                character=english_current_character or "",
                                start_time=None,
                                end_time=None
                            )
                        )
                        libretto_lines.append((german, english))

    return Libretto(lines=libretto_lines)

In [11]:
# Usage:
with open('libretti/rheingold_table.html', 'r', encoding='utf-8') as f:
    html_content = f.read()

libretto = parse_libretto(html_content)

# Print some examples to verify
for pair in libretto.lines[:50]:
    print("\nGerman:", pair[0].language)
    print("Content:", pair[0].content)
    print("English:", pair[1].language)
    print("Content:", pair[1].content)


German: de
Content: Lyric(text='Woglinde (eine Rheintochter) - hoher sopran', character='', start_time=None, end_time=None)
English: en
Content: Lyric(text='Woglinde (a Rhine daughter) - soprano', character='', start_time=None, end_time=None)

German: de
Content: Lyric(text='Wellgunde (eine Rheintochter) - hoher sopran', character='', start_time=None, end_time=None)
English: en
Content: Lyric(text='Woglinde (a Rhine daughter) - soprano', character='', start_time=None, end_time=None)

German: de
Content: Lyric(text='Flosshilde (eine Rheintochter) - tiefer sopran', character='', start_time=None, end_time=None)
English: en
Content: Lyric(text='Woglinde (a Rhine daughter) - soprano', character='', start_time=None, end_time=None)

German: de
Content: Lyric(text='Alberich (Nibelunge) - hoher baß', character='', start_time=None, end_time=None)
English: en
Content: Lyric(text='Woglinde (a Rhine daughter) - soprano', character='', start_time=None, end_time=None)

German: de
Content: Lyric(text

In [7]:
import pandas as pd

# read in html table as df while preserving the html tags

df = pd.read_html('libretti/rheingold_table.html')[0]

In [2]:
df

Unnamed: 0,0,1
0,Personen Woglinde (eine Rheintochter) - hoher ...,Characters Woglinde (a Rhine daughter) - sopra...
1,Vorspiel un erste Scene In der Tiefe des Rhein...,Prelude and Scene One At the bottom of the Rhi...
2,Wellgunde (taucht aus der Fluth zum Riff herab...,Wellgunde (dives down to the rock.) How safe i...
3,Alberich Hehe! ihr Nicker! (Die MÃ¤dchen halte...,Alberich Hehe! ye nixies! (The maidens stop pl...
4,Woglinde Mit uns will er spielen? Wellgunde Is...,Woglinde Would he be our playmate? Wellgunde D...
...,...,...
76,"RheintÃ¶chter (in der Tiefe des Thales, unsich...","Rhine Daughters (in the valley, unseen) Rhine-..."
77,Wotan Wehre ihrem Geneck's! Loge (in das Thal ...,Wotan Cease their clamorous taunts. Loge (call...
78,,"Glossary aught. - anything aye. - always, fore..."
79,,"weal. - prosperity, advantage ween. - to imagi..."
