In [1]:
from bs4 import BeautifulSoup
import re
import html

class HTMLTextExtractor:
    def __init__(self):
        pass

    def extract_with_beautifulsoup(self, html_content):
        """Extract text using BeautifulSoup (most popular method)"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')

            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()

            # Get text and clean it up
            text = soup.get_text()

            # Break into lines and remove leading/trailing space
            lines = (line.strip() for line in text.splitlines())

            # Break multi-headlines into a line each
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))

            # Drop blank lines
            text = '\n'.join(chunk for chunk in chunks if chunk)

            return text
        except Exception as e:
            return f"Error with BeautifulSoup: {e}"

    def extract_with_regex(self, html_content):
        """Extract text using regex (basic method)"""
        try:
            # Remove HTML tags
            clean = re.compile('<.*?>')
            text = re.sub(clean, '', html_content)

            # Decode HTML entities
            text = html.unescape(text)

            # Clean up whitespace
            text = re.sub(r'\s+', ' ', text).strip()

            return text
        except Exception as e:
            return f"Error with regex: {e}"

    def extract_specific_elements(self, html_content, tags=['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
        """Extract text from specific HTML elements only"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')

            texts = []
            for tag in tags:
                elements = soup.find_all(tag)
                for element in elements:
                    text = element.get_text().strip()
                    if text:
                        texts.append(text)

            return '\n\n'.join(texts)
        except Exception as e:
            return f"Error extracting specific elements: {e}"

    def extract_with_metadata(self, html_content):
        """Extract text along with metadata"""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')

            # Extract title
            title = soup.find('title')
            title_text = title.get_text().strip() if title else "No title"

            # Extract meta description
            meta_desc = soup.find('meta', attrs={'name': 'description'})
            description = meta_desc.get('content', '') if meta_desc else ''

            # Extract headings
            headings = []
            for i in range(1, 7):
                h_tags = soup.find_all(f'h{i}')
                for h in h_tags:
                    headings.append(f"H{i}: {h.get_text().strip()}")

            # Extract paragraphs
            paragraphs = []
            p_tags = soup.find_all('p')
            for p in p_tags:
                text = p.get_text().strip()
                if text and len(text) > 20:  # Filter out very short paragraphs
                    paragraphs.append(text)

            # Combine everything
            result = {
                'title': title_text,
                'description': description,
                'headings': headings,
                'paragraphs': paragraphs,
                'full_text': self.extract_with_beautifulsoup(html_content)
            }

            return result
        except Exception as e:
            return f"Error extracting with metadata: {e}"

    def extract_main_content(self, html_content):
        """Try to extract main content by removing navigation, ads, etc."""
        try:
            soup = BeautifulSoup(html_content, 'html.parser')

            # Remove unwanted elements
            unwanted_tags = ['nav', 'footer', 'header', 'aside', 'script', 'style', 'noscript']
            for tag in unwanted_tags:
                for element in soup.find_all(tag):
                    element.decompose()

            # Remove elements with common ad/navigation class names
            unwanted_classes = ['nav', 'navigation', 'menu', 'sidebar', 'ad', 'advertisement',
                              'footer', 'header', 'social', 'share', 'comment']

            for class_name in unwanted_classes:
                for element in soup.find_all(class_=re.compile(class_name, re.I)):
                    element.decompose()

            # Look for main content areas
            main_content = soup.find(['main', 'article']) or soup.find('div', class_=re.compile('content|main|article', re.I))

            if main_content:
                text = main_content.get_text()
            else:
                text = soup.get_text()

            # Clean up text
            lines = (line.strip() for line in text.splitlines())
            chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
            text = '\n'.join(chunk for chunk in chunks if chunk)

            return text
        except Exception as e:
            return f"Error extracting main content: {e}"

# Example usage and testing
def test_extractor():
    # Sample HTML content
    sample_html = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Sample Page Title</title>
        <meta name="description" content="This is a sample page description">
        <style>body { font-family: Arial; }</style>
    </head>
    <body>
        <nav>Navigation menu</nav>
        <header>
            <h1>Main Title</h1>
        </header>
        <main>
            <article>
                <h2>Article Heading</h2>
                <p>This is the first paragraph of the article. It contains some meaningful content.</p>
                <p>This is the second paragraph with more information about the topic.</p>
                <h3>Subheading</h3>
                <p>Another paragraph under the subheading with additional details.</p>
            </article>
        </main>
        <aside class="sidebar">Sidebar content</aside>
        <footer>Footer information</footer>
        <script>console.log('Some JavaScript');</script>
    </body>
    </html>
    """

    extractor = HTMLTextExtractor()

    print("1. Basic text extraction with BeautifulSoup:")
    print("-" * 50)
    basic_text = extractor.extract_with_beautifulsoup(sample_html)
    print(basic_text)
    print("\n")

    print("2. Text extraction with regex:")
    print("-" * 50)
    regex_text = extractor.extract_with_regex(sample_html)
    print(regex_text)
    print("\n")

    print("3. Specific elements extraction:")
    print("-" * 50)
    specific_text = extractor.extract_specific_elements(sample_html)
    print(specific_text)
    print("\n")

    print("4. Main content extraction:")
    print("-" * 50)
    main_text = extractor.extract_main_content(sample_html)
    print(main_text)
    print("\n")

    print("5. Extraction with metadata:")
    print("-" * 50)
    metadata = extractor.extract_with_metadata(sample_html)
    if isinstance(metadata, dict):
        print(f"Title: {metadata['title']}")
        print(f"Description: {metadata['description']}")
        print(f"Headings: {metadata['headings']}")
        print(f"Number of paragraphs: {len(metadata['paragraphs'])}")
    else:
        print(metadata)

if __name__ == "__main__":
    test_extractor()

1. Basic text extraction with BeautifulSoup:
--------------------------------------------------
Sample Page Title
Navigation menu
Main Title
Article Heading
This is the first paragraph of the article. It contains some meaningful content.
This is the second paragraph with more information about the topic.
Subheading
Another paragraph under the subheading with additional details.
Sidebar content
Footer information


2. Text extraction with regex:
--------------------------------------------------
Sample Page Title body { font-family: Arial; } Navigation menu Main Title Article Heading This is the first paragraph of the article. It contains some meaningful content. This is the second paragraph with more information about the topic. Subheading Another paragraph under the subheading with additional details. Sidebar content Footer information console.log('Some JavaScript');


3. Specific elements extraction:
--------------------------------------------------
This is the first paragraph of th