In [1]:
'''
Documentations Scraper
Author: Krittaprot Tangkittikun
Date: Mar 19, 2024
Purpose: The notebook includes the code for loading HTML source code using selenium.
The page content is then parsed using beautifulsoup for a structured string/text format
for further use in LLM fine-tuning datasets generation.

Environment Requirements:
pip install beautifulsoup
pip install selenium
'''

from selenium import webdriver
from bs4 import BeautifulSoup

def load_content(link):
    # Create a WebDriver instance
    driver = webdriver.Chrome()

    # Open the webpage
    driver.get(link)

    # Get the page source
    page_source = driver.page_source

    # Close the browser
    driver.quit()
    return page_source

def get_content_with_hyperlinks(soup):
    content_parts = []  # List to hold all parts of the content
    
    current_h3 = None
    current_h4 = None
    
    for descendant in soup.descendants:
        if descendant.name == 'h3':
            current_h3 = descendant.get_text(strip=True)
            current_h4 = None  # Reset H4 because we're in a new H3 section
            content_parts.append(f'\n\n{current_h3}\n')  # Add two newlines before H3, one after
        elif descendant.name == 'h4':
            current_h4 = descendant.get_text(strip=True)
            content_parts.append(f'\n  {current_h4}\n')  # Newline before and after H4, with indentation
        elif descendant.name == 'p':
            paragraph_with_links = ""
            for content in descendant.contents:
                if content.name == 'a':
                    link_text = content.get_text(strip=True)
                    link_url = content['href']
                    # Add space before link if paragraph doesn't end with space
                    if paragraph_with_links and not paragraph_with_links.endswith(' '):
                        paragraph_with_links += ' '
                    paragraph_with_links += f'{link_text} ({link_url}) '
                else:
                    paragraph_with_links += str(content)
            
            # Clean up the paragraph text to remove extra spaces
            paragraph_with_links = ' '.join(paragraph_with_links.split())
            
            # Append paragraph text with appropriate indentation
            if current_h4:  # If inside an H4 section
                content_parts.append(f'    {paragraph_with_links}\n')
            elif current_h3:  # Directly under an H3 section
                content_parts.append(f'  {paragraph_with_links}\n')

    # Join all parts into a single string
    full_content = ''.join(content_parts).strip()  # Strip leading/trailing whitespace
    return full_content

page_source = load_content('https://www.kaggle.com/community-guidelines')
soup = BeautifulSoup(page_source)
web_content = get_content_with_hyperlinks(soup)

print(web_content)

General Guidelines
  These guidelines apply to all user communication on kaggle.com (http://kaggle.com) , including Discussions, Notebooks, Datasets, etc. whether private or public.
  Nuance is easily lost when communicating online especially when many people are not using their first language. Instead of making assumptions, stay calm and ask clarifying questions. If you feel you can’t be patient or friendly, take a step back and respond later.
  We’re all here to learn and share ideas. When you have critical feedback, focus on the ideas that others are sharing, not the person.
  Low-level harassment is still harassment. Even minor or subtle put downs set a negative tone in our community that will alienate others.
  We strive to be a community that welcomes and supports people of all backgrounds and identities. This includes, but is not limited to, members of any race, ethnicity, culture, national origin, color, immigration status, social and economic class, educational level, sex, sex

Resources: 
1) [How to SCRAPE DYNAMIC websites with Selenium](https://www.youtube.com/watch?v=lTypMlVBFM4)
2) [Helium - high level webscraper (Selenium-based)](https://github.com/mherrmann/helium?tab=readme-ov-file)