In [18]:
'''
Documentations Scraper
Author: Krittaprot Tangkittikun
Date: Mar 19, 2024
Purpose: The notebook includes the code for loading HTML source code using selenium.
The page content is then parsed using beautifulsoup for a structured string/text format
for further use in LLM fine-tuning datasets generation.

Environment Requirements:
pip install beautifulsoup4
pip install selenium
'''

from selenium import webdriver
from bs4 import BeautifulSoup

def load_content(link):
    # Create a WebDriver instance
    driver = webdriver.Chrome()

    # Open the webpage
    driver.get(link)

    # Get the page source
    page_source = driver.page_source

    # Close the browser
    driver.quit()
    return page_source

def get_content_with_hyperlinks(soup):
    content_parts = []  # List to hold all parts of the content
    
    current_h3 = None
    current_h4 = None
    
    for descendant in soup.descendants:
        if descendant.name == 'h3':
            current_h3 = descendant.get_text(strip=True)
            current_h4 = None  # Reset H4 because we're in a new H3 section
            content_parts.append(f'\n\n{current_h3}\n')  # Add two newlines before H3, one after
        elif descendant.name == 'h4':
            current_h4 = descendant.get_text(strip=True)
            content_parts.append(f'\n  {current_h4}\n')  # Newline before and after H4, with indentation
        elif descendant.name == 'p':
            paragraph_with_links = ""
            for content in descendant.contents:
                if content.name == 'a':
                    link_text = content.get_text(strip=True)
                    link_url = content['href']
                    # Add space before link if paragraph doesn't end with space
                    if paragraph_with_links and not paragraph_with_links.endswith(' '):
                        paragraph_with_links += ' '
                    paragraph_with_links += f'{link_text} ({link_url}) '
                else:
                    paragraph_with_links += str(content)
            
            # Clean up the paragraph text to remove extra spaces
            paragraph_with_links = ' '.join(paragraph_with_links.split())
            
            # Append paragraph text with appropriate indentation
            if current_h4:  # If inside an H4 section
                content_parts.append(f'    {paragraph_with_links}\n')
            elif current_h3:  # Directly under an H3 section
                content_parts.append(f'  {paragraph_with_links}\n')

    # Join all parts into a single string
    full_content = ''.join(content_parts).strip()  # Strip leading/trailing whitespace
    return full_content

In [21]:
import os

# Ensure the directory exists, create if it doesn't
directory = os.path.dirname(file_path)
if not os.path.exists(directory):
    os.makedirs(directory)

links = ['https://www.kaggle.com/docs/competitions', 
         'https://www.kaggle.com/docs/datasets',
         'https://www.kaggle.com/docs/notebooks',
         'https://www.kaggle.com/docs/api',
         'https://www.kaggle.com/docs/efficient-gpu-usage',
         'https://www.kaggle.com/docs/tpu',
         'https://www.kaggle.com/docs/models',
         'https://www.kaggle.com/docs/competitions-setup',
         'https://www.kaggle.com/docs/organizations'
         ]

for i, link in enumerate(links):
    pagename = link.split('/')[-1]
    page_source = load_content(link)
    soup = BeautifulSoup(page_source)
    web_content = get_content_with_hyperlinks(soup)
    file_path = f'scraped_data/{i}_{pagename}.txt'

    # Check if web_content is defined and non-empty
    if web_content:
        # Open the file in write mode and write the string data to it
        with open(file_path, 'w') as file:
            file.write(web_content)
        print("Web content saved to", file_path)
    else:
        print("Error: 'web_content' is not defined or empty.")

Web content saved to scraped_data/0_competitions.txt
Web content saved to scraped_data/1_datasets.txt
Web content saved to scraped_data/2_notebooks.txt
Web content saved to scraped_data/3_api.txt
Error: 'web_content' is not defined or empty.
Web content saved to scraped_data/5_tpu.txt
Web content saved to scraped_data/6_models.txt
Web content saved to scraped_data/7_competitions-setup.txt
Web content saved to scraped_data/8_organizations.txt


Resources: 
1) [How to SCRAPE DYNAMIC websites with Selenium](https://www.youtube.com/watch?v=lTypMlVBFM4)
2) [Helium - high level webscraper (Selenium-based)](https://github.com/mherrmann/helium?tab=readme-ov-file)