In [1]:
import os
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
import html2text
import hashlib
import shutil
import readability


# Set of visited URLs to prevent infinite recursion
visited_urls = set()

def download_page(url):
    """
    Downloads the content of a web page from the given URL.

    Args:
        url (str): The URL of the web page to download.

    Returns:
        str: The content of the web page as a string, or None if there was an error.

    Raises:
        requests.RequestException: If there was an error while downloading the web page.
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raises an error for bad status codes
        return response.text
    except requests.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return None

def extract_urls(html, base_url, ignored_extensions = ['.txt', '.pdf', '.docx']):
    """
    Extracts all URLs from the given HTML content, resolving relative URLs and ignoring hash fragments.

    Args:
        html (str): The HTML content to extract URLs from.
        base_url (str): The base URL used to resolve relative URLs.

    Returns:
        set: A set of URLs extracted from the HTML content.
    """
    soup = BeautifulSoup(html, 'html.parser')
    urls = set()
    for link in soup.find_all('a', href=True):
        href = link['href']
        # Resolve relative URLs and filter by hash fragment
        full_url = urljoin(base_url, href.split('#', 1)[0])
        # Ignore URLs ending with specific file extensions
        if any(full_url.endswith(ext) for ext in ignored_extensions):
            continue
        if urlparse(full_url).netloc == urlparse(base_url).netloc:
            urls.add(full_url)
    return urls



def html_to_markdown(html):
    """
    Converts HTML content to Markdown format.

    Parameters:
    html (str): The HTML content to be converted.

    Returns:
    str: The Markdown representation of the HTML content.
    """
    # Using readability to extract the main content
    document = readability.Document(html)
    summary = document.summary()

    converter = html2text.HTML2Text()
    converter.ignore_links = False
    return converter.handle(summary)

def save_markdown(markdown, folder, filename):
    if not os.path.exists(folder):
        os.makedirs(folder)
    filepath = os.path.join(folder, filename)
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(markdown)


def generate_filename(url, base_url):
    """
    Generate a filename based on the given URL and base URL.

    Args:
        url (str): The URL from which the filename will be generated.
        base_url (str): The base URL used to remove the common path from the URL.

    Returns:
        str: The generated filename.

    """
    # Parse the URLs
    parsed_url = urlparse(url)
    parsed_base_url = urlparse(base_url)

    # Remove the base URL path to get the unique part of the path
    base_path = parsed_base_url.path.strip('/')
    unique_path = parsed_url.path.strip('/')

    # If the base_path is not empty, remove it from the start of unique_path
    if base_path and unique_path.startswith(base_path):
        unique_path = unique_path[len(base_path):].strip('/')

    # Split the path into segments and join them with hyphens
    if unique_path:
        filename = unique_path.replace('/', '-').lower() + ".md"
    else:
        filename = "index.md"
    return filename


def scrape_site(url, base_url, base_folder=''):
    """
    Scrapes a website recursively, saving the content as markdown files.

    Args:
        url (str): The URL of the website to scrape.
        base_url (str): The base URL of the website.
        base_folder (str, optional): The base folder to save the markdown files. Defaults to ''.

    Returns:
        None
    """
    # Ensure the URL starts with the base URL
    if not url.startswith(base_url):
        return

    if url in visited_urls or urlparse(url).netloc != urlparse(base_url).netloc:
        return
    visited_urls.add(url)

    print(f"Scraping {url}")
    html = download_page(url)
    if html:
        markdown = html_to_markdown(html)
        filename = generate_filename(url, base_url)
        folder = os.path.join(base_folder, urlparse(base_url).netloc)
        save_markdown(markdown, folder, filename)

        for link in extract_urls(html, url):
            scrape_site(link, base_url, base_folder)



def clean_directory(folder):
    """
    Deletes all files and folders in the specified directory.

    Args:
        folder (str): The path to the directory to be cleaned.

    Raises:
        OSError: If there is an error while deleting files or folders.

    """
    if os.path.exists(folder):
        for filename in os.listdir(folder):
            file_path = os.path.join(folder, filename)
            try:
                if os.path.isfile(file_path) or os.path.islink(file_path):
                    os.unlink(file_path)
                elif os.path.isdir(file_path):
                    shutil.rmtree(file_path)
            except Exception as e:
                print(f'Failed to delete {file_path}. Reason: {e}')

# Example usage

start_url = 'https://vectorbt.pro/pvt_321460c7/'  # Change this URL to your target
base_folder = 'downloaded_markdowns'

clean_directory(base_folder)
scrape_site(start_url, start_url, base_folder)
print("Scraping complete.")


Scraping https://vectorbt.pro/pvt_321460c7/
Scraping https://vectorbt.pro/pvt_321460c7/api/records/mapped_array/
Scraping https://vectorbt.pro/pvt_321460c7/api/generic/drawdowns/
Scraping https://vectorbt.pro/pvt_321460c7/terms/remarks/
Scraping https://vectorbt.pro/pvt_321460c7/documentation/data/scheduling/
Scraping https://vectorbt.pro/pvt_321460c7/tutorials/pairs-trading/
Scraping https://vectorbt.pro/pvt_321460c7/api/indicators/factory/
Scraping https://vectorbt.pro/pvt_321460c7/tutorials/cross-validation/splitter/
Scraping https://vectorbt.pro/pvt_321460c7/api/utils/enum_/
Scraping https://vectorbt.pro/pvt_321460c7/api/indicators/enums/
Scraping https://vectorbt.pro/pvt_321460c7/api/portfolio/decorators/
Scraping https://vectorbt.pro/pvt_321460c7/documentation/fundamentals/
Scraping https://vectorbt.pro/pvt_321460c7/api/data/base/
Scraping https://vectorbt.pro/pvt_321460c7/api/records/chunking/
Scraping https://vectorbt.pro/pvt_321460c7/features/indicators/
Scraping https://vecto