In [1]:
!pip install fake_useragent



In [19]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re


def html_to_markdown(url, soup):
    """
    Converts HTML content to Markdown.

    Args:
        soup (BeautifulSoup): The BeautifulSoup object representing the HTML content.

    Returns:
        str: The Markdown content.
    """

    markdown_content = ""

    # Handle headings
    for heading in soup.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
        level = int(heading.name[1:])
        markdown_content += f"{'#' * level} {heading.text.strip()}\n\n"

    # Handle paragraphs
    for paragraph in soup.find_all("p"):
        markdown_content += f"{paragraph.text.strip()}\n\n"

    # Handle images
    for image in soup.find_all("img"):
        alt_text = image.get("alt", "")
        src = image.get("src", "")
        if src:
            markdown_content += f"![{alt_text}]({urljoin(url, src)})\n\n"

    # Handle links
    for link in soup.find_all("a"):
        text = link.text.strip()
        href = link.get("href", "")
        if href:
            markdown_content += f"[{text}]({urljoin(url, href)})\n\n"

    # Handle lists
    for list_tag in soup.find_all(["ul", "ol"]):
        list_items = list_tag.find_all("li")
        if list_tag.name == "ul":
            markdown_content += (
                "- " + "\n- ".join([item.text.strip() for item in list_items]) + "\n\n"
            )
        else:
            markdown_content += (
                "1. "
                + "\n1. ".join([item.text.strip() for item in list_items])
                + "\n\n"
            )

    # Handle code blocks
    for code_block in soup.find_all("code"):
        markdown_content += f"```\n{code_block.text.strip()}\n```\n\n"

    # Handle blockquotes
    for blockquote in soup.find_all("blockquote"):
        markdown_content += f"> {blockquote.text.strip()}\n\n"

    # Handle iframes (e.g., embedded videos)
    for iframe in soup.find_all("iframe"):
        src = iframe.get("src", "")
        if src:
            markdown_content += f"[Embedded content]({urljoin(url, src)})\n\n"

    # Handle horizontal rules
    for hr in soup.find_all("hr"):
        markdown_content += "---\n\n"

    # Handle tables
    for table in soup.find_all("table"):
        rows = table.find_all("tr")
        headers = [th.text.strip() for th in rows[0].find_all("th")]
        markdown_content += "| " + " | ".join(headers) + " |\n"
        markdown_content += "|-" * len(headers) + "|\n"
        for row in rows[1:]:
            cells = [td.text.strip() for td in row.find_all(["td", "th"])]
            markdown_content += "| " + " | ".join(cells) + " |\n"
        markdown_content += "\n"

    # Replace multiple newlines with a single newline
    markdown_content = re.sub(r"\n+", "\n\n", markdown_content)

    return markdown_content


def scrape_urls_to_markdown_and_html(urls, output_prefix="scraped_content"):
    """
    Scrapes HTML content from a list of URLs and saves the textual content
    for each URL in both a Markdown and an HTML file, converting any html tag
    that can be mapped directly to markdown into markdown.

    Args:
        urls (list): A list of URLs to scrape.
        output_prefix (str, optional): The prefix for the output file names.
            Defaults to "scraped_content".
    """

    final_markdown_content = ""

    for i, url in enumerate(urls):
        try:
            # Use a more generic User-Agent
            headers = {
                "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
                "Connection": "keep-alive",
                "Upgrade-Insecure-Requests": "1",
            }

            response = requests.get(url, headers=headers, allow_redirects=True)
            response.raise_for_status()  # Raise an exception for bad status codes

            soup = BeautifulSoup(response.content, "html.parser", from_encoding="utf-8")

            # Remove scripts and styles
            for script in soup(["script", "style"]):
                script.extract()

            # Remove any template tags
            for template in soup(["template"]):
                template.extract()

            # Remove all div tags and preserve their content
            for div in soup.find_all("div"):
                div.unwrap()

            # Remove all span tags and preserve their content
            for span in soup.find_all("span"):
                span.unwrap()

            # Remove all meta tags and preserve their content
            for meta in soup.find_all("meta"):
                meta.unwrap()

            # Remove all link tags and preserve their content
            for link in soup.find_all("link"):
                link.unwrap()

            # Remove class attributes from all tags
            for tag in soup.find_all(True, attrs={"class": True}):
                del tag["class"]

            # Save HTML content
            with open(f"{output_prefix}_{i}.html", "w", encoding="utf-8") as html_file:
                html_file.write(soup.prettify())

            # Convert HTML to Markdown
            markdown_content = html_to_markdown(url, soup)

            # Wrap the Markdown content in details tag
            markdown_content = (
                f"<details><summary>{url}</summary>\n\n{markdown_content}\n</details>"
            )

            # Save Markdown content
            with open(f"{output_prefix}_{i}.md", "w", encoding="utf-8") as md_file:
                md_file.write(markdown_content)

            # Append to final Markdown content
            final_markdown_content += markdown_content + "\n\n"

        except requests.exceptions.RequestException as e:
            print(f"Error scraping {url}: {e}")

    # Save final merged Markdown content
    with open(f"{output_prefix}_merged.md", "w", encoding="utf-8") as merged_md_file:
        merged_md_file.write(final_markdown_content)


# Example usage:
urls = [
    # "https://llama.meta.com/docs/model-cards-and-prompt-formats/",
    "https://platform.openai.com/docs/guides/images/usage?context=node"
]

scrape_urls_to_markdown_and_html(urls, "scraped_content")

Error scraping https://platform.openai.com/docs/guides/images/usage?context=node: 403 Client Error: Forbidden for url: https://platform.openai.com/docs/guides/images/usage?context=node


In [20]:
urls = [
    "https://www.deeplearning.ai/short-courses/ai-agentic-design-patterns-with-autogen",
    "https://www.deeplearning.ai/short-courses/building-multimodal-search-and-rag",
    "https://www.deeplearning.ai/short-courses/introduction-to-on-device-ai",
    "https://www.deeplearning.ai/short-courses/multi-ai-agent-systems-with-crewai",
    "https://www.deeplearning.ai/short-courses/building-agentic-rag-with-llamaindex",
    "https://www.deeplearning.ai/short-courses/quantization-in-depth",
    "https://www.deeplearning.ai/short-courses/prompt-engineering-for-vision-models",
    "https://www.deeplearning.ai/short-courses/preprocessing-unstructured-data-for-llm-applications",
    "https://www.deeplearning.ai/short-courses/red-teaming-llm-applications",
    "https://www.deeplearning.ai/short-courses/getting-started-with-mistral",
    "https://www.deeplearning.ai/short-courses/quantization-fundamentals-with-hugging-face",
    "https://www.deeplearning.ai/short-courses/efficiently-serving-llms",
    "https://www.deeplearning.ai/short-courses/javascript-rag-web-apps-with-llamaindex",
    "https://www.deeplearning.ai/short-courses/open-source-models-hugging-face",
    "https://www.deeplearning.ai/short-courses/knowledge-graphs-rag",
    "https://www.deeplearning.ai/short-courses/prompt-engineering-with-llama-2",
    "https://www.deeplearning.ai/short-courses/serverless-llm-apps-amazon-bedrock",
    "https://www.deeplearning.ai/short-courses/building-applications-vector-databases",
    "https://www.deeplearning.ai/short-courses/build-llm-apps-with-langchain-js",
    "https://www.deeplearning.ai/short-courses/automated-testing-llmops",
    "https://www.deeplearning.ai/short-courses/llmops",
    "https://www.deeplearning.ai/short-courses/advanced-retrieval-for-ai",
    "https://www.deeplearning.ai/short-courses/reinforcement-learning-from-human-feedback",
    "https://www.deeplearning.ai/short-courses/building-evaluating-advanced-rag",
    "https://www.deeplearning.ai/short-courses/vector-databases-embeddings-applications",
    "https://www.deeplearning.ai/short-courses/functions-tools-agents-langchain",
    "https://www.deeplearning.ai/short-courses/quality-safety-llm-applications",
    "https://www.deeplearning.ai/short-courses/pair-programming-llm",
    "https://www.deeplearning.ai/short-courses/google-cloud-vertex-ai/",
    "https://www.deeplearning.ai/short-courses/google-cloud-vertex-ai",
    "https://www.deeplearning.ai/short-courses/microsoft-semantic-kernel",
    "https://www.deeplearning.ai/short-courses/finetuning-large-language-models",
    "https://www.deeplearning.ai/short-courses/large-language-models-semantic-search",
    "https://www.deeplearning.ai/short-courses/evaluating-debugging-generative-ai",
    "https://www.deeplearning.ai/short-courses/langchain-chat-with-your-data",
    "https://www.deeplearning.ai/short-courses/building-generative-ai-applications-with-gradio",
    "https://www.deeplearning.ai/short-courses/building-systems-with-chatgpt",
    "https://www.deeplearning.ai/short-courses/langchain-for-llm-application-development",
    "https://www.deeplearning.ai/short-courses/how-diffusion-models-work",
    "https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers",
]
scrape_urls_to_markdown_and_html(urls, output_prefix="short_courses/short_courses")

In [21]:
urls_courses = [
    "https://www.deeplearning.ai/courses/machine-learning-in-production/",
    "https://www.deeplearning.ai/courses/generative-ai-for-everyone/",
    "https://www.deeplearning.ai/courses/generative-ai-with-llms/",
    "https://www.deeplearning.ai/courses/ai-for-everyone/",
]
scrape_urls_to_markdown_and_html(urls_courses, output_prefix="courses/courses")

In [22]:
url_specializations = [
    "https://www.deeplearning.ai/courses/machine-learning-in-production/",
    "https://www.deeplearning.ai/courses/generative-ai-for-everyone/",
    "https://www.deeplearning.ai/courses/generative-ai-with-llms/",
    "https://www.deeplearning.ai/courses/ai-for-everyone/",
]
scrape_urls_to_markdown_and_html(
    url_specializations, output_prefix="specialization/specializations"
)

In [23]:
urls_notebooks = [
    "https://www.ollama.com/",
    "https://ai.google.dev/gemma/docs",
    "https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/",
    "https://huggingface.co/models",
    "https://python.langchain.com/",
    "https://faiss.ai/",
    "https://docs.trychroma.com/",
    "https://www.pinecone.io/",
    "https://www.devitoproject.org/",
    "https://www.devitoproject.org/tutorials.html",
    "https://www.devitoproject.org/api/",
    "https://python.langchain.com/",
    "https://python.langchain.com/v0.2/docs/tutorials/rag/",
    "https://python.langchain.com/v0.1/docs/use_cases/question_answering/",
    "https://ai.google.dev/",
]
scrape_urls_to_markdown_and_html(
    urls_notebooks, output_prefix="llm-sig-urls/llm-sig-urls"
)

Error scraping https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/: 429 Client Error: Too Many Requests for url: https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3/


In [25]:
urls_prompt_engineering = [
    "https://www.promptingguide.ai/introduction",
    "https://www.promptingguide.ai/introduction/settings",
    "https://www.promptingguide.ai/introduction/basics",
    "https://www.promptingguide.ai/introduction/elements",
    "https://www.promptingguide.ai/introduction/tips",
    "https://www.promptingguide.ai/introduction/examples",
    "https://www.promptingguide.ai/techniques",
    "https://www.promptingguide.ai/techniques/zeroshot",
    "https://www.promptingguide.ai/techniques/fewshot",
    "https://www.promptingguide.ai/techniques/cot",
    "https://www.promptingguide.ai/techniques/consistency",
    "https://www.promptingguide.ai/techniques/knowledge",
    "https://www.promptingguide.ai/techniques/prompt_chaining",
    "https://www.promptingguide.ai/techniques/tot",
    "https://www.promptingguide.ai/techniques/rag",
    "https://www.promptingguide.ai/techniques/art",
    "https://www.promptingguide.ai/techniques/ape",
    "https://www.promptingguide.ai/techniques/activeprompt",
    "https://www.promptingguide.ai/techniques/dsp",
    "https://www.promptingguide.ai/techniques/pal",
    "https://www.promptingguide.ai/techniques/react",
    "https://www.promptingguide.ai/techniques/reflexion",
    "https://www.promptingguide.ai/techniques/multimodalcot",
    "https://www.promptingguide.ai/techniques/graph",
    "https://www.promptingguide.ai/applications",
    "https://www.promptingguide.ai/applications/function_calling",
    "https://www.promptingguide.ai/applications/generating",
    "https://www.promptingguide.ai/applications/synthetic_rag",
    "https://www.promptingguide.ai/applications/generating_textbooks",
    "https://www.promptingguide.ai/applications/coding",
    "https://www.promptingguide.ai/applications/workplace_casestudy",
    "https://www.promptingguide.ai/applications/pf",
    "https://www.promptingguide.ai/prompts",
    "https://www.promptingguide.ai/prompts/classification",
    "https://www.promptingguide.ai/prompts/coding",
    "https://www.promptingguide.ai/prompts/creativity",
    "https://www.promptingguide.ai/prompts/evaluation",
    "https://www.promptingguide.ai/prompts/information-extraction",
    "https://www.promptingguide.ai/prompts/image-generation",
    "https://www.promptingguide.ai/prompts/mathematics",
    "https://www.promptingguide.ai/prompts/question-answering",
    "https://www.promptingguide.ai/prompts/reasoning",
    "https://www.promptingguide.ai/prompts/text-summarization",
    "https://www.promptingguide.ai/prompts/truthfulness",
    "https://www.promptingguide.ai/prompts/adversarial-prompting",
    "https://www.promptingguide.ai/models",
    "https://www.promptingguide.ai/models/chatgpt",
    "https://www.promptingguide.ai/models/claude-3",
    "https://www.promptingguide.ai/models/code-llama",
    "https://www.promptingguide.ai/models/flan",
    "https://www.promptingguide.ai/models/gemini",
    "https://www.promptingguide.ai/models/gemini-advanced",
    "https://www.promptingguide.ai/models/gemini-pro",
    "https://www.promptingguide.ai/models/gemma",
    "https://www.promptingguide.ai/models/gpt-4",
    "https://www.promptingguide.ai/models/grok-1",
    "https://www.promptingguide.ai/models/llama",
    "https://www.promptingguide.ai/models/llama-3",
    "https://www.promptingguide.ai/models/mistral-7b",
    "https://www.promptingguide.ai/models/mistral-large",
    "https://www.promptingguide.ai/models/mixtral",
    "https://www.promptingguide.ai/models/mixtral-8x22b",
    "https://www.promptingguide.ai/models/olmo",
    "https://www.promptingguide.ai/models/phi-2",
    "https://www.promptingguide.ai/models/sora",
    "https://www.promptingguide.ai/models/collection",
    "https://www.promptingguide.ai/risks",
    "https://www.promptingguide.ai/risks/adversarial",
    "https://www.promptingguide.ai/risks/factuality",
    "https://www.promptingguide.ai/risks/biases",
    "https://www.promptingguide.ai/research",
    "https://www.promptingguide.ai/research/llm-agents",
    "https://www.promptingguide.ai/research/rag",
    "https://www.promptingguide.ai/research/llm-reasoning",
    "https://www.promptingguide.ai/research/rag-faithfulness",
    "https://www.promptingguide.ai/research/llm-recall",
    "https://www.promptingguide.ai/research/rag_hallucinations",
    "https://www.promptingguide.ai/research/synthetic_data",
    "https://www.promptingguide.ai/research/thoughtsculpt",
    "https://www.promptingguide.ai/research/infini-attention",
    "https://www.promptingguide.ai/research/guided-cot",
    "https://www.promptingguide.ai/research/trustworthiness-in-llms",
    "https://www.promptingguide.ai/research/llm-tokenization",
    "https://www.promptingguide.ai/research/groq",
    "https://www.promptingguide.ai/papers",
    "https://www.promptingguide.ai/tools",
    "https://www.promptingguide.ai/notebooks",
    "https://www.promptingguide.ai/datasets",
    "https://www.promptingguide.ai/readings",
]
scrape_urls_to_markdown_and_html(
    urls_prompt_engineering, output_prefix="prompt_engineering/prompt_engineering"
)