In [106]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time

import re

import pandas as pd

In [89]:
# List of file extensions to skip
SKIP_EXTENSIONS = (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".pdf", ".zip", ".tar", ".gz", ".mp4", ".webp")

def is_valid(url):
    parsed = urlparse(url)
    path = parsed.path.lower()
    return (
        parsed.scheme in {"http", "https"}
        and parsed.netloc == "learning.rc.virginia.edu"
        and path.startswith("/notes")
        and not path.endswith(SKIP_EXTENSIONS)
    )

def crawl(url):
    if url in visited:
        return
    visited.add(url)

    try:
        response = requests.get(url, timeout=5)
        content_type = response.headers.get("Content-Type", "")

        # Skip non-HTML content (e.g. image files served without .png in URL)
        if "text/html" not in content_type:
            print(f"Skipping non-HTML URL: {url}")
            return

        if response.status_code != 200:
            return

        soup = BeautifulSoup(response.text, "html.parser")

        # Remove images before text extraction
        for tag in soup.find_all("img"):
            tag.decompose()

        # Extract and clean text
        text = soup.get_text(separator="\n")
        text = re.sub(r"https?:\/\/\S+?\.png", "", text)  # remove .png URLs
        text = re.sub(r"\S+\.png", "", text)              # remove local refs
        text = re.sub(r"\n{3,}", "\n\n", text)
        text = re.sub(r"[ \t]+", " ", text)

        documents[url] = text.strip()

        # Crawl linked pages recursively
        for a_tag in soup.find_all("a", href=True):
            next_url = urljoin(url, a_tag['href'])
            if is_valid(next_url):
                crawl(next_url)

        time.sleep(0.1)

    except Exception as e:
        print(f"Failed to crawl {url}: {e}")

In [77]:
def clean_html_text(html_text):
    soup = BeautifulSoup(html_text, "html.parser")

    # Remove script and style elements
    for tag in soup(["script", "style", "nav", "footer", "header"]):
        tag.decompose()

    # Optionally remove common navbars by class or id (you can tweak these)
    for div in soup.find_all(["div", "section"], class_=re.compile("(nav|footer|menu|header)", re.I)):
        div.decompose()

    # Extract visible text
    text = soup.get_text(separator="\n")

    # Remove excessive blank lines
    text = re.sub(r"\n{2,}", "\n\n", text)  # max two newlines
    text = re.sub(r"[ \t]+", " ", text)     # normalize whitespace

    # Remove copyright or repeated footer junk
    text = re.sub(r"©\s*20\d{2}.*University of Virginia.*", "", text, flags=re.I)
    text = re.sub(r"Last updated on.*", "", text, flags=re.I)

    # Trim
    return text.strip()


In [78]:
def refine_cleaned_text(text):
    lines = text.strip().splitlines()

    # Skip first line when looking for "Container Basics"
    start_index = 0
    for i in range(1, len(lines)):  # start from second line
        if "github actions" in lines[i].lower():
            start_index = i + 1
            break

    # Strip from "Previous" and after
    end_index = len(lines)
    for i, line in enumerate(lines):
        if "previous" in line.lower():
            end_index = i
            break

    cleaned_lines = lines[start_index:end_index]
    cleaned_text = "\n".join(cleaned_lines).strip()

    return cleaned_text


In [100]:
BASE_URL = "https://learning.rc.virginia.edu/notes/containers/"
visited = set()
documents = {}

In [101]:
crawl(BASE_URL)

In [102]:
documents

{'https://learning.rc.virginia.edu/notes/containers/': 'Introduction to Building and Deploying Containers | RC Learning Portal\n\nSearch\n\nRC Learning Portal\n\nRC Learning Portal\n\nHome\n\nShort Courses\n\nTutorials\n\nPeople\n\nContact\n\nContainers\n\nWhat Are Containers?\n\nWhy Use Containers?\n\nContainers vs VMs\n\nContainer Services\n\nContainer Basics\n\nA Quick Example\n\nRunning WebApps in a Container\n\nBuilding a Docker Image\n\nServing a ShinyApp\n\nWriting the Dockerfile\n\nBuilding the App Image\n\nRunning and Debugging\n\nPushing our Image to a Container Registry\n\nVersion Control\n\nCreating a New GitHub Repository\n\nCommon Git Commands\n\nGitHub Actions\n\nIntroduction to Building and Deploying Containers\n\nYou’ve developed an app and written the paper. Now it’s time to deploy the app so the world (and the reviewers) can see how awesome it is. This is Part 1 of a two-part workshop that will cover how to deploy web apps for publication. In Part 1 we will go over h

In [103]:
cleaned_docs = {}

for url, html_text in documents.items():
    cleaned_docs[url] = clean_html_text(html_text)


In [96]:
print(cleaned_docs['https://learning.rc.virginia.edu/notes/containers/basics/'])

Container Basics | RC Learning Portal

Search

RC Learning Portal

RC Learning Portal

Home

Short Courses

Tutorials

People

Contact

Containers

What Are Containers?

Why Use Containers?

Containers vs VMs

Container Services

Container Basics

A Quick Example

Running WebApps in a Container

Building a Docker Image

Serving a ShinyApp

Writing the Dockerfile

Building the App Image

Running and Debugging

Pushing our Image to a Container Registry

Version Control

Creating a New GitHub Repository

Common Git Commands

GitHub Actions

Container Basics

To run and build containers, you will need Docker Desktop installed on your local machine. Instructions and installation files can be found here: 
https://docs.docker.com/engine/install/
.

Terminology

Image
: The layers of libraries, code, and configuration that make up the environment that you need to run your application.

Container
: A running instance of an image. You can have many containers of a single image run simultaneously

In [104]:
refined_docs = {
    url: refine_cleaned_text(text)
    for url, text in cleaned_docs.items()
}

In [98]:
print(refined_docs['https://learning.rc.virginia.edu/notes/containers/basics/'])

Container Basics

To run and build containers, you will need Docker Desktop installed on your local machine. Instructions and installation files can be found here: 
https://docs.docker.com/engine/install/
.

Terminology

Image
: The layers of libraries, code, and configuration that make up the environment that you need to run your application.

Container
: A running instance of an image. You can have many containers of a single image run simultaneously.

DockerHub
: An online registry for Docker images (similar to GitHub)

Commonly Used Docker Commands

docker pull
: Fetches an image from a container registry to your local machine

docker images
: List all locally available images (kind of like ls)

docker run
: Run a container based on a particular image


In [105]:
refined_docs

{'https://learning.rc.virginia.edu/notes/containers/': 'Introduction to Building and Deploying Containers\n\nYou’ve developed an app and written the paper. Now it’s time to deploy the app so the world (and the reviewers) can see how awesome it is. This is Part 1 of a two-part workshop that will cover how to deploy web apps for publication. In Part 1 we will go over how to containerize our apps with Docker and maintain them with GitHub.\n\n \n\n\n\n Â©Â\xa02025 The Rector and Visitors of the University of Virginia \n \n\nCite\n\n×\n\n Copy\n \n\n Download',
 'https://learning.rc.virginia.edu/notes/containers/overview/': 'What Are Containers?\n\nShipping containers are used to transport cargo around the world. In computing, containers allow you to transport and share entire filesystems, processes, scripts, and more!\n\nNext\n\nWhy Use Containers?\n\n\n\n Â©Â\xa02025 The Rector and Visitors of the University of Virginia \n \n\nCite\n\n×\n\n Copy\n \n\n Download',
 'https://learning.rc.vir

In [107]:
data = pd.DataFrame(list(refined_docs.items()), columns=["url", "content"])
data.head()

Unnamed: 0,url,content
0,https://learning.rc.virginia.edu/notes/contain...,Introduction to Building and Deploying Contain...
1,https://learning.rc.virginia.edu/notes/contain...,What Are Containers?\n\nShipping containers ar...
2,https://learning.rc.virginia.edu/notes/contain...,Why Use Containers?\n\nHave you ever tried usi...
3,https://learning.rc.virginia.edu/notes/contain...,Containers vs VMs\n\nYou may be familiar with ...
4,https://learning.rc.virginia.edu/notes/contain...,Container Services\n\nContainer-based architec...


In [109]:
def extract_tail(url):
    path = urlparse(url).path
    tail = path.rstrip("/").split("/")[-1]
    return tail or "index"


In [110]:
data["title"] = data["url"].apply(extract_tail)
data.head()

Unnamed: 0,url,content,title
0,https://learning.rc.virginia.edu/notes/contain...,Introduction to Building and Deploying Contain...,containers
1,https://learning.rc.virginia.edu/notes/contain...,What Are Containers?\n\nShipping containers ar...,overview
2,https://learning.rc.virginia.edu/notes/contain...,Why Use Containers?\n\nHave you ever tried usi...,overview-purpose
3,https://learning.rc.virginia.edu/notes/contain...,Containers vs VMs\n\nYou may be familiar with ...,overview-vms
4,https://learning.rc.virginia.edu/notes/contain...,Container Services\n\nContainer-based architec...,overview-services


In [111]:
data

Unnamed: 0,url,content,title
0,https://learning.rc.virginia.edu/notes/contain...,Introduction to Building and Deploying Contain...,containers
1,https://learning.rc.virginia.edu/notes/contain...,What Are Containers?\n\nShipping containers ar...,overview
2,https://learning.rc.virginia.edu/notes/contain...,Why Use Containers?\n\nHave you ever tried usi...,overview-purpose
3,https://learning.rc.virginia.edu/notes/contain...,Containers vs VMs\n\nYou may be familiar with ...,overview-vms
4,https://learning.rc.virginia.edu/notes/contain...,Container Services\n\nContainer-based architec...,overview-services
5,https://learning.rc.virginia.edu/notes/contain...,Container Basics\n\nTo run and build container...,basics
6,https://learning.rc.virginia.edu/notes/contain...,A Quick Example\n\nCowsay is a Linux game that...,basics-example
7,https://learning.rc.virginia.edu/notes/contain...,Running WebApps in a Container\n\nJust like wi...,basics-webapps
8,https://learning.rc.virginia.edu/notes/contain...,Building a Docker Image\n\nIn order to serve o...,basics-dockerfile
9,https://learning.rc.virginia.edu/notes/contain...,Serving a ShinyApp\n\nIn this section of the w...,shinyapp


In [113]:
data.to_csv('containers_data.csv')