In [12]:
import time
import requests
from bs4 import BeautifulSoup, Comment

START_URL = "https://www.musashino-u.ac.jp/"
HEADERS = {"User-Agent": "Mozilla/5.0"}
TIMEOUT = 7

def host(url):
    for p in ("http://", "https://"):
        if url.startswith(p):
            return url[len(p):].split("/", 1)[0]
    return url.split("/", 1)[0]

ALLOW_HOST = host(START_URL)

def normalize(url):
    return url.split("#", 1)[0].rstrip("/")

def to_abs(base, href):
    if not href:
        return None
    h = href.strip()
    l = h.lower()
    if l.startswith(("javascript:", "mailto:", "tel:")):
        return None
    if l.startswith(("http://", "https://")):
        return normalize(h)

    scheme = "https://" if base.startswith("https://") else "http://"
    b = base[len(scheme):]
    hostpart, _, basepath = b.partition("/")
    basepath = "/" + basepath if basepath else "/"

    if h.startswith("/"):
        return normalize(f"{scheme}{hostpart}{h}")

    base_dirs = [p for p in basepath.split("/") if p]
    if not basepath.endswith("/") and base_dirs:
        base_dirs.pop()

    segs = [p for p in h.split("/") if p]
    res = []
    for s in segs:
        if s == "..":
            if base_dirs:
                base_dirs.pop()
        elif s != ".":
            res.append(s)

    path = "/".join(base_dirs + res)
    return normalize(f"{scheme}{hostpart}/" + path)

def same_host(url):
    return host(url) == ALLOW_HOST

def fetch(url):
    try:
        r = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
    except requests.RequestException:
        return None
    if r.status_code != 200:
        return None
    ct = r.headers.get("Content-Type", "").lower()
    if ("html" not in ct) and ("text/" not in ct):
        return None
    enc = r.apparent_encoding
    if "charset=" in ct:
        enc = ct.split("charset=")[-1].split(";")[0].strip() or enc
    r.encoding = enc or "utf-8"
    return r.text

def title_of(html):
    if not html:
        return ""
    s = BeautifulSoup(html, "html.parser")
    t = s.find("title")
    return (t.get_text() if t else "").strip()

def links(html, base):
    if not html:
        return []
    s = BeautifulSoup(html, "html.parser")
    for c in s.find_all(string=lambda x: isinstance(x, Comment)):
        c.extract()
    out = []
    for a in s.find_all("a", href=True):
        u = to_abs(base, a["href"])
        if u and same_host(u):
            out.append(u)
    return out

def crawl(start):
    stack = [normalize(start)]
    seen = set()
    result = {}
    while stack:
        url = stack.pop()
        if url in seen or not same_host(url):
            continue
        html = fetch(url)
        seen.add(url)
        result[url] = title_of(html)
        time.sleep(1.0)
        if html:
            for v in links(html, url):
                v = normalize(v)
                if v not in seen:
                    stack.append(v)
    return result

if __name__ == "__main__":
    data = crawl(START_URL)
    print(data)

KeyboardInterrupt: 