  # 最終課題 

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time # It's polite to pause between requests

In [3]:
# The starting URL and the domain we must stay within
BASE_URL = "https://www.musashino-u.ac.jp/"
DOMAIN = urlparse(BASE_URL).netloc  # This will be "www.musashino-u.ac.jp"

# This is the dictionary for the final output (key: URL, value: Title)
sitemap = {}

# This set stores URLs we've already visited or added to the queue,
# to prevent re-crawling and getting into loops.
visited_urls = set()

# This list will act as a "queue" of URLs we need to crawl.
# We start by adding the base URL.
urls_to_crawl = [BASE_URL]

In [4]:
# We'll limit the crawl to 100 pages to prevent it from running forever
# You can change or remove this limit.
MAX_PAGES = 100 

while urls_to_crawl and len(sitemap) < MAX_PAGES:
    # 1. Get the next URL from the queue
    current_url = urls_to_crawl.pop(0) # .pop(0) gets from the front (Queue)
    
    # 2. Check if we've already processed this URL
    if current_url in visited_urls:
        continue
        
    # 3. Mark it as visited
    visited_urls.add(current_url)

    # 4. Fetch and Parse the Page (See Step 5)
    print(f"Crawling: {current_url}") # Good for seeing progress
    
    try:
        response = requests.get(current_url, timeout=5)
        # Be polite! Wait a moment before the next request
        time.sleep(0.5) 
        
        if response.status_code != 200:
            continue # Skip if the page is broken (404, 500, etc.)
            
        soup = BeautifulSoup(response.text, 'html.parser')

        # 5. Extract the Title (as required by the assignment)
        title = "No Title Found" # Default
        if soup.title and soup.title.string:
            title = soup.title.string.strip() # .strip() removes whitespace

        # 6. Store in the dictionary
        sitemap[current_url] = title

        # 7. Find all new links (See Step 6)
        all_links = soup.find_all('a', href=True)
        
        for link in all_links:
            href = link['href']
            
            # Create an absolute URL (e.g., turn "/about" into "https://.../about")
            full_url = urljoin(BASE_URL, href)
            
            # --- This is the filtering logic ---
            
            # 1. Check if it's in the same domain
            if urlparse(full_url).netloc != DOMAIN:
                continue
                
            # 2. Check if we've already visited it or queued it
            if full_url in visited_urls:
                continue
                
            # 3. Ignore "fragment" links (like #section1)
            if '#' in full_url:
                continue
                
            # If the link is good, add it to the queue!
            urls_to_crawl.append(full_url)
            # Also add to visited_urls here to avoid queuing duplicates
            visited_urls.add(full_url) 

    except requests.RequestException as e:
        print(f"Error fetching {current_url}: {e}")
        
# --- End of while loop ---

# 8. Print the final result
print("--- CRAWLING COMPLETE ---")
print("Found pages:")
print(sitemap)

Crawling: https://www.musashino-u.ac.jp/
--- CRAWLING COMPLETE ---
Found pages:
{'https://www.musashino-u.ac.jp/': 'æ\xad¦è\x94µé\x87\x8eå¤§å\xad¦'}
