In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from robotexclusionrulesparser import RobotExclusionRulesParser
import time

# --- Configuration ---
# The list of 10 traditional websites to crawl
TARGET_WEBSITES = [
    "https://www.thehindu.com/",
    "https://www.iisc.ac.in/",
    "https://www.isro.gov.in/",
    "https://archive.org/",
    "https://news.mit.edu/",
    "https://www.nasa.gov/",
    "https://www.theguardian.com/world",
    "https://www.gutenberg.org/",
    "https://www.nationalgeographic.com/",
    "https://eng.bangaloreuniversity.ac.in/","https://timesofindia.indiatimes.com/",
    "https://www.ncbs.res.in/",
    "https://www.nytimes.com/international/",
    "https://www.espncricinfo.com/",
    "https://www.reuters.com/",
    "https://www.taralaya.org/",
    "https://www.kyoto-u.ac.jp/en",
    "https://www.britishmuseum.org/",
    "https://www.livemint.com/",
    "https://www.who.int/"
]
OUTPUT_FILE = "combined_dataset.txt"
MAX_LINKS_PER_SITE = 40  # The limit for links from each website
USER_AGENT = "SecureLoggerBot/1.0"   # How your bot identifies itself

# --- Crawler Logic ---
all_collected_paths = set()

print(f"[*] Starting multi-site crawl...")

for base_url in TARGET_WEBSITES:
    print(f"\n--- Crawling {base_url} ---")
    
    try:
        # 1. Fetch and parse robots.txt for the current site
        parsed_base_url = urlparse(base_url)
        robots_url = f"{parsed_base_url.scheme}://{parsed_base_url.netloc}/robots.txt"
        rerp = RobotExclusionRulesParser()
        rerp.fetch(robots_url)
        print(f"[+] robots.txt for {base_url} loaded.")
        
        urls_to_visit = [base_url]
        visited_urls = set()
        links_found_for_site = 0
        
        # 2. Start crawling the site until the limit is reached
        while urls_to_visit and links_found_for_site < MAX_LINKS_PER_SITE:
            current_url = urls_to_visit.pop(0)
            
            if current_url in visited_urls:
                continue

            # Respect the rules from robots.txt
            if not rerp.is_allowed(USER_AGENT, current_url):
                print(f"  [-] Disallowed by robots.txt: {current_url}")
                continue

            visited_urls.add(current_url)
            
            try:
                # 3. Fetch the page
                headers = {'User-Agent': USER_AGENT}
                response = requests.get(current_url, headers=headers, timeout=5)
                
                if response.status_code == 200:
                    # 4. Save the allowed URL path
                    path = urlparse(current_url).path
                    if not path: path = '/'
                    
                    if path not in all_collected_paths:
                        all_collected_paths.add(path)
                        links_found_for_site += 1
                        print(f"  [{links_found_for_site}/{MAX_LINKS_PER_SITE}] Found: {path}")

                    # 5. Find all new links on the page
                    soup = BeautifulSoup(response.content, 'html.parser')
                    for link in soup.find_all('a', href=True):
                        absolute_url = urljoin(base_url, link['href'])
                        # Only follow links that are on the same website
                        if urlparse(absolute_url).netloc == parsed_base_url.netloc:
                            if absolute_url not in visited_urls:
                                urls_to_visit.append(absolute_url)
                                
            except requests.RequestException as e:
                print(f"[!] Error fetching {current_url}: {e}")
    
    except Exception as e:
        print(f"[!] Critical error processing {base_url}: {e}")

# --- Save the Final Dataset ---
print(f"\n[*] Crawl finished. Total unique paths found: {len(all_collected_paths)}")
with open(OUTPUT_FILE, 'w') as f:
    # Sort the list for a clean output file
    for path in sorted(list(all_collected_paths)):
        f.write(path + '\n')
print(f"[*] Dataset saved to {OUTPUT_FILE}")

[*] Starting multi-site crawl...

--- Crawling https://www.thehindu.com/ ---
[+] robots.txt for https://www.thehindu.com/ loaded.
  [-] Disallowed by robots.txt: https://www.thehindu.com/

--- Crawling https://www.iisc.ac.in/ ---
[+] robots.txt for https://www.iisc.ac.in/ loaded.
  [-] Disallowed by robots.txt: https://www.iisc.ac.in/

--- Crawling https://www.isro.gov.in/ ---
[+] robots.txt for https://www.isro.gov.in/ loaded.
  [-] Disallowed by robots.txt: https://www.isro.gov.in/

--- Crawling https://archive.org/ ---
[+] robots.txt for https://archive.org/ loaded.
  [1/40] Found: /

--- Crawling https://news.mit.edu/ ---
[+] robots.txt for https://news.mit.edu/ loaded.
  [1/40] Found: /topics
  [2/40] Found: /topic/machine-learning
  [3/40] Found: /topic/sustainability
  [4/40] Found: /topic/startups
  [5/40] Found: /topic/black-holes
  [6/40] Found: /topic/classes-and-programs
  [7/40] Found: /departments
  [8/40] Found: /department/aeronautics-and-astronautics
  [9/40] Found: /d