In [6]:
import requests
from bs4 import BeautifulSoup

In [7]:
# Step 1: Fetch the robots.txt file as url/robots.txt contains the sitemap
def fetch_robots_txt(website_url):
    robots_url = website_url + "/robots.txt"
    response = requests.get(robots_url)
    
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch robots.txt: {response.status_code}")
        return None
# Step 2: Extract the sitemap URL from robots.txt
def extract_sitemap_from_robots(robots_txt, base_url):
    sitemap_url = None
    print("Content of robots.txt:")
    print(robots_txt)  # Print the content of robots.txt for debugging
    for line in robots_txt.splitlines():
        if line.lower().startswith("sitemap:"):
            sitemap_url = line.split(":")[1].strip()
            # Check if the sitemap URL is complete
            if not sitemap_url.startswith("http://") and not sitemap_url.startswith("https://"):
                sitemap_url = base_url + "/" + sitemap_url.strip('/')  # Construct full URL
            break
    return sitemap_url

In [8]:
# Step 3: Fetch the sitemap.xml file as url/rsitemap.xml contains the sitemap
def fetch_sitemap(sitemap_url):
    response = requests.get(sitemap_url)
    if response.status_code == 200:
        return response.text
    else:
        print(f"Failed to fetch sitemap: {response.status_code} - URL: {sitemap_url}")
        return None

# Step 4: Parse the sitemap XML and extract URLs
def parse_sitemap(sitemap_xml):
    soup = BeautifulSoup(sitemap_xml, 'xml')
    urls = [loc.get_text() for loc in soup.find_all('loc')]
    return urls

In [9]:
# Main function to extract sitemap and URLs
def main(website_url):
    # Ensure the website_url has a valid schema
    if not website_url.startswith("http://") and not website_url.startswith("https://"):
        website_url = "https://" + website_url  # Default to https if no schema provided
    
    robots_txt = fetch_robots_txt(website_url)
    if robots_txt:
        # Pass base_url to extract_sitemap_from_robots
        sitemap_url = extract_sitemap_from_robots(robots_txt, website_url)
        
        if sitemap_url:
            print(f"Sitemap found: {sitemap_url}")
            sitemap_xml = fetch_sitemap(sitemap_url)
            
            if sitemap_xml:
                urls = parse_sitemap(sitemap_xml)
                print(f"Found {len(urls)} URLs in sitemap.")
                for url in urls[:10]:  # Limit output to 10 URLs for demonstration
                    print(url)
        else:
            print("No sitemap found in robots.txt")
    else:
        print("Failed to retrieve robots.txt")

# Example Usage
website_url = "url"  # Without schema
main(website_url)

Content of robots.txt:
# Notice: Collection of data on Facebook through automated means is
# prohibited unless you have express written permission from Facebook
# and may only be conducted for the limited purpose contained in said
# permission.
# See: http://www.facebook.com/apps/site_scraping_tos_terms.php
User-agent: Amazonbot
Disallow: /

User-agent: Applebot-Extended
Disallow: /

User-agent: ClaudeBot
Disallow: /

User-agent: Google-Extended
Disallow: /

User-agent: GPTBot
Disallow: /

User-agent: PetalBot
Disallow: /

User-agent: viberbot
Disallow: /

User-agent: YaK
Disallow: /

User-agent: Yandex
Disallow: /

User-agent: Applebot
Disallow: /*/plugins/*
Disallow: /?*next=
Disallow: /a/bz?
Disallow: /ajax/
Disallow: /album.php
Disallow: /checkpoint/
Disallow: /contact_importer/
Disallow: /dialog/
Disallow: /fbml/ajax/dialog/
Disallow: /feeds/
Disallow: /file_download.php
Disallow: /job_application/
Disallow: /l.php
Disallow: /login.php*&next=
Disallow: /login.php/?next=
Disallow: 