In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import defaultdict
import json
import time
import threading
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
import sys

class URLHierarchyBuilder:
    def __init__(self, base_url, max_depth=2, max_urls=500, rate_limit=1):
        self.base_url = base_url
        self.hierarchy = defaultdict(list)
        self.visited_urls = set()
        self.max_depth = max_depth
        self.max_urls = max_urls
        self.rate_limit = rate_limit
        self.last_request_time = 0
        self.lock = threading.Lock()
        self.processed_count = 0
        self.errors = defaultdict(list)
        self.setup_session()

    def setup_session(self):
        """Configure requests session with retry strategy"""
        self.session = requests.Session()
        retry_strategy = Retry(
            total=3,
            backoff_factor=1,
            status_forcelist=[429, 500, 502, 503, 504]
        )
        adapter = HTTPAdapter(max_retries=retry_strategy)
        self.session.mount("http://", adapter)
        self.session.mount("https://", adapter)
        self.session.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

    def rate_limited_request(self, url):
        """Make a rate-limited request"""
        with self.lock:
            current_time = time.time()
            time_since_last_request = current_time - self.last_request_time
            if time_since_last_request < self.rate_limit:
                time.sleep(self.rate_limit - time_since_last_request)
            self.last_request_time = time.time()

        try:
            response = self.session.get(url, timeout=10)
            response.raise_for_status()
            return response
        except requests.exceptions.RequestException as e:
            self.errors['request'].append((url, str(e)))
            return None

    def get_page_links(self, url, current_depth=0):
        """Extract links from a page"""
        if (current_depth >= self.max_depth or 
            url in self.visited_urls or 
            len(self.visited_urls) >= self.max_urls):
            return

        with self.lock:
            if url in self.visited_urls:
                return
            self.visited_urls.add(url)
            self.processed_count += 1
            percentage = (self.processed_count / self.max_urls) * 100
            print(f"Processing {self.processed_count}/{self.max_urls} ({percentage:.1f}%): {url} (Depth: {current_depth})")

        response = self.rate_limited_request(url)
        if not response:
            return

        try:
            soup = BeautifulSoup(response.text, 'html.parser')
            for link in soup.find_all('a'):
                href = link.get('href')
                if href:
                    absolute_url = urljoin(url, href)
                    if self.is_valid_internal_url(absolute_url):
                        link_text = link.get_text(strip=True) or absolute_url
                        with self.lock:
                            self.hierarchy[url].append({
                                'url': absolute_url,
                                'text': link_text
                            })
                        if absolute_url not in self.visited_urls:
                            self.get_page_links(absolute_url, current_depth + 1)

        except Exception as e:
            self.errors['parsing'].append((url, str(e)))

    def is_valid_internal_url(self, url):
        """Check if URL is valid and internal"""
        try:
            parsed_base = urlparse(self.base_url)
            parsed_url = urlparse(url)
            return (parsed_url.netloc == parsed_base.netloc and 
                   parsed_url.scheme in ['http', 'https'] and
                   '#' not in url)
        except:
            return False

    def create_simplified_structure(self):
        """Create a simplified URL structure"""
        return {
            'metadata': {
                'base_url': self.base_url,
                'total_urls': len(self.visited_urls),
                'max_depth': self.max_depth,
                'timestamp': time.strftime('%Y-%m-%d %H:%M:%S')
            },
            'urls': list(self.visited_urls),
            'links': {url: [link['url'] for link in links] 
                     for url, links in self.hierarchy.items()},
            'errors': dict(self.errors)
        }

    def save_results(self, filename='url_structure.json'):
        """Save crawl results to file"""
        print("\nPreparing to save results...")
        try:
            data = self.create_simplified_structure()
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            print(f"Results successfully saved to '{filename}'")
            
            # Save a simple URL list for backup
            with open('url_list.txt', 'w', encoding='utf-8') as f:
                for url in self.visited_urls:
                    f.write(f"{url}\n")
            print("URL list saved to 'url_list.txt'")
            
        except Exception as e:
            print(f"Error saving results: {e}")
            # Emergency save of just the URLs
            try:
                with open('emergency_url_list.txt', 'w', encoding='utf-8') as f:
                    f.write('\n'.join(self.visited_urls))
                print("Emergency URL list saved to 'emergency_url_list.txt'")
            except Exception as e2:
                print(f"Emergency save failed: {e2}")

    def crawl(self):
        """Main crawl method"""
        start_time = time.time()
        print(f"Starting crawl of {self.base_url}")
        print(f"Max depth: {self.max_depth}, Max URLs: {self.max_urls}")
        
        try:
            self.get_page_links(self.base_url)
            
            end_time = time.time()
            duration = end_time - start_time
            
            print("\nCrawl Statistics:")
            print(f"Total URLs processed: {len(self.visited_urls)}")
            print(f"Time taken: {duration:.2f} seconds")
            print(f"Average time per URL: {duration/len(self.visited_urls):.2f} seconds")
            print(f"Total errors: {sum(len(errors) for errors in self.errors.values())}")
            
            self.save_results()
            
        except KeyboardInterrupt:
            print("\nCrawl interrupted by user")
            self.save_results('interrupted_results.json')
        except Exception as e:
            print(f"\nCrawl failed: {e}")
            self.save_results('error_results.json')

def main():
    # Configuration
    base_url = "BASEURL"
    max_depth = 3  # Limit depth to prevent excessive crawling
    max_urls = 1000  # Limit total URLs
    rate_limit = 1  # Seconds between requests
    
    try:
        crawler = URLHierarchyBuilder(
            base_url=base_url,
            max_depth=max_depth,
            max_urls=max_urls,
            rate_limit=rate_limit
        )
        
        crawler.crawl()
        
    except Exception as e:
        print(f"Fatal error: {e}")
        sys.exit(1)

if __name__ == "__main__":
    main()