In [1]:
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import urllib.robotparser
from urllib.parse import urlparse, urljoin

In [2]:
def is_allowed(url, user_agent = '*'):
    parsed_url = urlparse(url)
    base_url = f'{parsed_url.scheme}://{parsed_url.netloc}'
    robots_url = urljoin(base_url, 'robots.txt')

    rp = urllib.robotparser.RobotFileParser()
    rp.set_url(robots_url)
    rp.read()

    return rp.can_fetch(user_agent, url)

In [3]:
def fetch_page(url):
    if not is_allowed(url):
        print(f'Scrapping not allowed for {url}')
        return None

    try:
        response = requests.get(url)
        if response.status_code == 200:
            print(f'Successfully fetched {url}')
            soup = BesutifulSoup(response.content, 'html.parser')
            return soup
        else:
            print(f'Failed to fetch the {url} with status code : {response.status_code}')
    except Exception as e:
        print(f'Exception occured while fetching from {url} : {e}')

    return None    

In [4]:
def extract_links(soup, base_url):
    links = []
    if soup:
        for link in soup.find_all(a, href = True):
            full_url = urljoin(base_url, link['href'])
            links.append(full_url)
    return links

In [5]:
def scrape_urls(urls, max_workers = 5):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit fetch_page tasks to the ThreadPoolExecutor
        futures = {executor.submit(fetch_page, url): url for url in urls}
        results = []
        for future in futures:
            result = future.result()
            if result:
                results.append(result)
        return results        
        

In [6]:
def main():
    start_url = 'https://example.com'
    soup = fetch_page(start_url)
    if not soup:
        return

    links = extract_links(soup, url)
    pages = scrape_urls(links)
    for page in pages:
        if page:
            title = page.find('title').get_text()
            print(f'Page Title : {title}')

In [7]:
if __name__ == "__main__":
    main()

Exception occured while fetching from https://example.com : HTTPSConnectionPool(host='example.com', port=443): Max retries exceeded with url: / (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1032)')))


In [9]:
import time

class Execution:
    def __init__(self, func):
        self.func = func

    def __get__(self, instance, owner):
        return lambda *args, **kwargs : self(instance, *args, **kwargs)

    def __call__(self, *args, **kwargs):
        instance = args[0]
        start_time = time.time()
        result = self.func(instance, *args[1:], **kwargs)
        end_time = time.time()
        execution_time = end_time - start_time
        print(f"Execution time of {self.func.__name__}: {execution_time:.4f} seconds") 
        return result

In [10]:
class Example:
    @Execution
    def get_execution_time(self):
        for _ in range(10000):
            pass

example_class = Example()
example_class.get_execution_time()
        

Execution time of get_execution_time: 0.0011 seconds
