In [2]:
import logging
from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup

logging.basicConfig(
    format='%(asctime)s %(levelname)s:%(message)s',
    level=logging.INFO)

# NOTES:
# No parallelism offered
# No retries

class Crawler:
    def __init__(self, urls=[]):
        self.visited_urls = []
        self.urls_to_visit = urls
        
    def download_url(self, url):
        return requests.get(url).text
    
    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(html, 'html.parser')
        # get all links in the page using the <a href> tag
        for link in soup.find_all('a'):
            path = link.get('href')
            if path and path.startswith('/'):
                path = urljoin(url, path)
            yield path
            
    def add_url_to_visit(self, url):
        if url not in self.visited_urls and url not in self.urls_to_visit:
            self.urls_to_visit.append(url)
            
    def crawl(self, url):
        html = self.download_url(url)
        for url in self.get_linked_urls(url, html):
            self.add_url_to_visit(url)
            
    def run(self):
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            logging.info(f'Crawling: {url}')
            try:
                self.crawl(url)
            except Exception:
                logging.exception(f'Failed to crawl: {url}')
            finally:
                self.visited_urls.append(url)
                
if __name__ == '__main__':
    Crawler(urls=['https://www.imbd.com/']).run()
            

2023-10-28 08:16:46,321 INFO:Crawling: https://www.imbd.com/
2023-10-28 08:16:46,602 INFO:Crawling: #page-top
2023-10-28 08:16:46,603 ERROR:Failed to crawl: #page-top
Traceback (most recent call last):
  File "<ipython-input-2-e7734c85d5c6>", line 41, in run
    self.crawl(url)
  File "<ipython-input-2-e7734c85d5c6>", line 32, in crawl
    html = self.download_url(url)
  File "<ipython-input-2-e7734c85d5c6>", line 16, in download_url
    return requests.get(url).text
  File "/home/jandogonzales/.local/lib/python3.8/site-packages/requests/api.py", line 73, in get
    return request("get", url, params=params, **kwargs)
  File "/home/jandogonzales/.local/lib/python3.8/site-packages/requests/api.py", line 59, in request
    return session.request(method=method, url=url, **kwargs)
  File "/home/jandogonzales/.local/lib/python3.8/site-packages/requests/sessions.py", line 575, in request
    prep = self.prepare_request(req)
  File "/home/jandogonzales/.local/lib/python3.8/site-packages/reques

2023-10-28 08:16:53,361 INFO:Crawling: https://www.HugeDomains.com/faq.cfm
2023-10-28 08:16:53,648 INFO:Crawling: https://www.HugeDomains.com/about.cfm
2023-10-28 08:16:53,899 INFO:Crawling: https://www.HugeDomains.com/contact.cfm
2023-10-28 08:16:54,129 INFO:Crawling: https://www.HugeDomains.com/payment-plan-login.cfm
2023-10-28 08:16:54,473 INFO:Crawling: https://www.HugeDomains.com/my-favorites.cfm
2023-10-28 08:16:54,779 INFO:Crawling: https://www.HugeDomains.com/shopping_cart.cfm
2023-10-28 08:16:55,206 INFO:Crawling: https://www.HugeDomains.com/shopping_cart.cfm?d=ManChart&e=com
2023-10-28 08:16:55,539 INFO:Crawling: https://www.HugeDomains.com/payment-plan-setup.cfm?d=ManChart.com
2023-10-28 08:16:56,019 INFO:Crawling: https://www.HugeDomains.com/testimonials.cfm
2023-10-28 08:16:56,431 INFO:Crawling: https://www.HugeDomains.com/case-studies.cfm
2023-10-28 08:16:56,668 INFO:Crawling: https://www.HugeDomains.com/case-study-pixelbull.cfm
2023-10-28 08:16:56,952 INFO:Crawling: #
20

2023-10-28 08:17:33,250 INFO:Crawling: https://www.facebook.com/settings
2023-10-28 08:17:34,068 INFO:Crawling: https://www.facebook.com/allactivity?privacy_source=activity_log_top_menu
2023-10-28 08:17:34,678 INFO:Crawling: https://www.HugeDomains.com/index.cfm
2023-10-28 08:17:34,981 INFO:Crawling: https://www.HugeDomains.com/domain_search.cfm
2023-10-28 08:17:36,313 INFO:Crawling: https://www.HugeDomains.com/domain_profile.cfm?d=DwRealty.com
2023-10-28 08:17:36,673 INFO:Crawling: https://www.HugeDomains.com/shopping_cart.cfm?d=DwRealty.com
2023-10-28 08:17:36,980 INFO:Crawling: https://www.HugeDomains.com/domain_profile.cfm?d=SkElectrical.com
2023-10-28 08:17:37,224 INFO:Crawling: https://www.HugeDomains.com/shopping_cart.cfm?d=SkElectrical.com
2023-10-28 08:17:37,528 INFO:Crawling: https://www.HugeDomains.com/domain_profile.cfm?d=GgcOnline.com
2023-10-28 08:17:37,903 INFO:Crawling: https://www.HugeDomains.com/shopping_cart.cfm?d=GgcOnline.com
2023-10-28 08:17:38,234 INFO:Crawling: 

KeyboardInterrupt: 