<a href="https://colab.research.google.com/github/josouane/web-crawler/blob/main/chatgpt-web-crawler.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

class WebCrawler:
   def __init__(self, start_url, visiting_strategy='preorder'):
       self.start_url = start_url
       self.visiting_strategy = visiting_strategy.lower()
       self.visited_urls = set()
       self.corpus = {}
       self.main_domain = urlparse(start_url).netloc

   def crawl(self, url, depth=0):
       if depth > 10:  # Limiting depth to avoid potential infinite loops
           print(f"Reached maximum depth for {url}")
           return

       if url not in self.visited_urls and self.is_same_domain(url):
           print(f"Visiting: {url}")
           self.visited_urls.add(url)
           try:
               response = requests.get(url)
               soup = BeautifulSoup(response.content, 'html.parser')
               title = soup.title.string.strip() if soup.title else 'Untitled'
               text_content = self.extract_text_content(soup)
               self.corpus[title] = text_content
               print(f"Text Content: {text_content[:100]}...")  # Output a snippet of text

               if self.visiting_strategy == 'preorder':
                   links = self.extract_links(soup)
                   for link in links:
                       self.crawl(link, depth + 1)

               # Additional visiting strategies (inorder, postorder) can be implemented here

           except Exception as e:
               print(f"Error crawling {url}: {e}")

   def extract_text_content(self, soup):
       # Extract text content only from the body of the HTML
       text_content = ' '.join([p.get_text(separator=' ', strip=True) for p in soup.body.find_all('p')])
       return text_content

   def extract_links(self, soup):
       # Extract all links from the page
       links = [link.get('href') for link in soup.find_all('a', href=True)]
       # Filter internal links only
       links = [urljoin(self.start_url, link) for link in links if link.startswith(('http', 'https'))]
       # Exclude PDF links
       links = [link for link in links if not link.endswith('.pdf')]
       # Filter out external links
       links = [link for link in links if self.is_same_domain(link)]
       # Exclude links with 'resources' in the URL
       links = [link for link in links if 'resources' not in link.lower()]
       return links

   def is_same_domain(self, url):
       return urlparse(url).netloc == self.main_domain

   def start_crawling(self):
       self.crawl(self.start_url)
       self.save_corpus()

   def save_corpus(self):
       # Save the extracted corpus to text files
       for title, content in self.corpus.items():
           with open(f"{title}.txt", 'w', encoding='utf-8') as file:
               file.write(content)

if __name__ == "__main__":
   # Get the starting URL from the user
   start_url = input("Enter the website's URL: ")

   # Instantiate the WebCrawler with the provided URL and visiting strategy
   crawler = WebCrawler(start_url=start_url, visiting_strategy='preorder')

   # Start crawling
   crawler.start_crawling()