In [None]:
import threading
import time
from urllib.parse import urljoin, urlparse
import re
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By


# Create webcrawler class
class WebCrawler:
    def __init__(self, start_url, visiting_strategy='preorder'):
        self.start_url = start_url
        self.visiting_strategy = visiting_strategy.lower()
        self.visited_urls = set()
        self.corpus = {}
        self.main_domain = urlparse(start_url).netloc
        self.lock = threading.Lock()  # Lock for thread-safe access to shared data
        self.driver_path = r'D:\2024\School\St Johns\Spring 2024\Web Data Mining\chromedriver-win32\chromedriver.exe'  # Update this with your chromedriver path

    def can_crawl(self, url):
        # Check if the URL is allowed by robots.txt
        try:
            robots_url = urljoin(url, '/robots.txt')
            robots_content = self.get_page_source(robots_url)
            return re.search(
                r'^User-agent: \*\nDisallow: /', robots_content) is None
        except Exception as e:
            print(f"Error checking robots.txt for {url}: {e}")
            return False

    def crawl(self, url):
        if url not in self.visited_urls and self.is_same_domain(url) and self.can_crawl(url):
            print(f"Visiting: {url}")
            self.visited_urls.add(url)
            try:
                # Initialize Selenium WebDriver
                chrome_options = Options()
                chrome_options.add_argument("--headless")  # Run headless if you don't want browser window to pop up
                service = Service(self.driver_path)
                service.start()
                driver = webdriver.Remote(service.service_url, options=chrome_options)
                driver.get(url)
                time.sleep(2)  # Give some time for JavaScript to render

                # Parse the page source using BeautifulSoup
                soup = BeautifulSoup(driver.page_source, 'html.parser')
                title = soup.title.string.strip() if soup.title else 'Untitled'
                text_content = self.extract_text_content(soup)

                with self.lock:  # Thread-safe update of shared data
                    self.corpus[url] = text_content

                print(f"Text Content: {text_content[:100]}...")  # Output a snippet of text

                if self.visiting_strategy == 'preorder':
                    links = self.extract_links(soup)
                    threads = []
                    for link in links:
                        thread = threading.Thread(target=self.crawl, args=(link,))
                        threads.append(thread)
                        thread.start()
                        time.sleep(1)  # Add a delay of 1 second between requests

                    # Wait for all threads to complete
                    for thread in threads:
                        thread.join()

                # Additional visiting strategies (inorder, postorder) can be implemented here

            except Exception as e:
                print(f"Error crawling {url}: {e}")
            finally:
                driver.quit()

    def get_page_source(self, url):
        # Initialize Selenium WebDriver
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run headless if you don't want browser window to pop up
        service = Service(self.driver_path)
        service.start()
        driver = webdriver.Remote(service.service_url, options=chrome_options)
        driver.get(url)
        page_source = driver.page_source
        driver.quit()
        return page_source

    def extract_text_content(self, soup):
        # Extract text content only from the body of the HTML
        text_content = ' '.join([p.get_text(separator=' ', strip=True) for p in soup.body.find_all('p')])
        return text_content

    def extract_links(self, soup):
        # Extract all links from the page
        links = [link.get('href') for link in soup.find_all('a', href=True)]
        # Filter internal links only
        links = [urljoin(self.start_url, link) for link in links if link.startswith(('http', 'https'))]
        # Exclude PDF links
        links = [link for link in links if not link.endswith('.pdf')]
        # Filter out external links
        links = [link for link in links if self.is_same_domain(link)]
        # Exclude links with 'resources' in the URL
        links = [link for link in links if 'resources' not in link.lower()]
        return links

    def is_same_domain(self, url):
        return urlparse(url).netloc == self.main_domain

    def start_crawling(self):
        self.crawl(self.start_url)

    def get_crawled_data(self):
        return self.corpus


if __name__ == "__main__":
    # Get the starting URL from the user
    start_url = input("Enter the website's URL: ")

    # Instantiate the WebCrawler with the provided URL and visiting strategy
    crawler = WebCrawler(start_url=start_url, visiting_strategy='preorder')

    # Start crawling
    crawler.start_crawling()

    # Get the crawled data
    crawled_data = crawler.get_crawled_data()

    # Print the crawled data
    for url, content in crawled_data.items():
        print(f"URL: {url}")
        print(f"Content: {content[:100]}...")  # Print a snippet of content
#Link for st john's website https://www.stjohns.edu/

Enter the website's URL: https://www.stjohns.edu/
Visiting: https://www.stjohns.edu/
Text Content: See how your journey aligns with what drives you. Hannah M. Queens, NY Maria Orlando, FL Jenna Charl...
Visiting: https://www.stjohns.edu/life-st-johns/career-services
Visiting: https://www.stjohns.edu/about/leadership-and-administration/office-president/presidents-society
Visiting: https://www.stjohns.edu/who-we-are/faith-and-mission/campus-ministry/opportunities/plunge-program
Visiting: https://www.stjohns.edu/who-we-are/campus-sustainability
Visiting: https://www.stjohns.edu/academics/programs?level%5B151%5D=151
Visiting: https://www.stjohns.edu/admission/graduate-admission
Text Content: Plunges, or service immersion, are weeklong experiences where students are given the opportunity to ...
Text Content: Sustainability is a long-term responsibility to meet the needs of the present without compromising t...
Visiting: https://www.stjohns.edu/who-we-are/history-and-facts/vincentian-heritag

In [1]:
!pip install Selenium




[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip
