In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from time import sleep
import random
import logging

class StJohnsWebCrawler:
    def __init__(self, start_url, user_agents=None, proxies=None):
        self.start_url = start_url
        self.base_url = self.extract_base_url(start_url)
        self.user_agents = user_agents
        self.proxies = proxies
        self.session = requests.Session()
        self.logger = self.setup_logger()

    def setup_logger(self):
        logger = logging.getLogger(__name__)
        logger.setLevel(logging.INFO)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        ch = logging.StreamHandler()
        ch.setFormatter(formatter)
        logger.addHandler(ch)
        return logger

    def extract_base_url(self, url):
        return url.split('//')[-1].split('/')[0]

    def fetch_url(self, url):
        try:
            headers = {'User-Agent': random.choice(self.user_agents)} if self.user_agents else {}
            proxies = {'http': random.choice(self.proxies)} if self.proxies else {}
            response = self.session.get(url, headers=headers, proxies=proxies)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Failed to fetch URL: {url}, Error: {e}")
            return None

    def parse_html(self, html_content):
        if html_content:
            return BeautifulSoup(html_content, 'html.parser')
        else:
            return None

    def extract_links(self, soup):
        if soup:
            links = []
            for link in soup.find_all('a', href=True):
                absolute_link = urljoin(self.base_url, link['href'])
                if self.base_url in absolute_link:
                    links.append(absolute_link)
            return links
        else:
            return []

    def crawl(self):
        visited_urls = set()
        queue = [self.start_url]

        while queue:
            url = queue.pop(0)
            if url not in visited_urls:
                html_content = self.fetch_url(url)
                if html_content:
                    soup = self.parse_html(html_content)
                    if soup:
                        visited_urls.add(url)
                        self.logger.info(f"Crawling: {url}")
                        # Extract data or perform desired operations here
                        # For this example, let's just print the title of the page
                        title = soup.title.string.strip() if soup.title else "No title found"
                        self.logger.info(f"Page Title: {title}")
                        # Extract links from the page and add them to the queue for further crawling
                        links = self.extract_links(soup)
                        queue.extend(links)
                # Implement delay to avoid being detected as a bot
                sleep(random.uniform(1, 3))

# URL of the website to crawl
start_url = "https://www.stjohns.edu/"

# User agents and proxies (if needed)
user_agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"]
proxies = None  # Add proxies if required

# Create an instance of the crawler and start crawling
crawler = StJohnsWebCrawler(start_url, user_agents=user_agents, proxies=proxies)
crawler.crawl()


In [None]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from time import sleep
import random
import logging

class StJohnsWebCrawler:
    def __init__(self, start_url, user_agents=None, proxies=None):
        self.start_url = start_url
        parsed_url = urlparse(start_url)
        self.base_url = parsed_url.scheme + "://" + parsed_url.netloc  # More robust base URL extraction
        self.user_agents = user_agents
        self.proxies = proxies
        self.session = requests.Session()
        self.logger = self.setup_logger()


    def setup_logger(self):
        logger = logging.getLogger(__name__)
        logger.setLevel(logging.INFO)
        formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
        ch = logging.StreamHandler()
        ch.setFormatter(formatter)
        logger.addHandler(ch)
        return logger

    def extract_base_url(self, url):
        return url.split('//')[-1].split('/')[0]

    def fetch_url(self, url):
        try:
            headers = {'User-Agent': random.choice(self.user_agents)} if self.user_agents else {}
            proxies = {'http': random.choice(self.proxies)} if self.proxies else {}
            response = self.session.get(url, headers=headers, proxies=proxies)
            response.raise_for_status()
            return response.content
        except requests.exceptions.RequestException as e:
            self.logger.error(f"Failed to fetch URL: {url}, Error: {e}")
            return None

    def parse_html(self, html_content):
        if html_content:
            return BeautifulSoup(html_content, 'html.parser')
        else:
            return None

    def extract_links(self, soup):
        if soup:
            links = []
            for link in soup.find_all('a', href=True):
                href = link['href']
                absolute_link = urljoin(self.base_url, href)
                if self.base_url in absolute_link:
                    links.append(absolute_link)
            return links
        else:
            return []


    def crawl(self):
        visited_urls = set()
        queue = [self.start_url]

        while queue:
            url = queue.pop(0)
            if url not in visited_urls:
                html_content = self.fetch_url(url)
                if html_content:
                    soup = self.parse_html(html_content)
                    if soup:
                        visited_urls.add(url)
                        self.logger.info(f"Crawling: {url}")
                        # Extract data or perform desired operations here
                        # For this example, let's just print the title of the page
                        title = soup.title.string.strip() if soup.title else "No title found"
                        self.logger.info(f"Page Title: {title}")
                        # Extract links from the page and add them to the queue for further crawling
                        links = self.extract_links(soup)
                        queue.extend(links)
                # Implement delay to avoid being detected as a bot
                sleep(random.uniform(1, 3))

# URL of the website to crawl
start_url = "https://www.stjohns.edu/"

# User agents and proxies (if needed)
user_agents = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36"]
proxies = None  # Add proxies if required

# Create an instance of the crawler and start crawling
crawler = StJohnsWebCrawler(start_url, user_agents=user_agents, proxies=proxies)
crawler.crawl()


2024-01-31 01:43:44,598 - INFO - Crawling: https://www.stjohns.edu/
2024-01-31 01:43:44,598 - INFO - Crawling: https://www.stjohns.edu/
INFO:__main__:Crawling: https://www.stjohns.edu/
2024-01-31 01:43:44,607 - INFO - Page Title: Turn Passion into Purpose | St. John's University
2024-01-31 01:43:44,607 - INFO - Page Title: Turn Passion into Purpose | St. John's University
INFO:__main__:Page Title: Turn Passion into Purpose | St. John's University
2024-01-31 01:43:45,997 - INFO - Crawling: https://www.stjohns.edu#main-menu
2024-01-31 01:43:45,997 - INFO - Crawling: https://www.stjohns.edu#main-menu
INFO:__main__:Crawling: https://www.stjohns.edu#main-menu
2024-01-31 01:43:46,003 - INFO - Page Title: Turn Passion into Purpose | St. John's University
2024-01-31 01:43:46,003 - INFO - Page Title: Turn Passion into Purpose | St. John's University
INFO:__main__:Page Title: Turn Passion into Purpose | St. John's University
2024-01-31 01:43:47,786 - INFO - Crawling: https://www.stjohns.edu#main

KeyboardInterrupt: 