In [5]:
"""
Web scrapping in Python
This is an example of scrapping a web page in Python using beautiful soup

"""

import requests
from bs4 import BeautifulSoup

import requests
from bs4 import BeautifulSoup
import time


class WVUScraper:
    """
    A web scraper class for scraping the WVU website.

    Attributes:
    ----------
    base_url : str
        The base URL of the site to scrape (e.g., "https://www.wvu.edu").

    Methods:
    -------
    get_html(url):
        Sends a GET request to the URL and retrieves the HTML content.

    parse_html(html):
        Parses HTML content using BeautifulSoup.

    extract_links(soup):
        Extracts all anchor tags and retrieves link text and URLs.

    extract_headings(soup):
        Extracts all headings (h1, h2, h3) from the parsed HTML content.

    scrape_page(url):
        Scrapes a single page for links and headings.

    follow_links(url, max_pages=5, delay=2):
        Follows links on the page up to a specified number of pages.

    grab_specific_item(soup, selector):
        Extracts specific content based on a CSS selector (e.g., class, id).
    """

    def __init__(self, base_url):
        self.base_url = base_url

    def get_html(self, url):
        """
        Fetches HTML content from a given URL.

        Parameters:
        ----------
        url : str
            The URL to retrieve HTML from.

        Returns:
        -------
        str
            HTML content if the request is successful; None otherwise.
        """
        try:
            response = requests.get(url)
            response.raise_for_status()
            return response.content
        except requests.RequestException as e:
            print(f"Error fetching {url}: {e}")
            return None

    def parse_html(self, html):
        """
        Parses HTML content using BeautifulSoup.

        Parameters:
        ----------
        html : str
            Raw HTML content to be parsed.

        Returns:
        -------
        BeautifulSoup
            A BeautifulSoup object of the parsed HTML.
        """
        return BeautifulSoup(html, 'html.parser') if html else None

    def extract_links(self, soup):
        """
        Extracts all anchor tags and retrieves link text and URLs.

        Parameters:
        ----------
        soup : BeautifulSoup
            Parsed HTML content.

        Returns:
        -------
        list of dict
            A list of dictionaries containing link text and URLs.
        """
        links = []
        for link in soup.find_all('a', href=True):
            links.append({
                'text': link.text.strip(),
                'url': link.get('href')
            })
        return links

    def extract_headings(self, soup):
        """
        Extracts all headings (h1, h2, h3) from the parsed HTML content.

        Parameters:
        ----------
        soup : BeautifulSoup
            Parsed HTML content.

        Returns:
        -------
        dict
            A dictionary with heading levels as keys and a list of text for each heading as values.
        """
        headings = {}
        for level in ['h1', 'h2', 'h3']:
            headings[level] = [heading.text.strip() for heading in soup.find_all(level)]
        return headings

    def scrape_page(self, url):
        """
        Scrapes a single page for links and headings.

        Parameters:
        ----------
        url : str
            The URL of the page to scrape.

        Returns:
        -------
        dict
            A dictionary containing the page URL, extracted links, and headings.
        """
        html = self.get_html(url)
        if not html:
            return None

        soup = self.parse_html(html)
        data = {
            'url': url,
            'links': self.extract_links(soup),
            'headings': self.extract_headings(soup),
        }
        return data

    def follow_links(self, url, max_pages=5, delay=2):
        """
        Follows links on the main page and scrapes each one up to a specified number of pages.

        Parameters:
        ----------
        url : str
            The initial URL to start scraping from.

        max_pages : int, optional
            The maximum number of pages to scrape (default is 5).

        delay : int, optional
            Delay in seconds between requests to avoid server overload (default is 2).

        Returns:
        -------
        list of dict
            A list of dictionaries, each containing scraped data from a page.
        """
        main_data = self.scrape_page(url)
        if not main_data:
            return []

        all_data = [main_data]
        visited_urls = {url}

        for link in main_data['links']:
            full_url = link['url'] if link['url'].startswith('http') else f"{self.base_url}{link['url']}"
            if full_url not in visited_urls and len(all_data) < max_pages:
                print(f"Scraping {full_url}")
                page_data = self.scrape_page(full_url)
                if page_data:
                    all_data.append(page_data)
                    visited_urls.add(full_url)
                time.sleep(delay)
        return all_data

    def grab_specific_item(self, soup, selector):
        """
        Extracts specific content based on a CSS selector.

        Parameters:
        ----------
        soup : BeautifulSoup
            Parsed HTML content.

        selector : str
            The CSS selector for the item to grab (e.g., '.class' or '#id').

        Returns:
        -------
        list
            A list of strings of content matching the selector.
        """
        items = soup.select(selector)
        return [item.get_text(strip=True) for item in items]




In [8]:
import Web_Scraping_beautiful_soup as ws


# Main function to initialize and run the scraper
def main():
    # Initialize the scraper with the base URL
    scraper = ws.WVUScraper("https://bugzilla.redhat.com/")

    # Scrape the main page and follow links
    data = scraper.follow_links(scraper.base_url, max_pages=3, delay=1)

    # # Print data for each page
    # for page in data:
    #     print("Page URL:", page['url'])
    #     print("Headings:", page['headings'])
    #     print("Links:", page['links'])

    # Example of grabbing a specific item using a CSS selector
    html = scraper.get_html(scraper.base_url)
    soup = scraper.parse_html(html)

    # Attempt to retrieve the main heading using an alternative selector
    main_heading = scraper.grab_specific_item(soup, ".dataTables_info")
    print("Main Heading:", main_heading)

    main_heading = scraper.grab_specific_item(soup, ".wvu-nav-wrapper")
    for something in main_heading:
        print( something,'\t')

    # Attempt to retrieve paragraphs within the about section using an alternative selector
    about_paragraphs = scraper.grab_specific_item(soup, ".about-content p")
    print("About Section Paragraphs:")
    for para in about_paragraphs:
        print(para)
        
if __name__ == "__main__":
    main()

Scraping https://bugzilla.redhat.com/saml2_login.cgi?idp=Fedora%20Account%20System&target=index.cgi
Error fetching https://bugzilla.redhat.com/saml2_login.cgi?idp=Fedora%20Account%20System&target=index.cgi: 401 Client Error: Unauthorized for url: https://id.fedoraproject.org/login/gssapi/negotiate?ipsilon_transaction_id=5ac1af3b-b5f4-43df-9d4e-0a44e5a88def
Scraping https://bugzilla.redhat.com/saml2_login.cgi?idp=Red%20Hat%20Associate&target=index.cgi
Error fetching https://bugzilla.redhat.com/saml2_login.cgi?idp=Red%20Hat%20Associate&target=index.cgi: 401 Client Error: Unauthorized for url: https://auth.redhat.com/auth/realms/EmployeeIDP/protocol/saml?SAMLRequest=hVLLbtswELznKwjeLYmyUDuE5cCNW9RAWgRO0kMvxZpaxwT4ULmUW%2FfrQwkVGqBCyhvJnZ2dmV3d%2FLKGnTGQ9q7mIis4owiuAeMd1vyCxG%2FWVysCa1q56eLJ7fFHhxRZAjqSw0fNu%2BCkB9IkHVgkGZV82Hy%2Bk2VWyDb46JU3%2FIpNnL9t3u4CRBhimnK6zWb8vvWOOovhAcNZK3za39X8FGNLMs8P3fNvbQxkAZsTxEx5m%2FfM5XdQlKlnzdmOqMOd6z2INS%2BLspoJMRPlo6jkXEhRfZvm321rPgc4LubLRbMsGnVUVSWK40Ic

In [9]:
FIND IDs / HEADINGS / NUMBER OF ENTRIES

SyntaxError: invalid syntax (2085600238.py, line 1)

In [10]:
HAVE DONE BEFORE THURSDAY CLASS

SyntaxError: invalid syntax (3379066740.py, line 1)