In [None]:
pip install selenium

In [1]:
import time
import re
import csv
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager  # Optional: For WebDriver Manager
from selenium.webdriver.chrome.options import Options

class WVUScraperSelenium:
    """
    A web scraper class using Selenium for scraping Bugzilla and extracting bug data.
    
    Attributes:
    ----------
    base_url : str
        The base URL of the Bugzilla bug list.
    driver : webdriver.Chrome
        The Selenium WebDriver instance.
    
    Methods:
    -------
    setup_driver():
        Initializes the Selenium WebDriver.
    
    get_page():
        Loads the Bugzilla bug list page.
    
    extract_total_entries():
        Extracts the total number of bug entries.
    
    extract_bug_ids():
        Extracts all bug IDs from the bug list table.
    
    extract_table_headings():
        Extracts table headings from the bug list table.
    
    save_bugs_to_csv(bug_ids, filename='bug_ids.csv'):
        Saves bug IDs to a CSV file.
    
    close_driver():
        Closes the Selenium WebDriver.
    """

    def __init__(self, base_url):
        self.base_url = base_url
        self.driver = None
        self.setup_driver()
    
    def setup_driver(self):
        """
        Initializes the Selenium WebDriver with optional headless mode.
        """
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--no-sandbox")
        
        # Initialize WebDriver using WebDriver Manager
        self.driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()), options=chrome_options)
    
    def get_page(self):
        """
        Loads the Bugzilla bug list page.
        """
        self.driver.get(self.base_url)
        # Wait for the page to load completely
        time.sleep(5)  # Adjust as needed or implement explicit waits
    
    def extract_total_entries(self):
        """
        Extracts the total number of bug entries from the page.
        
        Returns:
        -------
        int
            Total number of bug entries; None if not found.
        """
        try:
            # Locate the div containing the entries info
            info_div = self.driver.find_element(By.CLASS_NAME, "dataTables_info")
            text = info_div.text
            print(f"Found dataTables_info text: {text}")
            # Use regex to extract the number
            match = re.search(r'of\s+([\d,]+)\s+entries', text)
            if match:
                total = int(match.group(1).replace(',', ''))
                return total
            else:
                print("Regex did not match for total entries.")
                return None
        except NoSuchElementException:
            print("Could not find element with class 'dataTables_info'.")
            return None
    
    def extract_bug_ids(self):
        """
        Extracts all bug IDs from the bug list table.
        
        Returns:
        -------
        list of str
            A list of bug IDs.
        """
        bug_ids = []
        try:
            # Locate the bug list table
            table = self.driver.find_element(By.ID, "buglist")
            tbody = table.find_element(By.TAG_NAME, "tbody")
            rows = tbody.find_elements(By.TAG_NAME, "tr")
            for row in rows:
                try:
                    # Assuming the first <td> has the bug ID
                    bug_id_cell = row.find_element(By.CLASS_NAME, "bz_id")
                    bug_link = bug_id_cell.find_element(By.TAG_NAME, "a")
                    href = bug_link.get_attribute("href")
                    match = re.search(r'id=(\d+)', href)
                    if match:
                        bug_ids.append(match.group(1))
                except NoSuchElementException:
                    print("Bug ID cell or link not found in a row.")
                    continue
            return bug_ids
        except NoSuchElementException:
            print("Could not find the bug list table with id 'buglist'.")
            return bug_ids
    
    def extract_table_headings(self):
        """
        Extracts table headings from the bug list table.
        
        Returns:
        -------
        list of str
            A list of table headings.
        """
        headings = []
        try:
            table = self.driver.find_element(By.ID, "buglist")
            thead = table.find_element(By.TAG_NAME, "thead")
            header_row = thead.find_element(By.TAG_NAME, "tr")
            th_elements = header_row.find_elements(By.TAG_NAME, "th")
            for th in th_elements:
                heading_text = th.text.strip()
                headings.append(heading_text)
            return headings
        except NoSuchElementException:
            print("Could not find table headings in the bug list table.")
            return headings
    
    def save_bugs_to_csv(self, bug_ids, filename='bug_ids.csv'):
        """
        Saves bug IDs to a CSV file.
        
        Parameters:
        ----------
        bug_ids : list of str
            List of bug IDs to save.
        filename : str, optional
            Name of the CSV file to save data (default is 'bug_ids.csv').
        """
        if not bug_ids:
            print("No bug IDs to save.")
            return
        
        try:
            with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow(['bug_id'])
                for bug_id in bug_ids:
                    writer.writerow([bug_id])
            print(f"Bug IDs saved to {filename}")
        except IOError as e:
            print(f"Error saving to CSV: {e}")
    
    def close_driver(self):
        """
        Closes the Selenium WebDriver.
        """
        if self.driver:
            self.driver.quit()

ModuleNotFoundError: No module named 'webdriver_manager'