In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd

import time
def extract_text_or_na(soup, selector, child_tag=None):
    try:
        element = soup.find(*selector)
        if element:
            if child_tag:
                child_element = element.find(child_tag)
                if child_element and child_element.text:
                    return child_element.text.strip()
                else:
                    print(f"Child element not found or has no text for selector {selector} and child tag {child_tag}")
            else:
                if element.text:
                    return element.text.strip()
                else:
                    print(f"Element found but has no text for selector {selector}")
        else:
            print(f"Element not found for selector {selector}")
    except AttributeError as e:
        print(f"AttributeError for selector {selector} and child tag {child_tag}: {e}")
    return None

def extract_href_or_na(soup, selector):
    try:
        element = soup.find(*selector)
        if element:
            link_element = element.find('a')
            if link_element and 'href' in link_element.attrs:
                return link_element['href']
            else:
                print(f"Anchor element not found or has no href for selector {selector}")
        else:
            print(f"Element not found for selector {selector}")
    except AttributeError as e:
        print(f"AttributeError for selector {selector}: {e}")
    return None

In [2]:
try:
    # Set up the WebDriver
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

    # Navigate to the main webpage
    driver.get('https://integratecic.my.salesforce-sites.com/directory/')
    print("Navigated to main page")

    data = []

    while True:
        # Wait for the anchor tags with class 'resultsBlock' to be present
        WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CLASS_NAME, 'resultsBlock')))
        page_source = driver.page_source

        soup = BeautifulSoup(page_source, 'html.parser')
#         anchor_tags = soup.find_all(class_='resultsBlock')
        
        anchor_tags = driver.find_elements(By.CLASS_NAME, 'resultsBlock')
        print(len(anchor_tags))
        filtered_tags = [tag for tag in anchor_tags if tag.get_attribute('name').endswith('j_id35')]
#         filtered_tags = [tag for tag in anchor_tags if tag.find_parent('h3')]

        print(f"Found {len(filtered_tags)} unique organisations on the current page")
#         for tag in filtered_tags:
#             print(tag.get_attribute('onclick'))
        
        # Collect all href attributes
        for tag in filtered_tags:
            tag_id = tag.get_attribute('id')
            all_ids = set()
            if tag_id not in all_ids:
                all_ids.add(tag_id)
                driver.execute_script("arguments[0].scrollIntoView(true);", tag)  # Scroll element into view
    #             time.sleep(1)  # Wait a bit to ensure the element is interactable

                WebDriverWait(driver, 10).until(EC.element_to_be_clickable(tag))  # Wait until the element is clickable
                tag.click()
    #             print(tag.get_attribute('onclick'))
    #             tag.click()  # Click on the ResultsBlock to open the popup

                # Wait for the popup content to load
                popup_content = WebDriverWait(driver, 5).until(
    #                 EC.presence_of_element_located((By.ID, "j_id0:accountBox"))
                    EC.presence_of_element_located((By.CLASS_NAME, "Popup"))
                )

                # Extract the HTML content of the popup
                print(popup_content.get_attribute('h2'))
                popup_html = popup_content.get_attribute('outerHTML')

                # Parse the HTML content with BeautifulSoup
                soup = BeautifulSoup(popup_html, 'html.parser')

                name = extract_text_or_na(soup, ('h2',))
                print(name)
                size = extract_text_or_na(soup, ('span', {'id': 'j_id0:j_id53:j_id58'}), 'h4').split(' ', 1)[0]
                if size not in ['Micro','Small','Medium','Large', 'Major', 'Super-major']:
                    size = None
                    orgtype = extract_text_or_na(soup, ('span', {'id': 'j_id0:j_id53:j_id58'}), 'h4')
                else:
                    orgtype = extract_text_or_na(soup, ('span', {'id': 'j_id0:j_id53:j_id58'}), 'h4').split(' ', 1)[1:]
                    orgtype = ' '.join(orgtype)  
                print(size)
                print(orgtype)
                description = extract_text_or_na(soup, ('span', {'id': 'j_id0:j_id53:j_id60'}), 'p')
                print(description)
                areas_of_work = extract_text_or_na(soup, ('span', {'id': 'j_id0:j_id53:j_id62'}), 'p')
                print(areas_of_work)
                contact_url = extract_href_or_na(soup, ('span', {'id': 'j_id0:j_id53:j_id68'}))
                print(contact_url)
                contact_phone = extract_text_or_na(soup, ('span', {'id': 'j_id0:j_id53:j_id70'}), 'p')
                print(contact_phone)
                address = extract_text_or_na(soup, ('span', {'id': 'j_id0:j_id53:j_id72'}), 'p')
                print(address)

                # Store the extracted data in a list
                data.append({
                    'name': name,
                    'size': size,
                    'description': description,
                    'areas_of_work': areas_of_work,
                    'contact_url': contact_url,
                    'contact_phone': contact_phone,
                    'address': address
                })
    #             print(data)

                # Close the popup
                close_button = driver.find_element(By.XPATH, "//input[@value='close']")
                close_button.click()
                # Wait for a short time to ensure the popup is closed before moving to the next block
                time.sleep(1)
        

        # Try to find the 'Next Page' button by text
        try:
            next_button = driver.find_element(By.LINK_TEXT, 'Next Page >')
            print("Found 'Next Page' button, clicking...")
            next_button.click()
            WebDriverWait(driver, 10).until(EC.staleness_of(next_button))  # Wait until the next button is no longer stale
            # Re-fetch elements after navigating to the next page
            time.sleep(10)  # Ensure the page is fully loaded
        except Exception as e:
            print(f"Could not find or click 'Next Page' button: {e}")
            # If the next button is not found or clickable, break the loop
            break
    # Create a DataFrame
    df = pd.DataFrame(data, columns=['Name', 'Size', 'Org Type', 'Description', 'Areas of Work', 'Contact Link', 'Phone', 'Address'])

    # Save the DataFrame to an Excel file
    df.to_excel('directory_data.xlsx', index=False)    

except Exception as e:
    print("An error occurred:", e)
    
finally:
    # Close the WebDriver
    if 'driver' in locals():
        driver.quit()

An error occurred: [WinError 193] %1 is not a valid Win32 application
