In [1]:
from selenium import webdriver
from bs4 import BeautifulSoup
import time

def get_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in headless mode to avoid opening a browser window
    return webdriver.Chrome(options=options)

def scrape_profile_links(url):
    driver = get_driver()
    driver.get(url)
    driver.implicitly_wait(5)
    html_content = driver.page_source
    driver.quit()

    soup = BeautifulSoup(html_content, 'html.parser')
    profile = {'name': [], 'link': []}

    brokers_boxes = soup.find_all('div', class_='brokers__box')
    for box in brokers_boxes:
        name_tag = box.find('h4', class_='brokers__item--topTitle')
        name = name_tag.get_text(strip=True) if name_tag else 'N/A'

        link_tag = box.find('a', class_='brokers__item--link')
        link_href = link_tag['href'] if link_tag else 'N/A'

        profile['name'].append(name)
        profile['link'].append(link_href)
    
    return profile

def scrape_email(link):
    driver = get_driver()
    try:
        driver.get(link)
        driver.implicitly_wait(10)  # Wait for the page to load

        time.sleep(3)  # Additional wait to ensure the page is fully loaded

        html_content = driver.page_source
        soup = BeautifulSoup(html_content, 'html.parser')

        location = soup.find_all('div', class_='brokers__profile--leftPhone')
        print(f"Debug: Found {len(location)} locations for link {link}")

        if location:
            link_tag = location[0].find('a') if len(location) == 1 else location[1].find('a')
            if link_tag:
                email_text = link_tag.get_text(strip=True)
                return email_text
            else:
                print(f"Debug: No <a> tag found in location for link {link}")
        else:
            print(f"Debug: No location div found for link {link}")
    except Exception as e:
        print(f"Error occurred while scraping {link}: {e}")
    finally:
        driver.quit()
    
    return "N/A"

# Main scraping logic
url = 'https://www.ibba.org/state/florida/'
profile = scrape_profile_links(url)
print(profile)

email_list = []
for each_link in profile['link']:
    email = scrape_email(each_link)
    email_list.append(email)
    print(f"Scraped email: {email}")

print("Final email list:", email_list)
print(f"Number of names: {len(profile['name'])}, Number of emails: {len(email_list)}")

# Verify that the lengths match
if len(profile['name']) != len(email_list):
    print("Error: The number of names and emails do not match.")
else:
    print("The number of names and emails match.")



{'name': ['Tom Milana', 'Dwight Altman', 'Barry Berkowitz', 'Anthony Rigney', 'Moche Hazout, P.A., CBI, CFC, CM&AP', 'Kent Cooper', 'Marty Fishman', 'Mark Harmon', 'Thomas McLenahan', 'Manny Fernandez', 'Gary Hallett', 'Eric Gagnon', 'Pablo Langesfeld', 'Baris Guler', 'Brad Coffman', 'George Chaconas', 'Timothy Bellon', 'Lou Vescio', 'Robin Gagnon', 'Brenda Sali', 'Andrew Cagnetta', 'Jose Miguel Arreaza', 'Alan Steinberg', 'Erica Lewandowski', 'Ken Eisenband', 'Paul Ihrig, M&AMI, CBI, CM&AP', 'Chris Gutierrez', 'David Rummell', 'Sean Brelsford', 'Michael Shea', 'Jim Parker', 'Jon Franz', 'Matthew FitzGibbon', 'Susan Moyer', 'Fernando Mello', 'Stacy Alario-Chrisman, LFCBI', 'Michelle Royce', 'BIBIANA GILLIS', 'Shaun Thornton', 'Gerard Perillo III', 'Erin Crawford', 'Jack Beraha', 'Bianca Evans', 'Nivedita Buzzetta', 'Don Imbus', 'James Lascano', 'Randy Bring', 'Frank Giacomelli', 'Eric Gall, MBA, CM&AP, CBI, ABI, M&AMI', 'Bruce Pockrandt', 'Robert Leone', 'Kenneth Hamner', 'Joe Shemansk

In [2]:
profile['emails'] = email_list 
print(profile)

{'name': ['Tom Milana', 'Dwight Altman', 'Barry Berkowitz', 'Anthony Rigney', 'Moche Hazout, P.A., CBI, CFC, CM&AP', 'Kent Cooper', 'Marty Fishman', 'Mark Harmon', 'Thomas McLenahan', 'Manny Fernandez', 'Gary Hallett', 'Eric Gagnon', 'Pablo Langesfeld', 'Baris Guler', 'Brad Coffman', 'George Chaconas', 'Timothy Bellon', 'Lou Vescio', 'Robin Gagnon', 'Brenda Sali', 'Andrew Cagnetta', 'Jose Miguel Arreaza', 'Alan Steinberg', 'Erica Lewandowski', 'Ken Eisenband', 'Paul Ihrig, M&AMI, CBI, CM&AP', 'Chris Gutierrez', 'David Rummell', 'Sean Brelsford', 'Michael Shea', 'Jim Parker', 'Jon Franz', 'Matthew FitzGibbon', 'Susan Moyer', 'Fernando Mello', 'Stacy Alario-Chrisman, LFCBI', 'Michelle Royce', 'BIBIANA GILLIS', 'Shaun Thornton', 'Gerard Perillo III', 'Erin Crawford', 'Jack Beraha', 'Bianca Evans', 'Nivedita Buzzetta', 'Don Imbus', 'James Lascano', 'Randy Bring', 'Frank Giacomelli', 'Eric Gall, MBA, CM&AP, CBI, ABI, M&AMI', 'Bruce Pockrandt', 'Robert Leone', 'Kenneth Hamner', 'Joe Shemansk

In [3]:
import pandas as pd
df = pd.DataFrame(profile)
df.head()

Unnamed: 0,name,link,emails
0,Tom Milana,https://www.ibba.org/broker-profile/florida/fo...,tom@tworld.com
1,Dwight Altman,https://www.ibba.org/broker-profile/florida/lo...,dta135@aol.com
2,Barry Berkowitz,https://www.ibba.org/broker-profile/florida/bo...,Barry@berkbiz.com
3,Anthony Rigney,https://www.ibba.org/broker-profile/florida/ja...,arigney@qbusinessadvisors.com
4,"Moche Hazout, P.A., CBI, CFC, CM&AP",https://www.ibba.org/broker-profile/florida/fo...,mhazout@tworld.com


In [4]:
df.to_excel('florida_emails.xlsx')