In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService 
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
import time
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
database_name = 'Pennsylvania'

# search settings
zip_code = False # integer
radius = False # string, can be 0, 5, 10, 25, 50, or 100
state = 'PA' # string w/ state code (CA, NY, etc.)

In [3]:
url = "https://forms.nabip.org/consumer/findagent2.cfm" 
# options = webdriver.ChromeOptions() #newly added 
# options.headless = True #newly added 
# with webdriver.Chrome(options=options) as browser: #modified 
# 	browser.get(url)
# 	print("Page URL:", browser.current_url) 
# 	print("Page Title:", browser.title)

browser = webdriver.Chrome()
browser.get(url)


# enter zip code
if zip_code:
    browser.find_element(By.XPATH, '/html/body/div/div[3]/form/table/tbody/tr[2]/td[2]/input').send_keys(zip_code)

# enter radius
if radius:
    radius_select = Select(browser.find_element(By.XPATH, '/html/body/div/div[3]/form/table/tbody/tr[2]/td[2]/select'))
    radius_select.select_by_visible_text('Single Zip Code') 
    radius_select.select_by_value(radius)

if state:
    state_select = Select(browser.find_element(By.XPATH, '/html/body/div/div[3]/form/table/tbody/tr[8]/td[2]/select'))
    state_select.select_by_visible_text('Choose a State/Province') 
    state_select.select_by_value(state)


# search database
browser.find_element(By.XPATH, '/html/body/div/div[3]/form/table/tbody/tr[2]/td[2]/input').send_keys(Keys.ENTER)

In [4]:
html = browser.page_source
soup = BeautifulSoup(html, 'html.parser')

# find total num of brokers in area
total_brokers = soup.find_all('p')

# exit if no brokers found
if (total_brokers[0].text) == 'No members found.':
    print("No members found, exiting")
    exit()

# get total brokers if one page of results
if len(total_brokers) == 1:
    one_page = True
    total_brokers = int(total_brokers[0].text.split()[1])
    
# get total brokers if multiple pages of results
else:
    one_page = False
    total_brokers = int(total_brokers[1].text.split()[4])

In [5]:
brokers = []

# loop until all brokers are found
while len(brokers) < total_brokers:

    # extract broker info
    new_brokers = soup.find('div', {'class': 'contentContainer'})

    new_brokers = new_brokers.find_all('div')

    for broker in new_brokers:
        data_dict = {'Name':'', 'Company':'', 'Website':'', 'Work Phone':'', 'Fax':'', 'Email':'', \
                    'Address':'', 'Zip Code':'', 'Certifications':'', 'Certified States':'', 'Practice Areas':''}

        broker_text = broker.text.split('\n')
        broker_text = list(filter(None, broker_text))

        # make sure its a full contact div
        if len(broker_text) < 5:
                continue
        
        broker_text.pop(0) # remove vCard item

        # extract broker info to data_dict
        for index, item in enumerate(broker_text):
            if index == 0:
                data_dict['Name'] = broker_text[index]
            elif 'Certifications' in item:
                data_dict['Certifications'] = broker_text[index]
            elif 'Work Phone' in item:
                temp_text = broker_text[index].split('Work Phone: ')[1]
                if 'Fax' in temp_text:
                    temp_text = temp_text.split(' Fax: ')
                    data_dict['Work Phone'] = temp_text[0]
                    data_dict['Fax'] = temp_text[1]
                else:
                    data_dict['Work Phone'] = temp_text
            elif 'Chapter' in item:
                data_dict['Certified States'] = broker_text[index].split('Chapter: ')[1]
            elif 'Web Site' in item:
                data_dict['Website'] = broker_text[index].split('Web Site: ')[1]
            elif 'E-mail' in item:
                temp_text = broker_text[index].split('E-mail: ')[1]
                if 'Practice Areas' in temp_text:
                    temp_text = temp_text.split(' Practice Areas: ')
                    data_dict['Email'] = temp_text[0]
                    data_dict['Practice Areas'] = temp_text[1]
                else:
                    data_dict['Email'] = temp_text
            elif 'Practice Areas' in item:
                  temp_text = item.split('Practice Areas: ')
                  data_dict['Practice Areas'] = temp_text[1]
            
            else:
                data_dict['Address'] = broker_text[index]
                data_dict['Zip Code'] = broker_text[index].split(' ')[-1]
                
                # get company from address
                split = broker_text[index].split()
                curr_string = ''
                for string in split:
                    if string.isnumeric():
                        break
                    elif string == "PO":
                        break
                    curr_string += string + ' '
                
                data_dict['Company'] = curr_string


        # only add broker if not already in list
        if data_dict not in brokers:
            brokers.append(data_dict)


    print(f'Found {len(brokers)}/{total_brokers} brokers')

    # reload page & create new BS object after all processing
    if not one_page:
        browser.find_element(By.CLASS_NAME, 'bluebutton').send_keys(Keys.ENTER)
        html = browser.page_source
        soup = BeautifulSoup(html, 'html.parser')

browser.close()
print(f'Done!')

Found 20/640 brokers
Found 39/640 brokers
Found 57/640 brokers
Found 74/640 brokers
Found 92/640 brokers
Found 107/640 brokers
Found 124/640 brokers
Found 140/640 brokers
Found 154/640 brokers
Found 169/640 brokers
Found 182/640 brokers
Found 194/640 brokers
Found 206/640 brokers
Found 219/640 brokers
Found 233/640 brokers
Found 246/640 brokers
Found 260/640 brokers
Found 270/640 brokers
Found 280/640 brokers
Found 288/640 brokers
Found 299/640 brokers
Found 308/640 brokers
Found 319/640 brokers
Found 326/640 brokers
Found 332/640 brokers
Found 343/640 brokers
Found 351/640 brokers
Found 364/640 brokers
Found 373/640 brokers
Found 381/640 brokers
Found 387/640 brokers
Found 393/640 brokers
Found 403/640 brokers
Found 413/640 brokers
Found 420/640 brokers
Found 426/640 brokers
Found 430/640 brokers
Found 437/640 brokers
Found 441/640 brokers
Found 449/640 brokers
Found 455/640 brokers
Found 461/640 brokers
Found 462/640 brokers
Found 469/640 brokers
Found 473/640 brokers
Found 477/640 b

In [6]:
# convert dictionary into DataFrame and export to csv

brokers_final = pd.DataFrame(brokers)

brokers_final.to_csv(f'Output\\{database_name}_brokers.csv', index=False)