In [1]:
# Import packages
import time
import itertools
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.actions.wheel_input import ScrollOrigin



In [2]:
# Start a web driver and open a window
driver = webdriver.Chrome()

# Go to the page with the BI dashboard
driver.get("https://www.hud.gov/program_offices/public_indian_housing/ehv/dashboard")

# Find the iframe that contains the dashboard and switch to it
frame_finder_result = WebDriverWait(driver, 10).until(EC.frame_to_be_available_and_switch_to_it((By.TAG_NAME, "iframe")))

# Click on next page to get to page 2
WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.CLASS_NAME, 'card')))
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label="Next Page"]'))).click()

# Click on next page to get to page 3
WebDriverWait(driver, 5).until(EC.text_to_be_present_in_element((By.CLASS_NAME, 'middleText'), '2of5'))
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label="Next Page"]'))).click()

# Click on the button to get more data
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, 'fill.ui-role-button-fill.sub-selectable'))).click()

In [3]:
# Give the table sometime to load
table_load_result = WebDriverWait(driver, 5).until(
    EC.text_to_be_present_in_element((By.XPATH, '//div[@role="columnheader" and @aria-colindex="1"]'), 'Row Selection')
)

In [4]:
# Define a function to grab and format data into a list of tuples with the second element being the row id
def get_rows():
  return([(x, x.get_attribute('aria-rowindex')) for x in driver.find_elements(By.XPATH, '//div[@role="row"]') if x.get_attribute('aria-rowindex') != '1'])

In [None]:
# Initialize an empty list to store results
all_rows = list()
last_indexes = list()
done_flag = False

# Enter the loop
while len(all_rows) <= 1000:
    # Read in rows available  
    current_rows = get_rows()
    
    # Double click on the last visible cell in the second column 
    select_cell = current_rows[len(current_rows) - 1][0].find_elements(By.XPATH, '*/*/div')[2]
    ActionChains(driver)\
      .double_click(select_cell)\
      .perform()
 
    # Scroll down
    key_to_press = "\ue015"
    actions = ActionChains(driver)
    for _ in range(10):
        actions.key_down(key_to_press).perform()
        #time.sleep(0.01)
        actions.perform()
    
    # Combine rows
    if len(all_rows) > 0:
        last_indexes.append(max([int(x[1]) for x in all_rows]))
        all_rows += [x for x in current_rows if int(x[1]) > last_indexes[-1]]
        print(last_indexes[-1])
    else:
        all_rows += current_rows
    
    # Check if the last five indexes did not change then terminate
    if len(last_indexes) > 5 and len(set(last_indexes[-5:])) == 1:
        done_flg = True

In [10]:
#sort the list
sorted_lst = sorted([int(x[1]) for x in all_rows])
 
#check if all elements are consecutive
is_consecutive = all(sorted_lst[i] == sorted_lst[i-1] + 1 for i in range(1, len(sorted_lst)))
is_consecutive

In [4]:
# Grab the column headers
col_headers = driver.find_elements(By.XPATH, '//div[@role="columnheader"]')
col_headers = [x.text for x in col_headers if x.text != 'Row Selection']
col_headers

['PHA Code',
 'PHA Formal Name',
 'Unique ID',
 'Issuance Effective Date',
 'Leased Effective Date',
 'Expired Effective Date',
 'Exit Effective Date\n ',
 'Days since Issuance',
 'Time to Success',
 'Race']

In [6]:
# Define a function to grab and group the data into a nested list
def getSplitTextRows(row):
  cells = row.find_elements(By.XPATH, '//div[@role="gridcell"]')
  text_from_cells = [cell.text for cell in cells] 
  return([list(group) for k, group in itertools.groupby(text_from_cells, lambda x: x=='Select Row') if not k])

In [38]:
# Main scraper loop 
# 1. Scrap the table
# 2. Count rows (X)
# 3. Move down X rows
# 4. Scrap and repeat

# Scrap
#stacked_table = pd.DataFrame()
cnt_rows = 0
cnt_new_rows = 1

while cnt_new_rows > 0: 
    current_rows = driver.find_element(By.XPATH, '//div[@role="columnheader"]')
    current_data = pd.DataFrame(getSplitTextRows(current_rows), columns = col_headers)
    
    # Double click on the last cell available
    cells_in_view = current_rows.find_elements(By.XPATH, '//div[@role="gridcell"]')
    first_cell_last_row = cells_in_view[-20]
    ActionChains(driver)\
      .double_click(first_cell_last_row)\
      .perform()
     
    # Scroll down
    key_to_press = "\ue015"
    actions = ActionChains(driver)
    
    # Perform the key press action multiple times with a delay between each press
    for _ in range(15):
        actions.key_down(key_to_press).perform()
        time.sleep(0.05)
        
    actions.perform()

    # Count number of new rows
    if len(stacked_table) == 0:
        cnt_new_rows = len(current_data)
    else: 
        cnt_new_rows = 1
        #cnt_new_rows = (~current_data['Unique ID'].isin(stacked_table['Unique ID'])).sum()

    # Combine scrapped data and display rows scrapped
    stacked_table = pd.concat([stacked_table, current_data], ignore_index=True).drop_duplicates()
    cnt_rows = len(stacked_table)
    print(f"Processed {cnt_rows} rows.")


Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 77467 rows.
Processed 

StaleElementReferenceException: Message: stale element reference: stale element not found
  (Session info: chrome=122.0.6261.94); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#stale-element-reference-exception
Stacktrace:
0   chromedriver                        0x000000010070853c chromedriver + 3966268
1   chromedriver                        0x0000000100700ac8 chromedriver + 3934920
2   chromedriver                        0x0000000100383da0 chromedriver + 277920
3   chromedriver                        0x0000000100388648 chromedriver + 296520
4   chromedriver                        0x0000000100389fe0 chromedriver + 303072
5   chromedriver                        0x000000010038a054 chromedriver + 303188
6   chromedriver                        0x000000010038a078 chromedriver + 303224
7   chromedriver                        0x00000001003c1500 chromedriver + 529664
8   chromedriver                        0x00000001003bc8a8 chromedriver + 510120
9   chromedriver                        0x00000001003febf0 chromedriver + 781296
10  chromedriver                        0x00000001003bafb0 chromedriver + 503728
11  chromedriver                        0x00000001003bba28 chromedriver + 506408
12  chromedriver                        0x00000001006cd724 chromedriver + 3725092
13  chromedriver                        0x00000001006d1c18 chromedriver + 3742744
14  chromedriver                        0x00000001006b620c chromedriver + 3629580
15  chromedriver                        0x00000001006d2714 chromedriver + 3745556
16  chromedriver                        0x00000001006a9584 chromedriver + 3577220
17  chromedriver                        0x00000001006f0f74 chromedriver + 3870580
18  chromedriver                        0x00000001006f1118 chromedriver + 3871000
19  chromedriver                        0x0000000100700738 chromedriver + 3934008
20  libsystem_pthread.dylib             0x000000018dda6034 _pthread_start + 136
21  libsystem_pthread.dylib             0x000000018dda0e3c thread_start + 8


In [39]:
stacked_table.drop_duplicates().to_csv('output_try4.csv', index=False)

In [15]:
stacked_table.drop_duplicates().shape

(77346, 10)