In [1]:
# Import packages
import time
import itertools
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.common.actions.wheel_input import ScrollOrigin

In [2]:
# Start a web driver and open a window
driver = webdriver.Chrome()

# Go to the page with the BI dashboard
driver.get("https://www.hud.gov/program_offices/public_indian_housing/ehv/dashboard")

# Find the iframe that contains the dashboard and switch to it
iframe = driver.find_element(By.TAG_NAME, "iframe")
driver.switch_to.frame(iframe)

In [3]:
# Click on next page to get to page 2
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label="Next Page"]'))).click()

In [4]:
# Click on next page to get to page 3
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CSS_SELECTOR, 'button[aria-label="Next Page"]'))).click()
# Click on the button to get more data
WebDriverWait(driver, 20).until(EC.element_to_be_clickable((By.CLASS_NAME, 'fill.ui-role-button-fill.sub-selectable'))).click()

In [9]:
# Grab the column headers
col_headers = driver.find_elements(By.XPATH, '//div[@role="columnheader"]')
col_headers = [x.text for x in col_headers if x.text != 'Row Selection']
col_headers

['PHA Code',
 'PHA Formal Name',
 'Unique ID',
 'Issuance Effective Date',
 'Leased Effective Date',
 'Expired Effective Date',
 'Exit Effective Date\n ',
 'Days since Issuance',
 'Time to Success',
 'Race']

In [6]:
# Define a function to grab and group the data into a nested list
def getSplitTextRows(row):
  cells = row.find_elements(By.XPATH, '//div[@role="gridcell"]')
  text_from_cells = [cell.text for cell in cells] 
  return([list(group) for k, group in itertools.groupby(text_from_cells, lambda x: x=='Select Row') if not k])

In [10]:
# Main scraper loop 
# 1. Scrap the table
# 2. Count rows (X)
# 3. Move down X rows
# 4. Scrap and repeat

# Scrap
stacked_table = pd.DataFrame()
cnt_rows = 0

while cnt_rows < 1000: 
    current_row = driver.find_element(By.XPATH, '//div[@role="columnheader"]')
    current_data = pd.DataFrame(getSplitTextRows(current_row), columns = col_headers)
    
    # Double click on the first cell
    cells_in_view = driver.find_elements(By.XPATH, '//div[@role="gridcell"]')
    first_cell = cells_in_view[1]
    ActionChains(driver)\
      .double_click(first_cell)\
      .perform()
    
    # Press down
    key_to_press = "\ue015"
    actions = ActionChains(driver)
    
    # Perform the key press action multiple times with a delay between each press
    for _ in range(len(current_data) * 2 - 5):
        actions.key_down(key_to_press).perform()
        time.sleep(0.1)
        
    actions.perform()
    stacked_table = pd.concat([stacked_table, current_data], ignore_index=True)
    cnt_rows = len(stacked_table)
    print("Rows scrapped:", cnt_rows)


Rows scrapped: 28
Rows scrapped: 56
Rows scrapped: 78
Rows scrapped: 98
Rows scrapped: 120
Rows scrapped: 150
Rows scrapped: 173
Rows scrapped: 193
Rows scrapped: 217
Rows scrapped: 244
Rows scrapped: 264
Rows scrapped: 284
Rows scrapped: 304
Rows scrapped: 330
Rows scrapped: 358
Rows scrapped: 382
Rows scrapped: 406
Rows scrapped: 430
Rows scrapped: 450
Rows scrapped: 479
Rows scrapped: 499
Rows scrapped: 526
Rows scrapped: 546
Rows scrapped: 571
Rows scrapped: 595
Rows scrapped: 622
Rows scrapped: 646
Rows scrapped: 671
Rows scrapped: 698
Rows scrapped: 718
Rows scrapped: 742
Rows scrapped: 769
Rows scrapped: 790
Rows scrapped: 814
Rows scrapped: 838
Rows scrapped: 868
Rows scrapped: 891
Rows scrapped: 914
Rows scrapped: 934
Rows scrapped: 954
Rows scrapped: 978
Rows scrapped: 998
Rows scrapped: 1018


In [18]:
stacked_table.drop_duplicates().to_excel('output.xlsx', index=False)