In [253]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException
from selenium.webdriver.support.ui import Select

from bs4 import BeautifulSoup
import re
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
from io import StringIO

pd.set_option('display.expand_frame_repr', False)

In [252]:
# Define a custom expected condition to handle StaleElementReferenceException
def custom_wait_func(refreshed_locator):
    def check_for_elem(driver):
        try:
            element = driver.find_element(*refreshed_locator)
            return element
        except:
            return False

    return check_for_elem


def find_els_wait_func(refreshed_locator):
    def check_for_elem(driver):
        try:
            element = driver.find_elements(*refreshed_locator)
            return element
        except:
            return False

    return check_for_elem

In [268]:
#Initialize Chrome Webdriver
driver = webdriver.Chrome()

url = "https://efdsearch.senate.gov/search/"  
driver.get(url)

#Read the data as HTML
html_content = driver.page_source
agree_checkbox = driver.find_element(By.ID, "agree_statement")

# Check if the checkbox is not already selected
if not agree_checkbox.is_selected():
    # If the checkbox is not selected, click on it
    agree_checkbox.click()


#select current senators
senator_checkbox = driver.find_element(By.CLASS_NAME,"form-check-input")

if not senator_checkbox.is_selected():
    # If the checkbox is not selected, click on it
    senator_checkbox.click()
    
#select prev senators
prev_senators_checkbox = driver.find_element(By.XPATH, "//input[@value='5']")
if not prev_senators_checkbox.is_selected():
    # If the checkbox is not selected, click on it
    prev_senators_checkbox.click()

#Select annual and periodic transaction reports
annual_checkbox = driver.find_element(By.XPATH, "//input[@value='7']")
if not annual_checkbox.is_selected():
    # If the checkbox is not selected, click on it
    annual_checkbox.click()


#insert starting year of 2013 (most of 2012 reports are written reports and cant be scraped

start_date_box = driver.find_element(By.XPATH, "//input[@name='submitted_start_date']")
start_date_box.send_keys("01/01/2013")


#hit submit button
submit_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH,"//button[@class='btn btn-primary']"))
)
submit_button.click()


#at this point we are on a page that has all former and current senator annual/periodic filings
#dating back to 1/13/2013

#we know need to iterate through each of the 1,704 filings, check if it is an image, in which case we 
#skip because we cant read


#first sort by ascending order
sort_data_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH,"//th[@aria-label='Date Received/Filed: activate to sort column ascending']"))
)
sort_data_button.click()
#make it descending
sort_data_button = WebDriverWait(driver, 10).until(
    EC.element_to_be_clickable((By.XPATH,"//th[@aria-label='Date Received/Filed: activate to sort column descending']"))
)
sort_data_button.click()

#make it 100 entries per page
reports_length = driver.find_element(By.XPATH,"//select[@name='filedReports_length']")
reports_length_select = Select(reports_length)
reports_length_select.select_by_value("100")



time.sleep(2)






#There are 17 pages of reports, with 100 reports per page
#This script will save a csv of each page.
#if your connection is interrupted
#insert the most recent saved page
#ie, if the connection is interrupted on page 4, put 3 as the checkpoint
checkpoint = 1

#skip pages up to checkpoint
for i in range(checkpoint):
    driver.execute_script("window.scrollTo(0, 0)")
    table = WebDriverWait(driver, 10).until(
        custom_wait_func((By.XPATH, "//table[@class='table table-striped dataTable no-footer']"))
    )
    next_button = WebDriverWait(driver, 10).until(
        custom_wait_func((By.XPATH, "//a[@class='paginate_button next']"))
    )
    next_button.click()


#Iterate over reports
for i in range(checkpoint, 17): 
    #filings
    senator_filings_list = []
    senator_name = []
    #scroll to top of page
    driver.execute_script("window.scrollTo(0, 0)")
    table = WebDriverWait(driver, 10).until(
        custom_wait_func((By.XPATH, "//table[@class='table table-striped dataTable no-footer']"))
    )

    rows = WebDriverWait(table, 10).until(
        find_els_wait_func((By.TAG_NAME, "tr"))
    )
    
    
    # table.find_elements(By.TAG_NAME, "tr")
    # #iterate over each link to each senator report
    for j,row in enumerate(rows):
        table = WebDriverWait(driver, 10).until(
            custom_wait_func((By.XPATH, "//table[@class='table table-striped dataTable no-footer']"))
        )
        row = table.find_elements(By.TAG_NAME, "tr")[j]
        cells = row.find_elements(By.TAG_NAME, "td")
        if (len(cells)>4):
            current_window_handle = driver.current_window_handle

            link = cells[3].find_element(By.TAG_NAME, "a")
            link.click()
            new_window_handle = [handle for handle in driver.window_handles if handle != current_window_handle][0]
            driver.switch_to.window(new_window_handle)
            
            #wait for page to load
            WebDriverWait(driver, 10).until(
                    custom_wait_func((By.XPATH, "//main[@class='container-fluid pgContent']"))
            )



            #if the page is not a handwritten for proceed
            if not driver.find_elements(By.XPATH, "//img[@class='filingImage']"):
                html_content = driver.page_source
                soup = BeautifulSoup(html_content, 'html.parser')

                #get Senator name
                name = driver.find_element(By.XPATH, "//h2[@class='filedReport']")
                pattern = r"\((.*?)\)"
                
                # Find all matches of the pattern in the text
                matches = re.findall(pattern, name.text)
                if (len(matches) > 0 and matches[0] != "Former Senator"):
                    name = matches[0]
                else:
                    name = name.text
                    
                print(name)
                
                #get all tables
                #only interested in part 4a and 4b
                sections = soup.find_all('section')

                if (len(sections) >= 5):
                    table_section_part4a = sections[3]
                    table_4a = table_section_part4a.find('table')
                        # If the table is found, read it into a DataFrame
                    if table_4a:
                        table_4a = str(table_4a)
                        table_4a = StringIO(table_4a)
                        senator_filings_list.append(table_4a)
                        senator_name.append(name)
    

                    table_section_part4b = sections[4]
                    table_4b = table_section_part4b.find('table')
                        # If the table is found, read it into a DataFrame
                    if table_4b:
                        table_4b = str(table_4b)
                        table_4b = StringIO(table_4b)
                        senator_filings_list.append(table_4b)
                        senator_name.append(name)

            
            driver.close()
            
            # Switch back to the original tab
            driver.switch_to.window(current_window_handle)

    #every page create df a save in case we are disconnected
    dataframes = []
    for name,table in zip(senator_name,senator_filings_list):
        df = pd.read_html(table)[0]  # Read the HTML table into a DataFrame
        df['Senator Name'] = name
        dataframes.append(df)
    
    # Concatenate all DataFrames into a single DataFrame
    result_df = pd.concat(dataframes, ignore_index=True)
    filename= f'senator_filings_page_{i}.csv'
    result_df.to_csv(filename)
    
    
    next_button = WebDriverWait(driver, 10).until(
        custom_wait_func((By.XPATH, "//a[@class='paginate_button next']"))
    )
    next_button.click()


# dataframes = []
# for name,table in zip(senator_name,senator_filings_list):
#     df = pd.read_html(table)[0]  # Read the HTML table into a DataFrame
#     df['Senator Name'] = name
#     dataframes.append(df)

# # Concatenate all DataFrames into a single DataFrame
# result_df = pd.concat(dataframes, ignore_index=True)

# Print the result DataFrame
 


# senator_filing_data_df = result_df.copy()
driver.quit()


Capito, Shelley Moore
Heinrich, Martin
Hickenlooper, John
Heinrich, Martin
Hickenlooper, John
Stabenow, Debbie A.
Merkley, Jeff
Klobuchar, Amy
McConnell, A. Mitchell Jr.
McConnell, A. Mitchell Jr.
Wyden, Ron
Murkowski, Lisa
Manchin, Joe
Johnson, Ron
Ossoff, Jon
Hoeven, John
Collins, Susan M.
Capito, Shelley Moore
Crapo, Michael D.
Fischer, Deb
Smith, Tina
Hawley, Josh
The Honorable John Fetterman
Bennet, Michael
Rosen, Jacky
Fetterman, John
Cantwell, Maria
Carper, Thomas R.
Peters, Gary
Cardin, Benjamin L.
King, Angus
Baldwin, Tammy
Booker, Cory
Lee, Mike
Wicker, Roger
Thune, John
Cornyn, John
Graham, Lindsey
Barrasso, John
Sanders, Bernard
Young, Todd
Grassley, Charles E.
Tester, Jon
King, Angus
Schumer, Charles E.
Former Senator
Ernst, Joni
Welch, Peter
Lankford, James
Schumer, Charles E.
Peters, Gary
Gillibrand, Kirsten E.
Former Senator
Kaine, Tim
Ricketts, Pete
Former Senator
Daines, Steve
Murray, Patty
Warren, Elizabeth
Sinema, Kyrsten
Cramer, Kevin
Former Senator
Former Senator


TimeoutException: Message: 


In [251]:
pd.set_option('display.max_rows', None)

senator_filing_data_df.drop(['Unnamed: 0', '#'],axis=1, inplace=True)
senator_filing_data_df['merged_type'] = senator_filing_data_df['Type'].fillna(senator_filing_data_df['Transaction Type'])

# Drop the original 'type' and 'transaction_type' columns if needed
senator_filing_data_df.drop(['Type', 'Transaction Type', 'Comment', 'Comments'], axis=1, inplace=True)

# Rename the merged column to 'type' if needed
senator_filing_data_df.rename(columns={'merged_type': 'Type'}, inplace=True)
senator_filing_data_df.head(1000)


Unnamed: 0,Transaction Date,Owner,Ticker,Asset Name,Amount,Senator Name,Type
0,02/06/2023,Spouse,--,chevron,"$1,001 - $15,000","Capito, Shelley Moore",Purchase
1,02/23/2023,Spouse,--,wells fargo,"$1,001 - $15,000","Capito, Shelley Moore",Sale (Partial)
2,04/04/2023,Spouse,MCD,McDonald's Corporation Common Stock,"$1,001 - $15,000","Capito, Shelley Moore",Sale (Partial)
3,04/04/2023,Spouse,DUK,Duke Energy Corporation (Holding Company) Comm...,"$15,001 - $50,000","Capito, Shelley Moore",Sale (Full)
4,04/04/2023,Spouse,UPS,"United Parcel Service, Inc. Common Stock","$1,001 - $15,000","Capito, Shelley Moore",Sale (Full)
5,05/10/2023,Spouse,BMY,Bristol-Myers Squibb Company Common Stock,"$1,001 - $15,000","Capito, Shelley Moore",Sale (Full)
6,05/10/2023,Spouse,--,Wells Fargo & Company Common Stock,"$1,001 - $15,000","Capito, Shelley Moore",Sale (Partial)
7,05/10/2023,Spouse,XOM,Exxon Mobil Corporation Common Stock,"$1,001 - $15,000","Capito, Shelley Moore",Sale (Partial)
8,05/10/2023,Spouse,C,"Citigroup, Inc. Common Stock","$1,001 - $15,000","Capito, Shelley Moore",Sale (Partial)
9,06/07/2023,Spouse,WMT,Walmart Inc. Common Stock,"$1,001 - $15,000","Capito, Shelley Moore",Sale (Partial)
