# Introduction

The notebook uses beautifulsoup and selenium libraries to scrape raw data from different websites, and each dataset is saved in a json file. 

### Explanation of Each Websites
- Votesmart: VoteSmart is a non-profit organization that provides information on the voting records, issue positions, biographical information, and campaign finances of elected officials and candidates in the United States.
- Twitter: Twitter is a social media platform where users can post and interact with short messages called tweets. It is often used by politicians to share their thoughts and communicate with their constituents.
- Billtrack: Billtrack is a legislative tracking service that provides up-to-date information on bills and their status in the United States Congress and state legislatures.
- Reaproject: Reaproject is a website that focuses on raising awareness about regional economic analysis techniques and methodologies in local and regional communities throughout the US..

### Scraping Methodologies
- Votesmart: The websites are accessed using selenium and the following steps are taken:
    1) Type the politician's name in the search box.
    2) Click on the folders for the categories of interest, including votes, ratings, and funding:
        2-1) Votes: 
            2-1-1) Click on the voting category "environment" and collect all information.
            2-1-2) Click on each bill sequentially and collect information on committee sponsors, sponsors, and co-sponsors. Sponsors or co-sponsors that are currently out-of-office are not collected.
        2-2) Ratings: Collect organization and match rate information.
        2-3) Funding: Collect top contributors list information.
- Twitter: Collect all Twitter posts' text data of the three politicians using the snscrape library.
- Billtrack: Collect all information under the three politicians' profiles, including bio, staff, votes and etc.
- Reaproject: Collect all employment information divided by each industry in each politician's district.

#### Votesmart scraping part

In [None]:
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from time import sleep



def getPoliticianProfileDriver(name):
    '''
    typing in politician name on the search box
    '''
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')  
    options.add_argument("--window-size=1920,1080")
    s = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=s, options=options)
    # driver = webdriver.Chrome(service=s)

    driver.get("https://justfacts.votesmart.org")
    driver.implicitly_wait(10)
    #searching politician
    driver.find_element(by=By.XPATH, value='//*[@id="ispysearch"]').send_keys(name)
    driver.implicitly_wait(10)
    #click on the politician
    sleep(0.5)
    driver.find_element(by=By.XPATH,value='//*[@id="iSpyResultsDropdown"]')
    driver.implicitly_wait(10)
    # driver.find_element(by=By.TAG_NAME, value='a').click()
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CLASS_NAME, "iSpy-dropdown-item")))
    driver.find_element(by=By.CLASS_NAME, value='iSpy-dropdown-item').click()
    driver.implicitly_wait(10)
    return driver

def support_button_dismiss(driver):
    '''
    if support asking button pops up, click to dismiss, otherwise pass.
    '''
    sleep(1)
    support_button = driver.find_element(by=By.XPATH, value='//*[@id="helpusPopUpNoThanksQuote"]')
    try:
        # Check if the button is displayed on the page
        if support_button.is_displayed():
            # Click the button
            support_button.send_keys(Keys.ENTER)
            print("Button clicked successfully")
        else:
            print("Button exists but is not visible on the page")
    except NoSuchElementException:
        pass
    return driver

def getDonorList(name):
    print(name)
    driver = getPoliticianProfileDriver(name)
    sleep(1)
    #click on funding
    WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="fundingFolderClosed"]/span')))
    driver.find_element(by=By.XPATH, value='//*[@id="fundingFolderClosed"]/span').click()
    driver.implicitly_wait(10)
    driver = support_button_dismiss(driver)
    driver.implicitly_wait(10)
    html_content = driver.page_source

    soup = BeautifulSoup(html_content, 'html.parser')
    # Find the table that contains the top contributors and get the rows
    table = soup.find('div', {'id': 'nimspContributors'})
    if table is None:
        return []
    rows = table.find_all('tr')
    res_donors=[]
    # Print the top contributors and their contribution amounts
    for row in rows[1:]:  # skip the header row
        cells = row.find_all('td')
        name = cells[0].text.strip()
        amount = cells[1].text.strip()
        tmp = dict()
        tmp['name'] = name
        tmp['amount'] = amount
        res_donors.append(tmp)
    return res_donors



def getRatingList(name):
    print(name)
    driver = getPoliticianProfileDriver(name)
    #click on funding
    driver.find_element(by=By.XPATH, value='/html/body/div[1]/nav[2]/div/div/div/div[3]/div[2]/div/div[2]/div/div[1]/a/span/span').click()
    driver.implicitly_wait(10)
    driver = support_button_dismiss(driver)
    driver.implicitly_wait(10)
    html_content = driver.page_source

    # Use Beautiful Soup to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the table that contains the top contributors and get the rows
    divs = soup.find_all('div',{'class':'card card-collapse'})
    if divs is None:
        return []
    res_match =[]
    for div in divs:
        title = div.find('div', {'class':"col col-sm col-"})
        perc = div.find('div',{'class':'col-md-2 col-sm-2 col-2 evaluations-item-primary text-nowrap'})
        tmp = dict()
        tmp['name'] = title.text.strip()
        tmp['match'] = perc.text.strip()
        res_match.append(tmp)
    res_match = list({v['name']:v for v in res_match}.values())
    #print(res_match)
    return res_match


def getVotingInfo(name):
    driver = getPoliticianProfileDriver(name)
    #click on funding
    driver.find_element(by=By.XPATH, value='//*[@id="votesFolderClosed"]/span').click()
    driver.implicitly_wait(10)
    print('vote folder click')
    driver = support_button_dismiss(driver)
    driver.implicitly_wait(10)
    driver.find_element(by=By.ID, value='dropdownMenuButton').send_keys(Keys.ENTER)
    driver.implicitly_wait(10)
    sleep(1.5)
    driver.find_element(by=By.XPATH,value='//*[@id="votesCategoriesDropdownMenu"]/a[20]').send_keys(Keys.ENTER)
    driver.implicitly_wait(10)
    html_content = driver.page_source

    # Use Beautiful Soup to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the table that contains the top contributors and get the rows
    divs = soup.find_all('div',{'class':'card card-collapse'})
    res_vote = []
    for idx, div in enumerate(divs):
        date = div.find('div', {'class':"col-lg-3 col-md-3 v-congstatus-statusdate"}).text.strip()
        synopsis = div.find('div',{'class':'col col-sm col- text-left'}).text.strip()
        vote = div.find('div', {'class':"col-xl-2 col-lg-3 col-md-3 col-sm-3 col-10 text-left vote-header-collapse-item candidate-votes-congaction-col"}).text.strip()
        val = div.find('div',{'class':'card-body'}).find('tbody').findAll('tr')

        outcome = val[1].find('td').text.strip()
        bill_no = val[0].find('td').text.strip()
        print('before vote title click')
        driver = support_button_dismiss(driver)
        driver.implicitly_wait(10)
        driver.find_elements(by=By.CLASS_NAME, value='vote-title-link')[idx].send_keys(Keys.ENTER)
        driver.implicitly_wait(10)
        sleep(1)
        html_content_bill = driver.page_source
        soup_bill = BeautifulSoup(html_content_bill, 'html.parser')
        committee_collapse = soup_bill.find_all('ul',{'class':'list-group key-votes-detail-list'})
        committee_list = []
        committee_title_idx = -1
        for idx, collapse in enumerate(committee_collapse):
            if collapse.parent.find('h4', {'class':"title"}) is None:
                continue
            if collapse.parent.find('h4', {'class':"title"}).text.strip() == 'Committee Sponsors':
                committee_title_idx = idx
        # if committee_collapse[3].parent.find('h4', {'class':"title"}) is not None:
        #     if committee_collapse[3].parent.find('h4', {'class':"title"}).text.strip() == 'Committee Sponsors':
        if committee_title_idx != -1:
            committee_list_html = committee_collapse[committee_title_idx].find_all('li',{'class':'list-group-item'})          
            for committee_html in committee_list_html:
                committee_list.append(committee_html.text.strip())
        sponsor_list = []
        sponsor_collapse = soup_bill.find('div',{'id':'collapseSponsor'})
        if sponsor_collapse is not None: 
            sponsor_list_html = sponsor_collapse.find_all('li',{'class':'list-group-item'})
            for sponsor_html in sponsor_list_html:
                sponsor_name_with_space = sponsor_html.text.strip()
                #organize space in between
                sponsor_name = " ".join(sponsor_name_with_space.split())
                #if out of office, not included in the list
                if '(Out Of Office)' not in sponsor_name:
                    sponsor_list.append(sponsor_name)
        cosponsor_list = []
        cosponsor_collapse = soup_bill.find('div',{'id':'collapseCo-sponsor'})
        if cosponsor_collapse is not None: 
            cosponsor_list_html = cosponsor_collapse.find_all('li',{'class':'list-group-item'})
            for cosponsor_html in cosponsor_list_html:
                cosponsor_name_with_space = cosponsor_html.text.strip()
                #organize space in between
                cosponsor_name = " ".join(cosponsor_name_with_space.split())
                #if out of office, not included in the list
                if '(Out Of Office)' not in cosponsor_name:
                    cosponsor_list.append(cosponsor_name)               
        driver.back()
        tmp = dict()
        tmp['Bill No.'] = bill_no
        tmp['Date'] = date
        tmp['Synopsis'] = synopsis
        tmp['Vote'] = vote        
        tmp['Outcome'] = outcome
        tmp['Committee'] = committee_list
        tmp['Sponsor'] = sponsor_list
        tmp['Cosponsor'] = cosponsor_list
        res_vote.append(tmp)
    return res_vote

if __name__ == '__main__':  
    import json
    #first depth collecting
    politician_name_list=['Karen Spilka','Ana Rodriguez','Liz Krueger']
    #donor
    container=dict()
    for idx, name in enumerate(politician_name_list):
        container[name] = getDonorList(name)

    with open("raw_data/votesmart_donors.json", "w") as fp:
        json.dump(container,fp) 

    #rating
    container=dict()
    for idx, name in enumerate(politician_name_list):
        container[name] = getRatingList(name)

    with open("raw_data/votesmart_match.json", "w") as fp:
        json.dump(container,fp) 

    # voting + second depth politicians
    container=dict()
    second_depth_politician_list = []
    for idx, name in enumerate(politician_name_list):
        container[name] = getVotingInfo(name)
        for bill_info in container[name]:
            second_depth_politician_list.extend(bill_info['Sponsor'])
            second_depth_politician_list.extend(bill_info['Cosponsor'])
    with open("raw_data/votesmart_vote.json", "w") as fp:
        json.dump(container,fp) 

    # #saving only unique values
    second_depth_politician_list = set(second_depth_politician_list)
    second_depth_politician_list=list(second_depth_politician_list)
    with open("raw_data/second_depth_politician_list.json", "w") as fp:
        json.dump(second_depth_politician_list,fp) 

    # #Second depth collecting
    f = open('raw_data/second_depth_politician_list.json')
    second_depth_politician_list = json.load(f)

    # Second depth donor
    container = dict()
    for idx, name in enumerate(second_depth_politician_list):
        container[name] = getDonorList(name)

    with open("raw_data/votesmart_donors_second_depth.json", "w") as fp:
        json.dump(container,fp) 

    # Second depth rating
    container=dict()
    for idx, name in enumerate(second_depth_politician_list):
        container[name] = getRatingList(name)

    with open("raw_data/votesmart_match_second_depth.json", "w") as fp:
        json.dump(container,fp) 

#### Twitter scraping part

In [None]:
# import os
# os.system("snscrape --jsonl --max-results 100 twitter-search 'from:KarenSpilka'> user-tweets.json")


import snscrape.modules.twitter as sntwitter
import pandas as pd
import json
politician_twitter_ids=["KarenSpilka","SenatorAMR","LizKrueger"]
politician_names = ["Karen Spilka", "Ana Rodriguez","Liz Krueger"]
# Creating list to append tweet data to
data_retrieved_for_each_politicians = []
ret = {}
for id in politician_twitter_ids:
    tmp = []
    # Using TwitterSearchScraper to scrape data and append tweets to list
    for i,tweet in enumerate(sntwitter.TwitterSearchScraper('from:{}'.format(id), top = True).get_items()):
        if i>100:
            break
        # tmp.append([tweet.date, tweet.id, tweet.content, tweet.user.username])
        tmp.append([tweet.date, tweet.content])
    # Creating a dataframe from the tweets list above 
    tweets_df = pd.DataFrame(tmp, columns=['Datetime',  'Text'])
    data_retrieved_for_each_politicians.append(tweets_df.to_json())
for idx, data in enumerate(data_retrieved_for_each_politicians):
    ret[politician_names[idx]]=json.loads(data) 

with open("raw_data/twitter_posts.json", "w") as fp:
    json.dump(ret,fp) 

#### Billtrack scraping part

In [None]:
from bs4 import BeautifulSoup
import requests

url_list = ['https://www.billtrack50.com/legislatordetail/16923','https://www.billtrack50.com/legislatordetail/23016','https://www.billtrack50.com/legislatordetail/2226']
name_list=['Karen Spilka','Ana Rodriguez','Liz Krueger']
res = dict()
for idx, url in enumerate(url_list):
    tmp_data_container={}
    # Send a GET request to the URL and get the HTML content
    response = requests.get(url)
    html_content = response.text

    # Use Beautiful Soup to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the table that contains the top contributors and get the rows
    table = soup.find('div', {'id': 'bills'})
    if table is not None:
        tbody = table.find('tbody')
        rows = tbody.find_all('tr')
        tmp_data_container['bills'] = []
        # Print the top contributors and their contribution amounts
        for row in rows:  # skip the header row
            cells = row.find_all('td')
            bill = cells[0].text.strip()
            billname = cells[1].text.strip()
            summary = cells[2].text.strip()
            progress = cells[3].text.strip()
    #        print(f'{name}: {amount}')
            tmp = dict()
            tmp['bill'] = bill
            tmp['billname'] = billname
            tmp['summary'] = summary
            tmp['progress'] = progress
            tmp_data_container['bills'].append(tmp)
        
    table = soup.find('div', {'id': 'votes'})
    if table is not None:
        tbody = table.find('tbody')
        rows = tbody.find_all('tr')
        tmp_data_container['votes'] =[]
        # Print the top contributors and their contribution amounts
        for row in rows:  # skip the header row
            cells = row.find_all('td')
            bill = cells[0].text.strip()
            billname = cells[1].text.strip()
            motion = cells[2].text.strip()
            votedate = cells[3].text.strip()
            vote = cells[4].text.strip()
    #        print(f'{name}: {amount}')
            tmp = dict()
            tmp['bill'] = bill
            tmp['billname'] = billname
            tmp['motion'] = motion                                  
            tmp['votedate'] = votedate
            tmp['vote'] = vote
            tmp_data_container['votes'].append(tmp)


    table = soup.find('div', {'id': 'committee'})
    if table is not None:
        tbody = table.find('tbody')
        rows = tbody.find_all('tr')
        tmp_data_container['committee'] =[]
        # Print the top contributors and their contribution amounts
        for row in rows:  # skip the header row
            cells = row.find_all('td')
            role = cells[1].text.strip()
            committee = cells[2].text.strip()
    #        print(f'{name}: {amount}')
            tmp = dict()
            tmp['role'] = role
            tmp['committee'] = committee
            tmp_data_container['committee'].append(tmp)

    table = soup.find('div', {'id': 'staff'})
    if table is not None:
        tbody = table.find('tbody')
        rows = tbody.find_all('tr')
        tmp_data_container['staff'] = []
        # Print the top contributors and their contribution amounts
        for row in rows:  # skip the header row
            cells = row.find_all('td')
            name = cells[0].text.strip()
            title = cells[1].text.strip()
            role_description = cells[2].text.strip()
            location = cells[3].text.strip()
            address = cells[4].text.strip()
            phone = cells[5].text.strip()
            email = cells[6].text.strip()
    #        print(f'{name}: {amount}')
            tmp = dict()
            tmp['name'] = name
            tmp['title'] = title
            tmp['role description'] = role_description
            tmp['location'] = location
            tmp['address'] = address
            tmp['phone'] = phone
            tmp['email'] = email
            tmp_data_container['staff'].append(tmp)


    table = soup.find('div', {'id': 'bio'})
    if table is not None:
        biotext = table.text.strip()
    #        print(f'{name}: {amount}')
        tmp = dict()
        tmp['bio'] = biotext
        tmp_data_container['bio'] = tmp

    res[name_list[idx]] = tmp_data_container



with open("raw_data/billtrack_info.json", "w") as fp:
    json.dump(res,fp) 
    

#### Reaproject scraping part

In [None]:
from bs4 import BeautifulSoup
import requests
import json

url_list = ['https://massachusetts.reaproject.org/analysis/industry-structure/industries_by_region/employment/tools/250017/','https://florida.reaproject.org/analysis/industry-structure/industries_by_region/employment/tools/120087/','https://new-york.reaproject.org/analysis/industry-structure/industries_by_region/employment/tools/360061/']
final_res = dict()
name_list=['Karen Spilka','Ana Rodriguez','Liz Krueger']

##Donor info scraping
for idx, url in enumerate(url_list):
    # Send a GET request to the URL and get the HTML content
    response = requests.get(url)
    html_content = response.text

    # Use Beautiful Soup to parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find the table that contains the top contributors and get the rows
    page2 = soup.find('div', {'id': 'page_2'})
    table = page2.find('div', {'class': 'report_table_data'})
    rows = table.find_all('tr')
    res=[]
    # Print the top contributors and their contribution amounts
    for row in rows[2:]:  # skip the header row
        cells = row.find_all('td')
        industry = cells[0].text.strip()
        jobs = cells[2].text.strip()
        percent = cells[3].text.strip()
        location_quotient = cells[4].text.strip()

        tmp = dict()
        tmp['industry']=industry
        tmp['jobs'] = jobs
        tmp['percent'] = percent
        tmp['location_quotient'] = location_quotient
        res.append(tmp)
    #print(res_donors)
    final_res[name_list[idx]] = res

with open("raw_data/industry_employment.json", "w") as fp:
    json.dump(final_res,fp) 