## 04 - Scraping party votes

The code in this notebook uses the data created in `03 - Cleaning data` to scape each individual senate webpage for confirmations of circuit judges and extract each senator's vote.

In [2]:
import pandas as pd
import re
import numpy as np

df = pd.read_csv("votes.csv", na_values=['NaN'])

df['confirmation_date'] = pd.to_datetime(df.confirmation_date)

# Convert the votes and record vote numbers into integers instead of float to remove the decimals. 
df['yea_votes'] = df['yea_votes'].astype('Int64')
df['nay_votes'] = df['nay_votes'].astype('Int64')
df['record_vote_number'] = df['record_vote_number'].astype('Int64')

And then convert the record_vote_number into a three digit string – this will be important later when I have to loop through URLs.

In [3]:
df['record_vote_number'] = df['record_vote_number'].apply(lambda x: '{0:0>3}'.format(x))
df.head(5)

Unnamed: 0,name,nomination_no,congress_no,circuit,confirmation_date,yea_votes,nay_votes,record_vote_number
0,Roopali H. Desai,PN2262,117,Ninth,2022-08-04,67,29,284
1,Florence Y. Pan,PN2193,117,District of Columbia,2022-09-20,52,42,340
2,Sarah A. L. Merriam,PN2141,117,Second,2022-09-15,53,44,337
3,Lara E. Montecalvo,PN2140,117,First,2022-09-14,52,47,335
4,Salvador Mendoza Jr,PN1966,117,Ninth,2022-09-12,46,40,331


In [4]:
df['confirmation_year']=df['confirmation_date'].dt.to_period('Y')

df.confirmation_year = df.confirmation_year.map(str)
df.confirmation_year = df.confirmation_year.astype(int)

df['session'] = df.confirmation_year.apply(lambda x: "2" if x % 2 == 0 else "1")

Because the record vote numbers are repeated every year, some nominations share the number. Later on I will need a unique number for each nomination, so I create a vote_id by merging the congress #, the session # and and the record vote #.

In [5]:
df['vote_id'] = df.congress_no.astype(str) + df.session.astype(str) + df.record_vote_number.astype(str)

In [6]:
df.to_csv("voteinfo.csv", index=False)

### Scraping each senators vote for each confirmation

In [20]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from playwright.async_api import async_playwright

playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless = False)
page = await browser.new_page()

The loop below loops through every webpage and scrapes the content.

In [21]:
# DO NOT RUN ALL THE TIME! MIGHT UPSET THE SERVER...

votes = [] 
for index, row in df.iterrows():
    record_vote_number = row['record_vote_number']
    congress_no = row['congress_no']
    session = row['session']
    await page.goto(f"https://www.senate.gov/legislative/LIS/roll_call_votes/vote{congress_no}{session}/vote_{congress_no}_{session}_00{record_vote_number}.htm")

    html = await page.content()

    doc = BeautifulSoup(html)

    nominations = {}
    nominations["congress_no"] = row['congress_no']
    nominations["session"] = row['session']
    nominations["vote_number"] = doc.select("#secondary_col2 > div:nth-child(7) > div:nth-child(2)")

    results = doc.select("#secondary_col2 > div:nth-child(14) > span ")

    nomination = []
    for row in results:
        nomination.append(row.text)

    nomination1 = [item.split('\n') for item in nomination]

    nomination1_flat = [item for l in nomination1 for item in l]

    nomination2 = [item.split(',') for item in nomination1_flat]

    nominations["votes"] = nomination2

    votes.append(nominations)

This loop loops through the scrape and extracts the democratic, republican and independent votes

In [22]:
for vote in votes:
    vote1 = {}

    D_yea = 0
    D_nay = 0
    D_no_vote = 0
    R_yea = 0
    R_nay = 0
    R_no_vote = 0
    I_yea = 0
    I_nay = 0
    I_no_vote = 0
    for senator in vote['votes']:
        dem = "(D"
        rep = "(R"
        ind = "(I"
        if dem in senator[0]:
            if senator[1] == " Yea":
                D_yea = D_yea + 1
            elif senator[1] == " Nay":
                D_nay = D_nay + 1
            else:
                D_no_vote = D_no_vote + 1
        elif rep in senator[0]:
            if senator[1] == " Yea":
                R_yea = R_yea + 1
            elif senator[1] == " Nay":
                R_nay = R_nay + 1
            else:
                R_no_vote = R_no_vote + 1
        elif ind in senator[0]:
            if senator[1] == " Yea":
                I_yea = I_yea + 1
            elif senator[1] == " Nay":
                I_nay = I_nay + 1
            else:
                I_no_vote = I_no_vote + 1        
        else:
            print("What is going on here?") 

    vote['D_yea'] = D_yea
    vote['D_nay'] = D_nay
    vote['D_no_vote'] = D_no_vote
    vote['R_yea'] = R_yea
    vote['R_nay'] = R_nay
    vote['R_no_vote'] = R_no_vote
    vote['I_yea'] = I_yea
    vote['I_nay'] = I_nay
    vote['I_no_vote'] = I_no_vote

What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is goi

In [23]:
df = pd.DataFrame(votes)

In [24]:
df['vote_number'] = df['vote_number'].astype(str)

In [25]:
df['vote_number'] = df['vote_number'].str.replace(r'<[^<>]*>', '', regex=True)
df['vote_number'] = df['vote_number'].str.extract('(\d+)', expand=False).str.strip()

In [26]:
df['vote_number'] = df['vote_number'].apply(lambda x: '{0:0>3}'.format(x))
df['vote_id'] = df.congress_no.astype(str) + df.session.astype(str) + df.vote_number.astype(str)
df.head(5)

Unnamed: 0,congress_no,session,vote_number,votes,D_yea,D_nay,D_no_vote,R_yea,R_nay,R_no_vote,I_yea,I_nay,I_no_vote,vote_id
0,117,2,284,"[[Baldwin (D-WI), Yea], [Barrasso (R-WY), Na...",46,0,2,19,29,2,2,0,0,1172284
1,117,2,340,"[[Baldwin (D-WI), Not Voting], [Barrasso (R-W...",46,0,2,4,42,4,2,0,0,1172340
2,117,2,337,"[[Baldwin (D-WI), Yea], [Barrasso (R-WY), Na...",48,0,0,3,44,3,2,0,0,1172337
3,117,2,335,"[[Baldwin (D-WI), Yea], [Barrasso (R-WY), Na...",47,0,1,3,47,0,2,0,0,1172335
4,117,2,331,"[[Baldwin (D-WI), Yea], [Barrasso (R-WY), No...",41,0,7,3,40,7,2,0,0,1172331


In [27]:
df.to_csv("party_votes.csv", index=False)