## 04 - Scraping party votes

In [2]:
import pandas as pd
import re
import numpy as np

df = pd.read_csv("votes.csv", na_values=['NaN'])

df['confirmation_date'] = pd.to_datetime(df.confirmation_date)

# Convert the votes and record vote numbers into integers instead of float to remove the decimals. 
df['yea_votes'] = df['yea_votes'].astype('Int64')
df['nay_votes'] = df['nay_votes'].astype('Int64')
df['record_vote_number'] = df['record_vote_number'].astype('Int64')

In [3]:
# Convert the record_vote_number into a three digit string – this will be important later when I have to loop
# through URLs.
df['record_vote_number'] = df['record_vote_number'].apply(lambda x: '{0:0>3}'.format(x))
df.head(20)

Unnamed: 0,name,nomination_no,congress_no,circuit,confirmation_date,yea_votes,nay_votes,record_vote_number
0,Roopali H. Desai,PN2262,117,Ninth,2022-08-04,67,29,284
1,Florence Y. Pan,PN2193,117,District of Columbia,2022-09-20,52,42,340
2,Sarah A. L. Merriam,PN2141,117,Second,2022-09-15,53,44,337
3,Lara E. Montecalvo,PN2140,117,First,2022-09-14,52,47,335
4,Salvador Mendoza Jr,PN1966,117,Ninth,2022-09-12,46,40,331
5,John Z. Lee,PN1965,117,Seventh,2022-09-07,50,44,327
6,Stephanie Dawkins Davis,PN1748,117,Sixth,2022-05-24,49,43,194
7,Julianna Michelle Childs,PN1671,117,District of Columbia,2022-07-19,64,34,260
8,Leonard Philip Stark,PN1508,117,Federal,2022-02-09,61,35,49
9,Alison J. Nathan,PN1504,117,Second,2022-03-23,49,47,106


In [4]:
df['confirmation_year']=df['confirmation_date'].dt.to_period('Y')

df.confirmation_year = df.confirmation_year.map(str)
df.confirmation_year = df.confirmation_year.astype(int)

df['session'] = df.confirmation_year.apply(lambda x: "2" if x % 2 == 0 else "1")
df.head(20)

Unnamed: 0,name,nomination_no,congress_no,circuit,confirmation_date,yea_votes,nay_votes,record_vote_number,confirmation_year,session
0,Roopali H. Desai,PN2262,117,Ninth,2022-08-04,67,29,284,2022,2
1,Florence Y. Pan,PN2193,117,District of Columbia,2022-09-20,52,42,340,2022,2
2,Sarah A. L. Merriam,PN2141,117,Second,2022-09-15,53,44,337,2022,2
3,Lara E. Montecalvo,PN2140,117,First,2022-09-14,52,47,335,2022,2
4,Salvador Mendoza Jr,PN1966,117,Ninth,2022-09-12,46,40,331,2022,2
5,John Z. Lee,PN1965,117,Seventh,2022-09-07,50,44,327,2022,2
6,Stephanie Dawkins Davis,PN1748,117,Sixth,2022-05-24,49,43,194,2022,2
7,Julianna Michelle Childs,PN1671,117,District of Columbia,2022-07-19,64,34,260,2022,2
8,Leonard Philip Stark,PN1508,117,Federal,2022-02-09,61,35,49,2022,2
9,Alison J. Nathan,PN1504,117,Second,2022-03-23,49,47,106,2022,2


In [5]:
# Because the record vote numbers are repeated every year, some nominations share the number. Later on I will need a unique number for each nomination,
# so I create a vote_id by merging the congress #, the session # and and the record vote #
df['vote_id'] = df.congress_no.astype(str) + df.session.astype(str) + df.record_vote_number.astype(str)
df

Unnamed: 0,name,nomination_no,congress_no,circuit,confirmation_date,yea_votes,nay_votes,record_vote_number,confirmation_year,session,vote_id
0,Roopali H. Desai,PN2262,117,Ninth,2022-08-04,67,29,284,2022,2,1172284
1,Florence Y. Pan,PN2193,117,District of Columbia,2022-09-20,52,42,340,2022,2,1172340
2,Sarah A. L. Merriam,PN2141,117,Second,2022-09-15,53,44,337,2022,2,1172337
3,Lara E. Montecalvo,PN2140,117,First,2022-09-14,52,47,335,2022,2,1172335
4,Salvador Mendoza Jr,PN1966,117,Ninth,2022-09-12,46,40,331,2022,2,1172331
...,...,...,...,...,...,...,...,...,...,...,...
189,Terrence L. O'Brien,PN882,107,Tenth,2002-04-15,98,0,068,2002,2,1072068
190,Michael J. Melloy,PN881,107,Eighth,2002-02-11,91,0,021,2002,2,1072021
191,Michael W. McConnell,PN880,107,Tenth,2002-11-15,,,,2002,2,1072<NA>
192,William J. Riley,PN458,107,Eighth,2001-08-02,97,0,270,2001,1,1071270


In [6]:
from bs4 import BeautifulSoup
import requests
from playwright.async_api import async_playwright

playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless = False)
page = await browser.new_page()

In [7]:
# DO NOT RUN ALL THE TIME! MIGHT UPSET THE SERVER...

votes = [] 
for index, row in df.iterrows():
    record_vote_number = row['record_vote_number']
    congress_no = row['congress_no']
    session = row['session']
    await page.goto(f"https://www.senate.gov/legislative/LIS/roll_call_votes/vote{congress_no}{session}/vote_{congress_no}_{session}_00{record_vote_number}.htm")

    html = await page.content()

    doc = BeautifulSoup(html)

    nominations = {}
    nominations["congress_no"] = row['congress_no']
    nominations["session"] = row['session']
    nominations["vote_number"] = doc.select("#secondary_col2 > div:nth-child(7) > div:nth-child(2)")

    results = doc.select("#secondary_col2 > div:nth-child(14) > span ")

    nomination = []
    for row in results:
        nomination.append(row.text)

    nomination1 = [item.split('\n') for item in nomination]

    nomination1_flat = [item for l in nomination1 for item in l]

    nomination2 = [item.split(',') for item in nomination1_flat]

    nominations["votes"] = nomination2

    votes.append(nominations)

In [8]:
import pandas as pd

In [9]:
for vote in votes:
    vote1 = {}

    D_yea = 0
    D_nay = 0
    D_no_vote = 0
    R_yea = 0
    R_nay = 0
    R_no_vote = 0
    I_yea = 0
    I_nay = 0
    I_no_vote = 0
    for senator in vote['votes']:
        dem = "(D"
        rep = "(R"
        ind = "(I"
        if dem in senator[0]:
            if senator[1] == " Yea":
                D_yea = D_yea + 1
            elif senator[1] == " Nay":
                D_nay = D_nay + 1
            else:
                D_no_vote = D_no_vote + 1
        elif rep in senator[0]:
            if senator[1] == " Yea":
                R_yea = R_yea + 1
            elif senator[1] == " Nay":
                R_nay = R_nay + 1
            else:
                R_no_vote = R_no_vote + 1
        elif ind in senator[0]:
            if senator[1] == " Yea":
                I_yea = I_yea + 1
            elif senator[1] == " Nay":
                I_nay = I_nay + 1
            else:
                I_no_vote = I_no_vote + 1        
        else:
            print("What is going on here?") 

    vote['D_yea'] = D_yea
    vote['D_nay'] = D_nay
    vote['D_no_vote'] = D_no_vote
    vote['R_yea'] = R_yea
    vote['R_nay'] = R_nay
    vote['R_no_vote'] = R_no_vote
    vote['I_yea'] = I_yea
    vote['I_nay'] = I_nay
    vote['I_no_vote'] = I_no_vote

print(votes)

What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is going on here?
What is goi

In [14]:
df = pd.DataFrame(votes)
df.head(20)

df.shape

(194, 13)

In [18]:
df['vote_number'] = df['vote_number'].astype(str)

#tags_list = ['[<div class="contenttext" style="float:left; min-width:200px; padding-bottom:10px;"><b>Vote Number:   </b>', '</div>']

#for tag in tags_list:
 #df['vote_number'].replace(to_replace=tag, value='', regex=True, inplace=True)
#print(df)
df

Unnamed: 0,congress_no,session,vote_number,votes,D_yea,D_nay,D_no_vote,R_yea,R_nay,R_no_vote,I_yea,I_nay,I_no_vote
0,117,2,"[<div class=""contenttext"" style=""float:left; m...","[[Baldwin (D-WI), Yea], [Barrasso (R-WY), Na...",46,0,2,19,29,2,2,0,0
1,117,2,"[<div class=""contenttext"" style=""float:left; m...","[[Baldwin (D-WI), Not Voting], [Barrasso (R-W...",46,0,2,4,42,4,2,0,0
2,117,2,"[<div class=""contenttext"" style=""float:left; m...","[[Baldwin (D-WI), Yea], [Barrasso (R-WY), Na...",48,0,0,3,44,3,2,0,0
3,117,2,"[<div class=""contenttext"" style=""float:left; m...","[[Baldwin (D-WI), Yea], [Barrasso (R-WY), Na...",47,0,1,3,47,0,2,0,0
4,117,2,"[<div class=""contenttext"" style=""float:left; m...","[[Baldwin (D-WI), Yea], [Barrasso (R-WY), No...",41,0,7,3,40,7,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
189,107,2,"[<div class=""contenttext"" style=""float:left; m...","[[Akaka (D-HI), Yea], [Allard (R-CO), Yea], ...",49,0,1,48,0,1,1,0,0
190,107,2,"[<div class=""contenttext"" style=""float:left; m...","[[Akaka (D-HI), Yea], [Allard (R-CO), Yea], ...",48,0,2,42,0,7,1,0,0
191,107,2,[],[],0,0,0,0,0,0,0,0,0
192,107,1,"[<div class=""contenttext"" style=""float:left; m...","[[Akaka (D-HI), Yea], [Allard (R-CO), Yea], ...",49,0,1,47,0,2,1,0,0


In [12]:
df.to_csv("party_votes.csv", index=False)