## 04 - Scraping party votes

In [2]:
import pandas as pd
import re
import numpy as np

df = pd.read_csv("votes.csv", na_values=['NaN'])

df['confirmation_date'] = pd.to_datetime(df.confirmation_date)

# Convert the votes and record vote numbers into integers instead of float to remove the decimals. 
df['yea_votes'] = df['yea_votes'].astype('Int64')
df['nay_votes'] = df['nay_votes'].astype('Int64')
df['record_vote_number'] = df['record_vote_number'].astype('Int64')

df.dtypes


name                          object
nomination_no                 object
congress_no                    int64
circuit                       object
confirmation_date     datetime64[ns]
yea_votes                      Int64
nay_votes                      Int64
record_vote_number             Int64
dtype: object

In [3]:
# Convert the record_vote_number into a three digit string – this will be important later when I have to loop
# through URLs.
df['record_vote_number'] = df['record_vote_number'].apply(lambda x: '{0:0>3}'.format(x))
df.head(20)

Unnamed: 0,name,nomination_no,congress_no,circuit,confirmation_date,yea_votes,nay_votes,record_vote_number
0,Roopali H. Desai,PN2262,117,Ninth,2022-08-04,67,29,284
1,Florence Y. Pan,PN2193,117,District of Columbia,2022-09-20,52,42,340
2,Sarah A. L. Merriam,PN2141,117,Second,2022-09-15,53,44,337
3,Lara E. Montecalvo,PN2140,117,First,2022-09-14,52,47,335
4,Salvador Mendoza Jr,PN1966,117,Ninth,2022-09-12,46,40,331
5,John Z. Lee,PN1965,117,Seventh,2022-09-07,50,44,327
6,Stephanie Dawkins Davis,PN1748,117,Sixth,2022-05-24,49,43,194
7,Julianna Michelle Childs,PN1671,117,District of Columbia,2022-07-19,64,34,260
8,Leonard Philip Stark,PN1508,117,Federal,2022-02-09,61,35,49
9,Alison J. Nathan,PN1504,117,Second,2022-03-23,49,47,106


In [4]:
df['confirmation_year']=df['confirmation_date'].dt.to_period('Y')

df.confirmation_year = df.confirmation_year.map(str)
df.confirmation_year = df.confirmation_year.astype(int)

df['session'] = df.confirmation_year.apply(lambda x: "2" if x % 2 == 0 else "1")
df.head(50)

Unnamed: 0,name,nomination_no,congress_no,circuit,confirmation_date,yea_votes,nay_votes,record_vote_number,confirmation_year,session
0,Roopali H. Desai,PN2262,117,Ninth,2022-08-04,67,29,284,2022,2
1,Florence Y. Pan,PN2193,117,District of Columbia,2022-09-20,52,42,340,2022,2
2,Sarah A. L. Merriam,PN2141,117,Second,2022-09-15,53,44,337,2022,2
3,Lara E. Montecalvo,PN2140,117,First,2022-09-14,52,47,335,2022,2
4,Salvador Mendoza Jr,PN1966,117,Ninth,2022-09-12,46,40,331,2022,2
5,John Z. Lee,PN1965,117,Seventh,2022-09-07,50,44,327,2022,2
6,Stephanie Dawkins Davis,PN1748,117,Sixth,2022-05-24,49,43,194,2022,2
7,Julianna Michelle Childs,PN1671,117,District of Columbia,2022-07-19,64,34,260,2022,2
8,Leonard Philip Stark,PN1508,117,Federal,2022-02-09,61,35,49,2022,2
9,Alison J. Nathan,PN1504,117,Second,2022-03-23,49,47,106,2022,2


In [5]:
from bs4 import BeautifulSoup
import requests
from playwright.async_api import async_playwright

playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless = False)
page = await browser.new_page()

In [6]:
# DO NOT RUN ALL THE TIME! MIGHT UPSET THE SERVER...


votes = [] 
for index, row in df.iterrows():
    record_vote_number = row['record_vote_number']
    congress_no = row['congress_no']
    session = row['session']
    await page.goto(f"https://www.senate.gov/legislative/LIS/roll_call_votes/vote{congress_no}{session}/vote_{congress_no}_{session}_00{record_vote_number}.htm")

    html = await page.content()

    doc = BeautifulSoup(html)

    nominations = {}

    nominations["vote_number"] = doc.select("#secondary_col2 > div:nth-child(7) > div:nth-child(2)")

    results = doc.select("#secondary_col2 > div:nth-child(14) > span ")

    nomination = []
    for row in results:
        nomination.append(row.text)

    nomination1 = [item.split('\n') for item in nomination]

    nomination1_flat = [item for l in nomination1 for item in l]

    nomination2 = [item.split(',') for item in nomination1_flat]

    nominations["votes"] = nomination2

    votes.append(nominations)

In [11]:
votes

[{'vote_number': [<div class="contenttext" style="float:left; min-width:200px; padding-bottom:10px;">
   <b>Vote Number:   </b>284</div>],
  'votes': [['Baldwin (D-WI)', ' Yea'],
   ['Barrasso (R-WY)', ' Nay'],
   ['Bennet (D-CO)', ' Yea'],
   ['Blackburn (R-TN)', ' Nay'],
   ['Blumenthal (D-CT)', ' Yea'],
   ['Blunt (R-MO)', ' Nay'],
   ['Booker (D-NJ)', ' Yea'],
   ['Boozman (R-AR)', ' Nay'],
   ['Braun (R-IN)', ' Nay'],
   ['Brown (D-OH)', ' Yea'],
   ['Burr (R-NC)', ' Not Voting'],
   ['Cantwell (D-WA)', ' Yea'],
   ['Capito (R-WV)', ' Yea'],
   ['Cardin (D-MD)', ' Yea'],
   ['Carper (D-DE)', ' Yea'],
   ['Casey (D-PA)', ' Yea'],
   ['Cassidy (R-LA)', ' Yea'],
   ['Collins (R-ME)', ' Yea'],
   ['Coons (D-DE)', ' Yea'],
   ['Cornyn (R-TX)', ' Not Voting'],
   ['Cortez Masto (D-NV)', ' Yea'],
   ['Cotton (R-AR)', ' Nay'],
   ['Cramer (R-ND)', ' Yea'],
   ['Crapo (R-ID)', ' Yea'],
   ['Cruz (R-TX)', ' Nay'],
   ['Daines (R-MT)', ' Nay'],
   ['Duckworth (D-IL)', ' Yea'],
   ['Durbin (D