In [1]:
# Imports

import requests
from bs4 import BeautifulSoup
import re
from playwright.async_api import async_playwright
import pandas as pd
import time
pd.options.display.max_rows = None
pd.options.display.max_colwidth = None

In [2]:
#Starting up Playwright

url_hear = 'https://www.judiciary.senate.gov/hearings'

playwright = await async_playwright().start()
browser = await playwright.chromium.launch(headless=True)
page = await browser.new_page()

await page.goto(url_hear)

<Response url='https://www.judiciary.senate.gov/hearings' request=<Request url='https://www.judiciary.senate.gov/hearings' method='GET'>>

In [3]:
# Going through each nomination meeting (on the first page) and grabbing the district judges and orgs who subitted briefs in support of them.
# Strangely, when I have Playwright .click() on each link, it's as if I right clicked. Never encountered that before.

meetings = await page.query_selector_all('a[href*="meetings"]')
links = [await meeting.get_attribute('href') for meeting in meetings]
data = []

for i in range(len(links)):
    link = links[i]
    if link.find('nominations') != -1:
        full_link = 'https://www.judiciary.senate.gov' + link
        await page.goto(full_link)
        time.sleep(1)
    
        names = await page.query_selector_all('li[class=" col-xs-12 "] span.fn')
        positions = await page.query_selector_all('li[class=" col-xs-12 "] div.title')
        docs = await page.query_selector_all('li[class="acrobat"] a')

        for j in range(len(positions)):
            position = str(await positions[j].inner_text()).strip()
            name = str(await names[j].inner_text()).strip()
            last_name = name.split(' ')[-1]
            if position.find('District Judge') != -1:
                temp_dict = {
                    'name': name,
                    'position': position,
                    'docs': []
                }
                for k in range(len(docs)):
                    doc = str(await docs[k].inner_text()).strip()
                    if doc.find(last_name) != -1:
                        temp_dict['docs'].append(doc)

            data.append(temp_dict)

        await page.go_back()
        

In [4]:
# Going through all the pages! Well, all the pages that have suporting documents posted, which I manually checked.



all_data = []

for i in range(1, 9):
    await page.goto(url_hear + '?PageNum_rs=' + str(i))
    time.sleep(1)
    meetings = await page.query_selector_all('a[href*="meetings"]')
    links = [await meeting.get_attribute('href') for meeting in meetings]
    data = []

    for j in range(len(links)):
        link = links[j]
        if link.find('nominations') != -1:
            full_link = 'https://www.judiciary.senate.gov' + link
            await page.goto(full_link)
            time.sleep(1)

            names = await page.query_selector_all('li[class=" col-xs-12 "] span.fn')
            positions = await page.query_selector_all('li[class=" col-xs-12 "] div.title')
            docs = await page.query_selector_all('li[class="acrobat"] a')
            dates = await page.query_selector_all('tr[class="date "] td')

            try:
                date = await dates[1].inner_text()
            except:
                dates = await page.query_selector_all('tr[class="date"] td')
                date = await dates[1].inner_text()

            beginning = 'https://www.judiciary.senate.gov/'
            end = '&download=1'

            for k in range(len(positions)):
                position = str(await positions[k].inner_text()).strip()
                temp_dict={}
                
                if position.find('District Judge') != -1:
                    name = str(await names[k].inner_text()).strip()
                    last_name = name.split(' ')[-1]
                    temp_dict = {
                        'name': name,
                        'position': position.lower().replace('to be ',''),
                        'docs': [],
                        'doc_links': [],
                        'date': date
                    }
                    
                    # This is because the statements/letters/briefs in support of someone are in the format "group support for [last name]"
                    for m in range(len(docs)):
                        doc = str(await docs[m].inner_text()).strip()
                        if (doc.find(last_name) != -1) & (doc.find('For') != -1):
                            temp_dict['docs'].append(doc)
                            link_doc = doc.lower().replace(' ','-')
                            temp_dict['doc_links'].append(beginning+link_doc+end)
                    
                    if len(temp_dict['docs']) != 0:
                        data.append(temp_dict)

            await page.go_back()
    
    all_data.extend(data)

Just a note, it's bad that I hardcoded looping through the first 8 pages. Making this auto-updating would require fixing that.

In [5]:
# Separating organizations from document names

orgs = []
loose_orgs = []

for i in range(len(all_data)):
    small_orgs = []
    for j in range(len(all_data[i]['docs'])):
        if all_data[i]['docs'][j].find('Letter') != -1:
            org = all_data[i]['docs'][j].split('Letter')[0].strip()
            small_orgs.append(org)
            loose_orgs.append(org)
        elif all_data[i]['docs'][j].find('Support') != -1:
            org = all_data[i]['docs'][j].split('Support')[0].strip()
            small_orgs.append(org)
            loose_orgs.append(org)
        elif all_data[i]['docs'][j].find('Statement') != -1:
            org = all_data[i]['docs'][j].split('Statement')[0].strip()
            small_orgs.append(org)
            loose_orgs.append(org)
        else:
            pass
        
    orgs.append(small_orgs)

In [6]:
# Separating districts from the titles

districts = []

for i in range(len(all_data)):
    district = str(all_data[i]['position']).lower()
    district = district.split('for the')[-1]
    districts.append(district)


In [7]:
# Making and prepping a dataframe

df = pd.DataFrame(all_data)

In [8]:
df['orgs'] = orgs
df['district'] = districts


In [9]:

pd.to_datetime(df['date'])

0    2022-12-13
1    2022-12-13
2    2022-12-13
3    2022-12-13
4    2022-11-30
5    2022-11-30
6    2022-11-30
7    2022-11-15
8    2022-11-15
9    2022-11-15
10   2022-11-15
11   2022-11-15
12   2022-10-12
13   2022-10-12
14   2022-10-12
15   2022-10-12
16   2022-10-12
17   2022-09-21
18   2022-09-21
19   2022-09-21
20   2022-09-07
21   2022-09-07
22   2022-09-07
23   2022-09-07
24   2022-07-27
25   2022-07-27
26   2022-07-27
27   2022-06-22
28   2022-06-22
29   2022-05-25
30   2022-05-25
31   2022-05-11
32   2022-05-11
33   2022-05-11
34   2022-04-27
35   2022-04-27
36   2022-03-02
37   2022-03-02
38   2022-03-02
39   2022-02-16
40   2022-02-16
41   2022-02-16
42   2022-02-01
43   2022-02-01
44   2022-02-01
45   2022-01-12
46   2022-01-12
47   2022-01-12
48   2021-12-15
49   2021-12-15
50   2021-12-15
51   2021-12-15
52   2021-12-15
53   2021-12-01
54   2021-12-01
55   2021-12-01
56   2021-12-01
57   2021-11-17
58   2021-11-17
59   2021-11-17
60   2021-11-03
61   2021-11-03
62   202

In [10]:
df.head()

Unnamed: 0,name,position,docs,doc_links,date,orgs,district
0,Gordon P. Gallagher,united states district judge for the district of colorado,"[Colorado DA Rubinstein Support For Gallagher, Former MCBA Presidents And Attorneys Support For Gallagher, Former CO US Attorneys Support For Gallagher, Grand Junction Legal Community Support For Gallagher, HAP And Mesa County Libraries Support For Gallagher, Mesa County DAs And Sheriffs Support For Gallagher, Riverside Educational Center Support For Gallagher, Ute Mountain Ute Tribe Support For Gallagher, Western CO Attorneys Support For Gallagher]","[https://www.judiciary.senate.gov/colorado-da-rubinstein-support-for-gallagher&download=1, https://www.judiciary.senate.gov/former-mcba-presidents-and-attorneys-support-for-gallagher&download=1, https://www.judiciary.senate.gov/former-co-us-attorneys-support-for-gallagher&download=1, https://www.judiciary.senate.gov/grand-junction-legal-community-support-for-gallagher&download=1, https://www.judiciary.senate.gov/hap-and-mesa-county-libraries-support-for-gallagher&download=1, https://www.judiciary.senate.gov/mesa-county-das-and-sheriffs-support-for-gallagher&download=1, https://www.judiciary.senate.gov/riverside-educational-center-support-for-gallagher&download=1, https://www.judiciary.senate.gov/ute-mountain-ute-tribe-support-for-gallagher&download=1, https://www.judiciary.senate.gov/western-co-attorneys-support-for-gallagher&download=1]","Tuesday, December 13, 2022","[Colorado DA Rubinstein, Former MCBA Presidents And Attorneys, Former CO US Attorneys, Grand Junction Legal Community, HAP And Mesa County Libraries, Mesa County DAs And Sheriffs, Riverside Educational Center, Ute Mountain Ute Tribe, Western CO Attorneys]",district of colorado
1,P. Casey Pitts,united states district judge for the northern district of california,"[Former AAG Eric Dreiband Support For Pitts, Stephen Cannon Support For Pitts, National Education Association Support For Pitts, William Baude Support For Pitts, FCC Deputy GC Boizelle Support For Pitts]","[https://www.judiciary.senate.gov/former-aag-eric-dreiband-support-for-pitts&download=1, https://www.judiciary.senate.gov/stephen-cannon-support-for-pitts&download=1, https://www.judiciary.senate.gov/national-education-association-support-for-pitts&download=1, https://www.judiciary.senate.gov/william-baude-support-for-pitts&download=1, https://www.judiciary.senate.gov/fcc-deputy-gc-boizelle-support-for-pitts&download=1]","Tuesday, December 13, 2022","[Former AAG Eric Dreiband, Stephen Cannon, National Education Association, William Baude, FCC Deputy GC Boizelle]",northern district of california
2,Andrew G. Schopler,united states district judge for the southern district of california,"[Major General Mark Malanka Support For Schopler, Judge Irma Gonzalez Support For Schopler, Former Law Clerks Letter Of Support For Schopler, Harvard Law School Classmates Letter Of Support For Schopler]","[https://www.judiciary.senate.gov/major-general-mark-malanka-support-for-schopler&download=1, https://www.judiciary.senate.gov/judge-irma-gonzalez-support-for-schopler&download=1, https://www.judiciary.senate.gov/former-law-clerks-letter-of-support-for-schopler&download=1, https://www.judiciary.senate.gov/harvard-law-school-classmates-letter-of-support-for-schopler&download=1]","Tuesday, December 13, 2022","[Major General Mark Malanka, Judge Irma Gonzalez, Former Law Clerks, Harvard Law School Classmates]",southern district of california
3,Arun Subramanian,united states district judge for the southern district of new york,"[Former SCOTUS Clerks Support For Subramanian, SABANY/AABANY/SABANA Support For Subramanian, Colleagues At Susman Godfrey Support For Subramanian, Columbia Law Review Board Support For Subramanian, Co-Counsel Eric Havian Support For Subramanian, Columbia Law Review Support For Subramanian]","[https://www.judiciary.senate.gov/former-scotus-clerks-support-for-subramanian&download=1, https://www.judiciary.senate.gov/sabany/aabany/sabana-support-for-subramanian&download=1, https://www.judiciary.senate.gov/colleagues-at-susman-godfrey-support-for-subramanian&download=1, https://www.judiciary.senate.gov/columbia-law-review-board-support-for-subramanian&download=1, https://www.judiciary.senate.gov/co-counsel-eric-havian-support-for-subramanian&download=1, https://www.judiciary.senate.gov/columbia-law-review-support-for-subramanian&download=1]","Tuesday, December 13, 2022","[Former SCOTUS Clerks, SABANY/AABANY/SABANA, Colleagues At Susman Godfrey, Columbia Law Review Board, Co-Counsel Eric Havian, Columbia Law Review]",southern district of new york
4,Jonathan James Canada Grey,united states district judge for the eastern district of michigan,"[Assistant USA And Retired Navy JAG Support For Grey, DEA Assistant Special Agent Support For Grey, Former USA Barbara McQuade Support For Grey, Former USA David DeVillers Support For Grey, Retired DHS Special Agent Support For Grey]","[https://www.judiciary.senate.gov/assistant-usa-and-retired-navy-jag-support-for-grey&download=1, https://www.judiciary.senate.gov/dea-assistant-special-agent-support-for-grey&download=1, https://www.judiciary.senate.gov/former-usa-barbara-mcquade-support-for-grey&download=1, https://www.judiciary.senate.gov/former-usa-david-devillers-support-for-grey&download=1, https://www.judiciary.senate.gov/retired-dhs-special-agent-support-for-grey&download=1]","Wednesday, November 30, 2022","[Assistant USA And Retired Navy JAG, DEA Assistant Special Agent, Former USA Barbara McQuade, Former USA David DeVillers, Retired DHS Special Agent]",eastern district of michigan


In [11]:
# Looks alright? I'm going to save it as a csv so avoid scraping it again a bunch of times

df.to_csv('../data/judiciary.csv', index=False)