Docket numbers retrieved manually from the list maintained by findlaw.com. Only cases decided in 2018 and onward have their documents hosted on the Scotus website. Briefs from 2012 term onward are stored on scotusblog.com, although some of them are in a very messy format (i.e. a picture of a scanned document). Prior briefs used to be hosted by the ABA but they took them down.

In [1]:
docket_nos_path = "./docket_nos_valid.txt"

In [2]:
import pandas as pd
import re

df = pd.DataFrame(columns=['docket_number', 'year', 'url_list', 'caption', 'brief_type', 'subject', 'filing_date'])

current_year = None
rows = [] 

with open(docket_nos_path, 'r') as file:
    for line in file:
        line = line.strip()

        # Check for year-only lines. The year indicates when the decision was released.
        year_match = re.match(r'^<year>\s+(\d{4})$', line)
        if year_match:
            current_year = year_match.group(1)
            continue

        # Extract the docket number from regular lines
        match = re.search(r'No\.\s+([\w-]+)', line)
        if match:
            docket_number = match.group(1)
            # Use the current year for this entry and add to the rows list
            rows.append({'docket_number': docket_number, 'year': current_year})

df = pd.concat([df, pd.DataFrame(rows)], ignore_index=True)

print(df.head())


  docket_number  year url_list caption brief_type subject filing_date
0         17-71  2018      NaN     NaN        NaN     NaN         NaN
1       17-1676  2018      NaN     NaN        NaN     NaN         NaN
2       18-5181  2018      NaN     NaN        NaN     NaN         NaN
3        17-587  2018      NaN     NaN        NaN     NaN         NaN
4       17-7894  2018      NaN     NaN        NaN     NaN         NaN


In [3]:
print(len(df))

412


In [4]:
from webdriver_manager.chrome import ChromeDriverManager

# This will print the location where ChromeDriver was downloaded
driver_path = ChromeDriverManager().install()
print(driver_path)


/Users/jessewoo/.wdm/drivers/chromedriver/mac64/129.0.6668.89/chromedriver-mac-arm64/chromedriver


In [5]:
import pandas as pd
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

# Set up Chrome options
options = Options()
options.add_argument("--headless")  # Run browser in headless mode

# Provide the path to the manually downloaded ChromeDriver executable

# Set up WebDriver
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=options)
page_num = 0

# Function to process a single docket page and return a list of row dictionaries
def process_archive_page(page_num):
    url = f"https://www.justice.gov/osg/supreme-court-briefs?page={page_num}"
    row_data_list = []  # List to hold row data
    
    try:
        driver.get(url)
    except Exception as e:
        print(f"Error loading page {url}: {str(e)}")
        return row_data_list  # Return empty list in case of an error
    
    try:
        # Find all table rows
        rows = driver.find_elements(By.TAG_NAME, 'tr')
    except Exception as e:
        print(f"Error finding table rows: {str(e)}")
        return row_data_list
    
    for row in rows:
        try:
            # Extract each <td> element's text for the respective columns
            tds = row.find_elements(By.TAG_NAME, 'td')
            if len(tds) == 7:  # Ensure there are exactly 7 <td> elements
                year_field = tds[0].text.split()
                year = year_field[0].strip()
                docket_number = tds[1].text.strip()
                caption = tds[2].text.strip()
                link_element = tds[2].find_element(By.TAG_NAME, 'a')
                caption_url = link_element.get_attribute('href')  # Get the URL of the link
                
                # Locate the file URL in the <td> element for the "file_url" column
                file_url = ""
                links = tds[3].find_elements(By.TAG_NAME, 'a')
                for link in links:
                    if link.text.strip().lower() == 'main document':
                        file_url = link.get_attribute('href')
                
                brief_type = tds[4].text.strip()
                subject = tds[5].text.strip()
                filing_date = tds[6].text.strip()
                
                # Create a dictionary for the current row's data
                row_data = {
                    'year': year,
                    'docket_number': docket_number,
                    'caption': caption,
                    'file_url': file_url,
                    'caption_url': caption_url,
                    'brief_type': brief_type,
                    'subject': '',
                    'filing_date': filing_date
                }
                
                # Append the row data to the list
                row_data_list.append(row_data)
        except Exception as e:
            print(f"Error processing row: {str(e)}")
            continue
    
    return row_data_list




In [6]:
all_data = []

# For example, process the first few pages
for page_num in range(0, 1):  # Adjust range for number of pages you want to process
    page_data = process_archive_page(page_num)
    all_data.extend(page_data)

# Once all pages are processed, convert the list of dictionaries to a dataframe
df = pd.DataFrame(all_data)

In [7]:
driver.quit()

In [8]:
df.head()

Unnamed: 0,year,docket_number,caption,file_url,caption_url,brief_type,subject,filing_date
0,2024,23-624,United States v. Trump,,https://www.justice.gov/osg/brief/united-state...,Petition for Writ of Certiorari,,"Monday, December 11, 2023"
1,2024,23-175,City of Grants Pass v. Johnson,,https://www.justice.gov/osg/brief/city-grants-...,Merits Stage Amicus Brief,,"Monday, March 4, 2024"
2,2024,23-708,Mark v. Republic of Sudan,,https://www.justice.gov/osg/brief/mark-v-repub...,Petition Stage Reply Brief,,"Monday, March 25, 2024"
3,2024,23-799,Magellan Tech. v. FDA,,https://www.justice.gov/osg/brief/magellan-tec...,Petition Stage Response,,"Monday, March 25, 2024"
4,2024,23-367,Starbucks Corp. v. McKinney,,https://www.justice.gov/osg/brief/starbucks-co...,Merits Stage Brief,,"Friday, March 22, 2024"


In [9]:
import os
import requests
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.common.by import By
import time


def download_files(df_row):
    try:
        # Get the caption_url from the dataframe row
        caption_url = df_row['caption_url']
        
        # Navigate to the caption_url page
        driver.get(caption_url)
        
        # Wait for the page to load (you can use explicit waits here)
        time.sleep(1)
        
        # Scrape all 'field__item' elements under the 'node-topics' div
        topics_div = driver.find_element(By.CLASS_NAME, 'node-topics')
        field_items = topics_div.find_elements(By.CLASS_NAME, 'field__item')
        
        # Collect all the topics into a list
        topics = [item.text for item in field_items]
        
        # Update the 'subject' field in the dataframe with the topics
        df_row['subject'] = topics
        
        # Find the downloadable PDF link
        pdf_element = driver.find_element(By.CLASS_NAME, 'downloadable-src')
        pdf_url = pdf_element.get_attribute('href')  # Get the PDF URL
        
        # Get the PDF filename from the URL
        pdf_filename = os.path.join('./osg_briefs', pdf_url.split('/')[-1])
        
        # Download the PDF file
        response = requests.get(pdf_url)
        if response.status_code == 200:
            with open(pdf_filename, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded: {pdf_filename}")
        
        # Update the 'file_url' field in the dataframe with the PDF URL
        df_row['file_url'] = pdf_url
    
    except Exception as e:
        print(f"Error processing file for row: {str(e)}")
        return df_row
    
    return df_row

# Example usage: Apply this function to each row in your dataframe
# Assuming you have a dataframe called 'df' with a column 'caption_url'


In [23]:
# Iterate through each row in the dataframe and update it with subjects and file URLs
for index, row in df.iterrows():
    df.loc[index] = download_files(row)

# Close the browser after the process is done
# driver.quit()

# Save the updated dataframe (optional)
# df.to_csv('updated_dataframe.csv', index=False)

Downloaded: ./osg_briefs/23-624_u.s_v._trump_pet.pdf
Downloaded: ./osg_briefs/23-175npunitedstates.pdf
Downloaded: ./osg_briefs/mark_v._sudan_23-708.pdf
Downloaded: ./osg_briefs/magellan_response.pdf
Downloaded: ./osg_briefs/23-367bsunitedstates.pdf
Downloaded: ./osg_briefs/23-411tsunitedstates.pdf
Downloaded: ./osg_briefs/23-235_us_intervention_opp_-_final.pdf
Downloaded: ./osg_briefs/23-189_debique_opp_-_final.pdf
Downloaded: ./osg_briefs/22-976tsunitedstates.pdf
Downloaded: ./osg_briefs/23-235tsunitedstates_0.pdf
Downloaded: ./osg_briefs/23-717_alvarado_opp._-_final.pdf
Downloaded: ./osg_briefs/22-1025npacunitedstates.pdf
Downloaded: ./osg_briefs/lissack_opp.pdf
Downloaded: ./osg_briefs/city_of_sf_041224.4_final.pdf
Downloaded: ./osg_briefs/23-405_-_ross_v._ftc.pdf
Downloaded: ./osg_briefs/22-1078bsunitedstates.pdf
Downloaded: ./osg_briefs/23-380_military-veterans_advocacy_opp_final.pdf
Downloaded: ./osg_briefs/22-1178rb_fbi_v._fikre.pdf
Downloaded: ./osg_briefs/22-1008bsunitedstate

In [20]:
df.head()

Unnamed: 0,year,docket_number,caption,file_url,caption_url,brief_type,subject,filing_date
0,2024,23-624,United States v. Trump,,https://www.justice.gov/osg/brief/united-state...,Petition for Writ of Certiorari,[CRIMINAL (INCLUDING HABEAS/2255)],"Monday, December 11, 2023"
1,2024,23-175,City of Grants Pass v. Johnson,,https://www.justice.gov/osg/brief/city-grants-...,Merits Stage Amicus Brief,[CRIMINAL (INCLUDING HABEAS/2255)],"Monday, March 4, 2024"
2,2024,23-708,Mark v. Republic of Sudan,,https://www.justice.gov/osg/brief/mark-v-repub...,Petition Stage Reply Brief,"[CIVIL DIV. I: GENERAL (E.G., DOT, FEC, FOIA, ...","Monday, March 25, 2024"
3,2024,23-799,Magellan Tech. v. FDA,,https://www.justice.gov/osg/brief/magellan-tec...,Petition Stage Response,"[ADMINISTRATIVE LAW, CIVIL DIV. II: HHS (MEDIC...","Monday, March 25, 2024"
4,2024,23-367,Starbucks Corp. v. McKinney,,https://www.justice.gov/osg/brief/starbucks-co...,Merits Stage Brief,"[CIVIL DIV. I: GENERAL (E.G., DOT, FEC, FOIA, ...","Friday, March 22, 2024"


In [5]:
import time
import random

# List of docket numbers to process
docket_numbers = df['docket_number'].tolist()

# Dictionary to hold docket numbers and their associated PDF URLs
docket_pdf_urls = {}

# Try to do this directly with the dataframe rows, otherwise use a list.
for idx, row in df.iterrows():
    docket_number = row['docket_number']
    pdf_urls = process_docket_page(docket_number)
    docket_pdf_urls[docket_number] = pdf_urls
    df.at[idx, 'url_list'] = pdf_urls
    
    time.sleep(random.uniform(1, 2)) # Hopefully avoid rate limits

driver.quit()

In [6]:
print(len(docket_pdf_urls))

410


In [7]:
import requests

# Ensure you have the correct headers for the request if needed
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36',
}

num_briefs = 0

for docket_number, urls_list in docket_pdf_urls.items():
    num = 1
    for url in urls_list:
        response = requests.get(url, headers=headers, stream=True)
        if response.status_code == 200:
            with open(f'./brief_pdfs/Docket{docket_number}_Brief{num:03}.pdf', 'wb') as f:
                f.write(response.content)
            
            num += 1
            num_briefs += 1
            # print("pdf saved successfully, I sleep now")
            time.sleep(random.uniform(1, 2))
        else:
            print(f"Failed to download PDF for docket number {docket_number}. Status code: {response.status_code}")


In [8]:
print(f"You scraped {num_briefs} briefs.")

You scraped 4377 briefs.


In [1]:
df.to_json('./scraped_briefs.json', orient='records')

NameError: name 'df' is not defined

This is for DOJ's archive of OSG briefs, which would serve as a backup. Probably better download directly from supremecourt.gov though.