In [None]:
%pip install openai librosa python-dotenv selenium soundfile torchaudio transformers webdriver-manager -qqq

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
import os

options = webdriver.ChromeOptions()
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [None]:
from tqdm import tqdm

NUM_ITEMS_TO_DOWNLOAD = 10
OUTPUT_DIR = "courtlistener_mp3s"
os.makedirs(OUTPUT_DIR, exist_ok=True)

BASE_URL = "https://www.courtlistener.com/?type=oa&q=&type=oa&order_by=score%20desc"


In [None]:
case_links = []
page = 1

while len(case_links) < NUM_ITEMS_TO_DOWNLOAD:
    paged_url = f"{BASE_URL}&page={page}"
    driver.get(paged_url)
    
    driver.implicitly_wait(5)  # Wait for elements to load

    links_on_page = driver.find_elements(By.CSS_SELECTOR, "h3.bottom.serif a")
    new_links = [link.get_attribute("href") for link in links_on_page if link.get_attribute("href")]

    if not new_links:
        print(f"No more results found on page {page}.")
        break

    for link in new_links:
        if link not in case_links:
            case_links.append(link)
        if len(case_links) >= NUM_ITEMS_TO_DOWNLOAD:
            break

    page += 1

print(f"Collected {len(case_links)} case URLs.")


In [None]:
import pandas as pd

metadata = []

In [None]:
from urllib.parse import urlparse
import re
import requests
import time

from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

for case_url in tqdm(case_links[:NUM_ITEMS_TO_DOWNLOAD], desc="Downloading MP3s", unit="file"):
    try:
        driver.get(case_url)
        WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.LINK_TEXT, "Download MP3")))

        # --- Extract MP3 URL first ---
        mp3_button = driver.find_element(By.LINK_TEXT, "Download MP3")
        mp3_url = mp3_button.get_attribute("href")
        mp3_data = requests.get(mp3_url).content

        # --- Build filename from URL ---
        path_parts = urlparse(case_url).path.strip('/').split('/')
        case_slug = path_parts[-1]
        filename_base = re.sub(r'[^\w\-]', '', case_slug)
        filename = f"{filename_base}.mp3"
        filepath = os.path.join(OUTPUT_DIR, filename)

        # --- Save MP3 ---
        with open(filepath, "wb") as f:
            f.write(mp3_data)

        # --- Extract metadata ---
        # Court Case Name from slug
        court_case_name = case_slug.replace('-', ' ').title()

        # Court name
        # Find the h3 element *above* the "Date Argued" span
        date_argued_label = driver.find_element(By.XPATH, "//span[contains(text(), 'Date Argued:')]")
        court_name = date_argued_label.find_element(By.XPATH, "./ancestor::p/preceding-sibling::h3[1]").text.strip()


        # Date Argued
        date_text = driver.find_element(By.XPATH, "//span[contains(text(), 'Date Argued:')]/following-sibling::span").text.strip()

        # Duration
        duration = driver.find_element(By.XPATH, "//span[contains(text(), 'Duration:')]/following-sibling::span").text.strip()

        # Docket Number
        docket_number = driver.find_element(By.XPATH, "//span[contains(text(), 'Docket Number:')]/following-sibling::a").text.strip()

        metadata.append({
            "Court Case Name": court_case_name,
            "Court": court_name,
            "Date Argued": date_text,
            "Duration": duration,
            "Docket Number": docket_number,
            "MP3 URL": mp3_url
        })

    except Exception as e:
        tqdm.write(f"Failed to process {case_url}: {e}")


In [None]:
driver.quit()

df = pd.DataFrame(metadata)
df.head()


In [None]:
import os
import torch
import librosa
import soundfile as sf
from transformers import pipeline
from dotenv import load_dotenv

load_dotenv()
os.environ["HUGGINGFACE_HUB_TOKEN"] = os.getenv("HF_API_KEY")

device = 0 if torch.cuda.is_available() else -1


In [None]:
def split_audio(audio_data, sr, chunk_duration=30):
    """Split audio into chunks of `chunk_duration` seconds."""
    chunks = []
    for start in range(0, len(audio_data), int(chunk_duration * sr)):
        end = start + int(chunk_duration * sr)
        chunks.append(audio_data[start:end])
    return chunks

def transcribe_long_audio(audio_path, transcriber, chunk_duration=30):
    """Transcribe long audio files in chunks using Whisper."""
    try:
        audio_data, sr = librosa.load(audio_path, sr=None)
        chunks = split_audio(audio_data, sr, chunk_duration)
        transcriptions = []

        for i, chunk in enumerate(chunks):
            chunk_path = f"temp_chunk_{i}.wav"
            sf.write(chunk_path, chunk, sr)
            result = transcriber(chunk_path, generate_kwargs={"language": "en"})
            transcriptions.append(result["text"])
            os.remove(chunk_path)

        return " ".join(transcriptions)
    except Exception as e:
        return f"Error: {e}"



In [None]:
from transformers import pipeline
import warnings

warnings.filterwarnings("ignore")

device = 0 if torch.cuda.is_available() else -1

transcriber = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-base",
    device=device
)


In [None]:
transcriptions = []

for i, row in tqdm(df.iterrows(), total=len(df), desc="Transcribing MP3s via HF"):
    try:
        # Build filename as before
        case_name = row["Court Case Name"]
        filename_base = re.sub(r"[^\w\-]", "", case_name.replace(" ", "-").lower())
        filename = f"{filename_base}.mp3"
        filepath = os.path.join(OUTPUT_DIR, filename)

        transcription = transcribe_long_audio(filepath, transcriber)
    except Exception as e:
        transcription = f"ERROR: {e}"

    transcriptions.append(transcription)

df["Transcription"] = transcriptions
