# Audio transcribing with Open AI Whisper v3

## Import libraries
Install dependencies

In [1]:
%%capture
!pip install --q --upgrade pip
!pip install --q --upgrade git+https://github.com/huggingface/transformers.git accelerate datasets[audio]
!pip install --q flash-attn --no-build-isolation

Import libraries

In [18]:
import torch
import requests
import ast
import re
import io
import pandas as pd
from google.colab import files
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

## Device setup
Get the available device

In [3]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() or torch.mps.is_available() else torch.float32
print(f"Device: {device}")

Device: cuda


## Data
Upload and read the project Oyez dataframe

In [4]:
uploaded = files.upload()
data_path = 'oyez_data.csv'
df = pd.read_csv(io.BytesIO(uploaded[data_path]))
df.head()

Saving oyez_data.csv to oyez_data.csv


Unnamed: 0,id,name,facts_of_the_case,question,conclusion,description,audio_links
0,54829,Arizona v. California,"In 1952, Arizona invoked the U.S. Supreme Cour...",Are the Quechan Tribe and the United States cl...,No. In a opinion deliver by Justice Ruth Bader...,Supplemental decree approved.,['https://api.oyez.org/case_media/oral_argumen...
1,54856,United States v. Mead Corporation,Under the Harmonized Tariff Schedule of the Un...,Does a tariff classification ruling by the Uni...,No. In an 8-1 opinion delivered by Justice Dav...,,['https://api.oyez.org/case_media/oral_argumen...
2,54857,"Cooper Industries, Inc. v. Leatherman Tool Gro...","Leatherman Tool Group, Inc., manufactures a mu...",Did the Court of Appeals review the constituti...,No. In an 8-1 opinion delivered by Justice Joh...,,['https://api.oyez.org/case_media/oral_argumen...
3,54858,Lopez v. Davis,Congress has provided the Bureau of Prisons (B...,Does the Bureau of Prisons have the authority ...,Yes. In a 6-3 opinion delivered by Justice Rut...,,['https://api.oyez.org/case_media/oral_argumen...
4,54859,Egelhoff v. Egelhoff,"David A. Egelhoff designated his wife, Donna R...",Does the Employee Retirement Income Security A...,Yes. In a 7-2 opinion delivered by Justice Cla...,,['https://api.oyez.org/case_media/oral_argumen...


Filter out cases where there are no oral arguments

In [5]:
df = df[~df["audio_links"].isna()]

## Get transcripts
A method for getting audio clip

In [11]:
def get_audio(url: str) -> tuple[bool, bytes]:
    """Gets audio base on URL"""
    response = requests.get(url)

    if response.status_code != 200: return False, None
    data = response.json()

    if len(data) == 0: return False, None

    is_unavailable = data["unavailable"]
    if is_unavailable: return False, None

    media_files = data["media_file"]
    if media_files is None: return False, None

    mp3_url = next((file["href"] for file in media_files if file["mime"] == "audio/mpeg"), None)

    response = requests.get(mp3_url)
    response.raise_for_status()

    return True, response.content


Get an example audio clip

In [12]:
name = "Cooper Industries, Inc. v. Leatherman Tool Group, Inc."
audio_url = "https://api.oyez.org/case_media/oral_argument_audio/21358"
status, audio = get_audio(audio_url)
print(f"Audio fetch successful: {status}")

Audio fetch successful: True


## Model
Get OpenAI Whisper

In [13]:
model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
model.to(device)
processor = AutoProcessor.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.90k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/340 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

Setup pipeline for audio transcribing

In [14]:
pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=15,
    batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
)

Device set to use cuda


Get an example transcript from the whisper model

In [15]:
result = pipe(audio, generate_kwargs={"language": "english"})

You have passed language=english, but also have set `forced_decoder_ids` to [[1, None], [2, 50360]] which creates a conflict. `forced_decoder_ids` will be ignored in favor of language=english.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [16]:
result['text']



Get the oral arguments for each case

In [25]:
oral_arguments = []
audio_links = df["audio_links"].apply(ast.literal_eval)
for idx, case_links in enumerate(audio_links, start=1):

    case_arguments = []
    for link in case_links:
        status, audio = get_audio(link)
        if not status: continue

        transcript = pipe(audio, generate_kwargs={"language": "english"})
        if transcript: case_arguments.append(transcript["text"])

    case_transcript = "\n".join(case_arguments) if case_arguments else None
    oral_arguments.append(case_transcript)
    if idx % 50 == 0: print(f"Fetched transcript ({idx}/{len(audio_links)})")

print(f"Fetched transcript ({idx}/{len(audio_links)})")

Fetched transcript (50/1735)
Fetched transcript (100/1735)
Fetched transcript (150/1735)
Fetched transcript (200/1735)
Fetched transcript (250/1735)
Fetched transcript (300/1735)
Fetched transcript (350/1735)
Fetched transcript (400/1735)
Fetched transcript (450/1735)
Fetched transcript (500/1735)
Fetched transcript (550/1735)
Fetched transcript (600/1735)
Fetched transcript (650/1735)
Fetched transcript (700/1735)
Fetched transcript (750/1735)
Fetched transcript (800/1735)
Fetched transcript (850/1735)
Fetched transcript (900/1735)
Fetched transcript (950/1735)
Fetched transcript (1000/1735)
Fetched transcript (1050/1735)
Fetched transcript (1100/1735)
Fetched transcript (1150/1735)
Fetched transcript (1200/1735)
Fetched transcript (1250/1735)
Fetched transcript (1300/1735)
Fetched transcript (1350/1735)
Fetched transcript (1400/1735)
Fetched transcript (1450/1735)
Fetched transcript (1500/1735)
Fetched transcript (1550/1735)
Fetched transcript (1600/1735)
Fetched transcript (1650/173

Add the oral arguments into the dataframe

In [26]:
df["oral_arguments"] = oral_arguments

Filter out the rows without oral arguments

In [27]:
df = df[~df["oral_arguments"].isna()]

Save the dataframe locally

In [None]:
data_path = "./data/project_oyez_oral_arguments.csv"
df.to_csv(data_path, index=False)

## Save the dataset into Hugging Face
Login to HuggingFace (run `huggingface-cli login` with HuggingFace client)

Convert `pd.DataFrame` into HuggingFace `Dataset`

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(df)

Push to hub

In [None]:
username = "" # Your username here
dataset_name = "project_oyez_oral_arguments_2000-2024"
dataset.push_to_hub(f"{username}/{dataset_name}")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/Juh6973/project_oyez_oral_arguments_2000-2024/commit/725c0d079bd330928e70c2578d755f43c62452e0', commit_message='Upload dataset', commit_description='', oid='725c0d079bd330928e70c2578d755f43c62452e0', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/Juh6973/project_oyez_oral_arguments_2000-2024', endpoint='https://huggingface.co', repo_type='dataset', repo_id='Juh6973/project_oyez_oral_arguments_2000-2024'), pr_revision=None, pr_num=None)