In [2]:
import pandas as pd
from pathlib import Path
import json
import warnings
warnings.filterwarnings("ignore")

In [3]:
# build function to parse transcript and return a dataframe with the following columns:
#   case id, case title, speaker, text
def parse_transcript(json_file):
    df = pd.DataFrame(columns=['Case_ID', 'Case_Title','Speaker', 'Text'])
    with open(json_file, 'r') as f:
        try:
            transcript = json.load(f)
        except:
            return None
    case_id = transcript['id']
    case_title = transcript['title']
    print(case_id, case_title)
    try:
        for sect in transcript['transcript']['sections']:
            for turn in sect['turns']:
                try:
                    speaker = turn['speaker']['name']
                    # print(speaker)
                    text = ' '.join([block['text'] for block in turn['text_blocks']])
                    df = df.append({'Case_ID': case_id, 'Case_Title': case_title, 
                                    'Speaker': speaker, 'Text': text}, ignore_index=True)
                except:
                    pass
    except:
        pass
    return df

In [4]:
# iterate through all json files in oyez/cases and parse them
# save the parsed transcripts to a csv file
csv_file = Path.cwd().parent.joinpath('transcripts.csv')
df = pd.DataFrame(columns=['Case_ID', 'Case_Title','Speaker', 'Text'])
for json_file in Path.cwd().parent.joinpath('oyez/cases').glob('*-t*.json'):
    # limit parsing to files beginning with 1990 - 2022
    if json_file.name[:4].isnumeric() and int(json_file.name[:4]) in range(1990, 2022):
        # json_file = Path(r'oyez\cases\2022.22O145-t01.json')
        json_file = Path.cwd().parent.joinpath(Path(json_file))
        # print(json_file)
        temp_df = parse_transcript(json_file)
        df = df.append(temp_df, ignore_index=True)
df
# df.to_csv(csv_file, header=False, index=False)

20063 Oral Argument - March 18, 1991
20902 Oral Argument - April 16, 1991
24062 No oral argument
20552 Oral Argument - November 06, 1990
20483 Oral Argument - November 06, 1990
19791 Oral Argument - February 19, 1991
19920 Oral Argument - November 05, 1990
20313 Oral Argument - December 03, 1990
21066 Oral Argument - October 02, 1990
20708 Oral Argument - October 10, 1990
19873 Oral Argument - October 02, 1990
21120 Oral Argument - October 01, 1990
20176 Oral Argument - October 29, 1990
19781 Oral Argument - October 03, 1990
20827 Oral Argument - October 10, 1990
20930 Oral Argument - October 10, 1990
20668 Oral Argument - November 05, 1990
21172 Oral Argument - October 03, 1990
20077 Oral Argument - October 01, 1990
20418 Oral Argument - October 09, 1990
19799 Oral Argument - January 07, 1991
20665 Oral Argument - November 27, 1990
20447 Oral Argument - October 29, 1990
20681 Oral Argument - October 02, 1990
20314 Oral Argument - October 30, 1990
20582 Oral Argument - November 28, 199

Unnamed: 0,Case_ID,Case_Title,Speaker,Text
0,20063,"Oral Argument - March 18, 1991",William H. Rehnquist,"We'll hear argument now in No. 106 Original, t..."
1,20063,"Oral Argument - March 18, 1991",Rickie Leon Pearson,"Mr. Chief Justice, and may it please the Court..."
2,20063,"Oral Argument - March 18, 1991",Anthony M. Kennedy,"Now, when you say low-water mark you have to t..."
3,20063,"Oral Argument - March 18, 1991",Rickie Leon Pearson,"As it exists from time to time, or currently."
4,20063,"Oral Argument - March 18, 1991",Anthony M. Kennedy,"The changing low-water mark, not the 1792 low-..."
...,...,...,...,...
545829,25314,"Oral Argument - January 07, 2022",Elizabeth B. Prelogar,"The limits, I think, are the ones written into..."
545830,25314,"Oral Argument - January 07, 2022",Amy Coney Barrett,Thank you.
545831,25314,"Oral Argument - January 07, 2022","John G. Roberts, Jr.","Mr. Keller, rebuttal?"
545832,25314,"Oral Argument - January 07, 2022",Scott A. Keller,"Two points, Mr. Chief Justice. First, we need ..."


In [5]:
csv_file = Path.cwd().parent.joinpath('transcripts.csv.gzip')
df.to_csv(csv_file, header=False, index=False, compression='gzip')

In [7]:
new_df = pd.read_csv(csv_file, compression='gzip', header=None, names=['Case_ID', 'Case_Title','Speaker', 'Text'])
new_df['text_len'] = new_df['Text'].str.len()
new_df['text_len'].describe()

count    545834.000000
mean        237.035077
std         356.398553
min           2.000000
25%          38.000000
50%         119.000000
75%         292.000000
max        9518.000000
Name: text_len, dtype: float64

In [24]:
# parse the text from the extension of remarks from congress.gov

url = 'https://www.congress.gov/118/crec/2023/02/10/169/28/modified/CREC-2023-02-10-pt1-PgE115-2.htm'
url = 'https://www.congress.gov/118/crec/2023/02/10/169/28/modified/CREC-2023-02-10-pt1-PgE115-3.htm'
# url = 'https://www.congress.gov/118/crec/2023/02/10/169/28/modified/CREC-2023-02-10-pt1-PgE115-4.htm'

import requests
from bs4 import BeautifulSoup

response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.find_all('pre')


In [26]:
import re
pattern = r'(?:HON. )(.*)[.\s]*of .*[.\s]*.*[.\s]*(?:Friday|Saturday|Monday|Tuesday|Wednesday|Thursday), (.*, \d{4})'


for t in text:
    speaker, text_date = re.findall(pattern, t.text)[0]
    transcript = t.text.split(text_date)[1].strip()
    print(speaker, text_date, transcript, sep='\n')



SALUD O. CARBAJAL
February 10, 2023
Mr. CARBAJAL. Mr. Speaker, today I rise to applaud the innovative and 
compassionate approach exemplified by one of my exceptional 
constituents--Ty Safreno who implemented a childcare center as part of 
his business Trust Automation, Inc.
  Ty embodies the true spirit of American entrepreneurship. Utilizing 
his degree in Manufacturing Engineering from Cal Poly San Luis Obispo, 
Ty began his business in the 1990's alongside his wife Trudie. In just 
three short decades, he turned his technology start-up into a staple of 
the Central Coast. Boasting over 100 employees, Trust Automation, Inc. 
works on cybersecurity, semiconductors, and other technologic 
advancements.
  What makes Ty remarkable is how he treats and cares for community 
members. He cultivates a workplace environment that emulates a close 
family dynamic and is described as brightening people's day. Amongst 
the Safreno family's achievements is successfully installing the Trust 
Childr

: 

In [None]:
# read json from url

url = 'https://api.oyez.org/case_media/opinion_announcement_audio/24917'

import requests
import json

response = requests.get(url)
json_data = json.loads(response.text)
json_data