## Scraping downloading and converting many PDFs
This is the cornerstone of the Supreme Court transcript project, but it may also be useful for the federal judge project as well--especially if you decide to download hearings material want to include Q&A text.

I begin by scraping the links to all the transcriptions using **beautiful soup**--if you were to choose to do this project, you would also want to scrape this page for the rest of its information such as the name of the case, the docket number, etc..

In [3]:
from bs4 import BeautifulSoup
import requests
my_url = "https://www.supremecourt.gov/oral_arguments/argument_transcript/2021"
raw_html = requests.get(my_url).content

In [5]:
soup_doc = BeautifulSoup(raw_html, "html.parser")

In [6]:
the_tables = soup_doc.find_all(class_="table table-bordered")


In [7]:
all_pdf_links = []
for table in the_tables:
    good_row = table.find_all('tr')
    for row in good_row:
        if row.td is not None:
            print(row.a['href'])
            all_pdf_links.append(row.a['href'][3:])

../argument_transcripts/2021/21-404_4246.pdf
../argument_transcripts/2021/21-441_cc8k.pdf
../argument_transcripts/2021/21-234_j0ei.pdf
../argument_transcripts/2021/21-5726_4246.pdf
../argument_transcripts/2021/21-499_bqmc.pdf
../argument_transcripts/2021/21-418_3f07.pdf
../argument_transcripts/2021/21-439_83j8.pdf
../argument_transcripts/2021/21-954_m6hn.pdf
../argument_transcripts/2021/21-511_71o9.pdf
../argument_transcripts/2021/21-429_09m1.pdf
../argument_transcripts/2021/21-328_e2pg.pdf
../argument_transcripts/2021/21-248_j4ek.pdf
../argument_transcripts/2021/20-1034_i425.pdf
../argument_transcripts/2021/21-401_apm1.pdf
../argument_transcripts/2021/20-807_m64o.pdf
../argument_transcripts/2021/21-309_2135.pdf
../argument_transcripts/2021/20-603_4h2p.pdf
../argument_transcripts/2021/20-1573_5368.pdf
../argument_transcripts/2021/20-493_8759.pdf
../argument_transcripts/2021/20-7622_1a7d.pdf
../argument_transcripts/2021/20-1775_2co3.pdf
../argument_transcripts/2021/20-1530_758b.pdf
../a

In [8]:
all_pdf_links

['argument_transcripts/2021/21-404_4246.pdf',
 'argument_transcripts/2021/21-441_cc8k.pdf',
 'argument_transcripts/2021/21-234_j0ei.pdf',
 'argument_transcripts/2021/21-5726_4246.pdf',
 'argument_transcripts/2021/21-499_bqmc.pdf',
 'argument_transcripts/2021/21-418_3f07.pdf',
 'argument_transcripts/2021/21-439_83j8.pdf',
 'argument_transcripts/2021/21-954_m6hn.pdf',
 'argument_transcripts/2021/21-511_71o9.pdf',
 'argument_transcripts/2021/21-429_09m1.pdf',
 'argument_transcripts/2021/21-328_e2pg.pdf',
 'argument_transcripts/2021/21-248_j4ek.pdf',
 'argument_transcripts/2021/20-1034_i425.pdf',
 'argument_transcripts/2021/21-401_apm1.pdf',
 'argument_transcripts/2021/20-807_m64o.pdf',
 'argument_transcripts/2021/21-309_2135.pdf',
 'argument_transcripts/2021/20-603_4h2p.pdf',
 'argument_transcripts/2021/20-1573_5368.pdf',
 'argument_transcripts/2021/20-493_8759.pdf',
 'argument_transcripts/2021/20-7622_1a7d.pdf',
 'argument_transcripts/2021/20-1775_2co3.pdf',
 'argument_transcripts/2021/2

Next I used the **requests** library to download all of the PDFs to a folder on my computer.

In [6]:
import time
import requests
for urls in all_pdf_links:
    time.sleep(2)
    link = 'https://www.supremecourt.gov/oral_arguments/' + urls
    file_name = "/Users/thirkield/Documents/Columbia2022/court_docs2021/" + urls.split('/')[-1]
    r = requests.get(link, stream=True)
    with open(file_name,'wb') as Pypdf:
        for chunk in r.iter_content():
            if chunk:
                Pypdf.write(chunk)

In [7]:
#Here I make a list of the names of the PDFs
pdf_names = [url.split('/')[-1] for url in all_pdf_links]
pdf_names

['21-404_4246.pdf',
 '21-441_cc8k.pdf',
 '21-234_j0ei.pdf',
 '21-5726_4246.pdf',
 '21-499_bqmc.pdf',
 '21-418_3f07.pdf',
 '21-439_83j8.pdf',
 '21-954_m6hn.pdf',
 '21-511_71o9.pdf',
 '21-429_09m1.pdf',
 '21-328_e2pg.pdf',
 '21-248_j4ek.pdf',
 '20-1034_i425.pdf',
 '21-401_apm1.pdf',
 '20-807_m64o.pdf',
 '21-309_2135.pdf',
 '20-603_4h2p.pdf',
 '20-1573_5368.pdf',
 '20-493_8759.pdf',
 '20-7622_1a7d.pdf',
 '20-1775_2co3.pdf',
 '20-1530_758b.pdf',
 '20-1410_97be.pdf',
 '20-1641_j5f0.pdf',
 '21-147_3f0m.pdf',
 '21a244_c185.pdf',
 '21a240_21o3.pdf',
 '20-1263_5i26.pdf',
 '19-896_hejm.pdf',
 '20-322_4h25.pdf',
 '20-1472_l6h1.pdf',
 '20-1800_4813.pdf',
 '20-1566_4f14.pdf',
 '21-12_ap6c.pdf',
 '20-1650_2035.pdf',
 '20-1312_j5fl.pdf',
 '20-219_bocf.pdf',
 '20-1114_l6h2.pdf',
 '19-1392_4h25.pdf',
 '20-979_7mi8.pdf',
 '19-1401_869d.pdf',
 '20-1459_1an2.pdf',
 '20-1088_97be.pdf',
 '20-1009_5j31.pdf',
 '21-463_4315.pdf',
 '21-588_i3jm.pdf',
 '20-804_d18f.pdf',
 '20-1143_bqmc.pdf',
 '20-843_7m5e.pdf',



Here I use **tika** to extract the text from the pdfs and write txt files to my computer.

In [2]:
import os
#you only need this is there is a timeout error when you first run this
os.environ['TIKA_SERVER_JAR'] = 'https://repo1.maven.org/maven2/org/apache/tika/tika-server/1.19/tika-server-1.19.jar'
import tika
from tika import parser
import time
one_pdf = ["bill_id-1639439.pdf"]
for urls in one_pdf:
    time.sleep(2)
    file_n = urls.split('/')[-1]
    print(file_n)
    file_name = "/Users/thirkield/Documents/MajorStudio2023/Gabriel_python/" + file_n
    parsed_pdf = parser.from_file(file_name) 
    txt_data = parsed_pdf['content']
    txt_name = file_n.split('.')[0] + "NEW.txt"
    print(txt_name)
    file_out ="/Users/thirkield/Documents/MajorStudio2023/Gabriel_python/" + txt_name
    with open(file_out, 'w') as outfile:
        outfile.write(txt_data)


    

bill_id-1639439.pdf
bill_id-1639439NEW.txt


In [None]:
f = open('/Users/thirkield/Documents/Columbia2020/2019pdfs/18-217_5hdk.txt', 'r')
sample_transcript = f.read()

In [None]:
## Scraping downloading and converting many PDFs
This is the cornerstone of the Supreme Court transcript project, but it may also be useful for the federal judge project as well--especially if you decide to download hearings material want to include Q&A text.

I begin by scraping the links to all the transcriptions using **beautiful soup**--if you were to choose to do this project, you would also want to scrape this page for the rest of its information such as the name of the case, the docket number, etc..

from bs4 import BeautifulSoup
import requests
my_url = "https://www.supremecourt.gov/oral_arguments/argument_transcript/2021"
raw_html = requests.get(my_url).content

soup_doc = BeautifulSoup(raw_html, "html.parser")

the_tables = soup_doc.find_all(class_="table table-bordered")


all_pdf_links = []
for table in the_tables:
    good_row = table.find_all('tr')
    for row in good_row:
        if row.td is not None:
            print(row.a['href'])
            all_pdf_links.append(row.a['href'][3:])

all_pdf_links

Next I used the **requests** library to download all of the PDFs to a folder on my computer.



import time
import requests
for urls in all_pdf_links:
    time.sleep(2)
    link = 'https://www.supremecourt.gov/oral_arguments/' + urls
    file_name = "/Users/thirkield/Documents/Columbia2022/court_docs2021/" + urls.split('/')[-1]
    r = requests.get(link, stream=True)
    with open(file_name,'wb') as Pypdf:
        for chunk in r.iter_content():
            if chunk:
                Pypdf.write(chunk)

#Here I make a list of the names of the PDFs
pdf_names = [url.split('/')[-1] for url in all_pdf_links]
pdf_names




Here I use **tika** to extract the text from the pdfs and write txt files to my computer.

import os
#you only need this is there is a timeout error when you first run this
os.environ['TIKA_SERVER_JAR'] = 'https://repo1.maven.org/maven2/org/apache/tika/tika-server/1.19/tika-server-1.19.jar'
import tika
from tika import parser
import time
one_pdf = ["bill_id-1639439.pdf"]
for urls in one_pdf:
    time.sleep(2)
    file_n = urls.split('/')[-1]
    print(file_n)
    file_name = "/Users/thirkield/Documents/MajorStudio2023/Gabriel_python/" + file_n
    parsed_pdf = parser.from_file(file_name) 
    txt_data = parsed_pdf['content']
    txt_name = file_n.split('.')[0] + "NEW.txt"
    print(txt_name)
    file_out ="/Users/thirkield/Documents/MajorStudio2023/Gabriel_python/" + txt_name
    with open(file_out, 'w') as outfile:
        outfile.write(txt_data)


    



f = open('/Users/thirkield/Documents/Columbia2020/2019pdfs/18-217_5hdk.txt', 'r')
sample_transcript = f.read()