## NC Court of Appeals Scraper

This notebook scrapes the NC Court of Appeals PDF opinions and convert them into a DataFrame

In [None]:
from bs4 import BeautifulSoup
import requests
import pickle
from pdfminer3.layout import LAParams, LTTextBox
from pdfminer3.pdfpage import PDFPage
from pdfminer3.pdfinterp import PDFResourceManager
from pdfminer3.pdfinterp import PDFPageInterpreter
from pdfminer3.converter import PDFPageAggregator
from pdfminer3.converter import TextConverter
import io

### Get List of All Opinions (Stored as PDFs)

In [None]:
# Make a get request to retrieve the court of appeals' page
html_page = requests.get('https://appellate.nccourts.org/opinion-filings/?c=coa&year=2020') 
# Pass the page contents for parsing
soup = BeautifulSoup(html_page.content, 'html.parser') 

In [None]:
soup.prettify

In [None]:
titles = soup.findAll('span', class_="title")
titles[0]

In [None]:
titles[0].attrs['onclick']

In [None]:
titles[0].attrs['onclick'].strip('viewOpinion(\"').strip('\")')

In [None]:
page_links = [titles.attrs['onclick'].strip('viewOpinion(\"').strip('\")') for titles in soup.findAll('span', class_='title')]

In [None]:
len(page_links)

In [None]:
# Scrape all PDF links (web site stores links from 1998 to the present)
all_links = []
for i in range(1998,2021,1):
    cur_year = 'https://appellate.nccourts.org/opinion-filings/?c=coa&year={}'.format(i)
    html_page = requests.get(cur_year)
    soup = BeautifulSoup(html_page.content, 'html.parser') 
    page_links = [titles.attrs['onclick'].strip('viewOpinion(\"').strip('\")') for titles in soup.findAll('span', class_='title')]
    all_links += page_links

In [None]:
# Total number of opinions to be scraped
length = len(all_links)
length

In [None]:
# delete any possible duplicates
set(all_links) 
len(all_links)
list(all_links)

In [None]:
# save list of PDFs to data file
with open('pdf_addresses.data', 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(all_links, filehandle)

### Create Function that Downloads All of the PDF Opinions

The COA's web page contains entries for 1998-2020.  Create a function that appends all 

In [None]:
# function to scrape all opinions
# this function run alone timed out after downloading 4,484 of the 28,861 

for i in range(length):
    target = all_links[i]
    name = target[-5:].replace('=','0') # name for the saved PDF, adding leading 0 to four-digit PDF names

    r = requests.get(target) # create HTTP response object 

    with open("Scraped_PDFs/"+name+".pdf",'wb') as f: 
        f.write(r.content) 


Received error at PDF named 21317; received 4,484 items. 

### Get Proxies to Scrape PDF List

In [None]:
from proxies import get_proxies, check_proxies #Attempt these before going with a paid proxy
working_proxies = check_proxies()
# import pickle file of all current PDFs. 
infile = open('pdf_addresses.data','rb')
pdf_addresses = pickle.load(infile)
infile.close()

### Convert PDFs into Text Strings and Build Into a DataFrame

In [None]:
# Code from Stack Overflow to use PDFminer to import PDF to text reworked into a function
resource_manager = PDFResourceManager()
fake_file_handle = io.StringIO()
converter = TextConverter(resource_manager, fake_file_handle, laparams=LAParams())
page_interpreter = PDFPageInterpreter(resource_manager, converter)

def pdf2txt(path):
    """
    A function to convert a PDF to a string, 
    takes the file path to the PDF as the 
    argument and returns the PDF as a string.
    """
    with open(path, 'rb') as fh:

        for page in PDFPage.get_pages(fh,
                                      caching=True,
                                      check_extractable=True):
            page_interpreter.process_page(page)

        text = fake_file_handle.getvalue()

    # close open handles
    converter.close()
    fake_file_handle.close()
    
    return(text)

In [None]:
path = 'SampleData/SamplePDF.pdf'
text = pdf2txt(path)

In [None]:
text  #WORKS!!
type(text)