In [3]:
# Module imports
import requests
import os
from io import BytesIO
from bs4 import BeautifulSoup
import pickle
import PyPDF2
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import matplotlib.pyplot as plt



Guidance and sample code taken from https://medium.com/@rqaiserr/how-to-convert-pdfs-into-searchable-key-words-with-python-85aab86c544f and https://github.com/adashofdata/nlp-in-python-tutorial/blob/master/1-Data-Cleaning.ipynb

We need to get text out of the PDFs we downloaded earlier. To do that, we use the PyPDF2 library to extract the PDF text. The below function takes a PDF file and returns a text string.

In [None]:
def pdf_to_text(filename):
    with open(filename,'rb') as pdfFileObj:
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        # this is the readable object to parse
        numpages = pdfReader.numPages
        count = 0
        text = ""
        while count < numpages:
            pageObj = pdfReader.getPage(count)
            count += 1
            text += pageObj.extractText()
        assert text != "", "Text not readable from %s" % filename
    return text

Now that we have the opinion in plain text form, we need to clean it up. 

In [None]:
# Well, first let's see if it works on one of the opinions. 
# Get the current directory:
currentdir = os.path.abspath('')
# Make a filepath to pass to PDFtotext:
testpath = os.path.join(currentdir, 'Opinions\cl_20\B228808.pdf')
testpdf = pdf_to_text(testpath)
print(testpdf)

Okay, excellent! Comparing the readout to the original PDF, there are some extraneous line breaks and spaces, but the text appears to be mostly present. Some things we might have to fix: footnotes interrupting other text, special characters causing following words not to be read, spaces breaking up names/keywords.

Documentation from PyPDF2 for the extractText() function: 

extractText()

    Locate all text drawing commands, in the order they are provided in the content stream, and extract the text. This works well for some PDF files, but poorly for others, depending on the generator used. This will be refined in the future. Do not rely on the order of text coming out of this function, as it will change if this function is made more sophisticated.
    Returns:	a unicode string object.

It looks like if I'm relying on PyPDF2 to extract the text, I'm going to have to work with incomplete text for right now. Maybe I can remove all the line breaks and compare word counts for the documents to check how many words got missed?


In [None]:
# Pickle output for later use
pickletest = os.path.join(currentdir, 'textopinions','B228808.txt')
with open(pickletest, "wb") as file:
    pickle.dump(testpdf, file)

Drawing from https://github.com/adashofdata/nlp-in-python-tutorial/blob/master/1-Data-Cleaning.ipynb

In [None]:
# Initial round of cleanup
def cleantext1(text): 
    # Make text lowercase: 
    text = text.lower()
    # Strip all newlines: 
    text = re.sub('\n', '', text)
    return text

In [None]:
print(cleantext1(testpdf))

In [None]:
testclean = cleantext1(testpdf)
# Pickle output for later use
pickletestclean = os.path.join(currentdir, 'textopinions','B228808_clean.txt')
with open(pickletest, "wb") as file:
    pickle.dump(testclean, file)

Here, we use Scikit-Learn's feature extraction module to get a count for each word in the text, ignoring commonly used words like "the" and "and".

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(input='content',stop_words = 'english')
data_cv = cv.fit_transform([testclean])
data_dtm = pd.DataFrame(data_cv.toarray(), columns = cv.get_feature_names())
# sort by frequency
data_sort = data_dtm.sort_values(by=0,axis=1, ascending=False)
# The resulting matrix is tall by default. 
data_sort = data_sort.transpose()

Now that we've checked one file, time to do the other 20,000.


In [None]:
def batchconvert(folder):
    for file in os.listdir(folder):
        try:
            if file.endswith('.pdf'):
                casenum = os.path.splitext(file)[0]
                casename = '%s.txt' % casenum
                savepath = os.path.join(currentdir, 'textopinions', casename)
                with open(savepath, "wb") as textfile:
                    pickle.dump(pdf_to_text(os.path.join(folder, file)),textfile)
        except:
            print('Error occurred during %s ' % casenum)
            continue
    return

In [None]:
folder = os.path.join(currentdir, 'Opinions','cl_15')
batchconvert(folder)

In [None]:
folder = os.path.join(currentdir, 'Opinions','cl_20')
batchconvert(folder)

TODO: Make it easy to start up again if it stops, handle errors, or both. Ideally both.

In [None]:
folder = os.path.join(currentdir, 'Opinions','cl_25')
batchconvert(folder)

In [None]:
folder = os.path.join(currentdir, 'Opinions','cl_30')
batchconvert(folder)

In [None]:
folder = os.path.join(currentdir, 'Opinions','cl_10')
batchconvert(folder)

At this point, we should have all the opinions in text form. Now we can load the pickled files and associate each case number with its corpus, check to see that it came from the Los Angeles Superior Court and is a felony, and add the word-frequency matrix.

For testing this part, we want to have a felony case. B2500042 is a felony which originated in the Los Angeles superior court. 

In [None]:
with open(os.path.join(currentdir, 'text_opinions', 'B250042.txt'), "rb") as file:
    felony_case = pickle.load(file)
    
print(type(felony_case))


We're looking for the presence of a string of the form "Super. Ct. No. NA088447" - specifically , the lower court case number should be [letter] A [6-digit number]. I am not sure whether all cases use 6 digits following the A. We're also looking for the words "Los Angeles County" - and we want them to be in the first page of the document. 

Upon further consideration, it may be worthwhile to process all the opinions and look for the case number afterward. 

In [4]:
# Let's try setting up a class Case with properties "corpus", "wordcount", "casenum""
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

class Case:
    def __init__(self, textfile):
        assert os.path.isfile(textfile) == True
        # We start out with just the raw text string from the pdf
        # We should also initialize the casenumber attribute?
        with open(textfile, "rb") as file: 
            self.text = pickle.load(file)
            self.casenumber = os.path.splitext(os.path.split(textfile)[1])[0]
    # We should have a method for doing text cleanup and a method for 
    # count vectorization
    def make_corpus(self):
        '''Adds corpus attribute to the Case object by cleaning
        the raw text string'''
        # Make text lowercase: 
        text = self.text
        text = text.lower()
        # Strip all newlines: 
        text = re.sub('\n', '', text)
        self.corpus = text
    def make_unigram_matrix(self):
        '''Adds frequency-matrix-of-unigrams attribute to Case object. Requires
        that corpus attribute already exists.'''
        # currently redundant with ngram matrix
        cv = CountVectorizer(input='content',stop_words = 'english')
        freq_cv = cv.fit_transform([self.corpus])
        freq_matrix = pd.DataFrame(freq_cv.toarray(), columns = cv.get_feature_names())
        # Sort the words by frequency, most frequent first
        # freq_matrix = freq_matrix.sort_values(by=0,axis=1, ascending=False)
        # freq_matrix = freq_matrix.transpose()
        self.unigram_matrix = freq_matrix
    def make_ngram_matrix(self, n):
        '''Adds frequency-matrix-of-ngrams attribute to Case object. Requires
        that corpus attribute already exists.'''
        cv = CountVectorizer(input='content',stop_words = 'english',strip_accents='unicode',ngram_range=(1,n))
        freq_cv = cv.fit_transform([self.corpus])
        freq_matrix = pd.DataFrame(freq_cv.toarray(), columns = cv.get_feature_names())
        # Sort the words by frequency, most frequent first
        # freq_matrix = freq_matrix.sort_values(by=0,axis=1, ascending=False)
        # freq_matrix = freq_matrix.transpose()
        # note: transposing matrix makes the to_string method produce
        # a nice output, but makes indexing by ngram slightly harder?
        self.ngram_matrix = freq_matrix

  
        

In [4]:
# testing initialization method for class
b = Case(testpath)
print(b.casenumber)
# print(b.text)
b.make_corpus()
# print(b.corpus)
b.make_unigram_matrix()
b.make_ngram_matrix(5)
print(b.unigram_matrix)
print(b.ngram_matrix)

B300885
   10  11  1115  11278  11280  1128111284  1128411285  1170  1192  12  ...  \
0   2   1     3      1      1           1           1     1     1   3  ...   

   witnesses  wl  woodell  working  writ  writs  xavier  year  years  yun  
0          1   1        2        1     4      1       1     3      5    1  

[1 rows x 709 columns]
   10  10 348  10 348 352353  10 348 352353 gallardo  \
0   2       1              1                       1   

   10 348 352353 gallardo intended  10 years  10 years prior  \
0                                1         1               1   

   10 years prior felony  10 years prior felony enhancements  11  ...  \
0                      1                                   1   1  ...   

   years life robbery 25 years  years prior  years prior felony  \
0                            1            1                   1   

   years prior felony enhancements  years prior felony enhancements 667  yun  \
0                                1                     

In [None]:
print('los angeles county super' in b.ngram_matrix.to_string())
print('los angeles county super' in b.ngram_matrix)
print('los angeles county super' in b.corpus)
# The reason that these give different results is that the corpus
# includes whitespace between "county" and "super".

In [5]:
# go thru the text opinions 
# make list of files 
text_file_list = []
for root, dirs, files in os.walk(os.path.join(os.path.abspath(''),'textopinions'), topdown=False):
    for file in files:
        text_file_list.append(os.path.join(root, file))

processed_cases = []
flagged_cases = []
processed_count = 0
flagged_count = 0
for filepath in text_file_list:
    try: 
        case = Case(filepath)
        case.make_corpus()
        case.make_ngram_matrix(5)
        processed_cases.append(case)
        processed_count += 1
        if processed_count % 100 == 0:
            print('%d cases processed successfully. %d cases flagged.' % (processed_count, flagged_count))
    except (KeyboardInterrupt,SystemExit): 
        raise
    except:
        flagged_cases.append(filepath)
        flagged_count += 1
        print('Flagged case at %s' % filepath)
        continue


100 cases processed successfully. 0 cases flagged.
200 cases processed successfully. 0 cases flagged.
300 cases processed successfully. 0 cases flagged.
400 cases processed successfully. 0 cases flagged.
500 cases processed successfully. 0 cases flagged.
600 cases processed successfully. 0 cases flagged.
700 cases processed successfully. 0 cases flagged.
800 cases processed successfully. 0 cases flagged.
900 cases processed successfully. 0 cases flagged.
1000 cases processed successfully. 0 cases flagged.
1100 cases processed successfully. 0 cases flagged.
1200 cases processed successfully. 0 cases flagged.
1300 cases processed successfully. 0 cases flagged.
1400 cases processed successfully. 0 cases flagged.
1500 cases processed successfully. 0 cases flagged.
1600 cases processed successfully. 0 cases flagged.
1700 cases processed successfully. 0 cases flagged.
1800 cases processed successfully. 0 cases flagged.
1900 cases processed successfully. 0 cases flagged.
2000 cases processed 

15800 cases processed successfully. 0 cases flagged.
15900 cases processed successfully. 0 cases flagged.
16000 cases processed successfully. 0 cases flagged.
16100 cases processed successfully. 0 cases flagged.
16200 cases processed successfully. 0 cases flagged.
16300 cases processed successfully. 0 cases flagged.
16400 cases processed successfully. 0 cases flagged.
16500 cases processed successfully. 0 cases flagged.
16600 cases processed successfully. 0 cases flagged.
16700 cases processed successfully. 0 cases flagged.
16800 cases processed successfully. 0 cases flagged.
16900 cases processed successfully. 0 cases flagged.
17000 cases processed successfully. 0 cases flagged.
17100 cases processed successfully. 0 cases flagged.
17200 cases processed successfully. 0 cases flagged.
17300 cases processed successfully. 0 cases flagged.
17400 cases processed successfully. 0 cases flagged.
17500 cases processed successfully. 0 cases flagged.
17600 cases processed successfully. 0 cases fl

In [6]:
# Pickle output for later use
with open('processed_cases.txt', "wb") as file:
    pickle.dump(processed_cases, file)
with open('flagged_cases.txt', 'wb') as file: 
    pickle.dump(flagged_cases, file)