In [48]:
# Module imports
import os
from bs4 import BeautifulSoup
import pickle
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [2]:
# Let's try setting up a class Case with properties "corpus", "wordcount", "casenum""
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

class Case:
    def __init__(self, textfile):
        assert os.path.isfile(textfile) == True
        # We start out with just the raw text string from the pdf
        # We should also initialize the casenumber attribute?
        with open(textfile, "rb") as file: 
            self.text = pickle.load(file)
            self.casenumber = os.path.splitext(os.path.split(textpath)[1])[0]
    # We should have a method for doing text cleanup and a method for 
    # count vectorization
    def make_corpus(self):
        '''Adds corpus attribute to the Case object by cleaning
        the raw text string'''
        # Make text lowercase: 
        text = self.text
        text = text.lower()
        # Strip all newlines: 
        text = re.sub('\n', '', text)
        self.corpus = text
    def make_unigram_matrix(self):
        '''Adds frequency-matrix-of-unigrams attribute to Case object. Requires
        that corpus attribute already exists.'''
        # currently redundant with ngram matrix
        cv = CountVectorizer(input='content',stop_words = 'english')
        freq_cv = cv.fit_transform([self.corpus])
        freq_matrix = pd.DataFrame(freq_cv.toarray(), columns = cv.get_feature_names())
        # Sort the words by frequency, most frequent first
        # freq_matrix = freq_matrix.sort_values(by=0,axis=1, ascending=False)
        # freq_matrix = freq_matrix.transpose()
        self.unigram_matrix = freq_matrix
    def make_ngram_matrix(self, n):
        '''Adds frequency-matrix-of-ngrams attribute to Case object. Requires
        that corpus attribute already exists.'''
        cv = CountVectorizer(input='content',stop_words = 'english',strip_accents='unicode',ngram_range=(1,n))
        freq_cv = cv.fit_transform([self.corpus])
        freq_matrix = pd.DataFrame(freq_cv.toarray(), columns = cv.get_feature_names())
        # Sort the words by frequency, most frequent first
        # freq_matrix = freq_matrix.sort_values(by=0,axis=1, ascending=False)
        # freq_matrix = freq_matrix.transpose()
        # note: transposing matrix makes the to_string method produce
        # a nice output, but makes indexing by ngram slightly harder?
        self.ngram_matrix = freq_matrix

  
        

In [6]:
print(os.path.join(os.path.abspath(''),'processed_cases.txt'))

C:\Users\leodb\Documents\THESIS\ldbw\processed_cases.txt


In [3]:
with open(os.path.join(os.path.abspath(''),'processed_cases.txt'), 'rb') as file:
    processed_cases = pickle.load(file)

In [4]:
la_cases = []
maybe_cases = []
location_flag = []
failed_cases = []
i = 0
for case in processed_cases: 
    if i % 100 == 0: 
        print('%d cases processed' % i)
    try: 
        cols = case.ngram_matrix.columns
        if 'los angeles super' in cols or 'los angeles county super' in cols:
            la_cases.append(case)
        elif 'los angeles county' in cols:
            maybe_cases.append(case)
        else:
            location_flag.append(case)
    except (KeyboardInterrupt,SystemExit): 
        raise
    except:
        failed_cases.append(case)
        print('Failed case at %d' % case.casenumber)
        continue
    finally: 
        i += 1

0 cases processed
100 cases processed
200 cases processed
300 cases processed
400 cases processed
500 cases processed
600 cases processed
700 cases processed
800 cases processed
900 cases processed
1000 cases processed
1100 cases processed
1200 cases processed
1300 cases processed
1400 cases processed
1500 cases processed
1600 cases processed
1700 cases processed
1800 cases processed
1900 cases processed
2000 cases processed
2100 cases processed
2200 cases processed
2300 cases processed
2400 cases processed
2500 cases processed
2600 cases processed
2700 cases processed
2800 cases processed
2900 cases processed
3000 cases processed
3100 cases processed
3200 cases processed
3300 cases processed
3400 cases processed
3500 cases processed
3600 cases processed
3700 cases processed
3800 cases processed
3900 cases processed
4000 cases processed
4100 cases processed
4200 cases processed
4300 cases processed
4400 cases processed
4500 cases processed
4600 cases processed
4700 cases processed
4800

In [5]:
la_felonies = []
maybe_felonies = []
felony_flag = []
failed_cases = []
i = 0
for case in la_cases: 
    if i % 100 == 0: 
        print('%d cases processed' % i)
    try: 
        m = case.ngram_matrix
        if len(m.filter(regex='super ct no [a-z]a\d{6}').columns)!=0 or len(m.filter(regex='super ct nos [a-z]a\d{6}').columns)!=0:
            la_felonies.append(case)
        elif len(m.filter(regex='[a-z]a\d{6}').columns)!=0:
            maybe_felonies.append(case)
        else:
            felony_flag.append(case)
    except (KeyboardInterrupt,SystemExit): 
        raise
    except:
        failed_cases.append(case)
        print('Failed case at %d' % case.casenumber)
        continue
    finally: 
        i += 1

0 cases processed
100 cases processed
200 cases processed
300 cases processed
400 cases processed
500 cases processed
600 cases processed
700 cases processed
800 cases processed
900 cases processed
1000 cases processed
1100 cases processed
1200 cases processed
1300 cases processed
1400 cases processed
1500 cases processed
1600 cases processed
1700 cases processed
1800 cases processed
1900 cases processed
2000 cases processed
2100 cases processed
2200 cases processed
2300 cases processed
2400 cases processed
2500 cases processed
2600 cases processed
2700 cases processed
2800 cases processed
2900 cases processed
3000 cases processed
3100 cases processed
3200 cases processed
3300 cases processed
3400 cases processed
3500 cases processed
3600 cases processed
3700 cases processed
3800 cases processed
3900 cases processed
4000 cases processed
4100 cases processed
4200 cases processed
4300 cases processed
4400 cases processed
4500 cases processed
4600 cases processed
4700 cases processed
4800

In [40]:
len(la_cases)

19290

In [41]:
len(maybe_cases)

1229

In [79]:
print(len(la_felonies))
print(len(maybe_felonies))
felonies_second_pass = []
hhh = []
i = 0
for f in maybe_felonies[1001:]: 
    if len(m.filter(regex='super ct [a-z]a\d{6}').columns) == 0:
        hhh.append(f)
    else: 
        felonies_second_pass.append(f)
    if i % 100 == 0:
        print(i)
    i += 1

131
8343
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300


In [6]:
len(maybe_felonies)
probable_felonies = la_felonies + maybe_felonies

In [46]:
t1 = 'ba333959'
t2 = '0a345909'
t3 = 'bb220020'
t4 = 'ba242'


print(re.search('[a-z]a\d{6}', t1))
print(re.search('[a-z]a\d{6}', t2))
print(re.search('[a-z]a\d{6}', t3))
print(re.search('[a-z]a\d{6}', t4))

<re.Match object; span=(0, 8), match='ba333959'>
None
None
None


In [7]:
with open('la_felonies', 'wb') as file:
    pickle.dump(probable_felonies, file)
with open('la_cases', 'wb') as file: 
    pickle.dump(la_cases, file)

In [52]:
c=la_cases[0]

In [55]:
c.ngram_matrix.index.contains('[a-z]a\d{6}')

AttributeError: 'RangeIndex' object has no attribute 'contains'

In [67]:
print(len(c.ngram_matrix.filter(regex='[a-z]a\d{6}').columns)==0)
print(len(c.ngram_matrix.filter(regex='[a-z]a\d{10}').columns)==0)

False
True


Eventually, we probably want to have a dataframe that has the case number as the index and columns like "THREE_STRIKES", "APPOINTED_ATTORNEY", "WEAPONS_ENHANCEMENT", "PROSECUTOR", "DATE_FILED", etc. 

In [71]:
q = la_cases[0:20]
qd = pd.DataFrame(q)
qd

Unnamed: 0,0
0,<__main__.Case object at 0x000001F403C76088>
1,<__main__.Case object at 0x000001EE683CAF88>
2,<__main__.Case object at 0x000001F3FBD90448>
3,<__main__.Case object at 0x000001F3FDE35148>
4,<__main__.Case object at 0x000001F40AF76BC8>
5,<__main__.Case object at 0x000001F40B948C88>
6,<__main__.Case object at 0x000001F40D01F3C8>
7,<__main__.Case object at 0x000001F40F25E6C8>
8,<__main__.Case object at 0x000001F400223BC8>
9,<__main__.Case object at 0x000001F411DBD588>


Before we go too much farther, we should make this a dataframe.


In [11]:
columns_to_use = ['APPEALS_CASE_NUMBER', 'CORPUS', 'NGRAM_MATRIX']
lst = []
for case in probable_felonies:
    lst.append([case.casenumber, case.corpus, case.ngram_matrix])
df = pd.DataFrame(lst, columns=columns_to_use)

In [12]:
df

Unnamed: 0,APPEALS_CASE_NUMBER,CORPUS,NGRAM_MATRIX
0,B213582,filed 4/2/12 p. v. khrayan ca2/2 not to be pu...,00 00 10 00 10 00 00 10 00 detective 00...
1,B226851,filed 7/31/12 p. v. scott ca2/7 not to be p...,100 100 feet 100 feet aqueduct 100 feet ...
2,B227717,filed 3/20/12 p. v. frazier ca2/5 not to be p...,000 000 past 000 past years 000 past yea...
3,B228643,filed 5/21/12 p. v. mcclelland ca2/7 not to b...,000 000 cash 000 cash days 000 cash days...
4,B231195,filed 4/24/12 p. v. rodriguez ca2/7 not to b...,10 10 2010 10 2010 rodriguez 10 2010 rod...
...,...,...,...
8469,B299414,filed 12/4/19 p. v. moore ca2/1 not to be pub...,1115 1115 court 1115 court appeal 1115 c...
8470,B299419,filed 1/17/20 p. v. bracamonte ca2/8 not to b...,106 106 wende 106 wende supra 106 wende ...
8471,B299427,filed 12/10/19 p. v. cole ca2/1 not to be pub...,10 10 19 10 19 cole 10 19 cole ca2 10 1...
8472,B300189,filed 1/14/20 p. v. keith ca2/3 not to be p...,10 10 2019 10 2019 keith 10 2019 keith d...


In [14]:
with open('felonies_dataframe', 'wb') as file:
    pickle.dump(df, file)

In [23]:
c = probable_felonies[0].ngram_matrix
last = c.index[-1]
c = c.rename(index={last: probable_felonies[0].casenumber})
c

Unnamed: 0,00,00 10,00 10 00,00 10 00 detective,00 10 00 detective currie,00 detective,00 detective currie,00 detective currie interviewed,00 detective currie interviewed gina,00 january,...,zapien 1993 cal 4th 929,zapien supra,zapien supra cal,zapien supra cal 4th,zapien supra cal 4th 964,zapienapparent,zapienapparent evidence,zapienapparent evidence destroyed,zapienapparent evidence destroyed nature,zapienapparent evidence destroyed nature defendant
B213582,3,1,1,1,1,1,1,1,1,1,...,1,2,2,2,2,1,1,1,1,1


In [4]:
with open('la_felonies', 'rb') as file: 
    probable_felonies = pickle.load(file)

In [8]:
c = probable_felonies[0].ngram_matrix
c_int = c.astype(dtype = 'int8', errors='ignore')
c_int

Unnamed: 0,00,00 10,00 10 00,00 10 00 detective,00 10 00 detective currie,00 detective,00 detective currie,00 detective currie interviewed,00 detective currie interviewed gina,00 january,...,zapien 1993 cal 4th 929,zapien supra,zapien supra cal,zapien supra cal 4th,zapien supra cal 4th 964,zapienapparent,zapienapparent evidence,zapienapparent evidence destroyed,zapienapparent evidence destroyed nature,zapienapparent evidence destroyed nature defendant
0,3,1,1,1,1,1,1,1,1,1,...,1,2,2,2,2,1,1,1,1,1


In [44]:
master_table = pd.DataFrame
lst = []
for case in probable_felonies: 
    df = case.ngram_matrix
    last = df.index[-1]
    # the resulting list is too large with default 64-bit float
    df = df.astype(dtype='int8', errors='ignore')
    df = df.rename(index={last: case.casenumber})
    lst.append(df)



In [45]:
nansafe_list = []
for df in lst:
    df = df.astype(dtype=pd.Int8Dtype(), errors='ignore')
    nansafe_list.append(df)

KeyboardInterrupt: 

In [46]:
len(nansafe_list)

737

In [10]:
with open('lst', 'wb') as file:
    pickle.dump(lst, file)

In [11]:
master_table = pd.concat(lst, axis=1, join='outer', sort=False)

MemoryError: Unable to allocate 5.20 GiB for an array with shape (82379, 8474) and data type float64

In [23]:
subtable1 = lst[0]
subtable1
type(lst[0])

pandas.core.frame.DataFrame

In [25]:
subtable1 = subtable1.append(lst[2:99])

In [49]:
subtable1 = subtable1.astype(pd.SparseDtype("int8", np.nan))


In [52]:
subtable1.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99 entries, B213582 to B270646
Columns: 453421 entries, 00 to years prison imposed mandatory fines
dtypes: Sparse[int8, nan](453421)
memory usage: 2.7+ MB


In [53]:
with open('lst.bin','wb') as file:
    pickle.dump(lst, file)

In [56]:
with open('probable_felonies.bin', 'wb') as file:
    pickle.dump(probable_felonies, file)

In [36]:
subtable1[subtable1['prior felony'].notnull()]

Unnamed: 0,00,00 10,00 10 00,00 10 00 detective,00 10 00 detective currie,00 detective,00 detective currie,00 detective currie interviewed,00 detective currie interviewed gina,00 january,...,waiving preliminary hearing,waiving preliminary hearing plea,waiving preliminary hearing plea contest,wende raising issues september,wende raising issues september 19,wish considered time boren,wish considered time boren ashmann,years prison imposed,years prison imposed mandatory,years prison imposed mandatory fines
B213582,3.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,,,,,,,,,,
B231195,,,,,,,,,,,...,,,,,,,,,,
B234590,,,,,,,,,,,...,,,,,,,,,,
B234597,,,,,,,,,,,...,,,,,,,,,,
B236683,,,,,,,,,,,...,,,,,,,,,,
B239566,,,,,,,,,,,...,,,,,,,,,,
B240370,,,,,,,,,,,...,,,,,,,,,,
B243070,,,,,,,,,,,...,,,,,,,,,,
B244731,,,,,,,,,,,...,,,,,,,,,,
B249216,,,,,,,,,,,...,,,,,,,,,,
