In [2]:
# Module imports
import os
import pickle
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [12]:
# Let's try setting up a class Case with properties "corpus", "wordcount", "casenum""
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

class Case:
    def __init__(self, textfile):
        assert os.path.isfile(textfile) == True
        # We start out with just the raw text string from the pdf
        # We should also initialize the casenumber attribute?
        with open(textfile, "rb") as file: 
            self.text = pickle.load(file)
            self.casenumber = os.path.splitext(os.path.split(textpath)[1])[0]
    # We should have a method for doing text cleanup and a method for 
    # count vectorization
    def make_corpus(self):
        '''Adds corpus attribute to the Case object by cleaning
        the raw text string'''
        # Make text lowercase: 
        text = self.text
        text = text.lower()
        # Strip all newlines: 
        text = re.sub('\n', '', text)
        self.corpus = text
    
        
        
    def make_unigram_matrix(self):
        '''Adds frequency-matrix-of-unigrams attribute to Case object. Requires
        that corpus attribute already exists.'''
        # currently redundant with ngram matrix
        cv = CountVectorizer(input='content',stop_words = 'english')
        freq_cv = cv.fit_transform([self.corpus])
        freq_matrix = pd.DataFrame(freq_cv.toarray(), columns = cv.get_feature_names())
        # Sort the words by frequency, most frequent first
        # freq_matrix = freq_matrix.sort_values(by=0,axis=1, ascending=False)
        # freq_matrix = freq_matrix.transpose()
        self.unigram_matrix = freq_matrix
    def make_ngram_matrix(self, n):
        '''Adds frequency-matrix-of-ngrams attribute to Case object. Requires
        that corpus attribute already exists.'''
        cv = CountVectorizer(input='content',stop_words = 'english',strip_accents='unicode',ngram_range=(1,n))
        freq_cv = cv.fit_transform([self.corpus])
        freq_matrix = pd.DataFrame(freq_cv.toarray(), columns = cv.get_feature_names())
        # Sort the words by frequency, most frequent first
        # freq_matrix = freq_matrix.sort_values(by=0,axis=1, ascending=False)
        # freq_matrix = freq_matrix.transpose()
        # note: transposing matrix makes the to_string method produce
        # a nice output, but makes indexing by ngram slightly harder?
        self.ngram_matrix = freq_matrix

  
        

        

One thing that's caused problems using the corpus is that the whitespace is very irregular, meaning matching expressions was impeded by the presence of tabs, returns, and long strings of space between words.

In [13]:
with open('short.bin', 'rb') as file: 
    shortlist = pickle.load(file)

In [14]:
def make_refined_corpus(string):
    regex_spaces = '(\s+)'
    refined_corpus = re.sub(r'(\s+)', ' ', string)
    refined_corpus = re.sub(r'[^\x00-\x7F]+', '', refined_corpus)
    return refined_corpus



In [15]:
test_case = shortlist[0]
test_case.corpus


'filed 4/2/12  p. v. khrayan ca2/2 not to be published in the official reports  california rules of court, rule 8.1115(a), prohibits courts and parties from citing or relying on opinions not certified for publication or ordered published, except as specified by rule 8.1115(b).  this opinion has not been certified for publication or ordered published for purposes of rule 8.1115.  in the court of appeal of the state of california  second appellate district  division two   the people,   plaintiff and respondent,   v.  arutyun khrayan,   defendant and appellant.        b213582        (los angeles county       super. ct. nos. ba255474, ba255302)      appeal from a judgment of the superior court of los angeles county.  kathleen kennedy, judge.  affirmed.   geragos & geragos and mark j. geragos for defendant and appellant.   kamala d. harris, attorney general, dane r. gillette, chief assistant attorney general, lance e. winters, assistant attorney general, linda c. johnson, robert david breto

In [16]:
clean_corpus_list = []
folder = os.path.join(os.path.abspath(''),'refined_corpuses')
for case in shortlist:
    clean_corpus = make_refined_corpus(case.corpus)
    writefile = os.path.join(folder,'%s.txt' % case.casenumber)
    with open(writefile, 'w') as f: 
        f.write(clean_corpus)
    clean_corpus_list.append(clean_corpus)

In [17]:
len(clean_corpus_list)

8474

In [18]:
casenumbers_la_felonies = []
for case in shortlist:
    casenumbers_la_felonies.append(case.casenumber)
with open('list_of_casenumbers.txt', 'wb') as f: 
    pickle.dump(casenumbers_la_felonies, f)

In [19]:
clean_corpus_list[6]

'filed 4/11/13 p. v. lozano ca2/3 not to be published in the official reports california rules of court, rule 8.1115(a), prohibits courts and parties from citing or relying on opinions not certified for publication or ordered published, except as specified by rule 8.1115(b). this opinion has not been certified for publication or ordered published for purposes of rule 8.1115. in the court of appeal of the state of california second appellate district division three the people, plaintiff and respondent, v. sammy lozano, defendant and appellant. b233393 (los angeles county super. ct. nos. ta077104 & ta080053) appeal from a judgment of the superior court of los angeles county, allen joseph webster, judge. affirmed with directions. george l. schraer for defendant and appellant. kamala d. harris, attorney general, dane r. gillette, chief assistant attorney general, lance e. winters, assistant attorney general, paul m. roadarmel, jr., stephanie a. miyoshi and william n. frank, deputy attorney

In [20]:
test_corpus_clean = clean_corpus_list[10]

In [21]:
# Get filing date
# Search for the string "filed " followed by a date
# return the date
def get_filing_date(corpus):
    strings = re.search(r'filed \d+/\d+/\d+', corpus)
    if len(strings) == 1:
        string = strings[0]
        date = string.replace('filed ','')
        l = date.split('/')
        date_info = {'date':date, 'month':int(l[0]), 'day':int(l[1]), 'year':int(l[2]) + 2000}
        return date_info
    elif len(strings) == 0: 
        print('No dates found.')
    else: 
        print('Multiple dates found.')
    return strings

In [22]:
# Get defendant name
# Search for "respondent, v. " or similar followed by
# "name, defendant"
# return name
def get_defendant(corpus):
    defendant_name = re.findall(r'respondent, vs?\. .{3,100}?, defendants?\b', corpus)
    if len(defendant_name) == 0:
        defendant_name = ['oop', 'oh no']
    return defendant_name

In [23]:
defendant_search_term = r'respondent, vs?\. .+, defendants?'
p = get_defendant(test_corpus_clean[0])
p

['oop', 'oh no']

In [24]:
re.findall(r'filed \d+/\d+/\d+', test_corpus_clean)

['filed 8/22/12']

In [25]:
attorney = r'(\. .{3,100}? for defendants?)'
appointed = r'(under appointment)'
outcome = r'(judgm?en?t?\. .{3,100}?\.)'


In [26]:
SEARCH_TERMS = [
    'felony', 
    'three strikes', 
    'prior conviction', 
    '3 strikes', 
    'strikes enhancement', 
    'kamala harris', 
    'firearm',
    'firearm enhancement', 
    'sentencing', 
    'high term', 
    'low term', 
    'mid term', 
    'habeas corpus',
    'prior convictions',
    'reversed',
    'remanded',
    'affirmed',
    'serious felony',
    'strike',
    'to life',
    'years life',
    'assault', 
    'deadly weapon',
    'murder',
    'felony enhancements',
    'felony enhancement',
    'guilty plea',
    'resentencing',
    'under appointment',
    'no appearance',
    'affirm', 
    'determinate term', 
    'indeterminate term',
    'raised no',
    'no issues',
    'felony priors',
    'strikes law',
    'second strike', 
    'third strike',
    'enhancement',
    'ineffective assistance'
]

In [27]:
def search_corpus(corpus, string): 
    found = re.findall(string, corpus[0:2000])
    return found
     

In [None]:
for case in clean_corpus_list[0:99]: 
    print((case.casenum, search_corpus(case, outcome)))

In [29]:
for corpus in clean_corpus_list[0:20]:
    print((search_corpus(corpus, attorney), search_corpus(corpus, appointed), search_corpus(corpus, outcome)))

(['. kathleen kennedy, judge. affirmed. geragos & geragos and mark j. geragos for defendant'], [], ['judge. affirmed.'])
(['. affirmed. richard c. neuhoff and barbara a. zuras, under appointment by the court of appeal, for defendant'], ['under appointment'], ['judge. affirmed.'])
(['. affirmed as modified. joseph shipp, under appointment by the court of appeal, for defendant', '. edward j. haggerty, under appointment by the court of appeal, for defendant'], ['under appointment', 'under appointment'], ['judge. affirmed as modified.'])
(['. affirmed as modified. sarah a. stockwell, under appointment by the court of appeal, for defendant', '. thomas owen, under appointment by the court of appeal, for defendant'], ['under appointment', 'under appointment'], ['judge. affirmed as modified.'])
(['. appeal dismissed. gary v. crooks, under appointment by the court of appeal, for defendant'], ['under appointment'], ['judge. appeal dismissed.'])
(['. charles a. chung, judge. affirmed. karyn h. bu

In [54]:
def is_affirmed(corpus):
    check = search_corpus(corpus, outcome)
    if len(check) == 0:
        out = 0
    elif len(check) != 0 and 'affirmed' in check[0]:
        out = 1
    else:
        out = 0
    return out    

In [31]:
def is_appointed(corpus):
    check = search_corpus(corpus, appointed)
    if len(check) == 0:
        out = 0
    else:
        out = 1
    return out  

In [32]:
def exists(string, corpus):
    if len(re.findall(string, corpus)) == 0:
        return 0
    else:
        return 1
    
        

In [None]:
def dict_exists(d, corpus):
    for k in 

In [55]:
dicts = []
for corpus in clean_corpus_list:
    d = {
        'Affirmed':is_affirmed(corpus),
        'Appointed':is_appointed(corpus)
        }
    dicts.append(d)

In [56]:
affirmed_and_appointed = 0
for d in dicts:
    affirmed_and_appointed += d['Affirmed']*d['Appointed']
affirmed_and_appointed

6612

In [57]:
affirmed = 0
for d in dicts:
    affirmed += d['Affirmed']
affirmed

7109

In [58]:
apptd = 0
for d in dicts:
    apptd += d['Appointed']
apptd

7540

In [37]:
len(dicts)

8474

In [38]:
columns_to_use = [
    'case_number',
    'raw_corpus',
    'clean_corpus',
    'filing_date',
    'filing_month',
    'filing_year',
    'defendant_name',
    'atty_appointed',
    'outcome',
    'prosecutor',
    'high_term',
    'sentencing_related'
]

In [39]:
# DON'T RUN
dicts = []
for case in clean:
    clean_corpus = make_refined_corpus(case.corpus)
    d = {
        'case_number':case.casenumber,
        'raw_corpus':case.corpus,
        'clean_corpus':clean_corpus,
        'filing_date':get_filing_date(clean_corpus),
        'is_appointed':is_appointed(clean_corpus),
        'is_affirmed':is_affirmed(clean_corpus)
    }
    dicts.append(d)

TypeError: object of type 're.Match' has no len()

In [44]:
new_dicts = []
for case in clean_corpus_list:
    d = {}
    for term in SEARCH_TERMS:
        d[term] = exists(term, case)
    new_dicts.append(d)
    

In [45]:
len(new_dicts)

8474

In [59]:
case_matrix = pd.DataFrame(dicts)
case_matrix

Unnamed: 0,Affirmed,Appointed
0,1,0
1,1,1
2,1,1
3,1,1
4,0,1
...,...,...
8469,1,1
8470,1,1
8471,1,1
8472,0,1


In [60]:
new_case_matrix = pd.DataFrame(new_dicts)
new_case_matrix.describe()

Unnamed: 0,felony,three strikes,prior conviction,3 strikes,strikes enhancement,kamala harris,firearm,firearm enhancement,sentencing,high term,...,determinate term,indeterminate term,raised no,no issues,felony priors,strikes law,second strike,third strike,enhancement,ineffective assistance
count,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,...,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0
mean,0.603257,0.239438,0.331367,0.000826,0.0,0.001416,0.397923,0.135827,0.556172,0.067383,...,0.077059,0.056054,0.043309,0.166391,0.003422,0.230588,0.052278,0.044253,0.462945,0.196601
std,0.489251,0.426766,0.470732,0.028731,0.0,0.037607,0.489498,0.342625,0.496864,0.250698,...,0.266701,0.230039,0.203564,0.372454,0.058403,0.421234,0.222599,0.205669,0.498654,0.397452
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [61]:
new_case_matrix['affirmed'] = case_matrix['Affirmed']

'affirmed' in new_case_matrix.columns

True

In [64]:
AAA = (new_case_matrix['affirmed'] * new_case_matrix['under appointment']*new_case_matrix['ineffective assistance'])
AAA.describe()

count    8474.000000
mean        0.150224
std         0.357312
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max         1.000000
dtype: float64

In [63]:
apptd = new_case_matrix[new_case_matrix['under appointment']==1]
appaff = apptd[apptd['affirmed']==1]
retd = new_case_matrix[new_case_matrix['under appointment']==0]
retdaff = retd[retd['affirmed']==1]
diff = apptd.describe() - retd.describe()
display(apptd)
display(retd)
display(appaff)
display(retdaff)
display(new_case_matrix[new_case_matrix['affirmed']==1])

Unnamed: 0,felony,three strikes,prior conviction,3 strikes,strikes enhancement,kamala harris,firearm,firearm enhancement,sentencing,high term,...,determinate term,indeterminate term,raised no,no issues,felony priors,strikes law,second strike,third strike,enhancement,ineffective assistance
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,1,0,0,0,0,0,1,0,1,0,...,1,1,0,0,0,0,0,0,1,1
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1
4,1,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8469,1,0,1,0,0,0,0,0,1,1,...,0,0,0,1,0,0,1,0,1,0
8470,1,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
8471,1,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
8472,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


Unnamed: 0,felony,three strikes,prior conviction,3 strikes,strikes enhancement,kamala harris,firearm,firearm enhancement,sentencing,high term,...,determinate term,indeterminate term,raised no,no issues,felony priors,strikes law,second strike,third strike,enhancement,ineffective assistance
0,1,1,1,0,0,0,1,0,1,1,...,0,0,0,0,0,1,0,0,1,0
6,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,1,0
15,0,0,0,0,0,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
23,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32,1,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8452,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
8455,0,0,0,0,0,0,1,0,1,0,...,1,1,0,0,0,0,0,0,1,0
8459,1,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,0,0,0,1
8463,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


Unnamed: 0,felony,three strikes,prior conviction,3 strikes,strikes enhancement,kamala harris,firearm,firearm enhancement,sentencing,high term,...,determinate term,indeterminate term,raised no,no issues,felony priors,strikes law,second strike,third strike,enhancement,ineffective assistance
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,1,0,0,0,0,0,1,0,1,0,...,1,1,0,0,0,0,0,0,1,1
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1
5,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8466,1,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
8468,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,1,0
8469,1,0,1,0,0,0,0,0,1,1,...,0,0,0,1,0,0,1,0,1,0
8470,1,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0


Unnamed: 0,felony,three strikes,prior conviction,3 strikes,strikes enhancement,kamala harris,firearm,firearm enhancement,sentencing,high term,...,determinate term,indeterminate term,raised no,no issues,felony priors,strikes law,second strike,third strike,enhancement,ineffective assistance
0,1,1,1,0,0,0,1,0,1,1,...,0,0,0,0,0,1,0,0,1,0
6,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,1,0
79,1,1,1,0,0,0,1,0,1,0,...,1,0,0,0,0,1,0,0,1,0
87,1,0,1,0,0,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,1
93,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8346,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8351,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8381,0,0,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,1,0
8383,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,felony,three strikes,prior conviction,3 strikes,strikes enhancement,kamala harris,firearm,firearm enhancement,sentencing,high term,...,determinate term,indeterminate term,raised no,no issues,felony priors,strikes law,second strike,third strike,enhancement,ineffective assistance
0,1,1,1,0,0,0,1,0,1,1,...,0,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,1
2,1,0,0,0,0,0,1,0,1,0,...,1,1,0,0,0,0,0,0,1,1
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,1
5,1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8466,1,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
8468,1,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,1,0
8469,1,0,1,0,0,0,0,0,1,1,...,0,0,0,1,0,0,1,0,1,0
8470,1,0,0,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0


In [None]:
with open('new_case_matrix.bin', 'wb') as file:
    pickle.dump(new_case_matrix, file)