In [1]:
# Module imports
import os
import pickle
import re
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
SEARCH_DICTIONARY = {
    'search_attorney': r'(\. .{3,100}? for defendants?)',
    'search_appointed': r'under appointment',
    'search_outcome': r'(judgm?en?t?\. .{3,100}?\.)',
    'pitchess_motion': r'pitchess .{0,5}?motion',
    'prejudice':'unduly prejudicial',
    'no_error':'no error',
    'mid_term': r'mid(dle)?[ -]?term',
    'disposition': r'disposition .{0:100}?\.',
    'full_disposition':r'disposition.{0:100}?not to be published',
    'prior_conviction': r'prior .{3,20}conviction',
    'prior_felony_conviction': r'prior felony convictions?',
    'ineffective_assistance': r'ineffective assistance',
    'habeas_corpus': r'habeas corpus',
    'prosecutorial_misconduct': r'(prosecutor|prosecutorial) misconduct',
    'polygraph': 'polygraph',
    'kamala_harris': r'kamala .{0,10}?harris',
    'constitutional_violation': r'constitutional violation',
    'three_strikes':r'(?:three|3)(?:\s|-)strikes?',
    'enhancement':r'(?:firearm|strikes|felony) enhancements?',    
    'any_enhancement':r'[\w+]{3,20}\senhancement\b\s?[\w+]{3,20}',    
    'term':r'\b[\w+]*\b(?:\s|-)?term',
    'resentencing':r're-?sentenc'}


In [22]:
MORE_TERMS = {
    'felony':r'felon(y|ies)',
    'three_strikes':r'(three|3) ?-?strikes'
    }

In [None]:
SEARCH_TERMS = [
    'felony',
    'three strikes',
    'prior conviction',
    '3 strikes',
    'strikes enhancement',
    'kamala harris',
    'firearm',
    'firearm enhancement',
    'sentencing',
    'high term',
    'low term',
    'mid term',
    'habeas corpus',
    'prior convictions',
    'reversed',
    'remanded',
    'affirmed',
    'serious felony',
    'strike',
    'to life',
    'years life',
    'assault',
    'deadly weapon',
    'murder',
    'felony enhancements',
    'felony enhancement',
    'guilty plea',
    'resentencing',
    'under appointment',
    'no appearance',
    'affirm',
    'determinate term',
    'indeterminate term',
    'raised no',
    'no issues',
    'felony priors',
    'strikes law',
    'second strike',
    'third strike',
    'enhancement',
    'ineffective assistance'
]



In [3]:


class Case:
    # occurrence_dict = {}
    # Note to self: These are class elements, so if you want to change them,
    # you can change the whole class by setting Case.<element> = <value>
    # and change individual cases with case.<element> = value
    def __init__(self):
        self.occurrence_dict = {}
        pass
    def set_scanned_text_path(self, scanned_text_path):
        self.scanned_text_path = scanned_text_path
    def read_scanned_text(self):
        assert os.path.isfile(self.scanned_text_path) == True
        with open(self.scanned_text_path, "r") as file:
            self.scanned_text = file.read()
            self.appeals_case_number = os.path.splitext(os.path.split(self.scanned_text_path)[1])[0]
            self.occurrence_df = pd.DataFrame(index=[self.appeals_case_number])
    def set_clean_text_path(self, clean_text_path):
        self.clean_text_path = clean_text_path
    def read_clean_text(self):
        assert os.path.isfile(self.clean_text_path) == True
        with open(self.clean_text_path, "r") as file:
            self.clean_text = file.read()
            self.appeals_case_number = os.path.splitext(os.path.split(self.clean_text_path)[1])[0]
            self.occurrence_df = pd.DataFrame(index=[self.appeals_case_number])
    def make_clean_text(self):
        text = self.scanned_text
        text = text.lower()
        # Strip all newlines:
        text = re.sub('\n', '', text)
        # turn all whitespace into single spaces:
        self.clean_text = re.sub(r'(\s+)', ' ', text)
    # TODO: either merge the text functions or have them check if casenum
    # already exists?
    def set_pdf_path(self, pdf_path):
        assert os.path.isfile(pdf_path) == True, \
            "pdf_path must lead to a file."
        self.pdf_path = pdf_path
    def set_save_path(self):
        assert (self.appeals_case_number is not None), \
            "Appeals case number must be set before object can be saved."
        current_path = os.path.abspath('')
        folder = os.path.join(current_path, 'case_objects')
        save_path = os.path.join(folder,'%s.bin' % self.appeals_case_number)
        self.save_path = save_path
    def save(self):
        with open(self.save_path, 'wb') as f:
            pickle.dump(self, f)
        # TODO: test read/write from binary

# want a function that takes a dict {key:value}, searches for value,
# if found: puts the result in column "key" in DataFrame
# if not: puts ... None? False? 'Not found?'

    def find(self, search_dictionary):
    # search_dictionary should be of form {'category':'regex'}
        assert (self.clean_text is not None), \
            'Cleaned text must be available to search.'
        assert (len(search_dictionary) != 0), \
            'Size of dictionary is zero. Dictionary must have contents.'
        text = self.clean_text
        for key in search_dictionary:
            found = re.findall(search_dictionary[key], text)
            if len(found) != 0:
                self.occurrence_df[key] = 1
                self.occurrence_dict[key] = found
            else:
                self.occurrence_df[key] = 0
                self.occurrence_dict[key] = 'Not found'


As I'm writing this, I do have all of the refined corpuses that I want to use in one folder. 

In [4]:
corpus_folder = os.path.join(os.path.abspath(''),'refined_corpuses')
cases = []
for f in os.listdir(corpus_folder):
    clean_text_path = os.path.join(corpus_folder, f)
    case = Case()
    case.set_clean_text_path(clean_text_path)
    case.read_clean_text()
    case.set_save_path()
    case.save()
    cases.append(case)

In [5]:
# Checking that we got all the cases: should be 8474
len(cases)


8474

In [6]:
# Checking that properties exist as expected: 
for c in cases[0:9]:
    print(c.appeals_case_number) # should be str of form 'BXXXXXX'
    print(c.save_path) # should be absolute path ending in casenum
    print(c.clean_text[0:100]) # should be first 101 chars of text

B120629
C:\Users\leodb\Documents\THESIS\ldbw\case_objects\B120629.bin
filed 4/10/13 p. v. downing ca2/5 not to be published in the official reports california rules of co
B158266
C:\Users\leodb\Documents\THESIS\ldbw\case_objects\B158266.bin
 filed 6/16/15 p. v. pilola ca2/4 not to be published in the official reports california rules of co
B197574
C:\Users\leodb\Documents\THESIS\ldbw\case_objects\B197574.bin
filed 1/26/15 p. v. goodwin ca2/8 not to be published in the official reports california rules of co
B211622
C:\Users\leodb\Documents\THESIS\ldbw\case_objects\B211622.bin
filed 8/30/13 p. v. gutierrez ca2/1 not to be published in the official reports california rules of 
B213582
C:\Users\leodb\Documents\THESIS\ldbw\case_objects\B213582.bin
filed 4/2/12 p. v. khrayan ca2/2 not to be published in the official reports california rules of cou
B217909
C:\Users\leodb\Documents\THESIS\ldbw\case_objects\B217909.bin
 filed 9/27/12 p. v. alonzo ca2/3 not to be published in the official repor

In [7]:
with open('list_of_felony_cases.bin','wb') as f:
    pickle.dump(cases, f)

list_of_felony_cases.bin does not include the cases as processed with SEARCH_DICTIONARY.

In [8]:
for c in cases:
    c.find(SEARCH_DICTIONARY)

In [9]:
with open('list_of_processed_felony_cases.bin','wb') as f:
    pickle.dump(cases, f)

In [11]:
list_of_dfs = []
for c in cases: 
    list_of_dfs.append(c.occurrence_df)


In [12]:
len(list_of_dfs)
type(list_of_dfs[0])

pandas.core.frame.DataFrame

In [13]:
with open('list_of_case_dfs.bin','wb') as f:
    pickle.dump(list_of_dfs, f)

In [14]:
all_cases = pd.concat(list_of_dfs, join='outer')
display(all_cases.head(30))

Unnamed: 0,search_attorney,search_appointed,search_outcome,pitchess_motion,prejudice,no_error,mid_term,disposition,full_disposition,prior_conviction,...,habeas_corpus,prosecutorial_misconduct,polygraph,kamala_harris,constitutional_violation,three_strikes,enhancement,any_enhancement,term,resentencing
B120629,1,1,1,0,0,1,0,0,0,0,...,1,0,0,1,0,1,0,0,1,0
B158266,1,1,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
B197574,1,1,1,0,0,1,0,0,0,0,...,0,1,1,1,1,0,0,0,1,0
B211622,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0
B213582,1,0,1,0,0,0,1,0,0,1,...,0,0,0,1,1,1,0,0,1,0
B217909,1,1,1,0,0,0,0,0,0,0,...,1,0,0,1,0,0,1,1,1,1
B218637,0,0,1,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
B220072,1,0,1,0,1,1,0,0,0,1,...,0,1,0,1,1,0,0,0,1,0
B220664,1,1,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
B221110,1,1,1,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,1,1


In [15]:
all_cases.describe()

Unnamed: 0,search_attorney,search_appointed,search_outcome,pitchess_motion,prejudice,no_error,mid_term,disposition,full_disposition,prior_conviction,...,habeas_corpus,prosecutorial_misconduct,polygraph,kamala_harris,constitutional_violation,three_strikes,enhancement,any_enhancement,term,resentencing
count,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,...,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0,8474.0
mean,0.949375,0.889781,0.946542,0.040477,0.05334,0.153646,0.20262,0.0,0.0,0.291244,...,0.086618,0.072693,0.004248,0.521359,0.02372,0.241444,0.160609,0.289002,0.807175,0.369601
std,0.219245,0.313182,0.224958,0.197086,0.224723,0.360631,0.401975,0.0,0.0,0.454362,...,0.281291,0.259647,0.065044,0.499573,0.152183,0.427984,0.367192,0.453325,0.39454,0.482725
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
with open('processed_cases_df.bin','wb') as f:
    pickle.dump(all_cases, f)

In [17]:
found_cases = all_cases[all_cases['search_attorney']==1]
found_cases.describe()

Unnamed: 0,search_attorney,search_appointed,search_outcome,pitchess_motion,prejudice,no_error,mid_term,disposition,full_disposition,prior_conviction,...,habeas_corpus,prosecutorial_misconduct,polygraph,kamala_harris,constitutional_violation,three_strikes,enhancement,any_enhancement,term,resentencing
count,8045.0,8045.0,8045.0,8045.0,8045.0,8045.0,8045.0,8045.0,8045.0,8045.0,...,8045.0,8045.0,8045.0,8045.0,8045.0,8045.0,8045.0,8045.0,8045.0,8045.0
mean,1.0,0.928278,0.975761,0.041641,0.055935,0.157862,0.203356,0.0,0.0,0.294344,...,0.081666,0.075699,0.004475,0.542946,0.024612,0.242387,0.163083,0.294966,0.812803,0.369671
std,0.0,0.258042,0.153799,0.199779,0.229811,0.364634,0.40252,0.0,0.0,0.455776,...,0.273872,0.264533,0.066748,0.498183,0.154948,0.428554,0.369464,0.456056,0.390094,0.482745
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
with open('found_cases_df.bin','wb') as f: 
    pickle.dump(found_cases, f)

In [19]:
test_string = ' three strikes and you\'re out, 3-strike policy, three-strikes left right' 

In [20]:
f = found_cases['ineffective_assistance']*found_cases['search_appointed']
found_cases[f==1].describe()

Unnamed: 0,search_attorney,search_appointed,search_outcome,pitchess_motion,prejudice,no_error,mid_term,disposition,full_disposition,prior_conviction,...,habeas_corpus,prosecutorial_misconduct,polygraph,kamala_harris,constitutional_violation,three_strikes,enhancement,any_enhancement,term,resentencing
count,1411.0,1411.0,1411.0,1411.0,1411.0,1411.0,1411.0,1411.0,1411.0,1411.0,...,1411.0,1411.0,1411.0,1411.0,1411.0,1411.0,1411.0,1411.0,1411.0,1411.0
mean,1.0,1.0,0.967399,0.050319,0.102764,0.231751,0.228916,0.0,0.0,0.336641,...,0.227498,0.214033,0.009922,0.588235,0.04394,0.264352,0.24309,0.38696,0.869596,0.341602
std,0.0,0.0,0.177653,0.21868,0.303758,0.4221,0.420284,0.0,0.0,0.472728,...,0.419366,0.410295,0.099149,0.492327,0.205035,0.441144,0.429101,0.487227,0.336867,0.474415
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [21]:
reg = r'(?:three|3)(?:\s|-)strikes?'

In [55]:
re.findall(reg, test_string)

['three strikes', '3-strike', 'three-strikes']

In [56]:
# r'(?:this|or this)'

In [57]:
disposition_test = '. affirmed as modified. sarah a. stockwell, under appointment by the court of appeal, for defendant'


In [58]:
term_test = 'defendant was sentenced to the high term for count one, mid-term for count two, and the middle-term for count three.'
term_reg = r'\b[\w+]*\b(?:\s|-)?term'
re.findall(term_reg, term_test)

['high term', 'mid-term', 'middle-term']

In [59]:
df.groupby([col1, col2]).sum()

NameError: name 'df' is not defined

In [60]:
for case in cases[0:10]:
    print(case.appeals_case_number)
    print(case.occurrence_dict)

B120629
{'search_attorney': ['. ronald s. coen, judge. affirmed. siri shetty, under appointment by the court of appeal, for defendant'], 'search_appointed': ['under appointment'], 'search_outcome': ['judge. affirmed.'], 'pitchess_motion': 'Not found', 'prejudice': 'Not found', 'no_error': ['no error'], 'mid_term': 'Not found', 'disposition': 'Not found', 'full_disposition': 'Not found', 'prior_conviction': 'Not found', 'prior_felony_conviction': 'Not found', 'ineffective_assistance': ['ineffective assistance'], 'habeas_corpus': ['habeas corpus', 'habeas corpus'], 'prosecutorial_misconduct': 'Not found', 'polygraph': 'Not found', 'kamala_harris': ['kamala d. harris'], 'constitutional_violation': 'Not found', 'three_strikes': ['three strikes', 'three strikes', 'three strikes', 'three strikes'], 'enhancement': 'Not found', 'any_enhancement': 'Not found', 'term': ['a term', 'prison term'], 'resentencing': 'Not found'}
B158266
{'search_attorney': ['. torribio, judge. affirmed. charles r. kh

In [61]:
cases[1].find(SEARCH_DICTIONARY)

In [62]:
cases[1].occurrence_dict

{'search_attorney': ['. torribio, judge. affirmed. charles r. khoury, jr., under appointment by the court of appeal, for defendant'],
 'search_appointed': ['under appointment'],
 'search_outcome': ['judge. affirmed.'],
 'pitchess_motion': 'Not found',
 'prejudice': 'Not found',
 'no_error': ['no error'],
 'mid_term': 'Not found',
 'disposition': 'Not found',
 'full_disposition': 'Not found',
 'prior_conviction': 'Not found',
 'prior_felony_conviction': 'Not found',
 'ineffective_assistance': ['ineffective assistance'],
 'habeas_corpus': 'Not found',
 'prosecutorial_misconduct': 'Not found',
 'polygraph': 'Not found',
 'kamala_harris': ['kamala d. harris'],
 'constitutional_violation': 'Not found',
 'three_strikes': 'Not found',
 'enhancement': 'Not found',
 'any_enhancement': 'Not found',
 'term': 'Not found',
 'resentencing': 'Not found'}