In [3]:
# Module imports
import os
import pickle
import re
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [69]:


class Case:
    occurrence_dict = {}
    # Note to self: These are class elements, so if you want to change them,
    # you can change the whole class by setting Case.<element> = <value>
    # and change individual cases with case.<element> = value
    def __init__(self):
        pass
    def set_scanned_text_path(self, scanned_text_path):
        self.scanned_text_path = scanned_text_path
    def read_scanned_text(self):
        assert os.path.isfile(self.scanned_text_path) == True
        with open(self.scanned_text_path, "r") as file:
            self.scanned_text = file.read()
            self.appeals_case_number = os.path.splitext(os.path.split(self.scanned_text_path)[1])[0]
            self.occurrence_df = pd.DataFrame(index=[self.appeals_case_number])
    def set_clean_text_path(self, clean_text_path):
        self.clean_text_path = clean_text_path
    def read_clean_text(self):
        assert os.path.isfile(self.clean_text_path) == True
        with open(self.clean_text_path, "r") as file:
            self.clean_text = file.read()
            self.appeals_case_number = os.path.splitext(os.path.split(self.clean_text_path)[1])[0]
            self.occurrence_df = pd.DataFrame(index=[self.appeals_case_number])
    def make_clean_text(self):
        text = self.scanned_text
        text = text.lower()
        # Strip all newlines:
        text = re.sub('\n', '', text)
        # turn all whitespace into single spaces:
        self.clean_text = re.sub(r'(\s+)', ' ', text)
    # TODO: either merge the text functions or have them check if casenum
    # already exists?
    def set_pdf_path(self, pdf_path):
        assert os.path.isfile(pdf_path) == True, \
            "pdf_path must lead to a file."
        self.pdf_path = pdf_path
    def set_save_path(self):
        assert (self.appeals_case_number is not None), \
            "Appeals case number must be set before object can be saved."
        current_path = os.path.abspath('')
        folder = os.path.join(current_path, 'case_objects')
        save_path = os.path.join(folder,'%s.bin' % self.appeals_case_number)
        self.save_path = save_path
    def save(self):
        with open(self.save_path, 'wb') as f:
            pickle.dump(self, f)
        # TODO: test read/write from binary

# want a function that takes a dict {key:value}, searches for value,
# if found: puts the result in column "key" in DataFrame
# if not: puts ... None? False? 'Not found?'

    def find(self, search_dictionary):
    # search_dictionary should be of form {'category':'regex'}
        assert (self.clean_text is not None), \
            'Cleaned text must be available to search.'
        assert (len(search_dictionary) != 0), \
            'Size of dictionary is zero. Dictionary must have contents.'
        text = self.clean_text
        for key in search_dictionary:
            found = re.findall(search_dictionary[key], text)
            if len(found) != 0:
                self.occurrence_df[key] = 1
                self.occurrence_dict[key] = found
            else:
                self.occurrence_df[key] = 0
                self.occurrence_dict[key] = 'Not found'


As I'm writing this, I do have all of the refined corpuses that I want to use in one folder. 

In [70]:
corpus_folder = os.path.join(os.path.abspath(''),'refined_corpuses')
cases = []
for f in os.listdir(corpus_folder):
    clean_text_path = os.path.join(corpus_folder, f)
    case = Case()
    case.set_clean_text_path(clean_text_path)
    case.read_clean_text()
    case.set_save_path()
    case.save()
    cases.append(case)

In [71]:
# Checking that we got all the cases: should be 8474
len(cases)


8474

In [72]:
# Checking that properties exist as expected: 
for c in cases[0:9]:
    print(c.appeals_case_number) # should be str of form 'BXXXXXX'
    print(c.save_path) # should be absolute path ending in casenum
    print(c.clean_text[0:100]) # should be first 101 chars of text

B120629
C:\Users\leodb\Documents\THESIS\ldbw\case_objects\B120629.bin
filed 4/10/13 p. v. downing ca2/5 not to be published in the official reports california rules of co
B158266
C:\Users\leodb\Documents\THESIS\ldbw\case_objects\B158266.bin
 filed 6/16/15 p. v. pilola ca2/4 not to be published in the official reports california rules of co
B197574
C:\Users\leodb\Documents\THESIS\ldbw\case_objects\B197574.bin
filed 1/26/15 p. v. goodwin ca2/8 not to be published in the official reports california rules of co
B211622
C:\Users\leodb\Documents\THESIS\ldbw\case_objects\B211622.bin
filed 8/30/13 p. v. gutierrez ca2/1 not to be published in the official reports california rules of 
B213582
C:\Users\leodb\Documents\THESIS\ldbw\case_objects\B213582.bin
filed 4/2/12 p. v. khrayan ca2/2 not to be published in the official reports california rules of cou
B217909
C:\Users\leodb\Documents\THESIS\ldbw\case_objects\B217909.bin
 filed 9/27/12 p. v. alonzo ca2/3 not to be published in the official repor

In [73]:
for c in cases[0:9]:
    c.find(SEARCH_DICTIONARY)

In [None]:
SEARCH_DICTIONARY = {
    'search_attorney': r'(\. .{3,100}? for defendants?)',
    'search_appointed': r'under appointment',
    'search_outcome': r'\. judgment .{3,100}?\.',
    'pitchess_motion': r'pitchess .{0,5}?motion',
    'prejudice':'unduly prejudicial',
    'no_error':'no error',
    'mid_term': r'mid(dle)?[ -]?term',
    'disposition': r'diposition .{0:100}?\.',
    'prior_conviction': r'prior .{3,20}conviction',
    'prior_felony_conviction': r'prior felony convictions?',
    'ineffective_assistance': r'ineffective assistance',
    'habeas_corpus': r'habeas corpus',
    'prosecutorial_misconduct': r'(prosecutor|prosecutorial) misconduct',
    'polygraph': 'polygraph',
    'kamala_harris': r'kamala .{0,10}?harris',
    'constitutional_violation': r'constitutional violation',
    }


In [76]:
test_string = ' three strikes and you\'re out, 3-strike policy, three-strikes left right' 

In [92]:
reg = r'(three|3)(\s|-)strikes?'

In [93]:
re.findall(reg, test_string)

[('three', ' '), ('3', '-'), ('three', '-')]