In [207]:
#import required packages
import nltk
import pandas as pd
import csv
import re
import numpy as np
import os
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import ngrams

In [208]:
from nltk.corpus import stopwords
#Add more stopwords
adtl_stopwords = pd.read_csv("./preprocessing/test_stopwords.csv", header = None)
adtl_stopwords = adtl_stopwords.values.tolist()
adtl_stopwords = adtl_stopwords[0]

#Combine with default stopwords list
stopwords_def = stopwords.words('english')
stopwords_def.extend(adtl_stopwords)
print(stopwords_def)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [209]:
#Open sample case (CEQA: 04672)
draft_case = open(r"./data/TEST1.txt", "r")
draft_case_original = draft_case.read()
draft_case = draft_case_original

In [210]:
#Use regex to do a preliminary cleaning of the content
### Remove punctuation and numbers
draft_case = re.sub("[0-9]", "", draft_case) #Removes numbers
draft_case = re.sub(r"\/", " ", draft_case) #Replaces slashes with spaces
draft_case = re.sub(r"\'", " ", draft_case) #Replaces apostrophes with spaces
draft_case = re.sub(r"\-", " ", draft_case) #Replaces en-dash with spaces
draft_case = re.sub(r"\–", " ", draft_case) #Replaces em-dash with spaces
draft_case = re.sub(r"[.,?!();:@§%&~\[\]\"]", "", draft_case) #Removes extraneous punctuation
draft_case = re.sub(r"^$n','\n", "", draft_case, re.MULTILINE) #Removes empty lines
draft_case = re.sub(r"\b\w{1,2}\b", '', draft_case) #Removes all one and two-character words (none have meaning)
draft_case = re.sub(r"\n","", draft_case) #Removes extraneous line breaks

#Remove extraneous information about lawyers and case detail that doesn't impact the content of the case
starting_phrase = "PRELIMINARY STATEMENT" #Use this as the starter for when the actual lawsuit begins
remove_before = r'^.*?{}'.format(re.escape(starting_phrase))
draft_case = re.sub(remove_before, "", draft_case, flags=re.DOTALL)

#Remove "FIRST AMENDED COMPLAINT", which is at the bottom of every page
draft_case = re.sub(r"FIRST AMENDED COMPLAINT", "", draft_case)

#These steps are done last since other words' removal will impact them
draft_case = re.sub(r"\s+", " ", draft_case) #Removes multiple whitespaces
draft_case = re.sub(r"([a-z])([A-Z])", r"\1 \2", draft_case) #Separates words that were joined together by double spaces
print(draft_case[0:6000])

Petitioners are bringing this action prevent Respondents Defendants California Department Public Health Karen Smith and Orange County Needle Exchange Program collectively Respondents from instituting needle exchange program that does not comport with the legal requirements necessary ensure the health and safety the public large Indeed the needle exchange program authorized commence August poses serious threat the health and safety the citizens Orange County This action also challenges the Orange County Needle Exchange Program violation the California Medical Waste Management Act medical waste generator Finally this action challenges the violation the California OFFICE THE UNTIKN GLNSEL Environmental Quality Act CEQA Respondents California Department Public Health and Karen Smith hereinafter State Respondents because they have entirely failed performthe required environmental review needed for the collection and disposal used needles wellthe violation the California Medical Waste Manage

In [211]:
#Word tokenization
draft_case = word_tokenize(draft_case)

#Depluralize all nouns
import pattern
from pattern.en import singularize

#Find words that end with 's' that need to be handled differently than plurals
exceptions = pd.read_csv('./preprocessing/singularized_exceptions.csv')
exceptions_list = exceptions.to_dict('records')
exceptions_list = exceptions_list[0]
print(exceptions_list)

#Set a function to process these words differently
def singularize_esp(word):
    if word in exceptions_list:
        return exceptions_list.get(word)
    return singularize(word)
    
print(singularize_esp("California"))

draft_case = [singularize_esp(word) for word in draft_case]

print(draft_case[0:2000])


{'California': 'California', 'Costa': 'Costa', 'serious': 'serious', 'fictitious': 'fictitious', 'numerous': 'numerous', 'intravenous': 'intravenous', 'process': 'process', 'this': 'this', 'has': 'has', 'virus': 'virus', 'various': 'various', 'basis': 'basis', 'business': 'business', 'thus': 'thus', 'debris': 'debris', 'excess': 'excess', 'was': 'was'}
California
['Petitioner', 'are', 'bringing', 'this', 'action', 'prevent', 'Respondent', 'Defendant', 'California', 'Department', 'Public', 'Health', 'Karen', 'Smith', 'and', 'Orange', 'County', 'Needle', 'Exchange', 'Program', 'collectively', 'Respondent', 'from', 'instituting', 'needle', 'exchange', 'program', 'that', 'do', 'not', 'comport', 'with', 'the', 'legal', 'requirement', 'necessary', 'ensure', 'the', 'health', 'and', 'safety', 'the', 'public', 'large', 'Indeed', 'the', 'needle', 'exchange', 'program', 'authorized', 'commence', 'August', 'pose', 'serious', 'threat', 'the', 'health', 'and', 'safety', 'the', 'citizen', 'Orange', '

In [212]:
#Remove all non-real words
from nltk.corpus import words
from nltk.corpus import wordnet

#Define words as anything distinct in these two NLTK sets
real_word_set = set(words.words() + list(wordnet.words()))

#Want to keep certain words capitalized
proper_nouns = pd.read_csv('./preprocessing/proper_nouns.csv', header = None)
proper_nouns = proper_nouns.values.tolist()
proper_nouns = proper_nouns[0]
print(proper_nouns)

def recapitalize(word):
    if word in proper_nouns:
        return word
    if word.lower() in real_word_set:
        return word.lower()   
    else:
        return word
        
draft_case = [recapitalize(word) for word in draft_case]
        
draft_case = [word for word in draft_case if (word in real_word_set or word in proper_nouns)]

print(draft_case[0:400])

['Costa', 'OCNEP', 'CEQA', 'CDPH', 'California', 'IUD', 'Los', 'Angeles', 'San', 'Francisco', 'Bernardino', 'Diego', 'Jose', 'Fresno', 'Sacramento', 'Oakland', 'Ana', 'Bakersfield', 'Santa', 'Anaheim', 'Riverside', 'Stokckton', 'Fremont', 'Irvine', 'Modesto', 'Clara', 'Clarita', 'Rosa', 'Corona', 'Glendale', 'Moreno', 'Escondido', 'Torrance', 'Pomona', 'Pasadena', 'Simi', 'Vallejo', 'Berkeley', 'Compton', 'Viejo', 'Barbara', 'Leandro', 'Monica', 'Huntington', 'Newport', 'Manhattan', 'Oxnard', 'Vista', 'Chula', 'Mesa']
['petitioner', 'are', 'bringing', 'this', 'action', 'prevent', 'respondent', 'defendant', 'California', 'department', 'public', 'health', 'karen', 'smith', 'and', 'orange', 'county', 'needle', 'exchange', 'program', 'collectively', 'respondent', 'from', 'needle', 'exchange', 'program', 'that', 'do', 'not', 'comport', 'with', 'the', 'legal', 'requirement', 'necessary', 'ensure', 'the', 'health', 'and', 'safety', 'the', 'public', 'large', 'indeed', 'the', 'needle', 'exchang

In [196]:
#Lemmatization
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

#Tag by part of speech
from nltk import pos_tag
tagged_word_content = pos_tag(word_content_filtered)


#Need to lemmatize using part of speech to ensure accuracy

#Need to source this code: https://stackoverflow.com/questions/32957895/wordnetlemmatizer-not-returning-the-right-lemma-unless-pos-is-explicit-python
def pos_lemmatize(word):
    word = [word]
    pos_tag_word = pos_tag(word)
    for word, tag in pos_tag_word:
        lemma_tag = tag[0].lower()
        if lemma_tag in ['a', 'r', 'n', 'v']:
            lemma_tag = lemma_tag 
        else:
            lemma_tag = None
        if not lemma_tag:
             return word
        else:
            return lemmatizer.lemmatize(word, lemma_tag)

lemmatized_output = [pos_lemmatize(word) for word in word_content_filtered]
print(type(lemmatized_output))

print(lemmatized_output[0:200])

<class 'list'>
['petitioner', 'be', 'bring', 'this', 'action', 'prevent', 'respondent', 'defendant', 'California', 'department', 'public', 'health', 'karen', 'smith', 'and', 'orange', 'county', 'needle', 'exchange', 'program', 'collectively', 'respondent', 'from', 'needle', 'exchange', 'program', 'that', 'do', 'not', 'comport', 'with', 'the', 'legal', 'requirement', 'necessary', 'ensure', 'the', 'health', 'and', 'safety', 'the', 'public', 'large', 'indeed', 'the', 'needle', 'exchange', 'program', 'authorize', 'commence', 'august', 'pose', 'serious', 'threat', 'the', 'health', 'and', 'safety', 'the', 'citizen', 'orange', 'county', 'action', 'also', 'challenge', 'the', 'orange', 'county', 'needle', 'exchange', 'program', 'violation', 'the', 'California', 'medical', 'waste', 'management', 'act', 'medical', 'waste', 'generator', 'finally', 'this', 'action', 'challenge', 'the', 'violation', 'the', 'California', 'office', 'the', 'environmental', 'quality', 'act', 'CEQA', 'respondent', 'Calif

In [None]:
#Create n-grams
