In [58]:
#import required packages
import nltk
import pandas as pd
import csv
import re
import numpy as np
import os
import gensim
import glob
import spacy
import random
import matplotlib.pyplot
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk import ngrams
from gensim import corpora, models
import pyLDAvis
import pyLDAvis.gensim
import pyLDAvis.gensim_models as gensimvis

In [59]:
#Generate file names using glob
_2018filenames = [I for I in glob.glob('data/2018/*.txt')]
_2019filenames = [I for I in glob.glob('data/2019/*.txt')]
_2020filenames = [I for I in glob.glob('data/2020/*.txt')]
_2021filenames = [I for I in glob.glob('data/2021/*.txt')]
_2022filenames = [I for I in glob.glob('data/2022/*.txt')]

#TO REMOVE: TEST CASE FOR BIGRAMS
# test_data = open(r"./data/TEST1.txt", "r").read()
# list_test = list()
# list_test.append(test_data)

#Save them as one massive list
list_cases = list()


for file in _2018filenames:
    case = open(r"" + file + "", "r")
    case = case.read()
    list_cases.append(case)

for file in _2019filenames:
    case = open(r"" + file + "", "r")
    case = case.read()
    list_cases.append(case)

for file in _2020filenames:
    case = open(r"" + file + "", "r")
    case = case.read()
    list_cases.append(case)

for file in _2021filenames:
    case = open(r"" + file + "", "r")
    case = case.read()
    list_cases.append(case)

""" for file in _2022filenames:
    case = open(r"" + file + "", "r")
    case = case.read()
    list_cases.append(case) """



' for file in _2022filenames:\n    case = open(r"" + file + "", "r")\n    case = case.read()\n    list_cases.append(case) '

In [60]:
#Create helper functions for pre-processing

#Default stopwords list
from nltk.corpus import stopwords
stopwords_def = stopwords.words('english')

#Keep only real words
from nltk.corpus import words
from nltk.corpus import wordnet

#Define words as anything distinct in these two NLTK sets
real_word_set = set(words.words() + list(wordnet.words()))
    
#Depluralize all nouns
import pattern
from pattern.en import singularize

#Find words that end with 's' that need to be handled differently than plurals
exceptions = pd.read_csv('./preprocessing/singularized_exceptions.csv')
exceptions_list = exceptions.to_dict('records')
exceptions_list = exceptions_list[0]

#Set a function to process these words differently
def singularize_esp(word):
    if word in exceptions_list:
        return exceptions_list.get(word)
    return singularize(word)
    
#Want to keep certain words capitalized
proper_nouns = pd.read_csv('./preprocessing/proper_nouns.csv', header = None)
proper_nouns = proper_nouns.values.tolist()
proper_nouns = proper_nouns[0]

def recapitalize(word):
    if word in proper_nouns:
        return word
    if word.lower() in real_word_set:
        return word.lower()   
    else:
        return word
    
#n-gram helper function
def replace_ngram(text, bigrams, trigrams):
    
    for gram in trigrams:
        pattern = r'\b' + r'\s+'.join(gram.split()) + r'\b'
        replacement = '_'.join(gram.split())
        text = re.sub(pattern, replacement, text)
    
    for gram in bigrams:
        pattern = r'\b' + r'\s+'.join(gram.split()) + r'\b'
        replacement = '_'.join(gram.split())
        text = re.sub(pattern, replacement, text)
    
    return text

#Lemmatize based on part of speech (pos tagging)

def pos_lemmatize(word, tag):
    if tag.startswith('J'):
        return lemmatizer.lemmatize(word, pos='a')  # Adjective
    elif tag.startswith('V'):
        return lemmatizer.lemmatize(word, pos='v')  # Verb
    elif tag.startswith('N'):
        return lemmatizer.lemmatize(word, pos='n')  # Noun
    elif tag.startswith('R'):
        return lemmatizer.lemmatize(word, pos='r')  # Adverb
    else:
        return lemmatizer.lemmatize(word)  # Default to noun

#Get rid of proper nouns
name_remove = spacy.load("en_core_web_sm")
def remove_proper_nouns(text):
    doc = name_remove(text)
    filtered_text = " ".join([token.text for token in doc if token.pos_ != "PROPN"])
    return filtered_text

#Filter to keep only nouns
def nouns(word):
    pos_word = nltk.pos_tag(word)
    filtered = [word[0] for word in pos_word if word[1] in ['NN', 'NNP', 'NNS', 'NNPS']]
    return filtered 

In [61]:
random.seed(32)
from nltk.probability import FreqDist

legal_corpus = pd.DataFrame()

for i in range(10):
    selected_samples = random.sample(list_cases, 8)
    print(type(selected_samples))
    lemmatized_documents = []

    for text in selected_samples: 
        draft_case = text
        #Use regex to do a preliminary cleaning of the content
        ### Remove punctuation and numbers
        draft_case = re.sub("[0-9]", "", draft_case) #Removes numbers
        draft_case = re.sub(r"\/", " ", draft_case) #Replaces slashes with spaces
        draft_case = re.sub(r"\'", " ", draft_case) #Replaces apostrophes with spaces
        draft_case = re.sub(r"\-", " ", draft_case) #Replaces en-dash with spaces
        draft_case = re.sub(r"\–", " ", draft_case) #Replaces em-dash with spaces
        draft_case = re.sub(r"[.,?!()•$;:@§%&~\[\]\"]", "", draft_case) #Removes extraneous punctuation
        draft_case = re.sub(r"^$n','\n", "", draft_case, re.MULTILINE) #Removes empty lines
        draft_case = re.sub(r"\b\w{1,2}\b", '', draft_case) #Removes all one and two-character words (none have meaning)
        draft_case = re.sub(r"\n","", draft_case) #Removes extraneous line breaks


        #Remove extraneous information about lawyers and case detail that doesn't impact the content of the case
        starting_phrase = "PRELIMINARY STATEMENT" #Use this as the starter for when the actual lawsuit begins
        remove_before = r'^.*?{}'.format(re.escape(starting_phrase))
        draft_case = re.sub(remove_before, "", draft_case, flags=re.DOTALL)

        #Remove "FIRST AMENDED COMPLAINT", which is at the bottom of every page
        draft_case = re.sub(r"FIRST AMENDED COMPLAINT", "", draft_case)

        #These steps are done last since other words' removal will impact them
        draft_case = re.sub(r"\s+", " ", draft_case) #Removes multiple whitespaces
        draft_case = re.sub(r"([a-z])([A-Z])", r"\1 \2", draft_case) #Separates words that were joined together by double spaces

        #Word tokenization
        draft_case = word_tokenize(draft_case)
        print(draft_case[0:100])


        #Singularize
        draft_case = [singularize_esp(word) for word in draft_case]

        #Recapitalize proper nouns only
        draft_case = [recapitalize(word) for word in draft_case]

        #Remove stopwords
        #Remove all non-real words (done twice since wordnet "real words" set includes names)
        draft_case = [word for word in draft_case if ((word not in stopwords_def) and (word in real_word_set or word in proper_nouns))]
        draft_case = [remove_proper_nouns(word) for word in draft_case]

        #After removing non-real words, implement n-grams

        ##Return draft_case into a string
        draft_case = ' '.join(draft_case)
        ##Create n-grams using pointwise mutual information

        ###Create bigrams and use PMI to score them
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        bigram_finder = nltk.collocations.BigramCollocationFinder.from_words(str.split(draft_case))
        ####Apply a frequency filter
        bigram_finder.apply_freq_filter(3)
        bigram_scores = bigram_finder.score_ngrams(bigram_measures.pmi)

        ###Do the same for trigrams
        trigram_measures = nltk.collocations.TrigramAssocMeasures()
        trigram_finder = nltk.collocations.TrigramCollocationFinder.from_words(str.split(draft_case))
        ####Frequency filter
        trigram_finder.apply_freq_filter(3)
        trigram_scores = trigram_finder.score_ngrams(trigram_measures.pmi)

        #Store the scores as dataframes
        bigram_pmi = pd.DataFrame(bigram_scores)
        bigram_pmi.columns = ['bigram', 'pmi']
        bigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)
        #Save only the bigrams with PMI greater than 5 (arbitrary threshold)
        bigram_pmi = bigram_pmi[bigram_pmi.apply(lambda bigram: bigram.pmi > 5, axis = 1)][:500]

        trigram_pmi = pd.DataFrame(trigram_scores)
        trigram_pmi.columns = ['trigram', 'pmi']
        trigram_pmi.sort_values(by='pmi', axis = 0, ascending = False, inplace = True)
        #Save only the trigrams with PMI greater than 5 (arbitrary threshold)
        trigram_pmi = trigram_pmi[trigram_pmi.apply(lambda trigram: trigram.pmi > 5, axis = 1)][:500]

        #Keep only the values
        bigrams = [' '.join(x) for x in bigram_pmi.bigram.values]
        trigrams = [' '.join(x) for x in trigram_pmi.trigram.values]

        # Concatenate n-grams
        draft_case = replace_ngram(draft_case, bigrams, trigrams)

        #Lemmatization
        from nltk.stem import WordNetLemmatizer
        lemmatizer = WordNetLemmatizer()

        #Re-tokenize
        draft_case = word_tokenize(draft_case)

        #Tag by part of speech
        from nltk import pos_tag
        tagged_word_content = pos_tag(draft_case)

        #Need to lemmatize using part of speech to ensure accuracy
        # Lemmatize using part of speech
        lemmatized_output = [pos_lemmatize(word, tag) for word, tag in tagged_word_content]
        lemmatized_output = nouns(lemmatized_output)
        
        lemmatized_documents.append(lemmatized_output)

        flattened_corpus = [word for doc in lemmatized_documents for word in doc]
        # Create a frequency distribution
        freq_dist = FreqDist(flattened_corpus)
        # Get the most common words
        most_common_words = freq_dist.most_common(100)
        column_name_words = f"Sample_{i+1}_Words"
        column_name_count = f"Sample_{i+1}_Count"
        legal_corpus[column_name_words] = [word for word, _ in most_common_words]
        legal_corpus[column_name_count] = [freq for _, freq in most_common_words]

legal_corpus.to_csv('preprocessing/legal_corpus.csv')
        

<class 'list'>
['FREDRIC', 'WOOCHER', 'SBN', 'BEVERLY', 'GROSSMAN', 'PALMER', 'SBN', 'DALE', 'LARSON', 'SBN', 'STRUMWASSER', 'WOOCHER', 'LLP', 'Wilshire', 'Boulevard', 'Suite', 'Los', 'Angeles', 'California', 'Telephone', 'Facsimile', 'mail', 'bpalmerstrumwoochcom', 'Attorney', 'for', 'Petitioner', 'and', 'Plaintiff', 'Crenshaw', 'Subway', 'Coalition', 'SUPERIOR', 'COURT', 'CALIFORNIA', 'COUNTY', 'LOS', 'ANGELESCRENSHAW', 'SUBWAY', 'COALITION', 'Case', 'nonprofit', 'organization', 'Petitioner', 'Plaintiff', 'VERIFIED', 'PETITION', 'FOR', 'WRIT', 'MANDATE', 'AND', 'COMPLAINT', 'FOR', 'INJUNCTIVE', 'AND', 'DECLARATORY', 'RELIEFCITY', 'LOS', 'ANGELES', 'LOS', 'ANGELES', 'CITY', 'COUNCIL', 'and', 'DOES', 'inclusive', 'Pub', 'Res', 'Code', 'Code', 'Civ', 'Proc', 'Respondents', 'Defendants', 'CALIFORNIA', 'ENVIRONMENTAL', 'QUALITYCAPRI', 'URBAN', 'BALDWIN', 'LLC', 'limited', 'ACT', 'CEQA', 'ACTION', 'liability', 'corporation', 'CAPRI', 'UBRAN', 'CRENSHAW', 'LLC', 'limited', 'liability', 'cor

In [65]:
legal_corpus_words = legal_corpus.select_dtypes(include = ['object'])
unique_legal_corpus_words = legal_corpus_words.stack().unique()
print(unique_legal_corpus_words)

final_legal_corpus = []
print(type(final_legal_corpus))

for word in unique_legal_corpus_words:
    occurrences = legal_corpus_words.applymap(lambda x: x == word).sum().sum()
    if occurrences > 6:
        final_legal_corpus.append(word)

final_legal_corpus_df = pd.DataFrame(final_legal_corpus, columns = ['Word'])
final_legal_corpus_df.to_csv('preprocessing/final_legal_corpus.csv')


['project' 'city' 'water' 'county' 'plan' 'petitioner' 'CEQA' 'state'
 'action' 'impact' 'area' 'district' 'ranch' 'space' 'contractor'
 'respondent' 'commission' 'fire' 'article' 'noise' 'trail' 'development'
 'cost' 'council' 'code' 'emission' 'agency' 'land' 'year' 'approval'
 'petition' 'facility' 'site' 'use' 'law' 'court' 'contract' 'staff'
 'property' 'verified_petition_writ' 'amount' 'factory' 'portion' 'relief'
 'park' 'amendment' 'hotel' 'tribe' 'measure' 'permit' 'charge' 'section'
 'delivery' 'act' 'general_plan' 'seawall' 'notice' 'construction'
 'equipment' 'transportation' 'case' 'mitigation' 'party' 'mandate'
 'community' 'review' 'entitlement' 'San_Jose' 'writ_mandate' 'payment'
 'plaintiff' 'operation' 'resource' 'complaint' 'traffic' 'board'
 'interest' 'capacity' 'document' 'service' 'record' 'neighborhood'
 'attorney' 'petition_writ_mandate' 'hazard' 'thereof' 'violation'
 'allege' 'capital' 'date' 'mail' 'person' 'CEQA_guideline' 'barn'
 'comment' 'beach' 'San' 'p

  occurrences = legal_corpus_words.applymap(lambda x: x == word).sum().sum()


In [66]:
#Implement tf-idf weights

from sklearn.feature_extraction.text import TfidfVectorizer

legal_corpus_joined = [' '.join(doc) for doc in lemmatized_documents]

print(legal_corpus_joined)

# Create a TfidfVectorizer object

tfidf_vectorizer = TfidfVectorizer(tokenizer=None, preprocessor=None)

# Fit and transform the documents
tfidf_matrix = tfidf_vectorizer.fit_transform(legal_corpus_joined)

# Get the feature names (words)
feature_names = tfidf_vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)

legal_corpus_total_weights = tfidf_df.sum(axis=0)

legal_corpus_total_weights.to_csv(('preprocessing/legal_corpus_tfidf_weights.csv'))

