In [5]:
"""
In this notebook we examine several datasets to look for a basic set of legal stopwords.
"""
import pandas as pd
import os

import time

data_header = '/Users/jhamer90811/Documents/Insight/legal_topic_modeling/data_uncompressed/baseline_12k'


datasets = ['random_cases2','cases_after1950_12k', 'cases_IL_12k', 'cases_IL_after1950_12k']


In [2]:
# Helper funcitons
def parse_list_col(df, col_to_parse):
    df.loc[:, col_to_parse] = df[col_to_parse].apply(lambda x: x.strip('[]').split(','))
    df.loc[:, col_to_parse] = df[col_to_parse].apply(lambda x: [t.strip().strip("'") for t in x])

# ************** THIS FUNCTION COURTESY OF EDWIN ZHANG **********
# Yields top n words by average frequency per document.
def sort_common_words(text, ngram = 1, n = 10,  tfidf = False, stopwords = None, binary = False,
                     rare = False):
    '''
    Return incidence of the n words that appear in the highest proportion of text samples
    
    Input:
        text (pd.Series): text to be analyzed
        n-gram(int): n-gram to analyze (default: unigram)
        n (int): number of words to return
        tfidf (boolean): if True, use tf-idf vector instead of binary count
        stopwords (str or None): common/basic words to remove in the vectorization process (keep None if using to find stopwords) 
        binary (boolean): if True, indicate presence or absence of word in doc
        rare (boolean): if True, return the rarest words instead of the most common
    Output:
        top_n_words: proportion of text samples that contain each of the top n words
    '''
    from sklearn.feature_extraction.text import CountVectorizer,  TfidfVectorizer
    from scipy.sparse import csr_matrix
    
    # Transform data into vectorized word binary counts or tf-idf counts
    if tfidf == True: 
        vect =  TfidfVectorizer(lowercase=True, analyzer='word', ngram_range=(ngram, ngram), stop_words = stopwords,
                               binary=binary)
    else:
        vect = CountVectorizer(lowercase=True, analyzer='word', ngram_range=(ngram, ngram), stop_words = stopwords,
                              binary=binary)
    word_counts = vect.fit_transform(text)
    vocab = vect.get_feature_names()
    num_entries = word_counts.shape[0]
    
    # Convert sparse matrix to a 1-column pandas DataFrame then to a pandas Series
    word_counts = word_counts.sum(axis = 0)
    word_counts = pd.DataFrame(word_counts)
    word_counts.columns = vocab
    word_counts = word_counts.transpose()
    word_counts = word_counts.iloc[:, 0]
    
    # Sort by word's prevalence and convert to proportion of text entires that includes the word
    if rare:
        top_n_words = word_counts.nsmallest(n) / num_entries
    else:
        top_n_words = word_counts.nlargest(n) / num_entries
    
    return top_n_words

def get_common_words(data_header, datasets, opinion_col, n=50, binary=True, rare=False):
    
    common_words = pd.DataFrame()

    for dataset in datasets:
        data = pd.read_csv(os.path.join(data_header, dataset+'_processed.csv'))
        parse_list_col(data, opinion_col)
        data['opinion_string'] = data['opinion'].apply(lambda x: ' '.join(x))
        data_cw = pd.DataFrame(sort_common_words(data['opinion_string'], n=n, binary=binary,
                                                rare=rare)).transpose()
        data_cw.index = [dataset]
        common_words = common_words.append(data_cw)

    return common_words.transpose()


In [8]:
# Common words: list of top 50 words from each corpus according to the proportion of documents
# in which they appear.
common_words = get_common_words(data_header, datasets, 'opinion')
# Frequent words: list of top 50 words from each corpus according to the average frequency with
# which they appear in each document.
frequent_words = get_common_words(data_header, datasets, 'opinion', binary=False)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [27]:
print(common_words.to_string())


           random_cases2  cases_after1950_12k  cases_IL_12k  cases_IL_after1950_12k
act             0.526333                  NaN      0.550500                     NaN
action          0.553083             0.557000      0.555917                0.629500
affirm          0.587583             0.613417      0.616833                0.667833
agree                NaN             0.567500           NaN                     NaN
allege               NaN                  NaN      0.492167                     NaN
appeal          0.705000             0.806500      0.624333                0.826667
appear          0.487333                  NaN      0.548667                     NaN
appellant       0.482167                  NaN           NaN                     NaN
argue                NaN             0.579750           NaN                0.646583
base                 NaN             0.558083           NaN                0.610917
because         0.527833                  NaN      0.615417                0

In [42]:
print(frequent_words.to_string())

           random_cases2  cases_after1950_12k  cases_IL_12k  cases_IL_after1950_12k
2d              2.022167             2.686417           NaN                     NaN
act             2.399917             1.940167      2.507250                     NaN
action          2.437167             2.766333      2.471083                3.750167
allege               NaN                  NaN           NaN                2.387500
appeal          2.921083             3.836000      2.265167                3.906000
appellant       3.816250             2.957750      2.732750                     NaN
appellee        2.111333                  NaN      2.072667                     NaN
argue                NaN             2.230417           NaN                2.643500
because              NaN                  NaN      1.948167                2.537333
case            7.126917             7.553333      6.880750                8.750000
child                NaN             2.313333           NaN                 

In [9]:
# Which words are common across all four corpuses?
common_words_all = common_words.dropna()
frequent_words_all = frequent_words.dropna()

common_words_all['mean'] = common_words_all.mean(axis=1)
common_words_all['std'] = common_words_all.std(axis=1)
frequent_words_all['mean'] = frequent_words_all.mean(axis=1)
frequent_words_all['std'] = frequent_words_all.std(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus

In [10]:
common_words_all = common_words_all.sort_values('mean', ascending=False)

In [11]:
frequent_words_all = frequent_words_all.sort_values('mean', ascending=False)

In [12]:
basic_legal_stop_words = set(common_words_all[common_words_all['mean']>0.7].index)|\
                              set(frequent_words_all[frequent_words_all['mean']>10].index)
print(basic_legal_stop_words)


{'fact', 'case', 'time', 'trial', 'state', 'find', 'question', 'defendant', 'court', 'hold', 'order', 'appeal', 'judgment', 'opinion'}


In [13]:
# There are also several regional reporters left in the vocabulary that we should remove.
reporters = set(['p.2d', 'p.3d', 's.e.', 's.e.2d', 's.e.3d', 'n.e.', 'n.e.2d',
                'p.', 'a.', 'a.2d', 'a.3d', 'so.','so.2d', 's.w.', 's.w.2d', 's.w.3d.',
                'f. supp.', 'f.supp.', 'f.supp.2d', 'f.supp.3d'])
basic_legal_stop_words|=reporters

In [14]:
# I will also add plaintiff as this seems to carry about as much meaning as "defendant", even if it
# appears less frequently. Same for appellee, appellant, see, and cir.

basic_legal_stop_words|={'plaintiff', 'appellee', 'appellant', 'see', 'cir'}

In [15]:
basic_legal_stop_words

{'a.',
 'a.2d',
 'a.3d',
 'appeal',
 'appellant',
 'appellee',
 'case',
 'cir',
 'court',
 'defendant',
 'f. supp.',
 'f.supp.',
 'f.supp.2d',
 'f.supp.3d',
 'fact',
 'find',
 'hold',
 'judgment',
 'n.e.',
 'n.e.2d',
 'opinion',
 'order',
 'p.',
 'p.2d',
 'p.3d',
 'plaintiff',
 'question',
 's.e.',
 's.e.2d',
 's.e.3d',
 's.w.',
 's.w.2d',
 's.w.3d.',
 'see',
 'so.',
 'so.2d',
 'state',
 'time',
 'trial'}