In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import data
from gensim.models import word2vec
from tqdm import tqdm
tqdm.pandas()

np.random.seed(42)

In [2]:
# This is the list of all the appellate cases from the 9th circuit

df=pd.read_pickle('./circuit_w_text')

In [4]:
# This the list of all the appellate cases except the 9th circuit

df2=pd.read_pickle('./circuit_dc_w_text')

In [5]:
df.shape

(264, 40)

In [6]:
df2.shape

(208, 40)

In [7]:
# Drop out entries that are blank or NAN in the text field.

df=df.dropna(subset=['text'])

df=df.loc[df['text']!='']

df2=df2.dropna(subset=['text'])

df2=df2.loc[df2['text']!='']

In [8]:
df.shape

(247, 40)

In [9]:
df2.shape

(205, 40)

In [10]:
# Adapted from Matt Brems' lecture notes

def review_to_wordlist(sentence):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #    
    
    # tokenizing and removing punctuation
    tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}')
    text_processed=tokenizer.tokenize(sentence)
    
    legal_words=set(['sec', 'fed', 'reg', 'act', 'cir', 'cert', 'see', 'app', 'soc', 'stat'])
    stops=set(stopwords.words('english'))
    
    # removing any stopwords
    text_processed = [word for word in text_processed  if (word.lower() not in stops and 
                     word.lower() not in legal_words)]
          
    # make words lower-cased, unless it is an acronym
    text_processed = [word.lower() if (word.upper() != word or len(word)>4) else word for word in text_processed ]
    
    # return a list of words
    return (text_processed)
    

In [11]:
# Adapted from Matt Brems' lecture notes.

# Load the punkt tokenizer
tokenizer = data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( text, tokenizer):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the document into sentences
    raw_sentences = tokenizer.tokenize(text.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [12]:
#Break each case down into a series of sentances, each of which is a list of words
# Assemble a list of all such sentences in the entire corpus


train_sentences = []  # Initialize an empty list of sentences
counter=0

print("Parsing sentences from training set")
for case_text in df['text']:
    print (counter)
    train_sentences += review_to_sentences(case_text, tokenizer)
    counter += 1

Parsing sentences from training set
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246


In [13]:
len(train_sentences)

109863

In [14]:
# Build a Word2Vec Model from the texts of all the cases


# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 100    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 40         # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)

print("Training model...")
model = word2vec.Word2Vec(train_sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "100features_40minwords_40context"
model.save(model_name)

2018-07-10 16:16:55,528 : INFO : collecting all words and their counts
2018-07-10 16:16:55,529 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-07-10 16:16:55,570 : INFO : PROGRESS: at sentence #10000, processed 98026 words, keeping 7796 word types
2018-07-10 16:16:55,600 : INFO : PROGRESS: at sentence #20000, processed 194672 words, keeping 11582 word types
2018-07-10 16:16:55,633 : INFO : PROGRESS: at sentence #30000, processed 293803 words, keeping 13854 word types
2018-07-10 16:16:55,667 : INFO : PROGRESS: at sentence #40000, processed 395471 words, keeping 16096 word types
2018-07-10 16:16:55,701 : INFO : PROGRESS: at sentence #50000, processed 497056 words, keeping 17779 word types
2018-07-10 16:16:55,732 : INFO : PROGRESS: at sentence #60000, processed 600534 words, keeping 18974 word types


Training model...


2018-07-10 16:16:55,767 : INFO : PROGRESS: at sentence #70000, processed 699081 words, keeping 19963 word types
2018-07-10 16:16:55,792 : INFO : PROGRESS: at sentence #80000, processed 773462 words, keeping 20600 word types
2018-07-10 16:16:55,836 : INFO : PROGRESS: at sentence #90000, processed 879668 words, keeping 22230 word types
2018-07-10 16:16:55,868 : INFO : PROGRESS: at sentence #100000, processed 980906 words, keeping 23205 word types
2018-07-10 16:16:55,908 : INFO : collected 24830 word types from a corpus of 1080367 raw words and 109863 sentences
2018-07-10 16:16:55,910 : INFO : Loading a fresh vocabulary
2018-07-10 16:16:56,034 : INFO : min_count=40 retains 3619 unique words (14% of original 24830, drops 21211)
2018-07-10 16:16:56,035 : INFO : min_count=40 leaves 948364 word corpus (87% of original 1080367, drops 132003)
2018-07-10 16:16:56,050 : INFO : deleting the raw counts dictionary of 24830 items
2018-07-10 16:16:56,051 : INFO : sample=0.001 downsamples 36 most-commo

In [15]:
# Process each text field into a list of tokens

df['text2']=df['text'].progress_apply(review_to_wordlist)

100%|██████████| 247/247 [00:02<00:00, 112.28it/s]


In [16]:
# Process each text field into a list of tokens

df2['text2']=df2['text'].progress_apply(review_to_wordlist)

100%|██████████| 205/205 [00:02<00:00, 102.49it/s]


In [17]:
df.to_pickle('./processed_text')

In [18]:
df2.to_pickle('./dc_processed_text')

In [19]:
model.wv.vocab

{'envtl': <gensim.models.keyedvectors.Vocab at 0x1a2bba44e0>,
 'rep': <gensim.models.keyedvectors.Vocab at 0x1a2bba4f28>,
 'friends': <gensim.models.keyedvectors.Vocab at 0x1a2bba4e80>,
 'endangered': <gensim.models.keyedvectors.Vocab at 0x1a2bba42e8>,
 'species': <gensim.models.keyedvectors.Vocab at 0x1a2bba4ef0>,
 'INC': <gensim.models.keyedvectors.Vocab at 0x1a2bba4400>,
 'plaintiff': <gensim.models.keyedvectors.Vocab at 0x1a2bba4b00>,
 'appellant': <gensim.models.keyedvectors.Vocab at 0x1a2bba4eb8>,
 'robert': <gensim.models.keyedvectors.Vocab at 0x1a2bba4908>,
 'director': <gensim.models.keyedvectors.Vocab at 0x1a2bba49e8>,
 'united': <gensim.models.keyedvectors.Vocab at 0x1a2bba4588>,
 'states': <gensim.models.keyedvectors.Vocab at 0x1a2bba4898>,
 'fish': <gensim.models.keyedvectors.Vocab at 0x1a2bba45f8>,
 'county': <gensim.models.keyedvectors.Vocab at 0x1a2bba4e10>,
 'san': <gensim.models.keyedvectors.Vocab at 0x1a2bba4550>,
 'city': <gensim.models.keyedvectors.Vocab at 0x1a2bb

In [24]:
model.most_similar('salmon')

  """Entry point for launching an IPython kernel.


[('chinook', 0.8644458055496216),
 ('coho', 0.8242893218994141),
 ('steelhead', 0.8165823817253113),
 ('soncc', 0.7848293781280518),
 ('naturally', 0.7489623427391052),
 ('hatchery', 0.7464495301246643),
 ('spawning', 0.7408063411712646),
 ('snake', 0.7348811030387878),
 ('klamath', 0.6949190497398376),
 ('salmonid', 0.6867708563804626)]

In [21]:
model.most_similar('plaintiff')

  """Entry point for launching an IPython kernel.


[('defendant', 0.7601103186607361),
 ('redressability', 0.7488522529602051),
 ('establish', 0.7452912330627441),
 ('asserting', 0.7404433488845825),
 ('injury', 0.7400792241096497),
 ('concrete', 0.7307429909706116),
 ('redressed', 0.7150592803955078),
 ('redress', 0.7069644927978516),
 ('lujan', 0.7050991654396057),
 ('behalf', 0.7011703848838806)]

In [22]:
model.most_similar('violation')

  """Entry point for launching an IPython kernel.


[('alleged', 0.801295816898346),
 ('violations', 0.7797095775604248),
 ('violating', 0.7210453152656555),
 ('allege', 0.7047728896141052),
 ('alleges', 0.6512392163276672),
 ('violate', 0.6335650682449341),
 ('seek', 0.6252464056015015),
 ('redress', 0.6206746697425842),
 ('suits', 0.6201921701431274),
 ('knowing', 0.6188560128211975)]

In [23]:
model.most_similar('kill')

  """Entry point for launching an IPython kernel.


[('capture', 0.9585036635398865),
 ('trap', 0.9422041773796082),
 ('harass', 0.9336722493171692),
 ('hunt', 0.9275435209274292),
 ('wound', 0.9246350526809692),
 ('shoot', 0.904694139957428),
 ('collect', 0.8831580877304077),
 ('porpoises', 0.8724063634872437),
 ('migratory', 0.8606283664703369),
 ('harassment', 0.843824565410614)]