In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import data
from gensim.models import word2vec
from tqdm import tqdm
tqdm.pandas()

np.random.seed(42)

In [2]:
# This is the list of all the appellate cases from the 9th circuit

df=pd.read_pickle('./circuit_w_text')

In [3]:
df.loc[200]

absolute_url                   /opinion/3066157/san-luis-delta-mendota-water-...
attorney                                                                        
author_id                                                                   None
caseName                       San Luis & Delta-Mendota Water v. Natural Reso...
caseNameShort                                                                   
citation                                                                    None
citeCount                                                                      0
cites                          [78617, 104034, 104812, 106566, 108283, 108752...
cluster_id                                                               3066157
court                                     Court of Appeals for the Ninth Circuit
court_citation_string                                                   9th Cir.
court_exact                                                                  ca9
court_id                    

In [4]:
# This the list of all the appellate cases except the 9th circuit

df2=pd.read_pickle('./circuit_dc_w_text')

In [5]:
df.shape

(264, 40)

In [6]:
df2.shape

(208, 40)

In [7]:
# Drop out entries that are blank or NAN in the text field.

df=df.dropna(subset=['text'])

df=df.loc[df['text']!='']

df2=df2.dropna(subset=['text'])

df2=df2.loc[df2['text']!='']

In [8]:
df.shape

(247, 40)

In [9]:
df2.shape

(205, 40)

In [10]:
# Adapted from Matt Brems' lecture notes

def review_to_wordlist(sentence):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #    
    
    # tokenizing and removing punctuation
    tokenizer = RegexpTokenizer(r'[a-zA-Z]{3,}')
    text_processed=tokenizer.tokenize(sentence)
    
    legal_words=set(['sec', 'fed', 'reg', 'act', 'cir', 'cert', 'see', 'app', 'soc', 'stat'])
    stops=set(stopwords.words('english'))
    
    # removing any stopwords
    text_processed = [word for word in text_processed  if (word.lower() not in stops and 
                     word.lower() not in legal_words)]
          
    # make words lower-cased, unless it is an acronym
    text_processed = [word.lower() if (word.upper() != word or len(word)>4) else word for word in text_processed ]
    
    # return a list of words
    return (text_processed)
    

In [11]:
# Adapted from Matt Brems' lecture notes.

# Load the punkt tokenizer
tokenizer = data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( text, tokenizer):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the document into sentences
    raw_sentences = tokenizer.tokenize(text.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [None]:
#Break each case down into a series of sentances, each of which is a list of words
# Assemble a list of all such sentences in the entire corpus


train_sentences = []  # Initialize an empty list of sentences
counter=0

print("Parsing sentences from training set")
for case_text in df['text']:
    print (counter)
    train_sentences += review_to_sentences(case_text, tokenizer)
    counter += 1

Parsing sentences from training set
0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149


In [None]:
len(train_sentences)

In [None]:
# Build a Word2Vec Model from the texts of all the cases


# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 100    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 40          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)

print("Training model...")
model = word2vec.Word2Vec(train_sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context, sample = downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "100features_40minwords_40context"
model.save(model_name)

In [None]:
# Process each text field into a list of tokens

df['text2']=df['text'].progress_apply(review_to_wordlist)

In [None]:
# Process each text field into a list of tokens

df2['text2']=df2['text'].progress_apply(review_to_wordlist)

In [None]:
df.to_pickle('./processed_text')

In [None]:
df2.to_pickle('./dc_processed_text')

In [None]:
model.wv.vocab

In [None]:
model.most_similar('forest')