In [1]:
import re
from nltk import *
from bs4 import BeautifulSoup
import string
import requests
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk import RegexpParser, Tree
from nltk.util import ngrams

# borrowed heavily from: https://nlpforhackers.io/topic-modeling/

stop_words = set(stopwords.words('english'))
wnl = WordNetLemmatizer()

def make_soup(url):
    r = requests.get(url).text
    soup = BeautifulSoup(r, 'html.parser')
    return soup


soup = make_soup('https://www.law.cornell.edu/supremecourt/text/60/393')


def get_text():
    text = soup.find(class_='bodytext')
    text = [t.text for t in text]
    return text


def clean(text):
    text = [txt.encode("ascii", "ignore").decode("ascii") for txt in text]
    # remove unicode characters
    # lower each string
    text = [txt.lower() for txt in text]
    # remove new line escape character
    text = [txt.replace('\n', '') for txt in text]
    # remove punctuation
    text = [re.sub('[%s]' % re.escape(string.punctuation), ' ', txt)
            for txt in text]
    # remove digits
    text = [re.sub('[%s]' % re.escape(string.digits), ' ', txt)
            for txt in text]

    # remove empty strings
    text = [txt.strip() for txt in text if txt]
    return text




In [2]:
tanney_opinion = clean(get_text()[13:-4])[:490]
wayne_concurrence = clean(get_text()[13:-4])[203:216]
nelson_concurrence = clean(get_text()[13:-4])[217:270]
grier_concurrence = clean(get_text()[13:-4])[271:273]
daniel_concurrence = clean(get_text()[13:-4])[274:356]
campbell_concurrence = clean(get_text()[13:-4])[357:427]
catron_concurrence = clean(get_text()[13:-4])[428:]

mclean_dissent = clean(get_text()[13:-4])[492:-1]

majority = [tanney_opinion, wayne_concurrence, nelson_concurrence, grier_concurrence, daniel_concurrence, campbell_concurrence, catron_concurrence]

In [4]:
# remove the stop words from the text
stopped_tanney = [word for word in word_tokenize((' '.join(tanney_opinion))) if word not in stop_words]
tanney_tagged = pos_tag(stopped_tanney)

In [11]:
Counter(tanney_tagged).most_common(50)

[(('states', 'NNS'), 526),
 (('court', 'NN'), 415),
 (('state', 'NN'), 415),
 (('upon', 'IN'), 292),
 (('power', 'NN'), 280),
 (('government', 'NN'), 262),
 (('constitution', 'NN'), 258),
 (('united', 'JJ'), 231),
 (('case', 'NN'), 218),
 (('law', 'NN'), 169),
 (('congress', 'NN'), 167),
 (('rights', 'NNS'), 163),
 (('jurisdiction', 'NN'), 157),
 (('property', 'NN'), 152),
 (('territory', 'NN'), 149),
 (('laws', 'NNS'), 146),
 (('within', 'IN'), 145),
 (('would', 'MD'), 139),
 (('one', 'CD'), 134),
 (('may', 'MD'), 126),
 (('opinion', 'NN'), 104),
 (('could', 'MD'), 100),
 (('question', 'NN'), 98),
 (('free', 'JJ'), 98),
 (('citizens', 'NNS'), 96),
 (('slave', 'VBP'), 96),
 (('authority', 'NN'), 95),
 (('new', 'JJ'), 93),
 (('powers', 'NNS'), 89),
 (('plaintiff', 'NN'), 88),
 (('judgment', 'NN'), 86),
 (('time', 'NN'), 84),
 (('shall', 'MD'), 81),
 (('people', 'NNS'), 79),
 (('must', 'MD'), 78),
 (('federal', 'JJ'), 77),
 (('said', 'VBD'), 75),
 (('persons', 'NNS'), 74),
 (('union', 'N

[('case', 'NN'),
 ('twice', 'RB'),
 ('argued', 'VBD'),
 ('argument', 'JJ'),
 ('last', 'JJ'),
 ('term', 'NN'),
 ('difference', 'NN'),
 ('opinion', 'NN'),
 ('found', 'VBN'),
 ('exist', 'VBP'),
 ('among', 'IN'),
 ('member', 'NN'),
 ('court', 'NN'),
 ('question', 'NN'),
 ('controversy', 'NN'),
 ('highest', 'JJS'),
 ('importance', 'NN'),
 ('court', 'NN'),
 ('time', 'NN'),
 ('much', 'JJ'),
 ('pressed', 'VBN'),
 ('ordinary', 'JJ'),
 ('business', 'NN'),
 ('term', 'NN'),
 ('deemed', 'VBD'),
 ('advisable', 'JJ'),
 ('continue', 'NN'),
 ('case', 'NN'),
 ('direct', 'JJ'),
 ('argument', 'NN'),
 ('point', 'NN'),
 ('order', 'NN'),
 ('might', 'MD'),
 ('opportunity', 'NN'),
 ('giving', 'VBG'),
 ('whole', 'JJ'),
 ('subject', 'JJ'),
 ('deliberate', 'NN'),
 ('consideration', 'NN'),
 ('accordingly', 'RB'),
 ('argued', 'VBD'),
 ('counsel', 'NN'),
 ('considered', 'VBN'),
 ('court', 'NN'),
 ('proceed', 'VBD'),
 ('deliver', 'JJ'),
 ('opinion', 'NN'),
 ('two', 'CD'),
 ('leading', 'VBG'),
 ('question', 'NN'),
 ('

In [14]:
NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True)
data_vectorized = vectorizer.fit_transform(stopped_tanney)

lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)

nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)

lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

(27426, 10)
(27426, 10)
(27426, 10)




In [22]:
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('court', 364.8667231372388), ('case', 203.5189004226234), ('words', 37.436586469277806), ('respect', 36.813606169687105), ('record', 32.31213699151465), ('provisions', 29.07466352287151), ('force', 28.98676118302845), ('family', 28.882694691984565), ('principle', 28.72006296227465), ('adopted', 27.127999400837464)]
Topic 1:
[('power', 294.67783851176733), ('laws', 138.87715337408216), ('question', 100.99543971408235), ('judgment', 86.85358413240564), ('slaves', 82.09730648552372), ('privileges', 44.41714289250514), ('claim', 40.10161352061264), ('treaty', 35.462642324094794), ('dispose', 30.902187415620062), ('relation', 27.483051251869338)]
Topic 2:
[('congress', 254.5406991347104), ('union', 88.48426966428353), ('subject', 87.33555102260404), ('master', 70.48651225602215), ('circuit', 64.10484218830008), ('general', 57.94757218201733), ('common', 49.52748403143306), ('exercise', 48.50259787420695), ('held', 48.11441737684822), ('virginia', 45.36450202305836)]
To



In [21]:
print("SVD Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

SVD Model:
Topic 0:
[('states', 0.9999999991115832), ('rights', 1.4536613794563821e-05), ('powers', 1.3280691941859458e-05), ('authority', 1.0929818164890444e-05), ('property', 5.246854917116058e-06), ('slave', 3.713019159675643e-06), ('subject', 3.4376051155525932e-06), ('opinion', 2.7934424681398334e-06), ('federal', 2.649987781202775e-06), ('judgment', 2.1581676842271556e-06)]
Topic 1:
[('state', 0.8007096821891441), ('court', 0.5990525239818364), ('authority', 7.37282124774566e-05), ('citizen', 6.386041323772978e-05), ('judgment', 5.844415299166624e-05), ('citizens', 5.698681304875949e-05), ('question', 4.819130484606348e-05), ('act', 4.3853372744220325e-05), ('opinion', 2.7047779161829657e-05), ('courts', 2.6660019129343745e-05)]
Topic 2:
[('court', 0.8007089545279722), ('laws', 0.0005303597621958765), ('question', 0.0005106015590287017), ('new', 0.00034724162205470064), ('law', 0.0003184970937287719), ('citizens', 0.00014605913279527375), ('time', 7.986595287647764e-05), ('shall'

In [23]:
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)

NMF Model:
Topic 0:
[('states', 4.86676880122057), ('laws', 1.7437160898734626e-12), ('citizen', 7.269604672135916e-13), ('citizens', 2.288626264846022e-13), ('opinion', 1.7695721219062394e-13), ('question', 1.0718946643193454e-13), ('authority', 1.0194858449074589e-13), ('right', 8.999910294765196e-14), ('free', 5.994761370187768e-14), ('missouri', 5.208179736139971e-14)]
Topic 1:
[('court', 4.866249324592335), ('law', 7.909388960378394e-10), ('laws', 4.895438571997297e-10), ('jurisdiction', 2.7557524123676435e-10), ('missouri', 2.5755240047997015e-11), ('authority', 1.821376106397972e-11), ('judgment', 1.6091629234753846e-12), ('time', 4.51119793501672e-13), ('act', 3.641537989835481e-13), ('persons', 3.3392629570934414e-13)]
Topic 2:
[('state', 5.041641267903728), ('rights', 4.0084126909375005e-09), ('property', 1.0110671373098932e-09), ('slave', 5.898583384140941e-10), ('question', 2.311941394630069e-10), ('plaintiff', 1.7850062588943722e-10), ('opinion', 1.0169149746623556e-10), (

In [23]:
cvn = CountVectorizer(stop_words='english')
tanney_data = cvn.fit_transform(tanney_opinion)

In [24]:
pd.DataFrame(tanney_data.toarray(), columns=cvn.get_feature_names())



Unnamed: 0,ab,abandon,abandoned,abandonment,abasement,abate,abatement,abdicate,abercrombie,abiding,...,written,wrong,wrote,yards,year,years,yielded,york,young,zealously
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
486,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
487,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
488,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
