In [3]:
import re
from nltk import *
from bs4 import BeautifulSoup
import string
import requests
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk import RegexpParser, Tree
from nltk.util import ngrams

stop_words = set(stopwords.words('english'))
normalizer = WordNetLemmatizer()


def make_soup(url):
    r = requests.get(url).text
    soup = BeautifulSoup(r, 'html.parser')
    return soup


soup = make_soup('https://www.law.cornell.edu/supremecourt/text/60/393')


def get_text():
    text = soup.find(class_='bodytext')
    text = [t.text for t in text]
    return text


def clean(text):
    text = [txt.encode("ascii", "ignore").decode("ascii") for txt in text]
    # remove unicode characters
    # lower each string
    text = [txt.lower() for txt in text]
    # remove new line escape character
    text = [txt.replace('\n', '') for txt in text]
    # remove punctuation
    text = [re.sub('[%s]' % re.escape(string.punctuation), ' ', txt)
            for txt in text]
    # remove digits
    text = [re.sub('[%s]' % re.escape(string.digits), ' ', txt)
            for txt in text]

    # remove empty strings
    text = [txt.strip() for txt in text if txt]
    return text




In [4]:
tanney_opinion = clean(get_text()[13:-4])[:490]
wayne_concurrence = clean(get_text()[13:-4])[203:216]
nelson_concurrence = clean(get_text()[13:-4])[217:270]
grier_concurrence = clean(get_text()[13:-4])[271:273]
daniel_concurrence = clean(get_text()[13:-4])[274:356]
campbell_concurrence = clean(get_text()[13:-4])[357:427]
catron_concurrence = clean(get_text()[13:-4])[428:]
majority = [tanney_opinion, wayne_concurrence, nelson_concurrence, grier_concurrence, daniel_concurrence, campbell_concurrence, catron_concurrence]

In [5]:
counter = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True)
matrix = counter.fit_transform(tanney_opinion)
features = counter.get_feature_names_out()
freq_tanney_df = pd.DataFrame(matrix.todense(), columns=features)

In [6]:
# remove the stop words from the text
stopped_tanney = [word for word in word_tokenize((' '.join(tanney_opinion))) if word not in stop_words]

In [18]:
NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True)
data_vectorized = vectorizer.fit_transform(stopped_tanney)

lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)

nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)

lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
        for i in topic.argsort()[:-top_n - 1:-1]])



In [19]:
print("LDA Model:")
print_topics(nmf_model, nmf_Z)
print("=" * 20)

LDA Model:
Topic 0:


AttributeError: 'numpy.ndarray' object has no attribute 'get_feature_names'

In [11]:
print("SVD Model:")
print_topics(nmf_model, nmf_Z)
print("=" * 20)

SVD Model:
Topic 0:


AttributeError: 'numpy.ndarray' object has no attribute 'get_feature_names'

In [21]:
[word_tokenize(paragraph) for paragraph in tanney_opinion]

[['this',
  'case',
  'has',
  'been',
  'twice',
  'argued',
  'after',
  'the',
  'argument',
  'at',
  'the',
  'last',
  'term',
  'differences',
  'of',
  'opinion',
  'were',
  'found',
  'to',
  'exist',
  'among',
  'the',
  'members',
  'of',
  'the',
  'court',
  'and',
  'as',
  'the',
  'questions',
  'in',
  'controversy',
  'are',
  'of',
  'the',
  'highest',
  'importance',
  'and',
  'the',
  'court',
  'was',
  'at',
  'that',
  'time',
  'much',
  'pressed',
  'by',
  'the',
  'ordinary',
  'business',
  'of',
  'the',
  'term',
  'it',
  'was',
  'deemed',
  'advisable',
  'to',
  'continue',
  'the',
  'case',
  'and',
  'direct',
  'a',
  're',
  'argument',
  'on',
  'some',
  'of',
  'the',
  'points',
  'in',
  'order',
  'that',
  'we',
  'might',
  'have',
  'an',
  'opportunity',
  'of',
  'giving',
  'to',
  'the',
  'whole',
  'subject',
  'a',
  'more',
  'deliberate',
  'consideration',
  'it',
  'has',
  'accordingly',
  'been',
  'again',
  'argued',
 

In [22]:
mclean_dissent = clean(get_text()[13:-4])[492:-1]

In [23]:
cvn = CountVectorizer(stop_words='english')
tanney_data = cvn.fit_transform(tanney_opinion)

In [24]:
pd.DataFrame(tanney_data.toarray(), columns=cvn.get_feature_names())



Unnamed: 0,ab,abandon,abandoned,abandonment,abasement,abate,abatement,abdicate,abercrombie,abiding,...,written,wrong,wrote,yards,year,years,yielded,york,young,zealously
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
486,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
487,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
488,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
