In [18]:
import re
from nltk import *
from bs4 import BeautifulSoup
import string
import requests
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk import RegexpParser, Tree
from nltk.util import ngrams

stop_words = set(stopwords.words('english'))
wnl = WordNetLemmatizer()

In [2]:
dobbs = 'https://www.law.cornell.edu/supremecourt/text/19-1392'
r = requests.get(dobbs).text
soup = BeautifulSoup(r, 'html.parser')

In [3]:
# facts
def get_text():
    text = soup.find(class_='bodytext')        
    text = [t.text for t in text]
    return text

def get_opinion():
    text = soup.find(class_='opinion')        
    text = [t.text for t in text] 
    return text

def get_dissent():
    text = soup.find(class_='dissent')        
    text = [t.text for t in text]
    return text

def clean(text):
    # remove unicode characters
    text = [txt.encode("ascii", "ignore").decode("ascii") for txt in text]
    # lowercase each string
    text = [txt.lower() for txt in text]
    # remove new line escape character
    text = [txt.replace('\n', ' ') for txt in text]
    # remove punctuation
    text = [re.sub('[%s]' % re.escape(string.punctuation), ' ', txt) for txt in text]
    # remove digits
    text = [re.sub('[%s]' % re.escape(string.digits), ' ', txt) for txt in text]
    # remove empty strings
    text = [txt.strip() for txt in text if txt]
    return text

In [31]:
dobbs_opinion = clean(sent_tokenize(get_opinion()[6]))[5:-1]
dobbs_tokenized = [word_tokenize(sent) for sent in dobbs_opinion]
dobbs_flattened = [word for sent in dobbs_tokenized for word in sent]
dobbs_stopped = [word for word in dobbs_flattened if word not in stop_words]

In [27]:
NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True)
data_vectorized = vectorizer.fit_transform(dobbs_stopped)

lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)

nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)

lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

(14779, 10)
(14779, 10)
(14779, 10)




In [28]:
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('abortion', 221.21534938343726), ('person', 124.9613340515249), ('opinion', 61.862694646646524), ('history', 30.677747482571423), ('concurrence', 23.83542468534604), ('advise', 23.752995564683054), ('taken', 22.005343271734013), ('century', 19.269145277329052), ('noxious', 17.857179446764913), ('labor', 14.632067226418005)]
Topic 1:
[('roe', 117.01734114172912), ('court', 113.3207740496828), ('life', 111.56394961455166), ('sec', 99.08455327773302), ('necessary', 81.81930684548087), ('procure', 76.28308790693872), ('constitutional', 54.73710306993624), ('dollars', 52.1095289664137), ('mother', 37.89701113306212), ('used', 20.330013105026666)]
Topic 2:
[('right', 118.47671125912382), ('casey', 101.72191903716215), ('medicine', 68.03138909573744), ('rule', 48.733279904168064), ('cases', 36.55320412360614), ('new', 36.51895727509398), ('decisis', 30.490099147004347), ('county', 20.05720659015158), ('whatsoever', 18.665519661905982), ('undue', 18.200551904461825)]
Topi



In [29]:
print("SVD Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

SVD Model:
Topic 0:
[('shall', 0.9999999888406649), ('drug', 3.7628949142916294e-05), ('means', 3.583215267586471e-05), ('necessary', 3.311077407780562e-05), ('viability', 3.0403845771024627e-05), ('substance', 1.786542315841523e-05), ('constitution', 1.7584913002723452e-05), ('state', 1.446727051597463e-05), ('states', 1.2360504624884966e-05), ('use', 1.1323836596714242e-05)]
Topic 1:
[('abortion', 0.9999998628745639), ('state', 0.00021685359472297018), ('use', 0.0001707489430371714), ('administer', 0.00014852603346277818), ('constitution', 0.00010889848637088378), ('procure', 0.00010021691863242949), ('constitutional', 7.86135182007264e-05), ('imprisonment', 6.200889381073607e-05), ('id', 3.766767642190911e-05), ('unless', 2.052906908812904e-05)]
Topic 2:
[('woman', 0.9999925896927576), ('use', 0.00155093989708715), ('drug', 0.0010234685789123153), ('means', 0.0010066343781128444), ('state', 0.0008717528978541957), ('exceeding', 0.0008211955398893836), ('constitutional', 0.0005956490

In [30]:
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)

NMF Model:
Topic 0:
[('shall', 4.105155240147702), ('states', 5.68529993805981e-24), ('state', 5.73354712046732e-25), ('years', 3.8908305604070007e-25), ('sec', 1.3034838464440478e-25), ('intent', 2.9476902271215286e-26), ('opinion', 7.663819534216433e-27), ('necessary', 1.6185096670628994e-27), ('means', 1.1331413693262177e-27), ('constitutional', 3.675739860179234e-28)]
Topic 1:
[('abortion', 3.9522777407225744), ('sec', 5.1924481236106496e-24), ('opinion', 2.0939294376481085e-24), ('necessary', 8.497993902709109e-25), ('procure', 1.2247265034777872e-25), ('viability', 5.993227359959461e-26), ('use', 4.585438868836612e-26), ('instrument', 1.0822340195957719e-26), ('substance', 2.7861400111466812e-27), ('id', 4.8241264636227735e-28)]
Topic 2:
[('woman', 3.616170671592573), ('states', 1.2948006159391912e-14), ('law', 4.74652452017304e-15), ('years', 8.472667680909442e-16), ('intent', 5.523614853507444e-17), ('necessary', 3.2311723610612473e-18), ('miscarriage', 7.853444285155321e-19), 