In [2]:
import re
from nltk import *
from bs4 import BeautifulSoup
import string
import requests
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk import RegexpParser, Tree
from nltk.util import ngrams

stop_words = set(stopwords.words('english'))
wnl = WordNetLemmatizer()

In [3]:
obergefell = 'https://www.law.cornell.edu/supremecourt/text/14-556'
kennedy_opinion = 'https://www.law.cornell.edu/supremecourt/text/14-556#writing-14-556_OPINION_3'
roberts_dissent = 'https://www.law.cornell.edu/supremecourt/text/14-556#writing-14-556_DISSENT_4'
scalia_dissent = 'https://www.law.cornell.edu/supremecourt/text/14-556#writing-14-556_DISSENT_5'
thomas_dissent = 'https://www.law.cornell.edu/supremecourt/text/14-556#writing-14-556_DISSENT_6'
alito_dissent = 'https://www.law.cornell.edu/supremecourt/text/14-556#writing-14-556_DISSENT_7'

In [26]:
def make_soup(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    return soup

def get_text(soup):
    text = soup.find(class_='bodytext')        
    text = [t.text for t in text]
    return text

def get_dissent(soup):
    dissent = soup.find(class_='dissent').find_all('p')
    dissent = [t.text for t in dissent]
    return dissent

def clean(text):
    # remove unicode characters
    text = [txt.encode("ascii", "ignore").decode("ascii") for txt in text]
    # lower each string
    text = [txt.lower() for txt in text]
    # remove new line escape character
    # text = [txt.replace('\n', ' ') for txt in text]
    # remove punctuation
    text = [re.sub('[%s]' % re.escape(string.punctuation), ' ', txt)
            for txt in text]
    # remove digits
    text = [re.sub('[%s]' % re.escape(string.digits), ' ', txt)
            for txt in text]
    # remove empty strings
    # text = [txt.strip() for txt in text if txt]
    return text

In [33]:
sent_tokens = [clean(sent_tokenize(sent)) for sent in get_dissent(make_soup(scalia_dissent))[20:]]
tokens = [word_tokenize(sent) for array in sent_tokens for sent in array]

In [35]:
stopped_scalia = [word for array in tokens for word in array if word not in stop_words]

In [36]:
NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True)
data_vectorized = vectorizer.fit_transform(stopped_scalia)

lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)

nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)

lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

(4689, 10)
(4689, 10)
(4689, 10)




In [37]:
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('court', 54.324251624883516), ('laws', 30.394038553259158), ('constitutional', 21.330032850073646), ('new', 19.727213371528496), ('institution', 11.936615512132965), ('judicial', 11.690485697332942), ('individual', 9.078031519776557), ('loving', 6.354935099903142), ('precedent', 5.66747016090418), ('certainly', 5.094660835809555)]
Topic 1:
[('ante', 40.276036164347396), ('law', 31.04951269374435), ('states', 28.107782378038912), ('fundamental', 19.782473775977508), ('union', 12.695309816952863), ('id', 12.285219453853045), ('government', 7.944837098491277), ('freedom', 6.303272745424996), ('result', 6.07452411706732), ('requires', 5.913131131853405)]
Topic 2:
[('majority', 47.96670845027344), ('majoritys', 21.99216998513673), ('lochner', 14.588885625360906), ('question', 13.887665131381763), ('position', 9.906243568990126), ('public', 8.700163240406498), ('time', 8.652060720956518), ('privacy', 8.22127370817266), ('implied', 7.763856613200818), ('years', 7.3258704



In [38]:
print("SVD Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

SVD Model:
Topic 0:
[('marriage', 0.9999999994760768), ('history', 1.9076204238350446e-05), ('definition', 8.84169778706528e-06), ('state', 7.4044360722517765e-06), ('constitutional', 3.98935433012742e-06), ('majoritys', 3.5345568453375876e-06), ('rights', 3.1966367632376974e-06), ('couples', 2.988827140370863e-06), ('laws', 2.919726986100139e-06), ('lochner', 2.891289664311642e-06)]
Topic 1:
[('court', 0.9999722939885705), ('rights', 0.003827752176175454), ('petitioners', 0.002360658729013807), ('woman', 0.0015551325134436301), ('marry', 0.0011974920565742551), ('people', 0.001072731613739752), ('definition', 0.0009944189987220026), ('liberty', 0.0007987639035384114), ('ante', 0.0005783144416056934), ('man', 0.0004380091164168111)]
Topic 2:
[('sex', 0.9999748678602072), ('rights', 0.0014364366210240932), ('opinion', 0.0011769347440139834), ('people', 0.0009587357291246067), ('court', 0.0006406383436901388), ('courts', 0.0005828637889557452), ('issue', 0.0005530825045000208), ('process