In [1]:
import re
from nltk import *
from bs4 import BeautifulSoup
import string
import requests
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
import pandas as pd
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk import RegexpParser, Tree
from nltk.util import ngrams

stop_words = set(stopwords.words('english'))
wnl = WordNetLemmatizer()



In [2]:
plessy = 'https://www.law.cornell.edu/supremecourt/text/163/537'
r = requests.get(plessy).text
soup = BeautifulSoup(r, 'html.parser')

In [33]:
def get_text():
    text = soup.find(class_='bodytext')        
    text = [t.text for t in text]
    return text

def clean(text):
    # remove unicode characters
    text = [txt.encode("ascii", "ignore").decode("ascii") for txt in text]
    # lowercase each string
    text = [txt.lower() for txt in text]
    # remove new line escape character
    text = [txt.replace('\n', '') for txt in text]
    # remove punctuation
    text = [re.sub('[%s]' % re.escape(string.punctuation), ' ', txt) for txt in text]
    # remove digits
    text = [re.sub('[%s]' % re.escape(string.digits), ' ', txt) for txt in text]
    # remove empty strings
    text = [txt.strip() for txt in text if txt]
    return text

In [51]:
plessy_text = get_text()
plessy_text = [sent_tokenize(sent) for sent in plessy_text]
plessy_text = [sent for sent in plessy_text if sent] 
opinion = plessy_text[6:34]
dissent = plessy_text[36:-1]

In [89]:
opinion_tokens = [clean(word_tokenize(word)) for sent in opinion for word in sent]
cleaned_tokens = [word for sent in opinion_tokens for word in sent if word]
plessy_stopped = [word for word in cleaned_tokens if word not in stop_words]

In [91]:
NUM_TOPICS = 10

vectorizer = CountVectorizer(min_df=5, max_df=0.9, stop_words='english', lowercase=True)
data_vectorized = vectorizer.fit_transform(plessy_stopped)

lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)

nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)

lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

(2686, 10)
(2686, 10)
(2686, 10)




In [92]:
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('race', 41.380909820678184), ('act', 18.131618884997806), ('question', 12.269996355467718), ('equal', 11.928383434935379), ('constitution', 7.001394029896492), ('exercise', 5.429997264891103), ('carrying', 5.367148677622111), ('shall', 0.1002343144373943), ('justice', 0.10021276709084843), ('general', 0.1001905459962534)]
Topic 1:
[('said', 16.05955976621757), ('commerce', 13.983654389292484), ('passengers', 12.244466660590884), ('railroad', 10.470137231047872), ('st', 10.180891138387937), ('children', 6.946577219239472), ('particular', 6.329224150994328), ('prohibition', 6.262348974351535), ('civil', 6.142518719420613), ('ohio', 5.977669146266599)]
Topic 2:
[('colored', 34.8374077696012), ('rights', 16.508793242207286), ('passenger', 16.44033335642596), ('separate', 15.4175586249693), ('petitioner', 11.807155187576699), ('parish', 5.068756721580701), ('carrying', 0.10003125012561528), ('statute', 0.10003087912790364), ('accommodations', 0.10003073055019034), ('di



In [93]:
print("SVD Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

SVD Model:
Topic 0:
[('state', 0.999999887768824), ('color', 0.000334374797524938), ('commerce', 0.00012671539305462597), ('said', 0.00012063042159822165), ('amendment', 0.00011964559268360472), ('railway', 0.00011120377340840466), ('passengers', 7.930581518739785e-05), ('question', 7.436836254608968e-05), ('petitioner', 6.938941127552435e-05), ('shall', 6.766521743007588e-05)]
Topic 1:
[('race', 0.9999990287290598), ('separate', 0.00023315141175294515), ('rights', 0.0002261691199049598), ('united', 0.00019787105303845093), ('passenger', 0.00019433410990208063), ('case', 0.00018297703841458187), ('coach', 0.0001802535549523719), ('accommodations', 0.00014153647121677523), ('sup', 0.00011500692580174148), ('equal', 8.188287714293317e-05)]
Topic 2:
[('colored', 0.9999979597827215), ('color', 0.0005519330936291733), ('railway', 0.000528330581513063), ('equal', 0.0005124449926329561), ('law', 0.00026169319314520617), ('question', 0.0002582559738800003), ('passenger', 0.0002152537775999223)

In [94]:
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)

NMF Model:
Topic 0:
[('state', 2.659147947363406), ('passenger', 1.8254273684374448e-26), ('said', 1.326958728113662e-26), ('races', 1.2883327986602069e-26), ('amendment', 7.777956876210687e-28), ('rights', 2.4066789185880586e-28), ('coach', 4.322057349265295e-29), ('railway', 6.395568284817204e-30), ('separate', 3.438709805670099e-30), ('law', 1.3088686256600636e-30)]
Topic 1:
[('race', 2.5457298392477563), ('passenger', 4.273306428939555e-22), ('races', 2.9388775146714284e-22), ('said', 6.027131563213107e-23), ('amendment', 9.230018920930518e-25), ('coach', 5.161792730076093e-25), ('law', 5.393184180498017e-26), ('separate', 2.3327719675217854e-26), ('color', 1.0594557751198074e-27), ('persons', 2.654054619884575e-29)]
Topic 2:
[('colored', 2.414735501050429), ('said', 3.131239954423774e-17), ('rights', 1.4618019053970783e-19), ('law', 1.2182134088147824e-20), ('coach', 6.783137320815167e-21), ('separate', 2.1157523485108534e-21), ('held', 4.456884472516113e-22), ('shall', 1.79741979