In [None]:
%%capture
# The packages we'll be using
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction import text
import pandas as pd
import os
import nltk
nltk.download(['stopwords','punkt','wordnet'])
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import warnings
warnings.filterwarnings("ignore")
import numpy as np
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.sklearn as LDAvis
import matplotlib.pyplot as plt

In [None]:
os.chdir('/content/drive/MyDrive/Graduate School/IST736 Text Mining/Week 8/110')
os.listdir()

['110-f-r', '110-m-d', '110-m-r', '110-f-d']

In [None]:
# import the data
corpora = []
tag = []
path = os.getcwd()
for subdir, dirs, files in os.walk(path):
  for file in files:
    with open(os.path.join(subdir,file), encoding='latin1') as f:
      corpora.append(f.read())
      folderTag = subdir.rfind('/')+1
      tag.append(subdir[folderTag:])

CPU times: user 939 ms, sys: 309 ms, total: 1.25 s
Wall time: 2min 57s


In [None]:
#checking the tag
tag[0]

'110-f-r'

In [None]:
#checking the corpus
corpora[1]



In [None]:
#Found on Github as sample to incorporate both
#nltk.word_tokenizer and Lemmatization for call in Vectorizer
#git location: https://gist.github.com/4OH4/f727af7dfc0e6bb0f26d2ea41d89ee55

#Removing the "#" and "@" sign will remove features specific to twitter.

class LemmaTokenizer:
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`',"!","?", "'", "#",
                     "@",">","<","[","]","(",")",]
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in nltk.wordpunct_tokenize(doc) if t not in self.ignore_tokens]

Tokenizer = LemmaTokenizer()

In [None]:
# Instatiation of the Vectorizer
vectorizer = TfidfVectorizer(
    tokenizer=Tokenizer,
    stop_words = 'english',
    ngram_range = (1,2),
    min_df = 0.45
)

In [None]:
X_corpora = vectorizer.fit_transform(corpora)
X_corpora

<429x2467 sparse matrix of type '<class 'numpy.float64'>'
	with 719432 stored elements in Compressed Sparse Row format>

In [None]:
model = LatentDirichletAllocation(n_components=15, max_iter=100, learning_offset=50, learning_method='online', random_state=0)
lda_matrix = model.fit(X_corpora)

In [None]:
terms = vectorizer.get_feature_names_out()
for index, component in enumerate(model.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['promote', '-', 'problem', '))</', 'really', 'house floor', 'text </']
Topic 1:  ['mr', 'south', 'pain', 'period', 'remains', 'importantly', 'lesson']
Topic 2:  ['mr', 'minimum', 'greater', 's', 'told', 'going', 'grant']
Topic 3:  ['mr', '</', 'support', 'representative', 'quick', 'stated', 'mr speaker']
Topic 4:  ['mr', 's', '-', '--', 'health', 'good', 'doc']
Topic 5:  ['common', '200', 'time congress', 'revenue', 'acknowledge', '09 2008', 'direction']
Topic 6:  ['mr', '-', 'text', '))</', 'ha worked', 'decrease', 'docno']
Topic 7:  ['mr', '-', 'mr speaker', 's', 'text', 'house', 'doc']
Topic 8:  ['acting', 'board', 'sought', 'amended', 'initiative', '21st century', 'look']
Topic 9:  ['mr', '-', 'doc', 's', 'text', 'docno', '</']
Topic 10:  ['-', 'ha', 'engage', '--', 'doc', 'america', 't']
Topic 11:  ['paying', 'statistic', 'owned', 'retired', 'half', 'reduction', 'bit']
Topic 12:  ['mr', '-', 's', 'text', 'doc', '</', 'docno']
Topic 13:  ['mr', 'text', 's', 'prohibits', 

In [None]:
#Expanding stop word list
stoplist = text.ENGLISH_STOP_WORDS.union(['house', 'representative','doc','docno',"'s",'--','/docno', 'mr', 'madam','-',
                                          '))</','text </','</', '))</','</','s','speaker','text'])
# Instatiation of the Vectorizer
vectorizer = TfidfVectorizer(
    tokenizer=Tokenizer,
    stop_words = stoplist,
    ngram_range = (1,4),
    min_df = 0.33
)
X_corpora = vectorizer.fit_transform(corpora)
X_corpora

<429x3936 sparse matrix of type '<class 'numpy.float64'>'
	with 950345 stored elements in Compressed Sparse Row format>

In [None]:
#Round 2 Adjusted Stopword list
#instantiate the LDA Model
model = LatentDirichletAllocation(n_components=15, max_iter=100, learning_offset=50, learning_method='online', random_state=0)
lda_matrix = model.fit(X_corpora)
#review model topics
terms = vectorizer.get_feature_names_out()
for index, component in enumerate(model.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['guard', 'grateful', 'unfortunate', 'occasion', 'washington d c', 'deny', 'health insurance']
Topic 1:  ['oberstar', 'july 30', 'smith', '17 2007', '2005', 'additional', 'judiciary']
Topic 2:  ['right', 'economic development', 'think', 'friendly', 'payment', 'public school', 't']
Topic 3:  ['currently', 'apparently', 'billion', 'urge passage', 'treat', 'profound', 'celebrate']
Topic 4:  ['year', '$', 'wa', 'state', 'ha', 'u', 'act']
Topic 5:  ['wa', 'ha', 't', 'american', 'time', 'year', 'u']
Topic 6:  ['state', 'act', 'chairman', 'ha', 'wa', 'today', 'act 2008']
Topic 7:  ['wa', 'act', 'ha', 'need', 'people', '2007', 't']
Topic 8:  ['member body', 'answer', '90', 'lending', 'just year', '$ 1 billion', 'officer']
Topic 9:  ['support', 'family', '2007', 'ha', 'korea', 'protect', 'security']
Topic 10:  ['threatens', 'fell', 'ground', 'ha', 'fortunately', 'grateful', 'leadership']
Topic 11:  ['neighbor', 'prior', 'american', '$ 2', 'package', 'chairman', '$ 1 billion']
Topic 12

In [None]:
#Round 3 reducing the number of topics
#instantiate the LDA Model
model = LatentDirichletAllocation(n_components=6, max_iter=100, learning_offset=50, learning_method='online', random_state=0)
lda_matrix = model.fit(X_corpora)
#review model topics
terms = vectorizer.get_feature_names_out()
for index, component in enumerate(model.components_):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

Topic 0:  ['guard', 'grateful', 'washington d c', 'virginia', 'unfortunate', 'deny', 'occasion']
Topic 1:  ['wa', 'new', 'ha', 'american', 'act', 'program', 'u']
Topic 2:  ['ha', 'wa', 't', 'state', 'people', 'act', 'think']
Topic 3:  ['currently', 'apparently', 'billion', 'urge passage', 'profound', 'treat', 'wisely']
Topic 4:  ['wa', 'ha', 'year', 'time', 'american', 'act', 'state']
Topic 5:  ['ha', 'wa', 'time', 'u', 'american', 'act', '$']


In [None]:
#Rerun Round 2 before executing this
panel = LDAvis.prepare(model, X_corpora, vectorizer=vectorizer, mds='tsne')
pyLDAvis.display(panel)

In [None]:
#Topic Interpretation
#Manually went through each of the topics in the LDA Vis and Looked for keywords to transform the numbers into a topic
Topics = ['Iraq War', 'Iraq War Funding', 'Energy Production', 'Military Deploymnet', 'Fomer Military Protections', 'Military Spousal Assistance', 'Religious Freedom', 'School Vouchers', 
          'Civilians in Iraq', 'Energy Cost Domestically', 'VA Funding', 'Refugees', 'Child Healthcare', 'Health Insurance', 'Resource Development']
len(Topics)

15