In [317]:
import re
from string import punctuation
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stop_words

class SimpleSummarize:
    def __init__(self, filename=None, k=None):
        self.txt = None
        self.word_tokens = None
        self.sent_tokens = None
        self.word_freq = None
        self.freq_dist = {}
        self.sent_scores = {}
        self.top_sents = None
        self.max_len = 40
        self.summary = ''
        self.scores = []
        self.english_stopwords = set(stopwords.words('english')) | stop_words
        if filename and k:
            self.load_file_from_disk(filename)
            self.tokenize()
            self.word_freq_dist()
            self.score_sentences()
            self.summarize(k)
    
    def load_file_from_disk(self, filename):
        with open(filename, "r") as file:
            self.txt = file.read().replace("\n", " ")
            self.txt = self.txt.replace("\'","")
    
    def tokenize(self):
        self.word_tokens = self.tokenizer(self.txt)
        #self.sent_tokens = self.simple_sent_tokenizer(self.txt)
        self.sent_tokens = sent_tokenize(self.txt)

    def simple_sent_tokenizer(self, s):
        sents = []
        for sent in s.split('.'):
            sents.append(sent.strip())
        return sents
        
    def tokenizer(self,txt):
        txt = txt.lower()
        word_tokens = word_tokenize(txt.lower())
        word_tokens = [w for w in word_tokens if w not in self.english_stopwords and re.match('[a-zA-Z-][a-zA-Z-]{2,}', w)]
        return word_tokens
    
    def word_freq_dist(self):
        self.word_freq = nltk.FreqDist(self.word_tokens)
        most_freq_count = max(self.word_freq.values())
        for k,v in self.word_freq.items():
            self.freq_dist[k] = v/most_freq_count
    
    def score_sentences(self):
        for sent in self.sent_tokens:
            words = self.tokenizer(sent)
            for word in words:
                if word.lower() in self.freq_dist.keys():
                    if len(words) < self.max_len:
                        # if key does not exist add it and the freq_dist for the first word
                        if sent not in self.sent_scores.keys():
                            self.sent_scores[sent] = self.freq_dist[word.lower()]
                        else: 
                            # the key exists and we just add the freq_dist of the following words. 
                            # We are just summing up the freq_dists for the sentence
                            self.sent_scores[sent] += self.freq_dist[word.lower()]
    
    def summarize(self, k):
        self.top_sents = Counter(self.sent_scores)
        for t in self.top_sents.most_common(k):
            self.summary += t[0].strip()+'. '
            self.scores.append((t[1],t[0]))
    

In [318]:
# foo = SimpleSummarize()
# foo.load_file_from_disk("CNNImpeachmentArticle.txt")
# foo.tokenize()
# foo.word_freq_dist()
# foo.score_sentences()
# foo.summarize(3)
# foo.summary
foo = SimpleSummarize(filename="CNNImpeachmentArticle.txt", k=3)
foo.summary

'The Judiciary Committee hearing is the latest sign that House Democrats are moving forward with impeachment proceedings against the President following the two-month investigation led by the House Intelligence Committee into allegations that Trump pushed Ukraine to investigate his political rivals while a White House meeting and $400 million in security aid were withheld from Kiev.. The House Judiciary Committee has invited President Donald Trump or his counsel to participate in the panels first impeachment hearing next week as the House moves another step closer to impeaching the President.. READ: Judiciary Chairman&#39;s invite to Trump and his lawyers to take part in upcoming impeachment hearings The hearing announcement comes as the Intelligence Committee plans to release its report summarizing the findings of its investigation to the House Judiciary Committee soon after Congress returns from its Thanksgiving recess next week.. '

# Text Extraction from PDFs

In [324]:
import PyPDF2

In [413]:
pdf = 'impeachment_data/20191203_-_full_report___hpsci_impeachment_inquiry_-_20191203.pdf'
pd_file_obj = open(pdf, 'rb')
pdf_reader = PyPDF2.PdfFileReader(pd_file_obj)
num_pages = pdf_reader.getNumPages()
full_text = ''
for p in range(num_pages):
    full_text = full_text + pdf_reader.getPage(p).extractText().strip().replace('\n','')
with open('impeachment_data/trump_impeachment_inquiry.txt', 'w+') as file:
    file.write(full_text)
    file.close()

In [414]:
full_text[:1000]

'THE TRUMP-UKRAINE  IMPEACHMENT INQUIRY REPORT   Report of the House Permanent Select Committee on Intelligence, Pursuant to H. Res. 660 in Consultation with the  House Committee on Oversight and Reform and the  House Committee on Foreign Affairs   December 20192  House Permanent Select Committee on Intelligence  Rep. Adam B. Schiff (CA), Chairman  Rep. Jim Himes (CT) Rep. Terri Sewell (AL) Rep. André Carson (IN)  Rep. Jackie Speier (CA) Rep. Mike Quigley (IL)  Rep. Eric Swalwell (CA)  Rep. Joaquin Castro (TX)  Rep. Denny Heck (WA)  Rep. Peter Welch (VT)  Rep. Sean Patrick Maloney (NY)  Rep. Devin Nunes (CA), Ranking Member  Rep. Mike Conaway (TX)  Rep. Michael Turner (OH)  Rep. Brad Wenstrup (OH)  Rep. Chris Stewart (UT)  Rep. Elise Stefanik (NY)  Rep. Will Hurd (TX)  Rep. John Ratcliffe (TX) Rep. Jim Jordan (OH) Rep. Val Demings (FL) Rep. Raja Krishnamoorthi (IL)  Majority Staff Timothy S. Bergreen, Staff Director  Daniel S. Goldman, Director of Investigations Maher Bitar, General Co

In [415]:
impeachment_summary = SimpleSummarize(filename="impeachment_data/trump_impeachment_inquiry.txt", k=3)
impeachment_summary.summary

'E. Cummings, House Committee on Oversight and Reform, and Chairman Adam B. Schiff, House Permanent Select Committee on Intelligence, the same day).. 77 Letter from Pat A. Cipollone, Counsel to the President, The White House, to House Speaker Nancy Pelosi, Chairman Adam B. Schiff, House Permanent Select Committee on Intelligence, Chairman Eliot L. Engel, House Committee on Foreign Affairs Committee, and Chairman Elijah E. Cummings, House Committee on Oversight and Reform (Oct. 8, 2019) (online at www.whitehouse.gov/wp-content/uploads/2019/10/PAC-Letter-10.08.2019.pdf).. 88 Letter from Pat A. Cipollone, Counsel to the President, The White House, to House Speaker Nancy Pelosi, Chairman Adam B. Schiff, House Permanent Select Committee on Intelligence, Chairman Eliot L. Engel, House Committee on Foreign Affairs Committee, and Chairman Elijah E. Cummings, House Committee on Oversight and Reform (Oct. 8, 2019) (online at www.whitehouse.gov/wp-content/uploads/2019/10/PAC-Letter-10.08.2019.pdf

In [416]:
impeachment_summary.scores

[(24.507658643326028,
  'E. Cummings, House Committee on Oversight and Reform, and Chairman Adam B. Schiff, House Permanent Select Committee on Intelligence, the same day).'),
 (10.921225382932166,
  '77 Letter from Pat A. Cipollone, Counsel to the President, The White House, to House Speaker Nancy Pelosi, Chairman Adam B. Schiff, House Permanent Select Committee on Intelligence, Chairman Eliot L. Engel, House Committee on Foreign Affairs Committee, and Chairman Elijah E. Cummings, House Committee on Oversight and Reform (Oct. 8, 2019) (online at www.whitehouse.gov/wp-content/uploads/2019/10/PAC-Letter-10.08.2019.pdf).'),
 (10.921225382932166,
  '88 Letter from Pat A. Cipollone, Counsel to the President, The White House, to House Speaker Nancy Pelosi, Chairman Adam B. Schiff, House Permanent Select Committee on Intelligence, Chairman Eliot L. Engel, House Committee on Foreign Affairs Committee, and Chairman Elijah E. Cummings, House Committee on Oversight and Reform (Oct. 8, 2019) (onl

# Topic Modeling

https://nlpforhackers.io/topic-modeling/

# Topic Modeling Algorithms

There are several algorithms for doing topic modeling. The most popular ones include

LDA – Latent Dirichlet Allocation – The one we’ll be focusing in this tutorial. Its foundations are Probabilistic Graphical Models

LSA or LSI – Latent Semantic Analysis or Latent Semantic Indexing – Uses Singular Value Decomposition (SVD) on the Document-Term Matrix. Based on Linear Algebra

NMF – Non-Negative Matrix Factorization – Based on Linear Algebra

# Using Gensim for Topic Modeling

In [417]:
import re
from glob import glob
from gensim import models, corpora, similarities
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stop_words

In [433]:
NUM_TOPICS = 2
STOPWORDS = set(stopwords.words('english')) | stop_words

In [422]:
def load_files_from_disk(data_dir):
    text_data_list = []
    file_list = glob(pathname=data_dir + '/*txt')
    for file in file_list: 
        with open(file, "r") as f:
            text_data_list.append(f.read())
    return text_data_list

In [420]:
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

In [423]:
article_data = load_files_from_disk('impeachment_data/')
len(article_data)

2

In [424]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in article_data:
    tokenized_data.append(clean_text(text))

In [425]:
# Build a Gensim Dictionary - assocation word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
# for k,v in dictionary.items():
#     print((k, v))

In [426]:
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

In [427]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

#Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [435]:
print("=" * 115)
print("LDA Model:")
for idx in range(NUM_TOPICS):
    print("Topic #%s:"%idx, lda_model.print_topic(idx,5))
print("=" * 115)
print("LSI Model:")
for idx in range(NUM_TOPICS):
    print("Topic #%s:"%idx, lsi_model.print_topic(idx,5))
print("=" * 115)

LDA Model:
Topic #0: 0.034*"president" + 0.018*"house" + 0.016*"trump" + 0.013*"ambassador" + 0.011*"ukraine"
Topic #1: 0.025*"president" + 0.017*"house" + 0.016*"ambassador" + 0.013*"committee" + 0.012*"ukraine"
LSI Model:
Topic #0: 0.582*"president" + 0.295*"house" + 0.269*"ambassador" + 0.225*"trump" + 0.193*"ukraine"
Topic #1: -0.543*"president" + 0.286*"committee" + 0.246*"dep" + 0.232*"house" + 0.185*"chairman"


In [444]:
t = """
The report puts Trump personal lawyer Rudy Giuliani at the center of a scheme to force out the U.S. ambassador to Ukraine and pressure that country’s government to investigate Joe Biden’s family and a conspiracy theory that Ukraine interfered in the 2016 U.S. election.
The House obtained AT&T call records showing Giuliani in contact with phone numbers associated with the White House, the Office of Management and Budget, top Intelligence Committee Republican Devin Nunes, and Giuliani associate Lev Parnas. The report doesn’t say who in the White House or OMB participated in the calls.
The calls and texts were made during the time period when Giuliani was publicly discussing his efforts to pursue investigations into the Bidens and a conspiracy theory about Ukrainian interference in the 2016 election.
House Intelligence Chairman Adam Schiff said the call records show that “there was considerable coordination among the parties including the White House” in a smear campaign against then-U.S. Ambassador Marie Yovanovitch.
The committee also found Giuliani in contact on Aug. 8 with phone numbers associated with the White House amid negotiations with Ukrainian officials about announcing investigations. The records also showed European Union Ambassador Gordon Sondland in contact with White House and OMB phone numbers on Aug. 9.
One of the Sondland calls came minutes before a text message he sent saying that he thought Trump strongly wanted the “deliverable.” Sondland later said that referred to an announcement by Ukraine of investigations sought by Trump and Giuliani.
"""
bow = dictionary.doc2bow(clean_text(t))
print(lsi_model[bow])
print(lda_model[bow])

[(0, 8.212948210731929), (1, 3.293606890748997)]
[(6, 0.9864905)]


In [446]:
lda_index = similarities.MatrixSimilarity(lda_model[corpus])

In [447]:
sims = lda_index[lda_model[bow]]

In [448]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [449]:
print(sims[:10])

[(0, 0.99974495), (1, 0.034893364)]


In [455]:
document_id, similarity = sims[1]
print(article_data[document_id][:1000])

HOME  
The Trump-Ukraine Impeachment Inquiry Report

The impeachment inquiry into Donald J. Trump, the 45th President of the United States, uncovered a months-long effort by President Trump to use the powers of his office to solicit foreign interference on his behalf in the 2020 election.  As described in this executive summary and the report that follows, President Trump’s scheme subverted U.S. foreign policy toward Ukraine and undermined our national security in favor of two politically motivated investigations that would help his presidential reelection campaign.  The President demanded that the newly-elected Ukrainian president, Volodymyr Zelensky, publicly announce investigations into a political rival that he apparently feared the most, former Vice President Joe Biden, and into a discredited theory that it was Ukraine, not Russia, that interfered in the 2016 presidential election.  To compel the Ukrainian President to do his political bidding, President Trump conditioned two offi

# Using Scikit-Learn for Topic Modeling

scikit-learn offers an NMF model in addition to LDA and LSI models