# Text Summarization using Python

In [34]:
import re
import json
import pandas as pd
import numpy as np
import networkx as nx
from flask import Flask
from flask import request, jsonify
from string import punctuation
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
from nltk import sent_tokenize, word_tokenize, pos_tag
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stop_words
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import logging
import urllib3
import requests
logging.basicConfig(level=logging.INFO)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [None]:
class SimpleSummarize:
    def __init__(self, filename=None, k=None):
        self.txt = None
        self.word_tokens = None
        self.sent_tokens = None
        self.word_freq = None
        self.freq_dist = {}
        self.sent_scores = {}
        self.top_sents = None
        self.max_len = 40
        self.summary = ''
        self.scores = []
        self.english_stopwords = set(stopwords.words('english')) | stop_words
        if filename and k:
            self.load_file_from_disk(filename)
            self.tokenize()
            self.word_freq_dist()
            self.score_sentences()
            self.summarize(k)
    
    def load_file_from_disk(self, filename):
        with open(filename, "r") as file:
            self.txt = file.read().replace("\n", " ")
            self.txt = self.txt.replace("\'","")
    
    def tokenize(self):
        self.word_tokens = self.tokenizer(self.txt)
        #self.sent_tokens = self.simple_sent_tokenizer(self.txt)
        self.sent_tokens = sent_tokenize(self.txt)

    def simple_sent_tokenizer(self, s):
        sents = []
        for sent in s.split('.'):
            sents.append(sent.strip())
        return sents
        
    def tokenizer(self,txt):
        txt = txt.lower()
        word_tokens = word_tokenize(txt.lower())
        word_tokens = [w for w in word_tokens if w not in self.english_stopwords and re.match('[a-zA-Z-][a-zA-Z-]{2,}', w)]
        return word_tokens
    
    def word_freq_dist(self):
        self.word_freq = nltk.FreqDist(self.word_tokens)
        most_freq_count = max(self.word_freq.values())
        for k,v in self.word_freq.items():
            self.freq_dist[k] = v/most_freq_count
    
    def score_sentences(self):
        for sent in self.sent_tokens:
            words = self.tokenizer(sent)
            for word in words:
                if word.lower() in self.freq_dist.keys():
                    if len(words) < self.max_len:
                        # if key does not exist add it and the freq_dist for the first word
                        if sent not in self.sent_scores.keys():
                            self.sent_scores[sent] = self.freq_dist[word.lower()]
                        else: 
                            # the key exists and we just add the freq_dist of the following words. 
                            # We are just summing up the freq_dists for the sentence
                            self.sent_scores[sent] += self.freq_dist[word.lower()]
    
    def summarize(self, k):
        self.top_sents = Counter(self.sent_scores)
        for t in self.top_sents.most_common(k):
            self.summary += t[0].strip()+'. '
            self.scores.append((t[1],t[0]))
    

In [None]:
# foo = SimpleSummarize()
# foo.load_file_from_disk("CNNImpeachmentArticle.txt")
# foo.tokenize()
# foo.word_freq_dist()
# foo.score_sentences()
# foo.summarize(3)
# foo.summary
foo = SimpleSummarize(filename="CNNImpeachmentArticle.txt", k=3)
foo.summary

## Gensim

https://towardsdatascience.com/text-summarization-in-python-76c0a41f0dc4

In [2]:
from gensim.summarization.summarizer import summarize
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel

In [3]:
with open("CNNImpeachmentArticle.txt", "r") as file:
    text = file.read()
print(summarize(text))

House Judiciary Chairman Jerry Nadler sent a letter to Trump on Tuesday notifying him of the hearing and inviting the President or his counsel to participate, including asking questions of the witnesses.
The Judiciary Committee hearing is the latest sign that House Democrats are moving forward with impeachment proceedings against the President following the two-month investigation led by the House Intelligence Committee into allegations that Trump pushed Ukraine to investigate his political rivals while a White House meeting and $400 million in security aid were withheld from Kiev.
Nadler asked Trump to respond by Sunday on whether the White House wanted to participate in the hearings, as well as who would act as the President's counsel for the proceedings.


In [4]:
text2 = """We all know that music is a powerful influencer. A movie without a soundtrack doesn’t provoke the same emotional journey. A workout without a pump-up anthem can feel like a drag. But is there a way to quantify these reactions? And if so, could they be reverse-engineered and put to use?

In a new paper, researchers at the University of Southern California mapped out how things like  pitch, rhythm, and harmony induce different types of brain activity, physiological reactions (heat, sweat, and changes in electrical response), and emotions (happiness or sadness), and how machine learning could use those relationships to predict how people might respond to a new piece of music. The results, presented at a conference last week on the intersections of computer science and art, show how we may one day be able to engineer targeted musical experiences for purposes ranging from therapy to movies.

The research is part of the lab’s broader goal to understand how different forms of media, such as films and TV ads as well as music, affect people’s bodies and brains. “Once we understand how media can affect your various emotions, then we can try to productively use it for actually supporting or enhancing human experiences,” says Shrikanth Narayanan, a professor at USC and the principal investigator in the lab."""

In [5]:
summarize(text2)

'In a new paper, researchers at the University of Southern California mapped out how things like  pitch, rhythm, and harmony induce different types of brain activity, physiological reactions (heat, sweat, and changes in electrical response), and emotions (happiness or sadness), and how machine learning could use those relationships to predict how people might respond to a new piece of music.'

In [6]:
text3 = """Republican South Carolina Senator Lindsey Graham is statistically tied with Democratic challenger Jaime Harrison, with the staunch pro-Trump incumbent seeing his favorability ratings plummet among independent voters.

Graham, who was first elected to the U.S. Senate in 2002, is clinging to a 2-percentage point lead over Harrison, 47 to 45 percent, with nearly 10 percent of voters surveyed still remaining undecided about their 2020 vote. As the Change Research/Post and Courier newspaper poll notes, Graham performs poorly in hypothetical November election matchups as well as with voters who do not identify solely as Republicans. The statistical dead heat between Graham and Harrison, who has pulled in record fundraising in recent weeks, comes as the GOP senator has become one of President Donald Trump's most outspoken supporters during the ongoing impeachment inquiry.

Trump won the traditionally-Republican state of South Carolina in the 2016 presidential election by a margin of nearly 15 percentage points over Democrat Hillary Clinton.

"[Senator Lindsey Graham] looks extremely vulnerable against Democratic contender Jaime Harrison," the South Carolina pollsters noted among their key findings. "While South Carolina does not support impeaching President Trump, a majority of voters would like Senator Graham to approach the impeachment inquiry with an open mind, rather than leap to the president's defense before hearing evidence."

Graham has not always been such a staunch defender of Trump, with the longtime Republican senator infamously remarking during the 2016 primary, "If we nominate Trump, we will get destroyed......and we will deserve it."


Lindsey Graham
✔
@LindseyGrahamSC
If we nominate Trump, we will get destroyed.......and we will deserve it.

162K
1:03 PM - May 3, 2016
Twitter Ads info and privacy
143K people are talking about this
However, Graham made recent comments that he doubts the president so little that he doesn't intend on being a "fair juror," if and when the House votes to send the articles of impeachment over to the Senate.

Harrison told residents of Greenville, South Carolina on Saturday that Graham is "not worthy of this state ... the winds of change are blowing my friends," the Greenville News reported. Responding to the neck-and-neck South Carolina poll on Twitter, Harrison asked his supporters and potential voters to remember that any campaign is possible in the current political environment.

"Running against Senator Graham is a tough climb, but it's also a hill worth climbing. I've faced things folks have deemed impossible my entire life, and this is yet another journey where I prove that in America, the impossible is always possible," Harrison wrote.

Harrison declared his candidacy in June and has previously served as chair and senior counselor at the Democratic National Committee, as well as heading the Democratic Party of South Carolina. The Yale University graduate and South Carolina native was also an advisor to Congressman James Clyburn.

"It's an uphill battle, no question, but Jaime is uniquely qualified," said House Majority Whip Clyburn, told The Washington Post last week. "He has the kind of life experiences that allow him to really connect with ordinary people."

The Change Research surveyed 998 likely general election voters in the state of South Carolina between December 6-11. The poll found only 38 percent of likely voters said they will "definitely" or "probably" vote for Graham next November. Trump has higher favorability numbers than Graham, with the Republican senator holding onto the support of less than half of those surveyed. In terms of the presidential election, former Vice President Joe Biden leads the Democratic primary with 27 percent, followed by Vermont Senator Bernie Sanders with 20 percent of the vote."""

text3 = text3.strip().replace("\n"," ")

In [7]:
text3.replace("\'","")

'Republican South Carolina Senator Lindsey Graham is statistically tied with Democratic challenger Jaime Harrison, with the staunch pro-Trump incumbent seeing his favorability ratings plummet among independent voters.  Graham, who was first elected to the U.S. Senate in 2002, is clinging to a 2-percentage point lead over Harrison, 47 to 45 percent, with nearly 10 percent of voters surveyed still remaining undecided about their 2020 vote. As the Change Research/Post and Courier newspaper poll notes, Graham performs poorly in hypothetical November election matchups as well as with voters who do not identify solely as Republicans. The statistical dead heat between Graham and Harrison, who has pulled in record fundraising in recent weeks, comes as the GOP senator has become one of President Donald Trumps most outspoken supporters during the ongoing impeachment inquiry.  Trump won the traditionally-Republican state of South Carolina in the 2016 presidential election by a margin of nearly 15

## Another text summarizer

https://towardsdatascience.com/understand-text-summarization-and-create-your-own-summarizer-in-python-b26a9f09fc70

In [None]:
from nltk.corpus import stopwords
from nltk.cluster.util import cosine_distance
import numpy as np
import networkx as nx
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as sw

In [None]:
def read_article(file_name):
    with open(file_name, 'r') as file:
        txt = file.read()
        sent_tokens = sent_tokenize(txt)
        return sent_tokens

In [None]:
def sentence_similarity(sent1, sent2, stopwords=None):
    if stopwords is None:
        stopwords = []
    sent1 = [w.lower() for w in sent1]
    sent2 = [w.lower() for w in sent2]
    all_words = list(set(sent1 + sent2))
    vector1 = [0] * len(all_words)
    vector2 = [0] * len(all_words)
    # build the vector for the first sentence
    for w in sent1:
        if w in stopwords:
            continue
        vector1[all_words.index(w)] += 1
    # build the vector for the second sentence
    for w in sent2:
        if w in stopwords:
            continue
        vector2[all_words.index(w)] += 1
    return 1 - cosine_distance(vector1, vector2)

In [None]:
def build_similarity_matrix(sentences, stop_words):
    similarity_matrix = np.zeros([len(sentences), len(sentences)])
    for idx1 in range(len(sentences)):
        for idx2 in range(len(sentences)):
            if idx1 == idx2:
                continue
            similarity_matrix[idx1][idx2] = sentence_similarity(sentences[idx1], sentences[idx2], stop_words)
    return similarity_matrix

In [None]:
def generate_summary(file_name, top_n=5):
    # set(stopwords.words('english')) | stop_words # Union operator for set() in python
    stop_words = set(stopwords.words('english')) | sw
    summarize_text = []
    sentences = read_article(file_name)
    sentence_similarity_matrix = build_similarity_matrix(sentences, stop_words)
    sentence_similarity_graph = nx.from_numpy_array(sentence_similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)
    #print("Indexes of top ranked_sentence order are ", ranked_sentences)    
    for i in range(top_n):
        summarize_text.append("".join(ranked_sentences[i][1]))
    print("Summarize Text: \n", ". ".join(summarize_text))

In [None]:
s1 = 'This is a sentence that I wrote.'
s2 = 'I wrote a sentence and drew a picture.'
s3 = 'Those are some green apples.'
print(sentence_similarity(s1, s2, stop_words))
print(sentence_similarity(s1, s3, stop_words))
print(sentence_similarity(s2, s3, stop_words))

In [None]:
sentences = read_article("impeachment_data/cnn_article.txt")
build_similarity_matrix(sentences, stop_words)

In [None]:
generate_summary("CNNImpeachmentArticle.txt", top_n=5)

# Text Extraction from PDFs

In [None]:
import PyPDF2

In [None]:
pdf = 'impeachment_data/20191203_-_full_report___hpsci_impeachment_inquiry_-_20191203.pdf'
pd_file_obj = open(pdf, 'rb')
pdf_reader = PyPDF2.PdfFileReader(pd_file_obj)
num_pages = pdf_reader.getNumPages()
full_text = ''
for p in range(num_pages):
    full_text = full_text + pdf_reader.getPage(p).extractText().strip().replace('\n','')
with open('impeachment_data/trump_impeachment_inquiry.txt', 'w+') as file:
    file.write(full_text)
    file.close()

In [None]:
full_text[:1000]

In [None]:
impeachment_summary = SimpleSummarize(filename="impeachment_data/trump_impeachment_inquiry.txt", k=3)
impeachment_summary.summary

In [None]:
impeachment_summary.scores

# Topic Modeling

https://nlpforhackers.io/topic-modeling/

# Topic Modeling Algorithms

There are several algorithms for doing topic modeling. The most popular ones include

LDA – Latent Dirichlet Allocation – The one we’ll be focusing in this tutorial. Its foundations are Probabilistic Graphical Models

LSA or LSI – Latent Semantic Analysis or Latent Semantic Indexing – Uses Singular Value Decomposition (SVD) on the Document-Term Matrix. Based on Linear Algebra

NMF – Non-Negative Matrix Factorization – Based on Linear Algebra

# Using Gensim for Topic Modeling

In [17]:
import re
from glob import glob
from gensim import models, corpora, similarities
from gensim.summarization.summarizer import summarize
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stop_words

In [18]:
def load_files_from_disk(data_dir):
    text_data_list = []
    file_list = glob(pathname=data_dir + '/*txt')
    for file in file_list: 
        with open(file, "r") as f:
            text_data_list.append(f.read())
    return text_data_list

In [236]:
class SummaryTopics:
    def __init__(self, text_data, num_topics=4):
        self.text_data = text_data
        self.num_topics = num_topics
        self.tokenized_data = []
        self.all_topics = []
        self.dictionary = None
        self.corpus = None
        self.lda_model = None
        self.lsi_model = None
        self.stopwords = set(stopwords.words('english')) | stop_words
    
    def add_stop_words(self, stopword_list):
        # set(list(t.stopwords) + ["bob", "jerry"])
        self.stopwords = set(list(self.stopwords) + stopword_list)
    
    def text_summarize(self):
        summaries = []
        for text in self.text_data:
            text_data = text.strip().replace("\n"," ")
            summaries.append(summarize(text_data).strip().replace("\n"," "))
        return json.dumps(summaries)
    
    def clean_text(self, text):
        tokenized_text = word_tokenize(text.lower())
        cleaned_text = [t for t in tokenized_text if t not in self.stopwords and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
        return cleaned_text
    
    def tokenize_text(self):
        # For gensim we need to tokenize the data and filter out stopwords
        for text in self.text_data:
            self.tokenized_data.append(self.clean_text(text))
        return self.tokenized_data
    
    def build_dictionary(self):
        # Build a Gensim Dictionary - associate word to numeric id
        self.dictionary = corpora.Dictionary(self.tokenized_data)
        return self.dictionary
    
    def build_corpus(self):
        # Transform the collection of texts to a numerical form
        self.corpus = [self.dictionary.doc2bow(text) for text in self.tokenized_data]
        return self.corpus
    
    def build_models(self):
        # Build the LDA model and the LSI model
        self.lda_model = models.LdaModel(corpus=self.corpus, 
                                         num_topics=self.num_topics, 
                                         id2word=self.dictionary)
        self.lsi_model = models.LsiModel(corpus=self.corpus, 
                                         num_topics=self.num_topics, 
                                         id2word=self.dictionary)
    
    def get_topics(self, num_topics=5):
        pattern = r'"([A-Za-z0-9_\./\\-]*)"'
        for idx in range(self.lda_model.num_topics):
            m = re.findall(pattern, self.lda_model.print_topic(idx,num_topics))
            self.all_topics += m
        for idx in range(self.lsi_model.num_topics):
            m = re.findall(pattern, self.lsi_model.print_topic(idx,num_topics))
            self.all_topics += m
        return list(set(self.all_topics))
    

In [238]:
# article_data = load_files_from_disk('articles/')
#t = SummaryTopics(article_data)
# tokens = t.tokenize_text()
# gensim_dict = t.build_dictionary()
# gensim_corpus = t.build_corpus()
# t.build_models()
# t.get_topics()
# set(list(t.stopwords) + ["bob", "jerry"])
# t.add_stop_words(['bab', 'bob', 'jerry', 'george', 'elaine'])
# t.stopwords

In [171]:
df = pd.read_csv("../data/dataframe_1572372176.7133894.csv")

In [177]:
df.head()

Unnamed: 0,title,excerpt,url,file_name,keyword,category
0,Elephants Under Attack Have An Unlikely Ally: ...,"A few years ago, Paul Allen, the co-founder of...",https://www.npr.org/2019/10/25/760487476/eleph...,2ffa358f-ee81-4df5-9444-33ef8beb8773.txt,Artificial Intelligence,information technology
1,Artificial Intelligence Research Needs Respons...,After nearly a year of suspense and controvers...,https://www.lawfareblog.com/artificial-intelli...,0efc8114-f8d2-4435-9352-798c49f5bba5.txt,Artificial Intelligence,information technology
2,Why We Shouldn’t Want Banks to Go All In on Ar...,Banks love to brag about how many data scienti...,https://slate.com/technology/2019/10/banks-art...,ee56040f-9325-4449-b3bc-62e7c824306d.txt,Artificial Intelligence,information technology
3,A face-scanning algorithm increasingly decides...,An artificial intelligence hiring system has b...,https://www.washingtonpost.com/technology/2019...,72f472de-e9f5-45b9-bc7b-49e2c8565e79.txt,Artificial Intelligence,information technology
4,Artificial intelligence as a tool for deliveri...,"Many of us are now familiar, as consumers, wit...",https://www.zdnet.com/article/artificial-intel...,be1dd15e-6c43-422e-97c9-bbb7f69c2cbb.txt,Artificial Intelligence,information technology


In [173]:
df['category'].unique()

array(['information technology', 'relationships', 'economics',
       'nature & ecology', 'family', 'beauty & fashion',
       'art & tradition', 'religion', 'sports', 'politics', 'astrology',
       'history', 'diy', 'science & technology', 'automobiles',
       'business', 'legal', 'cooking', 'medical', 'education',
       'meteorology', 'travel', 'health', 'celebrities', 'media'],
      dtype=object)

In [212]:
df_ai_articles = df[df['keyword']=='Artificial Intelligence'].head()

In [255]:
# /classify

article_data = []
for article in df_ai_articles.head(5)['file_name']:
    file = "../data/files/" + article
    with open(file, "r") as f:
        article_data.append(f.read())
        
api_endpoint = "/classify"
URI = "https://arcane-badlands-69055.herokuapp.com" + api_endpoint
header = {'content-type': 'application/json'}
r = requests.post(URI, headers=header, data=json.dumps(article_data[0]), verify=False) # <-- NOTE: Single doc
json.loads(r.content)

{'prediction': 'information technology'}

In [260]:
# /topics

article_data = []
for article in df_ai_articles.head(5)['file_name']:
    file = "../data/files/" + article
    with open(file, "r") as f:
        article_data.append(f.read())
        
api_endpoint = "/topics"
URI = "https://arcane-badlands-69055.herokuapp.com" + api_endpoint
header = {'content-type': 'application/json'}
r = requests.post(URI, headers=header, data=json.dumps(article_data), verify=False) # <-- Note: list of single or multiple docs 
json.loads(r.content)

{'topics': ['park',
  'npr',
  'bank',
  'researchers',
  'learning',
  'machine',
  'research',
  'artificial',
  'harm',
  'said',
  'intelligence',
  'hirevue',
  'potential',
  'banks',
  'elephant',
  'like',
  'people',
  'company',
  'job',
  'openai',
  'chikondi']}

In [312]:
# /summarize

article_data = []
for article in df_ai_articles.head(5)['file_name']:
    file = "../data/files/" + article
    with open(file, "r") as f:
        article_data.append(f.read())
        
api_endpoint = "/summarize"
# URI = "https://arcane-badlands-69055.herokuapp.com" + api_endpoint
URI = "http://skynet-jr:5000" + api_endpoint
header = {'content-type': 'application/json'}
r = requests.post(URI, headers=header, data=json.dumps(article_data[1]), verify=False) # <-- NOTE: Single doc
json.loads(r.content.decode())['summary']

'"After nearly a year of suspense and controversy, any day now the team of artificial intelligence (AI) researchers at OpenAI will release the full and final version of GPT-2, a language model that can \\u201cgenerate coherent paragraphs and perform rudimentary reading comprehension, machine translation, question answering, and summarization\\u2014all without task-specific training.\\u201d When OpenAI first unveiled the program in February, it was capable of impressive feats: Given a two-sentence prompt about unicorns living in the Andes Mountains, for example, the program produced a coherent nine-paragraph news article.\nAt the time, the technical achievement was newsworthy\\u2014but it was how OpenAI chose to release the new technology that really caused a firestorm.\\n\\nThere is a prevailing norm of openness in the machine learning research community, consciously created by early giants in the field: Advances are expected to be shared, so that they can be evaluated and so that the 

In [None]:
article_data = load_files_from_disk('impeachment_data/')
NUM_TOPICS = 5
STOPWORDS = set(stopwords.words('english')) | stop_words

In [None]:
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

In [None]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in article_data:
    tokenized_data.append(clean_text(text))

In [None]:
# Build a Gensim Dictionary - assocation word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
# for k,v in dictionary.items():
#     print((k, v))

In [None]:
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

In [None]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

#Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [None]:
lda_topics = lda_model.print_topics()
lsi_topics = lsi_model.print_topics()

In [None]:
t = lda_topics[0][1]
pattern = r'"([A-Za-z0-9_\./\\-]*)"'
m = re.findall(pattern, t)
m

In [None]:
for idx in range(lda_model.num_topics):
    print(lda_model.print_topic(idx,5))

In [None]:
for idx in range(lsi_model.num_topics):
    print(lda_model.print_topic(idx,5))

In [None]:
print("=" * 115)
print("LDA Model:")
for idx in range(NUM_TOPICS):
    print("Topic #%s:"%idx, lda_model.print_topic(idx,5))
print("=" * 115)
print("LSI Model:")
for idx in range(NUM_TOPICS):
    print("Topic #%s:"%idx, lsi_model.print_topic(idx,5))
print("=" * 115)

In [None]:
t = """
The report puts Trump personal lawyer Rudy Giuliani at the center of a scheme to force out the U.S. ambassador to Ukraine and pressure that country’s government to investigate Joe Biden’s family and a conspiracy theory that Ukraine interfered in the 2016 U.S. election.
The House obtained AT&T call records showing Giuliani in contact with phone numbers associated with the White House, the Office of Management and Budget, top Intelligence Committee Republican Devin Nunes, and Giuliani associate Lev Parnas. The report doesn’t say who in the White House or OMB participated in the calls.
The calls and texts were made during the time period when Giuliani was publicly discussing his efforts to pursue investigations into the Bidens and a conspiracy theory about Ukrainian interference in the 2016 election.
House Intelligence Chairman Adam Schiff said the call records show that “there was considerable coordination among the parties including the White House” in a smear campaign against then-U.S. Ambassador Marie Yovanovitch.
The committee also found Giuliani in contact on Aug. 8 with phone numbers associated with the White House amid negotiations with Ukrainian officials about announcing investigations. The records also showed European Union Ambassador Gordon Sondland in contact with White House and OMB phone numbers on Aug. 9.
One of the Sondland calls came minutes before a text message he sent saying that he thought Trump strongly wanted the “deliverable.” Sondland later said that referred to an announcement by Ukraine of investigations sought by Trump and Giuliani.
"""
bow = dictionary.doc2bow(clean_text(t))
print(lsi_model[bow])
print(lda_model[bow])

In [None]:
lda_index = similarities.MatrixSimilarity(lda_model[corpus])

In [None]:
sims = lda_index[lda_model[bow]]

In [None]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [None]:
print(sims[:10])

In [None]:
document_id, similarity = sims[2]
print(article_data[document_id][:1000])

# Using Scikit-Learn for Topic Modeling

scikit-learn offers an NMF model in addition to LDA and LSI models

In [None]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
NUM_TOPICS = 5

vectorizer = CountVectorizer(min_df=3, max_df=0.9, stop_words='english', lowercase=True,
                            token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(article_data)

In [None]:
# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape) # tuple (num_docs, num_topics)

In [None]:
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)

In [None]:
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)

In [None]:
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

In [None]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

In [None]:
text = "The President is underfire for his corrupt behavior. Trump Zelensky impeachment democrats volker"
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)

In [None]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(article_data))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)


In [None]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

## More about Latent Dirichlet Allocation


LDA is the most popular method for doing topic modeling in real-world applications. That is because it provides accurate results, can be trained online (do not retrain every time we get new data) and can be run on multiple cores. Let’s repeat the process we did in the previous sections with sklearn and LatentDirichletAllocation.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

In [None]:
NUM_TOPICS = 5

vectorizer = CountVectorizer(min_df=3, max_df=0.9, stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(article_data)
                             
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
text = "The President is underfire for his corrupt behavior. Trump Zelensky impeachment democrats volker"
x = lda_model.transform(vectorizer.transform([text]))[0]
# the largest topic contributes more to the document than the smaller topics
print(x, x.sum())

In [None]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel