In [317]:
import re
from string import punctuation
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stop_words

class SimpleSummarize:
    def __init__(self, filename=None, k=None):
        self.txt = None
        self.word_tokens = None
        self.sent_tokens = None
        self.word_freq = None
        self.freq_dist = {}
        self.sent_scores = {}
        self.top_sents = None
        self.max_len = 40
        self.summary = ''
        self.scores = []
        self.english_stopwords = set(stopwords.words('english')) | stop_words
        if filename and k:
            self.load_file_from_disk(filename)
            self.tokenize()
            self.word_freq_dist()
            self.score_sentences()
            self.summarize(k)
    
    def load_file_from_disk(self, filename):
        with open(filename, "r") as file:
            self.txt = file.read().replace("\n", " ")
            self.txt = self.txt.replace("\'","")
    
    def tokenize(self):
        self.word_tokens = self.tokenizer(self.txt)
        #self.sent_tokens = self.simple_sent_tokenizer(self.txt)
        self.sent_tokens = sent_tokenize(self.txt)

    def simple_sent_tokenizer(self, s):
        sents = []
        for sent in s.split('.'):
            sents.append(sent.strip())
        return sents
        
    def tokenizer(self,txt):
        txt = txt.lower()
        word_tokens = word_tokenize(txt.lower())
        word_tokens = [w for w in word_tokens if w not in self.english_stopwords and re.match('[a-zA-Z-][a-zA-Z-]{2,}', w)]
        return word_tokens
    
    def word_freq_dist(self):
        self.word_freq = nltk.FreqDist(self.word_tokens)
        most_freq_count = max(self.word_freq.values())
        for k,v in self.word_freq.items():
            self.freq_dist[k] = v/most_freq_count
    
    def score_sentences(self):
        for sent in self.sent_tokens:
            words = self.tokenizer(sent)
            for word in words:
                if word.lower() in self.freq_dist.keys():
                    if len(words) < self.max_len:
                        # if key does not exist add it and the freq_dist for the first word
                        if sent not in self.sent_scores.keys():
                            self.sent_scores[sent] = self.freq_dist[word.lower()]
                        else: 
                            # the key exists and we just add the freq_dist of the following words. 
                            # We are just summing up the freq_dists for the sentence
                            self.sent_scores[sent] += self.freq_dist[word.lower()]
    
    def summarize(self, k):
        self.top_sents = Counter(self.sent_scores)
        for t in self.top_sents.most_common(k):
            self.summary += t[0].strip()+'. '
            self.scores.append((t[1],t[0]))
    

In [318]:
# foo = SimpleSummarize()
# foo.load_file_from_disk("CNNImpeachmentArticle.txt")
# foo.tokenize()
# foo.word_freq_dist()
# foo.score_sentences()
# foo.summarize(3)
# foo.summary
foo = SimpleSummarize(filename="CNNImpeachmentArticle.txt", k=3)
foo.summary

'The Judiciary Committee hearing is the latest sign that House Democrats are moving forward with impeachment proceedings against the President following the two-month investigation led by the House Intelligence Committee into allegations that Trump pushed Ukraine to investigate his political rivals while a White House meeting and $400 million in security aid were withheld from Kiev.. The House Judiciary Committee has invited President Donald Trump or his counsel to participate in the panels first impeachment hearing next week as the House moves another step closer to impeaching the President.. READ: Judiciary Chairman&#39;s invite to Trump and his lawyers to take part in upcoming impeachment hearings The hearing announcement comes as the Intelligence Committee plans to release its report summarizing the findings of its investigation to the House Judiciary Committee soon after Congress returns from its Thanksgiving recess next week.. '

# Topic Modeling

https://nlpforhackers.io/topic-modeling/

# Topic Modeling Algorithms

There are several algorithms for doing topic modeling. The most popular ones include

LDA – Latent Dirichlet Allocation – The one we’ll be focusing in this tutorial. Its foundations are Probabilistic Graphical Models

LSA or LSI – Latent Semantic Analysis or Latent Semantic Indexing – Uses Singular Value Decomposition (SVD) on the Document-Term Matrix. Based on Linear Algebra

NMF – Non-Negative Matrix Factorization – Based on Linear Algebra

# Using Gensim for Topic Modeling

In [252]:
import re
from glob import glob
from gensim import models, corpora, similarities
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stop_words

In [253]:
NUM_TOPICS = 10
STOPWORDS = set(stopwords.words('english')) | stop_words

In [254]:
def load_files_from_disk(data_dir):
    text_data_list = []
    file_list = glob(pathname=data_dir + '/*')
    for file in file_list: 
        with open(file, "r") as f:
            text_data_list.append(f.read())
    return text_data_list

In [255]:
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

In [256]:
article_data = load_files_from_disk('articles/')
len(article_data)

111

In [257]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in article_data:
    tokenized_data.append(clean_text(text))

In [258]:
# Build a Gensim Dictionary - assocation word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
# for k,v in dictionary.items():
#     print((k, v))

In [259]:
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

In [260]:
print(len(corpus))
print(corpus[0][:5])
print(corpus[20][:10])

111
[(0, 1), (1, 1), (2, 2), (3, 2), (4, 1)]
[(0, 1), (18, 1), (39, 1), (49, 1), (52, 2), (70, 2), (71, 1), (74, 1), (81, 1), (84, 1)]


In [261]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

#Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [262]:
print("=" * 115)
print("LDA Model:")
for idx in range(NUM_TOPICS):
    print("Topic #%s:"%idx, lda_model.print_topic(idx,10))
print("=" * 115)
print("LSI Model:")
for idx in range(NUM_TOPICS):
    print("Topic #%s:"%idx, lsi_model.print_topic(idx,10))
print("=" * 115)

LDA Model:
Topic #0: 0.006*"like" + 0.004*"said" + 0.003*"time" + 0.003*"new" + 0.003*"people" + 0.003*"years" + 0.002*"way" + 0.002*"going" + 0.002*"know" + 0.002*"make"
Topic #1: 0.005*"new" + 0.005*"like" + 0.004*"people" + 0.003*"time" + 0.003*"game" + 0.002*"said" + 0.002*"make" + 0.002*"years" + 0.002*"data" + 0.002*"football"
Topic #2: 0.004*"like" + 0.004*"new" + 0.004*"people" + 0.003*"time" + 0.003*"way" + 0.003*"said" + 0.003*"work" + 0.003*"make" + 0.002*"years" + 0.002*"game"
Topic #3: 0.005*"like" + 0.004*"time" + 0.004*"people" + 0.004*"new" + 0.003*"said" + 0.003*"way" + 0.003*"work" + 0.003*"game" + 0.002*"know" + 0.002*"team"
Topic #4: 0.005*"people" + 0.005*"like" + 0.004*"new" + 0.003*"data" + 0.003*"way" + 0.003*"time" + 0.002*"use" + 0.002*"years" + 0.002*"know" + 0.002*"church"
Topic #5: 0.006*"like" + 0.005*"new" + 0.003*"people" + 0.003*"make" + 0.003*"time" + 0.002*"said" + 0.002*"team" + 0.002*"rice" + 0.002*"years" + 0.002*"know"
Topic #6: 0.006*"like" + 0.0

In [263]:
t = "I like football. It's cool to watch and I like the players and I like people. Football offense defense field goal"
bow = dictionary.doc2bow(clean_text(t))
print(lsi_model[bow])
print(lda_model[bow])

[(0, -1.461097125023915), (1, 0.5769466544499854), (2, -0.679132031595229), (3, 0.08706842861130795), (4, 0.22426330811234568), (5, 0.27258391940017535), (6, 0.7620872250218782), (7, 0.9151602360814599), (8, 0.10802040547313593), (9, -0.6742140436068658)]
[(9, 0.9356761)]


In [264]:
lda_index = similarities.MatrixSimilarity(lda_model[corpus])

In [265]:
sims = lda_index[lda_model[bow]]

In [266]:
sims = sorted(enumerate(sims), key=lambda item: -item[1])

In [267]:
print(sims[:10])

[(3, 0.999387), (13, 0.999387), (18, 0.999387), (22, 0.999387), (23, 0.999387), (24, 0.999387), (27, 0.999387), (35, 0.999387), (63, 0.999387), (72, 0.999387)]


In [269]:
document_id, similarity = sims[0]
print(article_data[document_id][:2000])

We Can't Even: Stories dedicated to fashion in 2009, when everything was "bananas." What we talk about when we talk about 2009. Photo-Illustration: by Stevie Remsberg; Photo by Dennis Valle

Your own time: You love to see it! Good luck trying. We’re suspended in it, swimming through Jell-O, with all the clarity that suggests. Sometimes you can only see where you are by looking at where you were. Ten years ago, when I was a young fashion writer, things looked different, but not dark-to-light different. “President Trump” existed — as a Simpsons character. I was sure we’d never give up BBM. My in-box, that searchable tomb, from September 2009: a birthday reminder (from Friendster), notice of a Netflix delivery (on DVD), a newsletter from Refinery 29, their list of the best models who blog. Recognizable, but off. It was today in embryo, today 1.0: a huge terrain to cover in a mere decade, but not an impossible one. Blink, and you’re here. Thus does the mold of history gel and set.

This we

# Using Scikit-Learn for Topic Modeling

scikit-learn offers an NMF model in addition to LDA and LSI models