In [9]:
import re
from string import punctuation
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stop_words

class SimpleSummarize:
    def __init__(self, filename=None, k=None):
        self.txt = None
        self.word_tokens = None
        self.sent_tokens = None
        self.word_freq = None
        self.freq_dist = {}
        self.sent_scores = {}
        self.top_sents = None
        self.max_len = 40
        self.summary = ''
        self.scores = []
        self.english_stopwords = set(stopwords.words('english')) | stop_words
        if filename and k:
            self.load_file_from_disk(filename)
            self.tokenize()
            self.word_freq_dist()
            self.score_sentences()
            self.summarize(k)
    
    def load_file_from_disk(self, filename):
        with open(filename, "r") as file:
            self.txt = file.read().replace("\n", " ")
            self.txt = self.txt.replace("\'","")
    
    def tokenize(self):
        self.word_tokens = self.tokenizer(self.txt)
        self.sent_tokens = sent_tokenize(self.txt.lower())

    def tokenizer(self,txt):
        txt = txt.lower()
        word_tokens = word_tokenize(txt.lower())
        word_tokens = [w for w in word_tokens if w not in self.english_stopwords and re.match('[a-zA-Z-][a-zA-Z-]{2,}', w)]
        return word_tokens
    
    def word_freq_dist(self):
        self.word_freq = nltk.FreqDist(self.word_tokens)
        most_freq_count = max(self.word_freq.values())
        for k,v in self.word_freq.items():
            self.freq_dist[k] = v/most_freq_count
    
    def score_sentences(self):
        for sent in self.sent_tokens:
            words = self.tokenizer(sent)
            for word in words:
                if word.lower() in self.freq_dist.keys():
                    if len(words) < self.max_len:
                        # if key does not exist add it and the freq_dist for the first word
                        if sent not in self.sent_scores.keys():
                            self.sent_scores[sent] = self.freq_dist[word.lower()]
                        else: 
                            # the key exists and we just add the freq_dist of the following words. 
                            # We are just summing up the freq_dists for the sentence
                            self.sent_scores[sent] += self.freq_dist[word.lower()]
    
    def summarize(self, k):
        self.top_sents = Counter(self.sent_scores)
        for t in self.top_sents.most_common(k):
            self.summary += t[0].strip()+'. '
            self.scores.append((t[1],t[0]))
    

In [10]:
# foo = SimpleSummarize()
# foo.load_file_from_disk("CNNImpeachmentArticle.txt")
# foo.tokenize()
# foo.word_freq_dist()
# foo.score_sentences()
# foo.summarize(3)
# foo.summary
foo = SimpleSummarize(filename="CNNImpeachmentArticle.txt", k=3)
foo.summary

'the judiciary committee hearing is the latest sign that house democrats are moving forward with impeachment proceedings against the president following the two-month investigation led by the house intelligence committee into allegations that trump pushed ukraine to investigate his political rivals while a white house meeting and $400 million in security aid were withheld from kiev.. the house judiciary committee has invited president donald trump or his counsel to participate in the panels first impeachment hearing next week as the house moves another step closer to impeaching the president.. read: judiciary chairman&#39;s invite to trump and his lawyers to take part in upcoming impeachment hearings the hearing announcement comes as the intelligence committee plans to release its report summarizing the findings of its investigation to the house judiciary committee soon after congress returns from its thanksgiving recess next week.. '

# Using Gensim for Topic Modeling

In [19]:
import re
from glob import glob
from gensim import models, corpora
from nltk.corpus import stopwords
from nltk import sent_tokenize, word_tokenize, pos_tag
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as stop_words

In [4]:
NUM_TOPICS = 3
STOPWORDS = set(stopwords.words('english')) | stop_words

In [41]:
def load_files_from_disk(data_dir):
    text_data_list = []
    file_list = glob(pathname=data_dir + '/*')
    for file in file_list: 
        with open(file, "r") as f:
            text_data_list.append(f.read())
    return text_data_list

In [7]:
def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text

In [43]:
article_data = load_files_from_disk('articles/')
len(article_data)

49

In [46]:
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in article_data:
    tokenized_data.append(clean_text(text))

In [55]:
# Build a Gensim Dictionary - assocation word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
# for k,v in dictionary.items():
#     print((k, v))

In [56]:
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]

In [68]:
print(len(corpus))
print(corpus[0][:5])
print(corpus[20][:10])

49
[(0, 1), (1, 1), (2, 2), (3, 2), (4, 1)]
[(0, 1), (18, 1), (39, 1), (49, 1), (52, 2), (70, 2), (71, 1), (74, 1), (81, 1), (84, 1)]


In [70]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

#Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [87]:
print("=" * 115)
print("LDA Model:")
for idx in range(NUM_TOPICS):
    print("Topic #%s:"%idx, lda_model.print_topic(idx,10))
print("=" * 115)
print("LSI Model:")
for idx in range(NUM_TOPICS):
    print("Topic #%s:"%idx, lsi_model.print_topic(idx,10))
print("=" * 115)

LDA Model:
Topic #0: 0.006*"like" + 0.004*"people" + 0.004*"new" + 0.004*"football" + 0.004*"time" + 0.004*"game" + 0.003*"way" + 0.003*"make" + 0.002*"data" + 0.002*"years"
Topic #1: 0.005*"like" + 0.005*"people" + 0.004*"data" + 0.003*"new" + 0.003*"handstand" + 0.003*"facebook" + 0.003*"time" + 0.003*"make" + 0.002*"use" + 0.002*"big"
Topic #2: 0.006*"new" + 0.005*"like" + 0.003*"time" + 0.003*"make" + 0.003*"pixels" + 0.003*"people" + 0.003*"buffer" + 0.003*"facebook" + 0.002*"use" + 0.002*"image"
LSI Model:
Topic #0: -0.315*"football" + -0.266*"like" + -0.263*"game" + -0.253*"people" + -0.203*"facebook" + -0.181*"new" + -0.159*"time" + -0.140*"targeting" + -0.132*"use" + -0.114*"make"
Topic #1: -0.418*"football" + -0.344*"game" + 0.319*"pixels" + 0.257*"image" + 0.241*"facebook" + 0.169*"sharing" + 0.166*"images" + -0.148*"players" + 0.122*"people" + 0.113*"targeting"
Topic #2: 0.443*"pixels" + 0.346*"image" + -0.248*"people" + 0.241*"sharing" + 0.233*"football" + 0.232*"images" +