In [6]:
import os 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
stop_words = list(STOP_WORDS)
punctuation += '\n'
from heapq import nlargest

In [7]:
class Summarizer():
    def __init__(self, path=None, corpus=None):
        self.mCorpus = ''
        self.mWordFrequencies = {}
        self.mSentScore = {}
        self.mNumSentences = 0

        if path != None:
            file = open(path, 'r')
            self.mCorpus = self.mCorpus + file.read()
        elif corpus != None:
            self.mCorpus = corpus
        
        self.mNlp = spacy.load('en_core_web_sm')
        self.mDoc = self.mNlp(self.mCorpus)
    

    def PrintCorpus(self):
        print(self.mCorpus)
    

    def WordFrequencyCalculator(self):
        for word in self.mDoc:
            wordInLowerCase = word.text.lower()
            if(wordInLowerCase not in stop_words) and (wordInLowerCase not in punctuation):
                if wordInLowerCase not in self.mWordFrequencies.keys():
                    self.mWordFrequencies[wordInLowerCase] = 1
                else:
                    self.mWordFrequencies[wordInLowerCase] += 1 
        return 
    

    def WordFreqNormalizer(self):
        maxFreq = max(self.mWordFrequencies.values())

        for word in self.mWordFrequencies.keys():
            self.mWordFrequencies[word] = self.mWordFrequencies[word] / maxFreq
        return
    
    
    def CalculateSentenceScore(self):
        sentences = [sent for sent in self.mDoc.sents]
        self.mNumSentences = len(sentences)

        for sent in sentences:
            for word in sent:
                wordInLowerCase = word.text.lower()

                if wordInLowerCase in self.mWordFrequencies.keys():
                    if sent not in self.mSentScore.keys():
                        self.mSentScore[sent] = self.mWordFrequencies[wordInLowerCase]
                    else:
                        self.mSentScore[sent] += self.mWordFrequencies[wordInLowerCase]
        return
    

    def SummarizeText(self, fractionToReduce=0.25):
        self.WordFrequencyCalculator()
        self.WordFreqNormalizer()
        self.CalculateSentenceScore()

        reducedSentNum = int(self.mNumSentences * fractionToReduce)
        print('Total Number of Sentences: {}'.format(self.mNumSentences))
        print('Number of sentences in Summary: {}'.format(reducedSentNum))
        print('Summary: \n')

        summaryList = nlargest(reducedSentNum, self.mSentScore, key=self.mSentScore.get)
        for sentence in summaryList:
            print(sentence, end=' ')


In [8]:
path = 'C:\MyProjects\Trump_Rallies\Speeches\BemidjiSep18_2020.txt'
print('Printing Summary for {}\n'.format('BattleCreekDec19'))
summarizer = Summarizer(path=path)
summarizer.SummarizeText(fractionToReduce=0.1)
print('\n')

Printing Summary for BattleCreekDec19

Total Number of Sentences: 2059
Number of sentences in Summary: 205
Summary: 

From St. Paul to St. Cloud, from Rochester to Duluth, and from Minneapolis, thank God we still have Minneapolis, to right here, right here with all of you great people, this state was pioneered by men and women who braved the wilderness and the winters to build a better life for themselves and for their families. All the guys that got it right, that really got it right, they all got it wrong, totally wrong. But they did it, and we've done a great job and the people of that great state, they understand it, and I just hope the people of your state of Minnesota understand it because we don't have too many more chances with this stuff. A friend of mine said, "You know, two years ago you said, next will be Robert E. Lee and then comes Washington. Minnesota is going to keep on winning and you're going to get tired of winning because Minnesota doesn't want to win all the time.