In [47]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
from collections import defaultdict
from nltk.collocations import *
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from nltk.probability import FreqDist
from heapq import nlargest

In [24]:
def read_file(file_path):
    with open(file_path, 'r') as content_file:
        content = content_file.read()
        
    return content

In [25]:
def preprocess(txt):
    return txt.replace("’", " ").replace("“", " ").replace("”", " ").replace("\n", " ")

In [26]:
def get_stopwords():
    result = set(stopwords.words('english') + list(punctuation) + ['\'', '’', '"', '“', '”'])
    return result

In [27]:
def create_sentences_map(txt):
    sentences = sent_tokenize(txt)
   
    sentencesMap = {key: value for (key, value) in enumerate(sentences)}
        
    return sentencesMap

In [28]:
def create_words_map(sentencesMap):
    wordsMap = {key: word_tokenize(sentencesMap[key]) for key in sentencesMap}
       
    return wordsMap

In [29]:
def remove_stopwords(words):
    customStopWords = get_stopwords()
    
    wordsFiltered = [word for word in words if word not in customStopWords]
    
    return wordsFiltered

In [30]:
def clean_words_map(wordsMap):
    cleanWordsMap = {}
    for p in wordsMap:
        words = wordsMap[p]
        cleanWords = remove_stopwords(words)
        cleanWordsMap[p] = cleanWords
        
    return cleanWordsMap

In [31]:
def create_raw_words(input):
    customStopWords =  get_stopwords()
    rawWords = [word for word in word_tokenize(txt) if word not in customStopWords]
    return rawWords

In [32]:
def count_words(words):
    result = defaultdict(int)
    for word in words:
        result[word] = result[word] + 1
    return result

In [33]:
def calculate_sentence_popularity_index(wordsCount, sentenceWords):
    result = 0
    for word in sentenceWords:
        if word in wordsCount:
            result += wordsCount[word]
            
    return result

In [34]:
def calculate_text_popularity_index(wordsCount, cleanWordsMap):
    result = {}
    for p in cleanWordsMap:
        ix = calculate_sentence_popularity_index(wordsCount, cleanWordsMap[p])
        result[p] = ix
        
    return result

In [38]:
def create_abstract_from_text(inputText, sentencesCount):
    txt = preprocess(inputText)
    sentencesMap = create_sentences_map(txt)
    wordsMap = create_words_map(sentencesMap)
    cleanWordsMap = clean_words_map(wordsMap)
    rawWords = create_raw_words(txt)
    wordsCount = count_words(rawWords)
    popIndex = calculate_text_popularity_index(wordsCount, cleanWordsMap)
    popIndexList = [(k, popIndex[k]) for k in popIndex]
    sortedPopIndexList = sorted(popIndexList, key = lambda p: p[1], reverse = True)
    
    result = []
    
    for ix in range(sentencesCount):
        if ix >= len(sentencesMap):
            break
        result.append(sentencesMap[sortedPopIndexList[ix][0]])
        
    return result    

In [70]:
def create_abstract_from_text_beta(inputText, sentencesCount):
    assert sentencesCount > 0
    
    txt = preprocess(inputText)
    sentencesMap = create_sentences_map(txt)
    
    assert sentencesCount <= len(sentencesMap)
    
    wordsMap = create_words_map(sentencesMap)
    cleanWordsMap = clean_words_map(wordsMap)
    rawWords = create_raw_words(txt)
    wordsCount = FreqDist(rawWords)
    popIndex = calculate_text_popularity_index(wordsCount, cleanWordsMap)
    nlarge = nlargest(sentencesCount, popIndex, key = popIndex.get)
    
    result = []
    
    for ix in range(len(nlarge)):
        if ix >= len(sentencesMap):
            break
        result.append(sentencesMap[nlarge[ix]])
        
    return result     

In [71]:
txt = read_file('d:/sample_text.txt')

In [72]:
create_abstract_from_text(txt, 2)

['With a history of innovation spanning more than 130 years, ABB is today a leader in digital industries with four customer-focused, globally leading businesses: Electrification, Industrial Automation, Motion, and Robotics & Discrete Automation, supported by its common ABB Ability™ digital platform.',
 'They were shown how iconic, two-armed YuMi® robots together with single-armed YuMi® put together wristwatches individually with unparalleled precision, using the revolutionary versatile transport system from ABB s newly formed Robotics & Discrete Automation business.']

In [74]:
create_abstract_from_text_beta(txt, 2)

['With a history of innovation spanning more than 130 years, ABB is today a leader in digital industries with four customer-focused, globally leading businesses: Electrification, Industrial Automation, Motion, and Robotics & Discrete Automation, supported by its common ABB Ability™ digital platform.',
 'They were shown how iconic, two-armed YuMi® robots together with single-armed YuMi® put together wristwatches individually with unparalleled precision, using the revolutionary versatile transport system from ABB s newly formed Robotics & Discrete Automation business.']