In [1]:
# gensim
from gensim.summarization import summarize, keywords
import requests

# sumy
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
from sumy.parsers.plaintext import PlaintextParser
from sumy.parsers.html import HtmlParser
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

# rake
import RAKE

# textacy & spacy
import textacy

## Sample Text

In [2]:
text = "Thomas A. Anderson is a man living two lives. By day he is an " + \
    "average computer programmer and by night a hacker known as " + \
    "Neo. Neo has always questioned his reality, but the truth is " + \
    "far beyond his imagination. Neo finds himself targeted by the " + \
    "police when he is contacted by Morpheus, a legendary computer " + \
    "hacker branded a terrorist by the government. Morpheus awakens " + \
    "Neo to the real world, a ravaged wasteland where most of " + \
    "humanity have been captured by a race of machines that live " + \
    "off of the humans' body heat and electrochemical energy and " + \
    "who imprison their minds within an artificial reality known as " + \
    "the Matrix. As a rebel against the machines, Neo must return to " + \
    "the Matrix and confront the agents: super-powerful computer " + \
    "programs devoted to snuffing out Neo and the entire human " + \
    "rebellion. "
'''
text = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility " \
       "of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. " \
       "Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating"\
       " sets of solutions for all types of systems are given. These criteria and the corresponding algorithms " \
       "for constructing a minimal supporting set of solutions can be used in solving all the considered types of " \
       "systems and systems of mixed types."
'''
#text = requests.get('http://rare-technologies.com/the_matrix_synopsis.txt').text

'\ntext = "Compatibility of systems of linear constraints over the set of natural numbers. Criteria of compatibility "        "of a system of linear Diophantine equations, strict inequations, and nonstrict inequations are considered. "        "Upper bounds for components of a minimal set of solutions and algorithms of construction of minimal generating"       " sets of solutions for all types of systems are given. These criteria and the corresponding algorithms "        "for constructing a minimal supporting set of solutions can be used in solving all the considered types of "        "systems and systems of mixed types."\n'

#### TextRank Performance with Gensim Package

In [4]:
# textrank by gensim
print('Summary by sentences: ' + '\n' + summarize(text, ratio=0.5) + '\n')
print('Summary by keywords: ')
print(keywords(text, lemmatize=True, pos_filter=['NN', 'VB', 'JJ'], ratio=0.3, split=True))

Summary by sentences: 
By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.
Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix.
As a rebel against the machines, Neo must return to the Matrix and confront the agents: super-powerful computer programs devoted to snuffing out Neo and the entire human rebellion.

Summary by keywords: 
['humanity', 'neo', 'super', 'reality', 'body', 'hacker known', 'real', 'world']


#### TextRank / LexRank Performance with Sumy Package (output sentences only)

In [5]:
LANG = 'english'
parser = PlaintextParser.from_string(text, Tokenizer(LANG))

stemmer = Stemmer(LANG)
# textrank again by sumy
summarizer_text = TextRankSummarizer(stemmer)
summarizer_text.stop_words = get_stop_words(LANG)
# lexrank by sumy
summarizer_lex = LexRankSummarizer(stemmer)
summarizer_lex.stop_words = get_stop_words(LANG)

sentences = []
for sentence in summarizer_text(parser.document, 3):
    sentences.append(str(sentence))
text_summary = ' '.join(sentences)
print('Summary by sentences - Textrank: ' + '\n' + text_summary)

sentences = []
for sentence in summarizer_lex(parser.document, 3):
    sentences.append(str(sentence))
text_summary = ' '.join(sentences)
print('Summary by sentences - Lexrank: ' + '\n' + text_summary)

Summary by sentences - Textrank: 
Neo finds himself targeted by the police when he is contacted by Morpheus, a legendary computer hacker branded a terrorist by the government. Morpheus awakens Neo to the real world, a ravaged wasteland where most of humanity have been captured by a race of machines that live off of the humans' body heat and electrochemical energy and who imprison their minds within an artificial reality known as the Matrix. As a rebel against the machines, Neo must return to the Matrix and confront the agents: super-powerful computer programs devoted to snuffing out Neo and the entire human rebellion.
Summary by sentences - Lexrank: 
Thomas A. Anderson is a man living two lives. By day he is an average computer programmer and by night a hacker known as Neo. Neo has always questioned his reality, but the truth is far beyond his imagination.


Possibly we can use other algorithms to extract keywords from this summary text.

#### TextRank Performance with Textacy - Spacy Package

In [49]:
# textrank again, by textacy & spacy
doc = textacy.Doc(text, lang='en')
textacy.keyterms.textrank(doc, n_keyterms=10)

[('neo', 0.08317900318102962),
 ('computer', 0.04982129794755459),
 ('human', 0.03740628903358867),
 ('machine', 0.03417847716927454),
 ('reality', 0.03328732039735026),
 ('hacker', 0.032888080953631854),
 ('morpheus', 0.03267534012863888),
 ('matrix', 0.03259798518432622),
 ('a.', 0.02795078439275685),
 ('anderson', 0.026078648121652885)]

#### Performance with RAKE Package

In [56]:
# rake
rake = RAKE.Rake(RAKE.SmartStopList())
rake.run(text)[0:20]
#rake.run(suma)  # try to run on top of lexrank summary

[('super-powerful computer programs devoted', 15.666666666666666),
 ('legendary computer hacker branded', 14.166666666666666),
 ('average computer programmer', 9.666666666666666),
 ('entire human rebellion', 9.0),
 ('morpheus awakens neo', 6.5),
 ('body heat', 4.0),
 ('ravaged wasteland', 4.0),
 ('electrochemical energy', 4.0),
 ('real world', 4.0),
 ('man living', 4.0),
 ('neo finds', 3.5),
 ('artificial reality', 3.5),
 ('hacker', 2.5),
 ('morpheus', 2.0),
 ('reality', 1.5),
 ('neo', 1.5),
 ('imprison', 1.0),
 ('lives', 1.0),
 ('rebel', 1.0),
 ('race', 1.0)]