In [10]:
# Import spacy and texacy
import spacy
import textacy.ke
from textacy.ke.utils import aggregate_term_variants

In [3]:
# Load a spacy model
en = textacy.load_spacy_lang('en_core_web_sm')

In [4]:
# Let us use a sample text file, nlphistory.txt, which is the text from the history section of Wikipedia's
# page on Natural Language Processing
mytext = open('./Data/nlphistory.txt').read()

In [5]:
# Convert the text into a spacy document
doc = textacy.make_spacy_doc(mytext, lang=en)

In [6]:
# Inspect the doc
textacy.ke.textrank(doc, topn=10)

[('successful natural language processing system', 0.024747020006318884),
 ('statistical machine translation system', 0.024619543495193394),
 ('natural language system', 0.020510235424315678),
 ('statistical natural language processing', 0.01857077705295509),
 ('natural language task', 0.01578699892777252),
 ('machine learning algorithm', 0.015477893072643328),
 ('style machine learning method', 0.014926608302122429),
 ('term neural machine translation', 0.014748947460180904),
 ('speech recognition system', 0.012595485590133421),
 ('cache language model', 0.012583412698375781)]

In [7]:
# Print the keywords using TextRank algorithm, as implemented in Textacy
print('Textrank Output:', [kps for kps, weight in textacy.ke.textrank(doc, normalize='lemma', topn=5)])

Textrank Output: ['successful natural language processing system', 'statistical machine translation system', 'natural language system', 'statistical natural language processing', 'natural language task']


In [8]:
# Print the key words and phrases, using SGRank algorithm, as implemented in Textacy
print('SGRank Output:', [kps for kps, weight in textacy.ke.sgrank(doc, topn=5)])

SGRank Output: ['natural language processing system', 'statistical machine translation', 'research', 'late 1980', 'early']


We can see there are overlapping key phrases. Let's pick one of the grouped terms per item to get a list of non-overlapping key phrases using textacy's **aggregage_term_variants**.

In [11]:
# Addressing the verlapping key phrases problem
terms = set(term for term, weight in textacy.ke.sgrank(doc))
print(aggregate_term_variants(terms))

[{'natural language processing system'}, {'statistical machine translation'}, {'statistical model'}, {'late 1980'}, {'research'}, {'example'}, {'ELIZA'}, {'early'}, {'world'}, {'real'}]


All the noun chunks as the potential key phrases. Let's take a look at the key phrases.

In [13]:
# Print key phrases
print([chunk for chunk in textacy.extract.noun_chunks(doc)])

[history, natural language processing, 1950s, work, earlier periods, Alan Turing, article, what, criterion, intelligence, Georgetown experiment, fully automatic translation, more than sixty Russian sentences, English, authors, three or five years, machine translation, real progress, ALPAC report, ten-year-long research, expectations, machine translation, Little further research, machine translation, late 1980s, first statistical machine translation systems, notably successful natural language processing systems, SHRDLU, natural language system, restricted "blocks worlds, restricted vocabularies, ELIZA, simulation, Rogerian psychotherapist, Joseph Weizenbaum, almost no information, human thought, emotion, ELIZA, startlingly human-like interaction, "patient, very small knowledge base, ELIZA, generic response, example, head, you, head, 1970s, many programmers, "conceptual ontologies, real-world information, computer-understandable data, Examples, MARGIE, Schank, Cullingford, (Wilensky, Le