In [1]:
# Import spacy and texacy
import spacy
import textacy
from textacy.ke import textrank, sgrank

In [2]:
# Load a spacy model
en = textacy.load_spacy_lang('en_core_web_sm')

In [3]:
# Load in the text data
mytext = open('data/myarticle.txt', encoding='utf8').read()

In [4]:
# Convert the text into a spacy document
doc = textacy.make_spacy_doc(mytext, lang=en)

In [5]:
# Inspect the doc
textacy.ke.textrank(doc, topn=10)

[('social medium company', 0.026812462366143444),
 ('outspoken black nationalist minister', 0.02455875080356081),
 ('large technology company', 0.022781479407385473),
 ('Silicon Valley company', 0.022750617448471778),
 ('Facebook spokeswoman', 0.020269002967446194),
 ('profile extremist', 0.014395433805769839),
 ('controversial user', 0.014223327119411107),
 ('social network', 0.014075256676270732),
 ('dangerous individual', 0.013191447061165594),
 ('Alex Jones', 0.010999816186346378)]

In [6]:
# Print the keywords using TextRank algorithm
print('Textrank Output:', [kps for kps, weight in textrank(doc, normalize='lemma', topn=5)])

Textrank Output: ['social medium company', 'outspoken black nationalist minister', 'large technology company', 'Silicon Valley company', 'Facebook spokeswoman']


In [7]:
# Print the key words and phrases, using SGRank algorithm
print('SGRank Output:', [kps for kps, weight in sgrank(doc, topn=5)])

SGRank Output: ['outspoken black nationalist minister', 'large technology company', 'Silicon Valley company', 'SAN FRANCISCO', 'Facebook']


We can see there are overlapping key phrases. Let's pick one of the grouped terms per item to get a list of non-overlapping key phrases using textacy's **aggregage_term_variants**.

In [8]:
# Addressing the verlapping key phrases problem
terms = set(term for term, weight in sgrank(doc))
print(textacy.ke.utils.aggregate_term_variants(terms))

[{'outspoken black nationalist minister'}, {'large technology company'}, {'Silicon Valley company'}, {'conspiracy theorist'}, {'controversial user'}, {'Louis Farrakhan'}, {'SAN FRANCISCO'}, {'extreme voice'}, {'Alex Jones'}, {'Facebook'}]


All the noun chunks as the potential key phrases. Let's extract all the noun chunks. However, we can't rank them.

In [9]:
# Print key phrases
print([chunk for chunk in textacy.extract.noun_chunks(doc)])

[SAN FRANCISCO, years, extreme voices, platform, Facebook, Thursday, most controversial users, whom, conservatives, debate, power, accountability, large technology companies, social network, it, Alex Jones, conspiracy theorist, founder, Infowars, platform, handful, other extremists, Louis Farrakhan, outspoken black nationalist minister, who, anti-Semitic remarks, Silicon Valley company, users, Facebook, Instagram, policies, dangerous individuals, organizations, We, individuals, organizations, violence, hate, ideology, Facebook spokeswoman, statement, process, potential violators, it, what, us, decision, accounts, move, tech industry, broadest actions, high-profile extremists, time, social media companies, fire, hateful content, misinformation, services]
