In [4]:
text = """Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data. The goal is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves."""

# textacy

In [9]:
!pip install textacy==0.9.1
!python -m spacy download en_core_web_sm

Collecting textacy==0.9.1
  Downloading textacy-0.9.1-py3-none-any.whl (203 kB)
[K     |████████████████████████████████| 203 kB 193 kB/s eta 0:00:01
[?25hCollecting pyemd>=0.5.0
  Downloading pyemd-0.5.1.tar.gz (91 kB)
[K     |████████████████████████████████| 91 kB 286 kB/s eta 0:00:01
Building wheels for collected packages: pyemd
  Building wheel for pyemd (setup.py) ... [?25ldone
[?25h  Created wheel for pyemd: filename=pyemd-0.5.1-cp39-cp39-macosx_10_9_x86_64.whl size=72367 sha256=e6a2681315ca568c3d435cacaab0c25e643155a3dc3ffd1f5425e1e097f644ab
  Stored in directory: /Users/jyotikasingh/Library/Caches/pip/wheels/64/bf/3e/0859be9a0108fc932a29b943792dcafb3b979555cf1bb5add6
Successfully built pyemd
Installing collected packages: pyemd, textacy
  Attempting uninstall: textacy
    Found existing installation: textacy 0.12.0
    Uninstalling textacy-0.12.0:
      Successfully uninstalled textacy-0.12.0
Successfully installed pyemd-0.5.1 textacy-0.9.1
Collecting en-core-web-sm==3.3.

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [13]:
from textacy import load_spacy_lang, make_spacy_doc
from textacy.ke import sgrank, textrank

en = load_spacy_lang(
    "en_core_web_sm", 
    disable=("parser",)
)
doc = make_spacy_doc(text, lang=en)

# TextRank
textrank = textrank(
    doc, normalize="lemma", topn=5
)

# SGRank
sg = sgrank(doc, topn=5)
print(
    "\n\n TextRank keyphrases \n ", 
    [kp for kp, _ in textrank]
)

print(
    "\n\n SGRank keyphrases \n ", 
    [kp for kp, _ in sg]
)



 TextRank keyphrases 
  ['natural language processing', 'natural language datum', 'computer capable', 'computer science', 'human language']


 SGRank keyphrases 
  ['natural language datum', 'natural language processing', 'artificial intelligence', 'human language', 'computer science']


# KeyBERT

In [19]:
!pip install keybert==0.5.1
from keybert import KeyBERT

# any model from sbert.net/docs/pretrained_models.html
# can be specified below
# default model = all-MiniLM-L6-v2
kw_model = KeyBERT()
keywords = kw_model.extract_keywords(
    text, keyphrase_ngram_range=(1, 3),
    stop_words=None, highlight=True
)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
print(keywords)

[('processing nlp', 0.7913), ('language processing nlp', 0.7629), ('processing nlp is', 0.7527), ('natural language processing', 0.7435), ('of natural language', 0.6745)]


# rake-nltk

In [22]:
!pip install rake-nltk

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting rake-nltk
  Downloading rake_nltk-1.0.6-py3-none-any.whl (9.1 kB)
Installing collected packages: rake-nltk
Successfully installed rake-nltk-1.0.6


In [29]:
from rake_nltk import Rake
# Considers nltk english stopwords and punctuations
r = Rake()

r.extract_keywords_from_text(text)

# top 5 keyphrases
print(r.get_ranked_phrases()[0:5])

['artificial intelligence concerned', 'analyze large amounts', 'accurately extract information', 'natural language processing', 'natural language data']
