# Implementation of Keyword Extraction using NLTK Library


The input text is given below

In [1]:
from __future__ import division
from nltk import sent_tokenize

finaltext = open("F:/python progs/RAKE-tutorial-master/RAKE-tutorial-master/extract.txt", "r",encoding="utf-8")
Text = finaltext.read()


The raw input text is cleaned off non-printable characters (if any) and turned into lower case.
The processed input text is then tokenized using NLTK library functions. 

In [2]:
import nltk
from nltk import word_tokenize
import string
from collections import defaultdict


text = word_tokenize(Text)

print("Tokenized Text: \n")
print(text)

Tokenized Text: 

['Lorem', 'Ipsum', 'is', 'simply', 'dummy', 'text', 'of', 'the', 'printing', 'and', 'typesetting', 'industry', '.', 'Lorem', 'Ipsum', 'has', 'been', 'the', 'industry', "'s", 'standard', 'dummy', 'text', 'ever', 'since', 'the', '1500s', ',', 'when', 'an', 'unknown', 'printer', 'took', 'a', 'galley', 'of', 'type', 'and', 'scrambled', 'it', 'to', 'make', 'a', 'type', 'specimen', 'book', '.', 'It', 'has', 'survived', 'not', 'only', 'five', 'centuries', ',', 'but', 'also', 'the', 'leap', 'into', 'electronic', 'typesetting', ',', 'remaining', 'essentially', 'unchanged', '.', 'It', 'was', 'popularised', 'in', 'the', '1960s', 'with', 'the', 'release', 'of', 'Letraset', 'sheets', 'containing', 'Lorem', 'Ipsum', 'passages', ',', 'and', 'more', 'recently', 'with', 'desktop', 'publishing', 'software', 'like', 'Aldus', 'PageMaker', 'including', 'versions', 'of', 'Lorem', 'Ipsum', '.', 'It', 'is', 'a', 'long', 'established', 'fact', 'that', 'a', 'reader', 'will', 'be', 'distracted'

NLTK is again used for <b>POS tagging</b> the input text.


Description of POS tags: 
http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

In [3]:
#nltk.download('averaged_perceptron_tagger')
  
POS_tag = nltk.pos_tag(text)
print("Tokenized Text with POS tags: \n")
print(POS_tag)

Tokenized Text with POS tags: 

[('Lorem', 'NNP'), ('Ipsum', 'NNP'), ('is', 'VBZ'), ('simply', 'RB'), ('dummy', 'JJ'), ('text', 'NN'), ('of', 'IN'), ('the', 'DT'), ('printing', 'NN'), ('and', 'CC'), ('typesetting', 'NN'), ('industry', 'NN'), ('.', '.'), ('Lorem', 'NNP'), ('Ipsum', 'NNP'), ('has', 'VBZ'), ('been', 'VBN'), ('the', 'DT'), ('industry', 'NN'), ("'s", 'POS'), ('standard', 'JJ'), ('dummy', 'NN'), ('text', 'NN'), ('ever', 'RB'), ('since', 'IN'), ('the', 'DT'), ('1500s', 'CD'), (',', ','), ('when', 'WRB'), ('an', 'DT'), ('unknown', 'JJ'), ('printer', 'NN'), ('took', 'VBD'), ('a', 'DT'), ('galley', 'NN'), ('of', 'IN'), ('type', 'NN'), ('and', 'CC'), ('scrambled', 'VBD'), ('it', 'PRP'), ('to', 'TO'), ('make', 'VB'), ('a', 'DT'), ('type', 'NN'), ('specimen', 'NNS'), ('book', 'NN'), ('.', '.'), ('It', 'PRP'), ('has', 'VBZ'), ('survived', 'VBN'), ('not', 'RB'), ('only', 'RB'), ('five', 'CD'), ('centuries', 'NNS'), (',', ','), ('but', 'CC'), ('also', 'RB'), ('the', 'DT'), ('leap', 'N

The tokenized text (mainly the nouns and adjectives) is normalized by <b>lemmatization</b>.
In lemmatization different grammatical counterparts of a word will be replaced by single
basic lemma. For example, 'glasses' may be replaced by 'glass'. 

Details about lemmatization: 
    
https://nlp.stanford.edu/IR-book/html/htmledition/stemming-and-lemmatization-1.html

In [4]:
#nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()

adjective_tags = ['JJ','JJR','JJS']

lemmatized_text = []

for word in POS_tag:
    if word[1] in adjective_tags:
        lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0],pos="a")))
    else:
        lemmatized_text.append(str(wordnet_lemmatizer.lemmatize(word[0]))) #default POS = noun
        
print("Text tokens after lemmatization of adjectives and nouns: \n")
print(lemmatized_text)

Text tokens after lemmatization of adjectives and nouns: 

['Lorem', 'Ipsum', 'is', 'simply', 'dummy', 'text', 'of', 'the', 'printing', 'and', 'typesetting', 'industry', '.', 'Lorem', 'Ipsum', 'ha', 'been', 'the', 'industry', "'s", 'standard', 'dummy', 'text', 'ever', 'since', 'the', '1500s', ',', 'when', 'an', 'unknown', 'printer', 'took', 'a', 'galley', 'of', 'type', 'and', 'scrambled', 'it', 'to', 'make', 'a', 'type', 'specimen', 'book', '.', 'It', 'ha', 'survived', 'not', 'only', 'five', 'century', ',', 'but', 'also', 'the', 'leap', 'into', 'electronic', 'typesetting', ',', 'remaining', 'essentially', 'unchanged', '.', 'It', 'wa', 'popularised', 'in', 'the', '1960s', 'with', 'the', 'release', 'of', 'Letraset', 'sheet', 'containing', 'Lorem', 'Ipsum', 'passage', ',', 'and', 'more', 'recently', 'with', 'desktop', 'publishing', 'software', 'like', 'Aldus', 'PageMaker', 'including', 'version', 'of', 'Lorem', 'Ipsum', '.', 'It', 'is', 'a', 'long', 'established', 'fact', 'that', 'a', 're

The <b>lemmatized text</b> is <b>POS tagged</b> here.

In [5]:
POS_tag = nltk.pos_tag(lemmatized_text)

print("Lemmatized text with POS tags: \n")
print(POS_tag)

Lemmatized text with POS tags: 

[('Lorem', 'NNP'), ('Ipsum', 'NNP'), ('is', 'VBZ'), ('simply', 'RB'), ('dummy', 'JJ'), ('text', 'NN'), ('of', 'IN'), ('the', 'DT'), ('printing', 'NN'), ('and', 'CC'), ('typesetting', 'NN'), ('industry', 'NN'), ('.', '.'), ('Lorem', 'NNP'), ('Ipsum', 'NNP'), ('ha', 'VBD'), ('been', 'VBN'), ('the', 'DT'), ('industry', 'NN'), ("'s", 'POS'), ('standard', 'JJ'), ('dummy', 'NN'), ('text', 'NN'), ('ever', 'RB'), ('since', 'IN'), ('the', 'DT'), ('1500s', 'CD'), (',', ','), ('when', 'WRB'), ('an', 'DT'), ('unknown', 'JJ'), ('printer', 'NN'), ('took', 'VBD'), ('a', 'DT'), ('galley', 'NN'), ('of', 'IN'), ('type', 'NN'), ('and', 'CC'), ('scrambled', 'VBD'), ('it', 'PRP'), ('to', 'TO'), ('make', 'VB'), ('a', 'DT'), ('type', 'NN'), ('specimen', 'NNS'), ('book', 'NN'), ('.', '.'), ('It', 'PRP'), ('ha', 'VBZ'), ('survived', 'VBD'), ('not', 'RB'), ('only', 'RB'), ('five', 'CD'), ('century', 'NN'), (',', ','), ('but', 'CC'), ('also', 'RB'), ('the', 'DT'), ('leap', 'NN'),

Any word from the lemmatized text, which isn't a noun, adjective, or gerund (or a 'foreign word'), is here
considered as a <b>stopword</b> (non-content). This is based on the assumption that usually keywords are noun,
adjectives or gerunds. 

Punctuations are added to the stopword list too.

In [6]:
stopwords = []

wanted_POS = ['NN','NNS','NNP','NNPS','JJ','JJR','JJS','VBG','FW'] 

for word in POS_tag:
    if word[1] not in wanted_POS:
        stopwords.append(word[0])

punctuations = list(str(string.punctuation))

stopwords = stopwords + punctuations

Even if we remove the aforementioned stopwords, still some extremely common nouns, adjectives or gerunds may
remain which are very bad candidates for being keywords (or part of it). 

An external file constituting a long list of stopwords is loaded and all the words are added with the previous
stopwords to create the final list 'stopwords-plus' which is then converted into a set. 

(Source of stopwords data: https://www.ranks.nl/stopwords)

Stopwords-plus constitute the sum total of all stopwords and potential phrase-delimiters. The contents of this
set will be used to partition the lemmatized text into phrases. 

Phrases should constitute a group of consecutively occuring words that has no member from stopwords_plus in
between. Example: "Neural Network".
    
Each phrase is a <b>keyword candidate</b>. 
    
There are some exceptions, that is, there are some possible cases where a good keyword candidate may contain 
stopword in between. Example: "Word of Mouth". 
    
But, for simplicity's sake I will pretend here that such exceptions do not exist.

In [7]:
stopword_file = open("F:/python progs/RAKE-tutorial-master/RAKE-tutorial-master/stopwords.txt", "r")
#Source = https://www.ranks.nl/stopwords

lots_of_stopwords = []

for line in stopword_file.readlines():
    lots_of_stopwords.append(str(line.strip()))

stopwords_plus = []
stopwords_plus = stopwords + lots_of_stopwords
stopwords_plus = set(stopwords_plus)

#Stopwords_plus contain total set of all stopwords and phrase delimiters that
#will be used for partitioning the text into phrases (candidate keywords).

Phrases are generated by partitioning the lemmatized text using the members of stopwords_plus 
as delimiters.

In [8]:
phrases = []

phrase = " "
for word in lemmatized_text:
    
    if word in stopwords_plus:
        if phrase!= " ":
            phrases.append(str(phrase).split())
        phrase = " "
    elif word not in stopwords_plus:
        phrase+=str(word)
        phrase+=" "

print("Partitioned Phrases: \n")
print(phrases)

Partitioned Phrases: 

[['Lorem', 'Ipsum'], ['dummy', 'text'], ['printing'], ['typesetting', 'industry'], ['Lorem', 'Ipsum'], ['industry'], ['standard', 'dummy', 'text'], ['unknown', 'printer'], ['galley'], ['type'], ['type', 'specimen', 'book'], ['century'], ['leap'], ['electronic', 'typesetting'], ['remaining'], ['unchanged'], ['1960s'], ['release'], ['Letraset', 'sheet'], ['Lorem', 'Ipsum', 'passage'], ['desktop', 'publishing', 'software'], ['Aldus', 'PageMaker', 'including', 'version'], ['Lorem', 'Ipsum'], ['long'], ['fact'], ['reader'], ['readable', 'content'], ['point'], ['Lorem', 'Ipsum'], ['more-or-less', 'normal', 'distribution'], ['letter'], ["'Content"], ['content'], ['making'], ['readable', 'English'], ['Many', 'desktop', 'publishing', 'package'], ['web'], ['editor'], ['Lorem', 'Ipsum'], ['default', 'model', 'text'], ['search'], ["'lorem", 'ipsum'], ['web', 'site'], ['infancy'], ['Various', 'version'], ['year'], ['accident'], ['purpose'], ['humour'], ['variation'], ['passag

Frequency of each words in the list of phrases, are calculated here. 

The degree of each words are calculating by adding the length of all the
phrases where the word occurs.

Each word scores are caclulated by dividing degree of the word by its frequency.


In [9]:


frequency = defaultdict(int)
degree = defaultdict(int)
word_score = defaultdict(float)

vocabulary = []

for phrase in phrases:
    for word in phrase:
        frequency[word]+=1
        degree[word]+=len(phrase)
        if word not in vocabulary:
            vocabulary.append(word)
            
for word in vocabulary:
    word_score[word] = degree[word]/frequency[word]

print("Dictionary of degree scores for each words under candidate keywords (phrases): \n")
print(degree)
print("\nDictionary of frequencies for each words under candidate keywords (phrases): \n")
print(frequency)
print("\nDictionary of word scores for each words under candidate keywords (phrases): \n")
print(word_score)

Dictionary of degree scores for each words under candidate keywords (phrases): 

defaultdict(<class 'int'>, {'Lorem': 25, 'Ipsum': 25, 'dummy': 5, 'text': 9, 'printing': 1, 'typesetting': 4, 'industry': 3, 'standard': 3, 'unknown': 2, 'printer': 2, 'galley': 1, 'type': 4, 'specimen': 3, 'book': 3, 'century': 1, 'leap': 1, 'electronic': 2, 'remaining': 1, 'unchanged': 1, '1960s': 1, 'release': 1, 'Letraset': 2, 'sheet': 2, 'passage': 5, 'desktop': 7, 'publishing': 7, 'software': 3, 'Aldus': 4, 'PageMaker': 4, 'including': 4, 'version': 6, 'long': 1, 'fact': 1, 'reader': 1, 'readable': 4, 'content': 3, 'point': 1, 'more-or-less': 3, 'normal': 3, 'distribution': 3, 'letter': 1, "'Content": 1, 'making': 2, 'English': 2, 'Many': 4, 'package': 4, 'web': 3, 'editor': 1, 'default': 3, 'model': 6, 'search': 1, "'lorem": 2, 'ipsum': 2, 'site': 2, 'infancy': 1, 'Various': 2, 'year': 1, 'accident': 1, 'purpose': 1, 'humour': 3, 'variation': 1, 'majority': 1, 'alteration': 1, 'form': 1, 'word': 5, 

The phrase scores are calculated by adding individual scores of each of the words
which form the members of the phrase. 

In [10]:
import numpy as np

phrase_scores = []
keywords = []
phrase_vocabulary=[]

for phrase in phrases:
    if phrase not in phrase_vocabulary:
        phrase_score=0
        for word in phrase:
            phrase_score+= word_score[word]
        phrase_scores.append(phrase_score)
        phrase_vocabulary.append(phrase)

phrase_vocabulary = []
j=0
for phrase in phrases:
    
    if phrase not in phrase_vocabulary:
        keyword=''
        for word in phrase:
            keyword += str(word)+" "
        phrase_vocabulary.append(phrase)
        keyword = keyword.strip()
        keywords.append(keyword)
    
        print ("Score of candidate keyword '"+keywords[j]+"': "+str(phrase_scores[j]))
        
        j+=1

Score of candidate keyword 'Lorem Ipsum': 4.545454545454546
Score of candidate keyword 'dummy text': 4.75
Score of candidate keyword 'printing': 1.0
Score of candidate keyword 'typesetting industry': 3.5
Score of candidate keyword 'industry': 1.5
Score of candidate keyword 'standard dummy text': 7.75
Score of candidate keyword 'unknown printer': 4.0
Score of candidate keyword 'galley': 1.0
Score of candidate keyword 'type': 2.0
Score of candidate keyword 'type specimen book': 8.0
Score of candidate keyword 'century': 1.0
Score of candidate keyword 'leap': 1.0
Score of candidate keyword 'electronic typesetting': 4.0
Score of candidate keyword 'remaining': 1.0
Score of candidate keyword 'unchanged': 1.0
Score of candidate keyword '1960s': 1.0
Score of candidate keyword 'release': 1.0
Score of candidate keyword 'Letraset sheet': 4.0
Score of candidate keyword 'Lorem Ipsum passage': 6.212121212121213
Score of candidate keyword 'desktop publishing software': 10.0
Score of candidate keyword 

The index of the phrase score ndarray is then sorted in descending order in terms of
the score values.
The index corresponds to the location of the concerned phrase in phrases list.
So by getting the sorted order of the index, we also get the sorted order of the phrases.
Each phrase can be considered as a <b>candidate keyword</b>. 
We can then simply choose the top n highest scoring candidate keywords and present them as
the final exctracted keywords for the system. 

In [11]:
sorted_index = np.flip(np.argsort(phrase_scores),0)

keywords_num = 10

print("Keywords:\n")

for i in range(0,keywords_num):
    print(str(keywords[sorted_index[i]])+", ",)



Keywords:

Aldus PageMaker including version, 
Many desktop publishing package, 
desktop publishing software, 
model sentence structure, 
more-or-less normal distribution, 
default model text, 
type specimen book, 
standard dummy text, 
generated Lorem Ipsum, 
Lorem Ipsum generator, 
