## KEYWORD EXTRACTION CODE

In [75]:
import gensim
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')

import spacy
from spacy.tokens import Doc, Span, Token
nlp = spacy.load('en_core_web_sm')
print(nlp.pipe_names)

['tagger', 'parser', 'ner']


## Class to be added as the component to the pipeline

In [76]:
class KeyphraseExtraction(object):
    # component name, will show up in the pipeline
    name = 'Keyphrases' 

    
    def __init__(self, nlp):
           Doc.set_extension('score_keyphrases_by_tfidf', getter = self.score_keyphrases_by_tfidf ,  force = True)
        

    def __call__(self, doc):
           return doc  

    
    # helper function for score_keyphrases_by_tfidf
    def extract_candidate_chunks(self , text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
        import itertools, nltk, string
    
        # exclude candidates that are stop words or entirely punctuation
        punct = set(string.punctuation)
        stop_words = set(nltk.corpus.stopwords.words('english'))
    
        # tokenize, POS-tag, and chunk using regular expressions
        chunker = nltk.chunk.regexp.RegexpParser(grammar)
        tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
        all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents))
    
        # join constituent chunk words into a single chunked phrase
        candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda word__pos__chunk : word__pos__chunk[2] != 'O') if key]

        return [cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand)]    
        
   

    def score_keyphrases_by_tfidf(self , doc , candidates='chunks'):
        import  nltk
        import gensim
        # sentence tokenising
        #texts = nltk.sent_tokenize(texts)
        texts_as_string = str(doc)
        list_of_string = []
        for text in nltk.sent_tokenize(texts_as_string):
            list_of_string.append(str(text))
    
        texts = list_of_string
    
        # extract candidates from each text in texts, either chunks or words
        if candidates == 'chunks':
            boc_texts = [self.extract_candidate_chunks(text) for text in texts]
        
        elif candidates == 'words':
            boc_texts = [self.extract_candidate_words(text) for text in texts]
   
        # make gensim dictionary and corpus
        dictionary = gensim.corpora.Dictionary(boc_texts)
    
        corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts]
    
         # transform corpus with tf*idf model
        tfidf = gensim.models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
       
    
    
        # instead of returning
        list_tfidf = []
        for docs in corpus_tfidf:
            for d in docs:
                list_tfidf.append(d)
    
        list1 = []
        list2 = []
        for i in dictionary:
            list1.append(dictionary[i])
            list2.append(list_tfidf[i][1])
            
    
        answer_dictionary = dict(zip(list1 , list2))
        return answer_dictionary
    
    
    
    

## Text used in the algorithm
Everyone has an opinion about finding your passion. It's either the best piece of career advice you've ever heard or the worst. Bill Gates is all for it. He discovered a passion for writing software as a kid and kept at it. Seemed to work out pretty well for him.Mark Cuban is vehemently against hanging your success on finding your passion. Just because you're passionate about something doesn't mean you're good at it. He advises you find where you're putting in the most effort, then double down on that to achieve success.Stanford researchers recently decided to get to the bottom of the matter. They performed a series of experiments and published their findings in Psychological Science.



In [77]:
def main():
    
    nlp = spacy.load('en_core_web_sm')
    component = KeyphraseExtraction(nlp)  # initialise component
    nlp.add_pipe(component, last = True)  # add last to the pipeline
    
    text = "Everyone has an opinion about finding your passion. It's either the best piece of career advice you've ever heard or the worst. Bill Gates is all for it. He discovered a passion for writing software as a kid and kept at it. Seemed to work out pretty well for him. Mark Cuban is vehemently against hanging your success on finding your passion. Just because you're passionate about something doesn't mean you're good at it. He advises you find where you're putting in the most effort, then double down on that to achieve success. Stanford researchers recently decided to get to the bottom of the matter. They performed a series of experiments and published their findings in Psychological Science."  
    doc = nlp(text)
    print('Pipeline', nlp.pipe_names)
    print() 
    print()
    dictionary_of_keyphrases = doc._.score_keyphrases_by_tfidf
    for key , value in dictionary_of_keyphrases.items():
           print("%-40s %4.3f" % ( key , value ))

In [78]:
if __name__ == '__main__':
    main()

Pipeline ['tagger', 'parser', 'ner', 'Keyphrases']


everyone                                 0.663
opinion                                  0.663
passion                                  0.347
piece of career advice                   1.000
bill gates                               1.000
kept                                     0.289
kid                                      0.553
software                                 0.553
mark cuban                               0.553
success                                  0.394
something                                0.753
effort                                   0.527
bottom                                   1.000
matter                                   0.573
stanford researchers                     0.820
findings in psychological science        0.577
series of experiments                    0.577
