In [11]:
#Required Libraries
from collections import OrderedDict
import numpy as np
import spacy
import wikipedia

In [12]:
#Additional stopwords required for each company
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')
nlp.Defaults.stop_words |= {"%"}

In [13]:
#TextRanking
class KeywordTextRank():
    """Extraction of keywords from text"""
    
    def __init__(self):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight

    
    def set_stopwords(self, stopwords):  
        """Set stop words from spacy api"""
        for word in STOP_WORDS:
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True
    
    def sentence_segment(self, doc, candidate_pos, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc.sents:
            selected_words = []
            for token in sent:
                # Store words only with cadidate POS tag
                if token.pos_ in candidate_pos and token.is_stop is False:
                    if lower is True:
                        selected_words.append(token.text.lower())
                    else:
                        selected_words.append(token.text)
            sentences.append(selected_words)
        return sentences
        
    def get_vocab(self, sentences):
        """Get all tokens"""
        vocab = OrderedDict()
        i = 0
        for sentence in sentences:
            for word in sentence:
                if word not in vocab:
                    vocab[word] = i
                    i += 1
        return vocab
    
    def get_token_pairs(self, window_size, sentences):
        """Build token_pairs from windows in sentences"""
        token_pairs = list()
        for sentence in sentences:
            for i, word in enumerate(sentence):
                for j in range(i+1, i+window_size):
                    if j >= len(sentence):
                        break
                    pair = (word, sentence[j])
                    if pair not in token_pairs:
                        token_pairs.append(pair)
        return token_pairs
        
    def symmetrize(self, a):
        return a + a.T - np.diag(a.diagonal())
    
    def get_matrix(self, vocab, token_pairs):
        """Get normalized matrix"""
        # Build matrix
        vocab_size = len(vocab)
        g = np.zeros((vocab_size, vocab_size), dtype='float')
        for word1, word2 in token_pairs:
            i, j = vocab[word1], vocab[word2]
            g[i][j] = 1
            
        # Get Symmeric matrix
        g = self.symmetrize(g)
        
        # Normalize matrix by column
        norm = np.sum(g, axis=0)
        g_norm = np.divide(g, norm, where=norm!=0) # this is ignore the 0 element in norm
        return g_norm
    
    def get_keywords(self, number=10):
        """Top n keywords"""
        node_weight = OrderedDict(sorted(self.node_weight.items(), key=lambda t: t[1], reverse=True))
        for i, (key, value) in enumerate(node_weight.items()):
            print(key + ' - ' + str(value))
            if i > number:
                break

    def analyze(self, text, 
                candidate_pos=['NOUN', 'PROPN'], 
                window_size=4, lower=False, stopwords=list()):
        """Main function to analyze text"""
        
        # Set stopwords
        self.set_stopwords(stopwords)
        
        # Pare text by spaCy
        doc = nlp(text)
        
        # Filter sentences
        sentences = self.sentence_segment(doc, candidate_pos, lower) # list of list of words
        
        # Build vocabulary
        vocab = self.get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = self.get_token_pairs(window_size, sentences)
        
        # Normalise matrix
        g = self.get_matrix(vocab, token_pairs)
        
        # Initialisation for weight
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight

In [14]:
#Required to check selected page is correct
wikipedia.search('Pearsons')

['Morleys Stores',
 'Toronto Pearson International Airport',
 'Pearsons of Nottingham',
 'Pearson',
 'Pearson correlation coefficient',
 'Francis T. P. Plimpton',
 'Pearson plc',
 'Daniel Kimball Pearsons',
 'Pearsons Hall of Science',
 'List of This Is Us characters']

In [15]:
#Collect content from wikipedia page
text = wikipedia.page("GlaxoSmithKline").content

In [16]:
#Keywords
keywords = KeywordTextRank()
keywords.analyze(text, candidate_pos = ['NOUN','PROPN', 'VERB'], window_size=10,lower=False)
keywords.get_keywords(30)

GSK - 28.559574659434766
company - 17.00944724677334
drug - 9.885692049359823
Glaxo - 7.399347494134659
drugs - 6.612417735627308
year - 5.876079622479478
companies - 5.773254021489584
Wellcome - 5.575218543477434
products - 5.061602588260646
including - 4.454992352850071
research - 4.4418681150773125
FDA - 4.345118831113159
UK - 4.19653390395771
treatment - 4.118303619116857
Beecham - 4.106642790805072
vaccine - 4.081479292565653
rosiglitazone - 4.0333294123637105
Laboratories - 4.029558205565415
United - 4.00836229792651
Research - 3.9995156485109598
announced - 3.9508523854011552
sales - 3.8801251634191414
States - 3.8297851821861775
Health - 3.70104906774781
children - 3.5002123001793164
doctors - 3.4808396115177747
Paxil - 3.424954724291175
business - 3.2760919254057477
percent - 3.2704613515587986
opened - 3.2587673206193157
settlement - 3.24265039694049
risk - 3.205378626265681
