### Keyword Extraction¶

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#### Importing dataset¶


In [31]:
df = pd.read_csv('papers.csv')
df.head()
del df['event_type']
del df['abstract']
del df['pdf_name']
del df['year']
del df['id']

df.head()

Unnamed: 0,title,paper_text
0,Self-Organization of Associative Database and ...,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,A Mean Field Theory of Layer IV of Visual Cort...,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,Storing Covariance by the Associative Long-Ter...,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,Bayesian Query Construction for Neural Network...,Bayesian Query Construction for Neural\nNetwor...
4,"Neural Network Ensembles, Cross Validation, an...","Neural Network Ensembles, Cross\nValidation, a..."


In [32]:
#print(df['paper_text'][0])

In [33]:
title = list(df['title'])
text = list(df['paper_text'])
title = title[:10]
text = text[:10]

#### Preprocessing¶


In [34]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anshumanrajchauhan_1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [35]:
class Preprocess:
    def __init__(self,text):
        self.__text = text
        
        #stopwords
        self.__sw = stopwords.words('english')
        self.__sw.append('and')
        self.__sw.append('using')
    
    def stopwordRemoval(self,sen,sw):
        sen = [word.lower() for word in sen if (len(word) > 2 and word.lower() not in sw)]
        return sen

    def lemmatizeWords(self,text):
        lemmatizer = WordNetLemmatizer()
        wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}
        pos_tagged_text = nltk.pos_tag(text.split())
        return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])
        
    def computeAOF(self,count):
        aof = []
        for i in count:
            nTokens = 0
            freq = 0
            for j in i:
                if(j > 0):
                    nTokens += 1
            aof.append(np.sum(i)/nTokens)

        return aof
    
    def vectorise(self):
        text = self.__text
        corpus = []
        tokenizer = RegexpTokenizer('[a-zA-Z\']+') 
        for sen in text:
            sen = tokenizer.tokenize(sen)
            sen = self.stopwordRemoval(sen,self.__sw)
            sen = ' '.join(word for word in sen)
            sen = self.lemmatizeWords(sen)
            corpus.append(sen)
        
        cv = CountVectorizer()
        vect = cv.fit_transform(corpus)
        count = vect.toarray()
        vocab = cv.vocabulary_
        aof = self.computeAOF(count)
        corpus_new = []
        
        tokenizer = RegexpTokenizer('[a-zA-Z]+') 
        dummy = []
        i = 0
        for sen in corpus:
            sen = tokenizer.tokenize(sen)
            sw_new = [key for (key,value) in vocab.items() if count[i][value] < aof[i]]
            sen = self.stopwordRemoval(sen,sw_new)
            dummy.append(sen)
            sen = ' '.join(word for word in sen)
            corpus_new.append(sen)
            i += 1
    
        vectNew = cv.fit_transform(corpus_new)
        count = vectNew.toarray()
        vocab = cv.vocabulary_
        invVocab = dict(zip(vocab.values(),vocab.keys()))

        corpusVect = []
        for sen in dummy:
            corpusVect.append([vocab[word] for word in sen])

        return cv,vectNew,count,vocab,invVocab,aof,corpusVect

In [36]:
p = Preprocess(text)
cv,vect,count,vocab,invVocab,aof,corpusVect = p.vectorise()

#### Graph Formulation¶


In [37]:
class Graph:
    def __init__(self,V):
        self.V = V
        self.nodeWts = np.zeros((V))
        self.wt = np.zeros((V,V))
        self.F = np.zeros((V))
        self.L = np.zeros((V))
        self.SC = np.zeros((V))
        self.TF = np.zeros((V))
        self.D = np.zeros((V))
        self.Neigh = np.zeros((V))
            
    def addEdgeWt(self,a,b,weight):
        wt = self.wt
        wt[a][b] = weight
        wt[b][a] = weight
        self.wt = wt

    
    def computeEdgeWt(self,freqMat,a,b):
        fa = np.sum(freqMat,axis = 0)[a]
        fb = np.sum(freqMat,axis = 0)[b]
        count = 0
        for i in range(np.shape(freqMat)[0]):
            if(freqMat[i][a] > 0 and freqMat[i][b] > 0):
                count += 1
        fab = count
        weight = (fab)/(fa + fb - fab)
        self.addEdgeWt(a,b,weight)
    
    def computeNodeWt(self,count,vocab,corpusVect):
        F = self.F
        L = self.L
        SC = self.SC
        TF = self.TF
        wt = self.wt
        D = self.D
        Neigh = self.Neigh
        
        #first/last word
        for vect in corpusVect:
            F[vect[0]] += 1
            L[vect[-1]] += 1
        for key in vocab.values():
            if(F[key] > 0):
                F[key] = F[key]/np.sum(count,axis = 0)[key]
            if(L[key] > 0):
                L[key] = L[key]/np.sum(count,axis = 0)[key]
        
        #SC, TF
        degree = []
        for i in range(np.shape(wt)[0]):
            outDegree = 0
            for j in range(np.shape(wt)[1]):
                if(j == i):
                    continue
                if(wt[i][j] > 0):
                    SC[i] += wt[i][j]
                    outDegree += 1
            degree.append(outDegree)
            SC[i] = SC[i]/outDegree
             
        centralNode = np.argmax(degree)
        #TF
        TF = np.sum(count,axis = 0)/np.shape(count)[0]
        
        #D
        for i in range(np.shape(wt)[0]):
            if(i == centralNode):
                continue
            D[i] = wt[i][centralNode]
        
        D = D/np.max(D)         #Normalisation
        D[centralNode] = 1
        
        
        
        self.wt = wt
        self.F = F
        self.L = L
        self.wt = wt
        self.SC = SC
        self.TF = TF
        self.D = D
    
        
        self.nodeWts = F + L + SC + TF + D
        nodeWts = self.nodeWts
        minWt = np.min(nodeWts)
        maxWt = np.max(nodeWts)
        for i in range(self.V):
            nodeWts[i] = (nodeWts[i] - minWt)/(maxWt - minWt)
        
        self.nodeWts = nodeWts
        return
        
    
    def makeGraph(self,corpusVect,vocab,count):
        for a in vocab.values():
            for b in vocab.values():
                if(a == b):
                    self.wt[a][b] = 0
                    continue
                
                self.computeEdgeWt(count,a,b)
        
        self.computeNodeWt(count,vocab,corpusVect)
        
        return self.nodeWts,self.wt

In [38]:
G = Graph(len(vocab))
nodeWts, edgeWts = G.makeGraph(corpusVect,vocab,count)

#### NE Rank¶


In [39]:
def computeNERank(nodeWt,edgeWt,d):
    V = np.shape(nodeWt)[0]
    R = np.zeros((V))
    sigma2 = np.sum(edgeWt,axis = 1)
    for itr in range(10):
        for i in range(V):
            sigma1 = 0
            for j in range(V):
                if(i == j):
                    continue
                sigma1 += (edgeWt[i][j]*R[j])/sigma2[j]
            
            dummy = R[i]
            R[i] = (1-d)*nodeWt[i] + d*nodeWt[i]*sigma1;
            
    return R

In [40]:
R = computeNERank(nodeWts,edgeWts,0.85)

#### Printing Top Keywords¶


In [41]:
topKeywords = R.argsort()[-20:][::-1]

for i in range(20):
    print(invVocab[topKeywords[i]])

network
neural
input
use
show
error
model
set
control
figure
weight
time
function
plasma
test
data
cell
value
parameter
result
