In [3]:
import glob, os, re
from collections import defaultdict
from nltk import word_tokenize, pos_tag
from nltk.corpus import wordnet
from pandas import DataFrame
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse.csr import csr_matrix
import Stemmer, string

train=glob.glob('./se_txt/train/dummy/*.txt.final')   

def load_files(path):
    raw=[]
    for file in path:
        d_all={'doc_id': None, 'title': None, 'abstract': None, 'full-text': None, 'candidates': None}
        file_id=os.path.basename(file).rstrip('.txt.final') #catch only file name  
        source=open(file,encoding='utf-8').readlines()
        
        d_all['doc_id']=file_id
        
        ##########detect title
        beginning=re.sub("\n", "", source[0]) #retrieve title
        candidate=re.sub("\n", "", source[1]) # retrieve title candidate
        h_candidate=word_tokenize(re.sub("-",' ',candidate)) #tokenize the candidate
        
        title=[]
        name=[]
        for word in h_candidate:
            if wordnet.synsets(word): #check if candidate exist on wordnet
                title.append(word)
            else:
                name.append(word)
            #if title>
            if len(title)>len(name): 
                newtitle=beginning+' '+candidate
            elif len(title)==len(name):
                newtitle=beginning
            else:
                newtitle=beginning

        d_all['title']=newtitle

        ##################################
        
        content=source[2:]
        ######check header, inconsistency all file
        r_intro=re.compile("^1\.?\s[A-Z]+")
        r_ref=re.compile("[0-9]{1,2}?\.?\s?R[EFERENCES|eferences]") #detect reference
        #r_header=re.compile("[0-9]{1,2}?\.?\s?[A-Z]")
        
        in_abstract=content.index('ABSTRACT\n')
        in_authorkey=content.index('Categories and Subject Descriptors\n')
        
        list_intro=[i for i, item in enumerate(content) if re.search(r_intro, item)]
        in_intro=list_intro[0]
        list_ref=[i for i, item in enumerate(content) if re.search(r_ref, item)]
        in_ref=list_ref[0]
        
        abstract=content[in_abstract+1:in_authorkey] #eliminate keyword and category
        body=content[in_intro+1:in_ref] #remove reference       
        
        list_title=[]
        list_title.append(newtitle)
        
        full_text=list(chain(list_title,abstract, body))
        #d_all['abstract']=clean_merge(abstract)
        #d_all['body']=clean_merge(body)
        d_all['full_text']=full_text
        
        raw.append(d_all)
    return raw


print(load_files(train))
#####pandas
#l=load_files(train)
#print(DataFrame(l).head())



In [1]:
from nltk import word_tokenize, sent_tokenize, pos_tag_sents

text="""Keyphrase extraction is the task of identifying single or
                 multi-word expressions that represent the main topics of a
                 document. In this paper we present TopicRank, a graph-based
                 keyphrase extraction method that relies on a topical
                 representation of the document."""
sents=[]

#extract tokens and create list per sentence
sentences=[word_tokenize(s) for s in sent_tokenize(text)]

#create tuples from each token on sentences list
tuples=pos_tag_sents(sentences)

#create list of dictionary contains of pos and word
for sentence in tuples:
    sents.append({
        "words":[x[0] for x in sentence],
        "POS":[x[1] for x in sentence]
    })
print('s',sentences)
print('t',tuples)
print('s2',sents)

s [['Keyphrase', 'extraction', 'is', 'the', 'task', 'of', 'identifying', 'single', 'or', 'multi-word', 'expressions', 'that', 'represent', 'the', 'main', 'topics', 'of', 'a', 'document', '.'], ['In', 'this', 'paper', 'we', 'present', 'TopicRank', ',', 'a', 'graph-based', 'keyphrase', 'extraction', 'method', 'that', 'relies', 'on', 'a', 'topical', 'representation', 'of', 'the', 'document', '.']]
t [[('Keyphrase', 'NNP'), ('extraction', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('task', 'NN'), ('of', 'IN'), ('identifying', 'VBG'), ('single', 'JJ'), ('or', 'CC'), ('multi-word', 'JJ'), ('expressions', 'NNS'), ('that', 'WDT'), ('represent', 'VBP'), ('the', 'DT'), ('main', 'JJ'), ('topics', 'NNS'), ('of', 'IN'), ('a', 'DT'), ('document', 'NN'), ('.', '.')], [('In', 'IN'), ('this', 'DT'), ('paper', 'NN'), ('we', 'PRP'), ('present', 'JJ'), ('TopicRank', 'NNP'), (',', ','), ('a', 'DT'), ('graph-based', 'JJ'), ('keyphrase', 'NN'), ('extraction', 'NN'), ('method', 'NN'), ('that', 'WDT'), ('relies', 'V

In [2]:
def extract_candidate_words(text, good_tags=set(['JJ','JJR','JJS','NN','NNP','NNS','NNPS'])):
    import itertools, nltk, string

    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize and POS-tag words
    tagged_words = itertools.chain.from_iterable(nltk.pos_tag_sents(nltk.word_tokenize(sent)
                                                                    for sent in nltk.sent_tokenize(text)))
    #print('tag',list(tagged_words))
    # filter on certain POS tags and lowercase all words
    candidates = [word.lower() for word, tag in tagged_words
                  if tag in good_tags and word.lower() not in stop_words
                  and not all(char in punct for char in word)]

    return candidates 
print(extract_candidate_words(text))

['keyphrase', 'extraction', 'task', 'single', 'multi-word', 'expressions', 'main', 'topics', 'document', 'paper', 'present', 'topicrank', 'graph-based', 'keyphrase', 'extraction', 'method', 'topical', 'representation', 'document']


In [6]:
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
    import itertools, nltk, string
    
    # exclude candidates that are stop words or entirely punctuation
    punct = set(string.punctuation)
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # tokenize, POS-tag, and chunk using regular expressions
    chunker = nltk.chunk.regexp.RegexpParser(grammar)
    tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
    all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
                                                    for tagged_sent in tagged_sents))
    # join constituent chunk words into a single chunked phrase
    candidates = [' '.join(word for word, pos, chunk in group).lower()
                  for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]

    return [cand for cand in candidates
            if cand not in stop_words and not all(char in punct for char in cand)]

print(extract_candidate_words(text))

SyntaxError: invalid syntax (<ipython-input-6-bafde6c925e5>, line 14)

In [None]:
def score_keyphrases_by_tfidf(texts, candidates='chunks'):
    import gensim, nltk
    
    # extract candidates from each text in texts, either chunks or words
    if candidates == 'chunks':
        boc_texts = [extract_candidate_chunks(text) for text in texts]
    elif candidates == 'words':
        boc_texts = [extract_candidate_words(text) for text in texts]
    # make gensim dictionary and corpus
    dictionary = gensim.corpora.Dictionary(boc_texts)
    corpus = [dictionary.doc2bow(boc_text) for boc_text in boc_texts]
    # transform corpus with tf*idf model
    tfidf = gensim.models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]
    
    return corpus_tfidf, dictionary
