In [40]:
import glob, os, re, string, itertools
import logging
import numpy as np
from __future__ import print_function
from time import time
from nltk.stem.porter import *
from collections import defaultdict
from nltk import word_tokenize, pos_tag, pos_tag_sents
from nltk.corpus import wordnet, stopwords
from nltk.chunk.regexp import RegexpParser
from nltk.chunk import tree2conlltags
from pandas import DataFrame
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
from scipy.sparse.csr import csr_matrix
from sklearn import svm                                       #library for creating the classifier, SVM
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [41]:
def clean(input_list):
    result=[]
    #remove unwanted character per line
    for line in input_list:
        clean=re.sub("(\.)?\n",'', line) #remove \n
        clean=re.sub("('s)",'', clean) #remove \n
        clean=re.sub("\[([0-9]{1,2}\,?\s?)+\]",'', clean) #remove [2]
        clean=re.sub("\(([0-9]{1,2}\,?\s?)+\)",'', clean) #remove (2)
            #remove fig. 2 etc, need improvement to catch the sentence after it
        clean=re.sub("([Ff]ig.|[Ff]igure|[Tt]ab.|[Tt]able)\s?[0-9]{1,2}",'', clean) #remove fig. 2 etc
        result.append(clean)
    return result

In [54]:
def load_files(path):
    raw=[]
    for file in path:
        dict_doc={'doc_id': None, 'title': None, 'abstract': None, 'full-text': None, 'candidates': None}
        file_id=os.path.basename(file).rstrip('.txt.final') #catch only file name  
        dict_doc['doc_id']=file_id
        
        source=open(file,encoding='utf-8').readlines()
        source=clean(source)
        
        ##########detect title
        beginning=re.sub("\n", "", source[0]) #retrieve title
        candidate=re.sub("\n", "", source[1]) # retrieve title candidate
        h_candidate=word_tokenize(re.sub("-",' ',candidate)) #tokenize the candidate
        
        title=[]
        name=[]
        for word in h_candidate:
            if wordnet.synsets(word): #check if title candidate exist on wordnet
                title.append(word)
            else:
                name.append(word)
            #if title>
            if len(title)>len(name): 
                newtitle=beginning+' '+candidate
            elif len(title)==len(name):
                newtitle=beginning
            else:
                newtitle=beginning

        dict_doc['title']=newtitle
        
        content=source[2:]
        ######check header, inconsistency all file
        r_intro=re.compile("^1\.?\s[A-Z]+")
        r_ref=re.compile("[0-9]{1,2}?\.?\s?R[EFERENCES|eferences]") #detect reference
        #r_header=re.compile("[0-9]{1,2}?\.?\s?[A-Z]")
        
        in_abstract=content.index('ABSTRACT')
        in_authorkey=content.index('Categories and Subject Descriptors')
        
        list_intro=[i for i, item in enumerate(content) if re.search(r_intro, item)]
        in_intro=list_intro[0]
        list_ref=[i for i, item in enumerate(content) if re.search(r_ref, item)]
        in_ref=list_ref[0]
        
        abstract=content[in_abstract+1:in_authorkey] #eliminate keyword and category
        body=content[in_intro+1:in_ref] #remove reference       
        
        list_title=[]
        list_title.append(newtitle)
        
        full_text=list(chain(list_title,abstract, body))
        #dict_doc['abstract']=abstract
        #dict_doc['body']=body
        #dict_doc['full_text']=full_text
        
        #per sentence in a document
        raw.append(dict_doc)
    return raw

train_directory=sorted(glob.glob('./se_txt/test/*.txt.final'), key=lambda name: int(name[17:-10]))
source=load_files(train_directory)
print(source)

[{'doc_id': 'C-1', 'abstract': None, 'full-text': None, 'candidates': None, 'title': 'Scalable Grid Service Discovery Based on UDDI*'}, {'doc_id': 'I-1', 'abstract': None, 'full-text': None, 'candidates': None, 'title': 'Aborting Tasks in BDI Agents'}, {'doc_id': 'J-1', 'abstract': None, 'full-text': None, 'candidates': None, 'title': 'Generalized Trade Reduction Mechanisms'}, {'doc_id': 'H-2', 'abstract': None, 'full-text': None, 'candidates': None, 'title': 'Personalized Query Expansion for the Web'}, {'doc_id': 'J-2', 'abstract': None, 'full-text': None, 'candidates': None, 'title': 'Worst-Case Optimal Redistribution of VCG Payments'}, {'doc_id': 'C-3', 'abstract': None, 'full-text': None, 'candidates': None, 'title': 'Self-Adaptive Applications on the Grid'}, {'doc_id': 'H-3', 'abstract': None, 'full-text': None, 'candidates': None, 'title': 'Using Query Contexts in Information Retrieval'}, {'doc_id': 'J-3', 'abstract': None, 'full-text': None, 'candidates': None, 'title': 'Budget 

In [48]:
def tfidf_corpus(raw_data):
    train_data=[]
    for doc in raw_data:
        #add to list and join all element in full text into a text
        train_data.append(' '.join(doc['full_text']))
    return train_data

In [25]:
def calculate_ngram_tfidf(corpus):
    
    #porter stemmer
    stemmer=PorterStemmer()

    #eliminate ngram which starts or ends from stopwords
    #from https://stackoverflow.com/questions/49746555/sklearn-tfidfvectorizer-generate-custom
    #-ngrams-by-not-removing-stopword-in-the/49775000#49775000
    class NewTfidfVectorizer(TfidfVectorizer):
        def _word_ngrams(self, tokens, stop_words=None):
            # First get tokens without stop words
            tokens = super(TfidfVectorizer, self)._word_ngrams(tokens, None)
            if stop_words is not None:
                new_tokens=[]
                for token in tokens:
                    split_words = token.split(' ')
                    # Only check the first and last word for stop words
                    if len(token)>2 and split_words[0] not in stop_words and split_words[-1] not in stop_words:
                        #stem every word in token
                        if len(split_words)==1 and len(split_words[0])>2:
                            new_tokens.append(stemmer.stem(token))
                        elif len(split_words)==2 and split_words[-1]=="'":
                            del(token)
                        elif len(split_words[0])<3 and len(split_words[1])<3:
                            del(token)
                        elif split_words[1]=="'" and split_words[2]=="s":
                            new_tokens.append(stemmer.stem(split_words[0])+split_words[1]+split_words[2])
                        else:
                            new_tokens.append(' '.join(list(stemmer.stem(word) for word in word_tokenize(token))))
                    '''
                    if len(token)>2 and split_words[0] not in stop_words and split_words[-1] not in stop_words:
                        #stem every word in token
                        token1=' '.join(list(stemmer.stem(word) for word in word_tokenize(token)))
                        new_tokens.append(token1)
                    '''
                return new_tokens
            return tokens
    
    stop_words=text.ENGLISH_STOP_WORDS
    
    #pattern="(?u)\\b[\\w-]+\\b"
    tfidf=NewTfidfVectorizer(ngram_range=(1,5), stop_words=stop_words,
                                token_pattern=r"(?u)\b[A-Za-z-]+\b|'")
    
    #tfidf=NewTfidfVectorizer(ngram_range=(1,5), stop_words=stop_words,
    #                            token_pattern=r"(?u)\b[A-Za-z-]+('s)?\b")
    
    matrix=tfidf.fit_transform(corpus)
    feature_names=tfidf.get_feature_names()

    #how to print tf-idf from https://stackoverflow.com/questions/34449127/
    #sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
    candidates=[]
    for doc in range(0,len(corpus)):
        feature_index=matrix[doc,:].nonzero()[1]
        tfidf_doc=zip(feature_index, [matrix[doc, x] for x in feature_index])
        names_tfidf=[(w, s) for w, s in [(feature_names[i], s) for (i, s) in tfidf_doc]]
        candidates.append(names_tfidf)
    
    #this is the candidates per document
    #vocab_perdoc=tfidf.inverse_transform(matrix)
    
    return candidates

In [26]:
def vocabulary_nounphrases(raw_data):
    
    #porter stemmer
    stemmer=PorterStemmer()
    
    #from http://bdewilde.github.io/blog/2014/09/23/intro-to-automatic-keyphrase-extraction/
    grammar=r'NP: {(<JJ.*>* <NN.*>+ <IN>)? (<JJ.*>* <NN.*>+)+}' #only detect noun phrases that contain specific pattern, hypen word is counted as one NN
    
    #test new grammar
    #grammar=r'NP: {(<JJ>* <VBN>? <NN.*>+ <IN>)? <JJ>* <VBG>? <NN.*>+}' 
    
    punct = set(string.punctuation) #list of punctuation
    chunker = RegexpParser(grammar) #chunker from nltk
    
    def lambda_unpack(f):
        return lambda args:f(*args)
    
    postag_sents = pos_tag_sents(word_tokenize(sent) for sent in raw_data) #tokenize and create pos tag per sentence
    #list of IOB of noun phrases based on the specific grammar
    noun_phrases = list(chain.from_iterable(tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in postag_sents)) 
    
    #join B-NP and I-NP tags as one noun phrase excluding O tags
    #merged_nounphrase = [' '.join(stemmer.stem(word) for word, pos, chunk in group if re.search(r"(?u)\b[A-Za-z-]+\b", word)).lower() for key, group in
    #                itertools.groupby(noun_phrases, lambda_unpack(lambda word, pos, chunk: chunk != 'O')) if key]
    
    merged_nounphrase = [' '.join(stemmer.stem(word) for word, pos, chunk in group).lower() for key, group in
                    itertools.groupby(noun_phrases, lambda_unpack(lambda word, pos, chunk: chunk != 'O')) if key]
    
    #filter noun phrases from stopwords and punctuation
    all_nounphrases=[cand for cand in merged_nounphrase
            if len(cand)>2 and not all(char in punct for char in cand)]
    
    #all_nounphrases=[cand for cand in merged_nounphrase
    #        if len(cand)>2 and cand not in stop_words and not all(char in punct for char in cand)]
    
    #select distinct noun phrases
    vocabulary=(list(set(all_nounphrases)))
    return vocabulary

In [27]:
def calculate_nounphrase_tfidf(corpus, voc):
    
    stemmer=PorterStemmer()
    
    class StemmedTfidfVectorizer(TfidfVectorizer):
        '''
        def build_analyzer(self):
            analyzer=super(TfidfVectorizer, self).build_analyzer()
            #doc=' '.join(list(stemmer.stem(word) for word in analyzer(word_tokenize(doc))))
            return lambda doc: (stemmer.stem(word) for word in analyzer(doc))
            #return lambda doc: (' '.join(list(stemmer.stem(word) for word in word_tokenize(analyzer(doc)))))
        '''
        def build_tokenizer(self):
            tokenizer=super(TfidfVectorizer, self).build_tokenizer()
            return lambda doc: (stemmer.stem(token) for token in tokenizer(doc) if token not in stop_words)

    stop_words=set(text.ENGLISH_STOP_WORDS)
    s=['of','in','on','for']
    stop_words=stop_words.difference(s)
    tfidf=StemmedTfidfVectorizer(ngram_range=(1,5), stop_words=stop_words, vocabulary=voc, token_pattern=r"(?u)\b[A-Za-z-]+\b")
    
    matrix=tfidf.fit_transform(corpus)
    feature_names=tfidf.get_feature_names()

    #how to print tf-idf from https://stackoverflow.com/questions/34449127/
    #sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
    candidates=[]
    for doc in range(0,len(corpus)):
        feature_index=matrix[doc,:].nonzero()[1]
        tfidf_doc=zip(feature_index, [matrix[doc, x] for x in feature_index])
        names_tfidf=[(w, s) for w, s in [(feature_names[i], s) for (i, s) in tfidf_doc]]
        candidates.append(names_tfidf)
    
    return candidates

In [28]:
###sorting candidates based on 15 keywords
def get_topcandidates(candidates_list, number_keyphrases):
    best_candidates=[]
    for doc in candidates_list:
        #sort candidates by tf-idf value
        sorted_candidates=sorted(doc, key=lambda x: x[1], reverse=True)[:number_keyphrases]
        #best_candidates.append(sorted_candidates)
        best_candidates.append([x for x,_ in sorted_candidates])
        #remove overlapping keywords
    return best_candidates

In [29]:
###compare candidates to goldstandard
def extract_goldkeyphrases(gold_data):
    r_plus=re.compile("^.*\+.*$")
    r_slash=re.compile("^.*\s.*\/.*$")
    
    gold_standard=[]
    for line in gold_data.split('\n'):
        doc=[]      
        for key in line[6:].split(','):
            if key[0]==' ':
                doc.append(key[1:])
            elif re.search(r_plus, key):
                split=[]
                for element in key.split('+'):
                    doc.append(element)
            elif re.search(r_slash, key):
                split=[]
                for element in key.split('/'):
                    doc.append(element)
            else:
                doc.append(key)
        gold_standard.append(doc)
    return gold_standard

def calculate_fmeasure(candidates_list, gold_data):
    #true positive
    all_matches=[]
    for index in range(len(candidates_list)):
        #store all measure per document in dic
        value={'tp': None, 'fp': None, 'fn': None, 'gold': None}
        value['gold']=len(gold_data[index])
        #counter true positive per document
        true_positive=0
        #loop between elements
        for element_candidate in candidates_list[index]:                    
            for element_goldkeyphrase in gold_data[index]:
                #matched predicted keyword in gold keyphrase
                if element_candidate==element_goldkeyphrase:
                    #matches_perdoc.append(element_candidate)
                    true_positive+=1
            #if need the detail of evaluation
            value['tp']=int(true_positive) #matched pair
            value['fp']=int(15-true_positive) #depend how many keyword should we use
            value['fn']=int(value['gold']-value['tp'])
        #return all metrics per document
        all_matches.append(value)

    true_positive=sum(doc['tp'] for doc in all_matches)
    false_positive=sum(doc['fp'] for doc in all_matches)
    false_negative=sum(doc['fn'] for doc in all_matches)
    
    #matched/total top n
    precision=float(true_positive/(false_positive+true_positive))
    #matched/total gold standard
    recall=float(true_positive/(false_negative+true_positive))
    # calculate with micro averagedprecision
    f_measure=2*(precision*recall)/(precision+recall)
    return f_measure


In [30]:
#load directory of training data to feed to scikit learn
train_directory=glob.glob('./se_txt/test/*.txt.final')
gold_standard=open('./se_txt/test_answer/test.combined.stem.final', encoding='utf-8').read()
source=load_files(train_directory)
train_data=tfidf_corpus(source)
gold_data=extract_goldkeyphrases(gold_standard)
#tdidf value for ngram version
candidates_ngram=calculate_ngram_tfidf(train_data) #can be used for training data too
top_keyphrases_ngram=get_topcandidates(candidates_ngram, 15)
fmeasure_ngram=calculate_fmeasure(top_keyphrases_ngram, gold_data)
print(top_keyphrases_ngram)
#print("fmeasure testing ngram:", fmeasure_ngram)



[['registri', 'uddi', 'dht', 'servic', 'uddi registri', 'proxi', 'proxi registri', 'queri', 'hash', 'grid', 'key', 'discoveri', 'uddi key', 'publish', 'node'], ['sensor', 'target', 'detect', 'prob', 'path', 'fusion', 'target detect', 'exposur', 'deploy', 'travers', 'consensu', 'path exposur', 'sensor field', 'decis fusion', 'grid'], ['conferenc', 'confer', 'audio', 'client', 'mix', 'stream', 'voip', 'floor', 'internet', 'video', 'speaker', 'voip conferenc', 'floor control', 'particip', 'sip'], ['worm', 'swarm', 'infect', 'password', 'host', 'swarm worm', 'zachik', 'scan', 'target', 'behavior', 'vulner', 'slammer', 'propag', 'zachik worm', 'action'], ['protocol', 'modul', 'protocol modul', 'servic interfac', 'stack', 'interfac', 'protocol framework', 'repli', 'event', 'servic', 'event-bas', 'service-bas', 'framework', 'request', 'notif'], ['migrat', 'data center', 'server', 'center', 'replic', 'data', 'storag', 'network', 'server migrat', 'virtual server', 'outag', 'wan', 'virtual', 'se

In [31]:
#tdidf value for noun phrases version
voc_nounphrase=vocabulary_nounphrases(train_data)
candidates_nounphrase=calculate_nounphrase_tfidf(train_data, voc_nounphrase)
top_keyphrases_nounphrase=get_topcandidates(candidates_nounphrase, 15)
fmeasure_nounphrase=calculate_fmeasure(top_keyphrases_nounphrase, gold_data)
#print("fmeasure testing noun phrase:", fmeasure_nounphrase)
print(top_keyphrases_nounphrase)

[['registri', 'uddi', 'dht', 'servic', 'uddi registri', 'proxi', 'proxi registri', 'queri', 'hash', 'grid', 'key', 'discoveri', 'uddi key', 'publish', 'node'], ['sensor', 'target', 'detect', 'prob', 'target detect', 'path', 'exposur', 'deploy', 'consensu', 'path exposur', 'sensor field', 'decis fusion', 'grid', 'field', 'region'], ['conferenc', 'confer', 'audio', 'client', 'stream', 'voip', 'floor', 'internet', 'speaker', 'video', 'floor control', 'particip', 'sip', 'qualiti', 'number'], ['worm', 'swarm', 'infect', 'password', 'host', 'swarm worm', 'zachik', 'scan', 'target', 'behavior', 'vulner', 'slammer', 'propag', 'zachik worm', 'action'], ['protocol', 'modul', 'protocol modul', 'servic interfac', 'stack', 'interfac', 'protocol framework', 'repli', 'event', 'servic', 'framework', 'request', 'notif', 'listen', 'handler'], ['migrat', 'data center', 'server', 'center', 'replic', 'server migrat', 'data', 'storag', 'network', 'virtual server', 'outag', 'wan', 'virtual', 'servic', 'remot