In [53]:
import glob, os, re, string, itertools
import logging
import numpy as np
from __future__ import print_function
from time import time
from nltk.stem.porter import *
from collections import defaultdict
from nltk import word_tokenize, pos_tag, pos_tag_sents
from nltk.corpus import wordnet, stopwords
from nltk.chunk.regexp import RegexpParser
from nltk.chunk import tree2conlltags
from pandas import DataFrame
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
from scipy.sparse.csr import csr_matrix
from sklearn import svm                                       #library for creating the classifier, SVM
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [19]:
import pickle
def create_pickle(data, name):
    with open('%s.pickle' % name,'wb') as handle:
        result=pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return result

def open_pickle(name):
    with open('%s.pickle' % name,'rb') as handle:
        result=pickle.load(handle)
    return result

In [20]:
def clean(input_list):
    result=[]
    #remove unwanted character per line
    for line in input_list:
        clean=re.sub("(\.)?\n",'', line) #remove \n
        clean=re.sub("('s)",'', clean) #remove 's
        clean=re.sub("\[([0-9]{1,2}\,?\s?)+\]",'', clean) #remove [2]
        clean=re.sub("\(([0-9]{1,2}\,?\s?)+\)",'', clean) #remove (2)
        #clean=re.sub(r"\b(iv|ix|x|v?i{0,3})+\b",'', clean) #remove roman number
        #remove fig. 2 etc, need improvement to catch the sentence after it
        #clean=re.sub(r"\b(i.e.g.|e.g.|i.e.)",'', clean) #remove i.e.g., i.e., e.g.
        clean=re.sub("([Ff]ig.|[Ff]igure|[Tt]ab.|[Tt]able)\s?[0-9]{1,2}",'', clean) #remove fig. 2 etc
        clean=re.sub(r"\b((https?://|www.)[^\s]+)",'', clean) #remove email
        result.append(clean)
    return result

In [12]:
def load_files(path):
    raw=[]
    for file in path:
        dict_doc={'doc_id': None, 'title': None, 'abstract': None, 'introduction': None, 'full-text': None}
        file_id=os.path.basename(file).rstrip('.txt.final') #catch only file name  
        dict_doc['doc_id']=file_id
        
        source=open(file,encoding='utf-8').readlines()
        source=clean(source)
        
        ##########detect title
        beginning=re.sub("\n", "", source[0]) #retrieve title
        candidate=re.sub("\n", "", source[1]) # retrieve title candidate
        h_candidate=word_tokenize(re.sub("-",' ',candidate)) #tokenize the candidate
        
        title=[]
        name=[]
        for word in h_candidate:
            if wordnet.synsets(word): #check if title candidate exist on wordnet
                title.append(word)
            else:
                name.append(word)
            #if title>
            if len(title)>len(name): 
                newtitle=beginning+' '+candidate
            elif len(title)==len(name):
                newtitle=beginning
            else:
                newtitle=beginning

        dict_doc['title']=newtitle
        
        content=source[2:]
        ######check header, inconsistency all file
        r_intro=re.compile("^1\.?\s[A-Z]+")
        r_after_intro=re.compile("^2\.?\s[A-Z]+")
        r_ref=re.compile("[0-9]{1,2}?\.?\s?R[EFERENCES|eferences]") #detect reference
        #r_header=re.compile("[0-9]{1,2}?\.?\s?[A-Z]")
        
        in_abstract=content.index('ABSTRACT')
        in_authorkey=content.index('Categories and Subject Descriptors')
        
        list_intro=[i for i, item in enumerate(content) if re.search(r_intro, item)]
        in_intro=list_intro[0]
        list_after_intro=[i for i, item in enumerate(content) if re.search(r_after_intro, item)]
        in_after_intro=list_after_intro[0]
        list_ref=[i for i, item in enumerate(content) if re.search(r_ref, item)]
        in_ref=list_ref[0]
        
        abstract=content[in_abstract+1:in_authorkey] #eliminate keyword and category
        intro=content[in_intro+1:in_after_intro]
        body=content[in_after_intro+1:in_ref]      
        
        list_title=[]
        list_title.append(newtitle)
        
        full_text=list(chain(list_title, abstract, intro, body))
        dict_doc['abstract']=abstract
        dict_doc['introduction']=intro
        dict_doc['body']=body
        dict_doc['full_text']=full_text
        
        #per sentence in a document
        raw.append(dict_doc)
    return raw

In [13]:
#convert data to tfidfvectorizer format
#corpus=['a','b','c']
#RENAME TO CREATE CORPUS
def create_corpus(raw_data):
    #add to list and join all element in full text into a text
    train_data=[' '.join(doc['full_text']) for doc in raw_data]
    return train_data

In [7]:
#generate noun phrases based on corpus
def extract_candidate(raw_data):
    
    #porter stemmer
    #stemmer=PorterStemmer()
    
    #from http://bdewilde.github.io/blog/2014/09/23/intro-to-automatic-keyphrase-extraction/
    grammar=r'NP: {(<JJ.*>* <NN.*>+ <IN>)? (<JJ.*>* <NN.*>+)+}' #only detect noun phrases that contain specific pattern, hypen word is counted as one NN
    
    #test new grammar
    #grammar=r'NP: {(<JJ>* <VBN>? <NN.*>+ <IN>)? <JJ>* <VBG>? <NN.*>+}' 
    
    punct = set(string.punctuation) #list of punctuation
    chunker = RegexpParser(grammar) #chunker from nltk
    
    def lambda_unpack(f):
        return lambda args:f(*args)
    
    postag_sents = pos_tag_sents(word_tokenize(sent) for sent in raw_data) #tokenize and create pos tag per sentence
    #list of IOB of noun phrases based on the specific grammar
    noun_phrases = list(chain.from_iterable(tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in postag_sents)) 
    
    #join B-NP and I-NP tags as one noun phrase excluding O tags    
    merged_nounphrase = [' '.join(stemmer.stem(word) for word, pos, chunk in group).lower() for key, group in
                    itertools.groupby(noun_phrases, lambda_unpack(lambda word, pos, chunk: chunk != 'O')) if key]
    
    #filter noun phrases from stopwords and punctuation
    all_nounphrases=[cand for cand in merged_nounphrase
            if len(cand)>2 and not all(char in punct for char in cand)]
    
    #select distinct noun phrases
    vocabulary=(list(set(all_nounphrases)))
    return vocabulary

In [6]:
def calculate_ngram_tfidf(corpus):
    
    #porter stemmer
    stemmer=PorterStemmer()

    #eliminate ngram which starts or ends from stopwords
    #from https://stackoverflow.com/questions/49746555/sklearn-tfidfvectorizer-generate-custom
    #-ngrams-by-not-removing-stopword-in-the/49775000#49775000
    class NewTfidfVectorizer(TfidfVectorizer):
        def _word_ngrams(self, tokens, stop_words=None):
            # First get tokens without stop words
            tokens = super(TfidfVectorizer, self)._word_ngrams(tokens, None)
            if stop_words is not None:
                new_tokens=[]
                for token in tokens:
                    split_words = token.split(' ')
                    # Only check the first and last word for stop words
                    if len(token)>2 and split_words[0] not in stop_words and split_words[-1] not in stop_words:
                        #stem every word in token
                        if len(split_words)==1 and len(split_words[0])>2:
                            new_tokens.append(stemmer.stem(token))
                        elif len(split_words)==2 and split_words[-1]=="'":
                            del(token)
                        elif len(split_words[0])<3 and len(split_words[1])<3:
                            del(token)
                        elif split_words[1]=="'" and split_words[2]=="s":
                            new_tokens.append(stemmer.stem(split_words[0])+split_words[1]+split_words[2])
                        else:
                            new_tokens.append(' '.join(list(stemmer.stem(word) for word in word_tokenize(token))))
                return new_tokens
            return tokens
    
    stop_words=text.ENGLISH_STOP_WORDS
    
    tfidf=NewTfidfVectorizer(ngram_range=(1,5), stop_words=stop_words,
                                token_pattern=r"(?u)\b[A-Za-z-]+\b")
    
    matrix=tfidf.fit_transform(corpus)
    feature_names=tfidf.get_feature_names()

    #how to print tf-idf from https://stackoverflow.com/questions/34449127/
    #sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
    candidates=[]
    for doc in range(0,len(corpus)):
        feature_index=matrix[doc,:].nonzero()[1]
        tfidf_doc=zip(feature_index, [matrix[doc, x] for x in feature_index])
        names_tfidf=[(w, s) for w, s in [(feature_names[i], s) for (i, s) in tfidf_doc]]
        candidates.append(names_tfidf)
    
    #this is the candidates per document
    #vocab_perdoc=tfidf.inverse_transform(matrix)
    
    return candidates

In [8]:
def calculate_nounphrase_tfidf(corpus, voc):
    
    stemmer=PorterStemmer()
    
    class StemmedTfidfVectorizer(TfidfVectorizer):
        def build_tokenizer(self):
            tokenizer=super(TfidfVectorizer, self).build_tokenizer()
            return lambda doc: (stemmer.stem(token) for token in tokenizer(doc) if token not in stop_words)

    stop_words=set(text.ENGLISH_STOP_WORDS)
    s=['of','in','on','for']
    stop_words=stop_words.difference(s)
    tfidf=StemmedTfidfVectorizer(ngram_range=(1,5), stop_words=stop_words, vocabulary=voc, token_pattern=r"(?u)\b[A-Za-z-]+\b")
    
    matrix=tfidf.fit_transform(corpus)
    feature_names=tfidf.get_feature_names()

    #how to print tf-idf from https://stackoverflow.com/questions/34449127/
    #sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
    candidates=[]
    for doc in range(0,len(corpus)):
        feature_index=matrix[doc,:].nonzero()[1]
        tfidf_doc=zip(feature_index, [matrix[doc, x] for x in feature_index])
        names_tfidf=[(w, s) for w, s in [(feature_names[i], s) for (i, s) in tfidf_doc]]
        candidates.append(names_tfidf)
    
    return candidates

In [9]:
###sorting candidates based on 15 keywords
def get_top_candidates(candidates_list, number_keyphrases):
    best_candidates=[]
    for doc in candidates_list:
        #sort candidates by tf-idf value
        sorted_candidates=sorted(doc, key=lambda x: x[1], reverse=True)[:number_keyphrases]
        #best_candidates.append(sorted_candidates)
        best_candidates.append([x for x,_ in sorted_candidates])
        #remove overlapping keywords
    return best_candidates

In [10]:
###compare candidates to goldstandard
def extract_goldkeyphrase(gold_data):
    
    r_plus=re.compile("^.*\+.*$")
    r_slash=re.compile("^.*\s.*\/.*$")
    
    gold_standard=[]
    for line in gold_data.split('\n'):
        doc=[]      
        for key in line[6:].split(','):
            if key[0]==' ':
                doc.append(key[1:])
            elif re.search(r_plus, key):
                split=[]
                for element in key.split('+'):
                    doc.append(element)
            elif re.search(r_slash, key):
                split=[]
                for element in key.split('/'):
                    doc.append(element)
            else:
                doc.append(key)
        gold_standard.append(doc)
    return gold_standard

def calculate_fmeasure(candidates_list, gold_data):
    #true positive
    all_matches=[]
    for index in range(len(candidates_list)):
        #store all measure per document in dic
        value={'tp': None, 'fp': None, 'fn': None, 'gold': None}
        value['gold']=len(gold_data[index])
        #counter true positive per document
        true_positive=0
        #loop between elements
        for element_candidate in candidates_list[index]:                    
            for element_goldkeyphrase in gold_data[index]:
                #matched predicted keyword in gold keyphrase
                if element_candidate==element_goldkeyphrase:
                    #matches_perdoc.append(element_candidate)
                    true_positive+=1
            #if need the detail of evaluation
            value['tp']=int(true_positive) #matched pair
            value['fp']=int(15-true_positive) #depend how many keyword should we use
            value['fn']=int(value['gold']-value['tp'])
        #return all metrics per document
        all_matches.append(value)

    true_positive=sum(doc['tp'] for doc in all_matches)
    false_positive=sum(doc['fp'] for doc in all_matches)
    false_negative=sum(doc['fn'] for doc in all_matches)
    
    #matched/total top n
    precision=float(true_positive/(false_positive+true_positive))
    #matched/total gold standard
    recall=float(true_positive/(false_negative+true_positive))
    # calculate with micro averagedprecision
    f_measure=float("{0:.2F}".format(2*(precision*recall)/(precision+recall)*100))
    return f_measure

In [11]:
def calculate_term_frequency(section):
     #porter stemmer
    stemmer=PorterStemmer()
    
    #eliminate ngram which starts or ends from stopwords
    class NewCountVectorizer(CountVectorizer):
        def _word_ngrams(self, tokens, stop_words=None):
            # First get tokens without stop words
            tokens = super(CountVectorizer, self)._word_ngrams(tokens, None)
            if stop_words is not None:
                new_tokens=[]
                for token in tokens:
                    split_words = token.split(' ')
                    # Only check the first and last word for stop words
                    if len(token)>2 and split_words[0] not in stop_words and split_words[-1] not in stop_words:
                        #stem every word in token
                        if len(split_words)==1 and len(split_words[0])>2:
                            new_tokens.append(stemmer.stem(token))
                        elif len(split_words)==2 and split_words[-1]=="'":
                            del(token)
                        elif len(split_words[0])<3 and len(split_words[1])<3:
                            del(token)
                        elif split_words[1]=="'" and split_words[2]=="s":
                            new_tokens.append(stemmer.stem(split_words[0])+split_words[1]+split_words[2])
                        else:
                            new_tokens.append(' '.join(list(stemmer.stem(word) for word in word_tokenize(token))))
                return new_tokens
            return tokens
    
    stop_words=text.ENGLISH_STOP_WORDS
    
    count_vect=NewCountVectorizer(ngram_range=(1,5), stop_words=stop_words,
                                token_pattern=r"(?u)\b[A-Za-z-]+\b")
    
    matrix=count_vect.fit_transform(section)
    feature_names=count_vect.get_feature_names()

    #how to print tf-idf from https://stackoverflow.com/questions/34449127/
    #sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-document
    ngrams=[]
    for doc in range(0,len(section)):
        feature_index=matrix[doc,:].nonzero()[1]
        count_vect_doc=zip(feature_index, [matrix[doc, x] for x in feature_index])
        names_count_vect=[(w, s) for w, s in [(feature_names[i], s) for (i, s) in count_vect_doc]]
        ngrams.append(names_count_vect)
    
    return ngrams

In [34]:
###Dont run it twice
#----------------------------------------------------TF-IDF version
###load training data
train_directory=glob.glob('./se_txt/train/*.txt.final')
train_raw=load_files(train_directory)
pickle_train_raw=create_pickle(train_raw,'txt train raw')
train_data=create_corpus(train_raw)
pickle_train_data=create_pickle(train_data,'txt train data')
train_tf_corpus=calculate_term_frequency(train_data)
pickle_train_tf_corpus=create_pickle(train_tf_corpus,'txt train tf corpus')


#load gold keyphrase
train_label_directory=open('./se_txt/train/train.combined.stem.final', encoding='utf-8').read()
train_label=extract_goldkeyphrase(train_label_directory)
pickle_train_label=create_pickle(train_label, 'txt train label')

###Load testing data
test_directory=glob.glob('./se_txt/test/*.txt.final')
test_raw=load_files(test_directory)
pickle_test_raw=create_pickle(test_raw,'txt test raw')
test_data=create_corpus(test_raw)
pickle_test_data=create_pickle(test_data,'txt test data')
test_tf_corpus=calculate_term_frequency(test_data)
pickle_test_tf_corpus=create_pickle(test_tf_corpus,'txt test tf corpus')

test_label_directory=open('./se_txt/test_answer/test.combined.stem.final', encoding='utf-8').read()
test_label=extract_goldkeyphrase(test_label_directory)
pickle_test_label=create_pickle(test_label, 'txt test label')

#### Ngram version
print("N-gram TF-IDF version")
ngram_candidates=calculate_ngram_tfidf(train_data) 
pickle_ngram_candidates=create_pickle(ngram_candidates, 'txt ngram candidates')
#ngram_top_keyphrases=get_top_candidates(ngram_candidates, 15)
#ngram_fmeasure=calculate_fmeasure(ngram_top_keyphrases, train_label)
#print("F-measure on training:", ngram_fmeasure)

test_ngram_candidates=calculate_ngram_tfidf(test_data)
pickle_test_ngram_candidates=create_pickle(test_ngram_candidates, 'txt test ngram candidates')
#test_ngram_top_candidates=get_top_candidates(test_ngram_candidates, 15)
#test_ngram_fmeasure=calculate_fmeasure(test_ngram_top_candidates, test_label)
#print("F-measure on testing:", test_ngram_fmeasure)


#### Noun phrase version
print("Noun phrase TF-IDF version")
nounphrase_vocabulary=create_phrase_vocabulary(train_data)
nounphrase_candidates=calculate_nounphrase_tfidf(train_data, nounphrase_vocabulary)
pickle_nounphrase_candidates=create_pickle(nounphrase_candidates, 'txt nounphrase candidates')
#nounphrase_top_keyphrases=get_top_candidates(nounphrase_candidates, 15)
#nounphrase_fmeasure=calculate_fmeasure(nounphrase_top_keyphrases, train_label)
#print("F-measure on training:", nounphrase_fmeasure)

test_nounphrase_vocabulary=create_phrase_vocabulary(test_data)
test_nounphrase_candidates=calculate_nounphrase_tfidf(test_data, test_nounphrase_vocabulary)
pickle_test_nounphrase_candidates=create_pickle(test_nounphrase_candidates, 'txt test nounphrase candidates')
#test_nounphrase_top_candidates=get_top_candidates(test_nounphrase_candidates, 15)
#test_nounphrase_fmeasure=calculate_fmeasure(test_nounphrase_top_candidates, test_label)
#print("F-measure on testing:", test_nounphrase_fmeasure)

N-gram TF-IDF version
Noun phrase TF-IDF version


In [12]:
def clean_corpus(corpus):
    clean=[]
    stemmer=PorterStemmer()
    for doc in corpus:
        cleaned_words=" ".join([word for word in word_tokenize(doc.lower()) if re.search(r"\b[A-Za-z-]+\b", word) and len(word)>2])
        stemmed_words=[stemmer.stem(word) for word in cleaned_words.split()]
        clean.append(" ".join([word for word in stemmed_words]))
    return clean

In [13]:
def create_example(raw_data, corpus, candidates, label, tf_corpus):
    
    #binary_title, frequency_title, binary_abstract, frequency_abstract, binary_introduction, frequency_introduction
    def feature_structure(candidates, raw_data):
        title_raw=[doc['title'] for doc in raw_data]
        abstract_raw=[' '.join(doc['abstract']) for doc in raw_data]
        introduction_raw=[' '.join(doc['introduction']) for doc in raw_data]  
        title=calculate_term_frequency(title_raw)
        abstract=calculate_term_frequency(abstract_raw)
        introduction=calculate_term_frequency(introduction_raw) 
        feature=[]
        for n_doc in range(len(candidates)):
            doc=[]
            for n_cand in range(len(candidates[n_doc])):
                title_perdoc=[feature for (feature, value) in title[n_doc]]
                abstract_perdoc=[feature for (feature, value) in abstract[n_doc]]
                introduction_perdoc=[feature for (feature, value) in introduction[n_doc]]
                if candidates[n_doc][n_cand][0] in title_perdoc:
                    binary_title=1
                    value=[value for (feature, value) in title[n_doc] if feature in candidates[n_doc][n_cand][0]]
                    frequency_title=value[0]
                else:
                    binary_title=0
                    frequency_title=0
                if candidates[n_doc][n_cand][0] in abstract_perdoc:
                    binary_abstract=1
                    value=[value for (feature, value) in abstract[n_doc] if feature in candidates[n_doc][n_cand][0]]
                    frequency_abstract=value[0]
                else:
                    binary_abstract=0
                    frequency_abstract=0
                if candidates[n_doc][n_cand][0] in introduction_perdoc:
                    binary_introduction=1
                    value=[value for (feature, value) in introduction[n_doc] if feature in candidates[n_doc][n_cand][0]]
                    frequency_introduction=value[0]
                else:
                    binary_introduction=0
                    frequency_introduction=0
                doc.append(((binary_title, frequency_title, binary_abstract, frequency_abstract, binary_introduction, frequency_introduction)))
            feature.append(doc)
        return feature
    
    def feature_candidate_length(candidates):
        feature=[]
        for n_doc in range(len(candidates)):
            doc=[]
            for n_feature in range(len(candidates[n_doc])):
                doc.append(len(candidates[n_doc][n_feature][0]))
            feature.append(doc)
        return feature
        
    def feature_frequency(label, tf_corpus):
        merged_labels=list(chain.from_iterable(label))
        feature=[]
        for n_doc in range(len(tf_corpus)):
            doc=[]
            for n_cand in range(len(tf_corpus[n_doc])):
                cand_freq=tf_corpus[n_doc][n_cand][1]
                if tf_corpus[n_doc][n_cand][0] not in merged_labels:
                    supervised=0
                else:
                    supervised=tf_corpus[n_doc][n_cand][1]
                doc.append(((cand_freq, supervised)))
            feature.append(doc)
        return feature
    
    #create first, last occurence, distance from first occurence, spread from first and last occurence
    def feature_occurence(candidates, corpus):
        feature=[]
        cleaned_corpus=clean_corpus(corpus)
        for n_doc in range(len(candidates)):
            doc=[]
            corpus_words=len(cleaned_corpus[n_doc].split(" "))
            for n_cand in range(len(candidates[n_doc])):
                first_index=cleaned_corpus[n_doc].lower().find(candidates[n_doc][n_cand][0])
                last_index=cleaned_corpus[n_doc].lower().rfind(candidates[n_doc][n_cand][0])
                preceding_words=len(cleaned_corpus[n_doc][:first_index].split(" "))-1
                following_words=len(cleaned_corpus[n_doc][:last_index].split(" "))-1
                distance=float("{0:.2F}".format(preceding_words/corpus_words))
                spread=len(cleaned_corpus[n_doc][first_index:last_index].split(" "))-1
                doc.append(((preceding_words, following_words, distance, spread)))
            feature.append(doc)
        return feature
    
    
    #lists of feature
    feature2=feature_structure(candidates, raw_data)
    feature3=feature_candidate_length(candidates)
    feature4=feature_frequency(label, tf_corpus)
    feature5=feature_occurence(candidates, corpus) #important feature there is 3 feature
    
    #add values of all features into candidate list
    for n_doc in range(len(candidates)):
        for n_candidate in range(len(candidates[n_doc])):
            #binary_title
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature2[n_doc][n_candidate][0],)
            #frequency_title
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature2[n_doc][n_candidate][1],)
            #binary_abstract
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature2[n_doc][n_candidate][2],)
            #frequency_abstract
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature2[n_doc][n_candidate][3],)
            #binary_introduction
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature2[n_doc][n_candidate][4],)
            #frequency_introduction
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature2[n_doc][n_candidate][5],)            
            #length
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature3[n_doc][n_candidate],)
            #term frequency
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature4[n_doc][n_candidate][0],)
            #supervised
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature4[n_doc][n_candidate][1],)
            #first occurence
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature5[n_doc][n_candidate][0],)
            #last occurence
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature5[n_doc][n_candidate][1],)
            #distance from first occurence
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature5[n_doc][n_candidate][2],)
            #spread
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature5[n_doc][n_candidate][3],)
            
    #convert the format from candidate from tuple to list
    x_data=[]
    for n_doc in range(len(candidates)):
        for n_candidate in range(len(candidates[n_doc])):
            #append only values of features. without word
            x_data.append(list(candidates[n_doc][n_candidate][1:]))
    return x_data     

In [14]:
##create label for training or testing
def create_label(candidates, label):
    y_label=[]
    for n_doc in range(len(candidates)):
        for n_cand in range(len(candidates[n_doc])):
            keyphrase_document=list(label[n_doc])
            if candidates[n_doc][n_cand][0] not in keyphrase_document:
                y_label.append(0)
            else:
                y_label.append(1)
    return y_label

In [15]:
def probability_to_fmeasure(predict_proba, candidates, labels, models):
    #all_fmeasure=[]
    for model in range(0, len(predict_proba)):
        probability=[]
        counter=0
        for n_doc in range(len(candidates)):
            doc=[]
            for n_cand in range(len(candidates[n_doc])):
                doc.append((candidates[n_doc][n_cand][0], predict_proba[model][counter]))
                counter+=1
            probability.append(doc)
        fmeasure=calculate_fmeasure(get_top_candidates(probability, 15), labels)
        print("Model %s: %.3f" % (models[model][0], fmeasure))
        #all_fmeasure.append((models[model][0], fmeasure))
    return 'finish'

In [32]:
def predict_data(x_train, y_train, x_test, y_test, candidates, labels):
    seed = 7 #just randomly select the number
    models = []
    #models.append(('LR', LogisticRegression()))
    #models.append(('LDA', LinearDiscriminantAnalysis()))
    #models.append(('NB', GaussianNB()))
    #models.append(('SVM', SVC(probability=True)))
    models.append(('RF', RF(n_estimators=20, max_depth=10)))
    #models.append(('AdaBoost', AdaBoostClassifier()))
    #models.append(('Bagging', BaggingClassifier()))
    #models.append(('GradientBoosting', (GradientBoostingClassifier())))
    #models.append(('MLP', (MLPClassifier()))) #learning_rate_init=0.002))))
    #models.append(('Multinomial', (MultinomialNB())))
    

    #loop as many as features
    print("Take one feature out")
    for feature in range(len(x_train[0])):
        print("Remove feature number", feature+1)
     
        modified_train=[doc[:feature]+doc[feature+1:] for doc in x_train] 
        modified_test=[doc[:feature]+doc[feature+1:] for doc in x_test]
        
        predict_proba=[]
        for name, model in models:
        #calculate F-score, recall and precision
            #print("%s: %.3f" % (name, accuracy_score(model.fit(modified_train, train_label).predict(modified_test), test_label)))
            predict_proba.append(model.fit(x_train, y_train).predict_proba(x_test)[:,1])
            
        #calculate f-measure
        fmeasure=probability_to_fmeasure(predict_proba, candidates, labels, models)
    
    #results = []
    #names = []
    #scoring='accuracy'
    #print("\nAccuracy on testing data:")
    all_predict_proba=[]
    for name, model in models:
        #accuracy
        #print("%s: %.3f" % (name, accuracy_score(model.fit(x_train, y_train).predict(x_test), y_test)))
        all_predict_proba.append(model.fit(x_train, y_train).predict_proba(x_test)[:,1])
    
    print("Fmeasure on full features:")
    all_fmeasure=[]
    for model in range(0, len(all_predict_proba)):
        probability=[]
        counter=0
        for n_doc in range(len(candidates)):
            doc=[]
            for n_cand in range(len(candidates[n_doc])):
                doc.append((candidates[n_doc][n_cand][0], all_predict_proba[model][counter]))
                counter+=1
            probability.append(doc)
        fmeasure=calculate_fmeasure(get_top_candidates(probability, 15), labels)
        all_fmeasure.append((models[model][0], fmeasure))
    return all_fmeasure

In [None]:
train_directory=glob.glob('./se_txt/train/dummy/dump/*.txt.final')
train_raw=load_files(train_directory)
train_data=create_corpus(train_raw)
print(extract_candidate(train_data))

In [21]:
#CREATE DUMMY FIRST
def extract_candidate(raw_data):
    
    #porter stemmer
    stemmer=PorterStemmer()
    
    grammar=r'''CNP: {(((<JJ.*>|<NN.*>)+ | ((<JJ.*>|<NN.*>)* (<IN>)?) (<JJ.*>|<NN.*>)*) <NN.*>)}
            NN: {<NN.*>+ <NN.*>}
            NP: {(<JJ.*>|<NN.*>)+ <NN.*>}
        '''
    
    punct = set(string.punctuation) #list of punctuation
    chunker = RegexpParser(grammar) #chunker from nltk
    
    def lambda_unpack(f):
        return lambda args:f(*args)
    
    postag_sents = pos_tag_sents(word_tokenize(sent) for sent in raw_data) #tokenize and create pos tag per sentence
    #list of IOB of noun phrases based on the specific grammar
    noun_phrases = list(chain.from_iterable(tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in postag_sents)) 
    
    #join B-NP and I-NP tags as one noun phrase excluding O tags    
    merged_nounphrase = [' '.join(stemmer.stem(word) for word, pos, chunk in group).lower() for key, group in
                    itertools.groupby(noun_phrases, lambda_unpack(lambda word, pos, chunk: chunk != 'O')) if key]
    
    #filter noun phrases from stopwords and punctuation
    all_nounphrases=[cand for cand in merged_nounphrase
            if len(cand)>2 and not all(char in punct for char in cand)]
    
    #select distinct noun phrases
    vocabulary=(list(set(all_nounphrases)))
    return vocabulary

    #return noun_phrases



['approach', 'power budget', 'ip core', 'point pi', 'direct of arriv differ', 'wireless network', 'from resourc monitor', 'weapon classif in addit', 'a/v', 'voltag requir', 'simpl high resolut grid search', 'combin', 'power at base station', 'sensor pair', 'true for constant projectil speed', 'signal process core', 'sensor platform', 'as corba servant', 'address', 'system perform', 'bound on applic resourc util', 'video captur', 'recommend frame rate', 'shockwav', 'system resourc util', 'jitter', '|ti − tj| < = |pi − pj |/c + ε where c', 'detect logic', 'tao', 'task of adapt resourc manag', 'second zc', 'differ sensor', 't_start', 'discret dynam', 'desir upper bound on resourc util', 'in pure java', 'thin adapt layer', 'ak-47 m240', 'v. sinc p2', 'within b', '+ tao', 'if calib avail comput weapon type', 'with ieee', 'resembl hardwar build block', 'by time of arriv t1 < t2 < t3', 'central scenario', 'hard qo requir', 'set point', 'line', 'at posit p1 with time t1', 'measur direct', 'ava

In [159]:
from itertools import chain

t=["machine learning","learning","bath","swim","swimming","swimming pool","very good view swimming pool",
   "good view swimming pool","high level machine learning","banana", "chocolate banana","cold chocolate banana milkshake",
  "good beautiful shape playing house","shape playing house","milkshake","banana milkshake", "green banana",
  "hot green banana","addictive machine learning","exciting machine learning", "shared playing house",
   "highly shared playing house"]
t.sort(key= lambda s: len(s.split(" ")), reverse=True)
print(t)

phrase_chains=[]
rest=[]
for n_cand in range(len(t)):
    if len(t[n_cand].split(" "))==5:
        substring=[word for word in t[n_cand].split(" ")]
        chain=[' '.join(substring[i:]) for i in range(0,5)]
        phrase_chains.append(chain)
    elif len(t[n_cand].split(" "))==4:
        #check if this string has been added into another chain
        if len([word for word in t[:n_cand] if t[n_cand] not in word]) == len(t[:n_cand]):
            substring=[word for word in t[n_cand].split(" ")]
            chain=[' '.join(substring[i:]) for i in range(0,4)]
            phrase_chains.append(chain)
    elif len(t[n_cand].split(" "))==3:
        if len([word for word in t[:n_cand] if t[n_cand] not in word]) == len(t[:n_cand]):
            substring=[word for word in t[n_cand].split(" ")]
            chain=[' '.join(substring[i:]) for i in range(0,3)]
            phrase_chains.append(chain)
    elif len(t[n_cand].split(" "))==2:
        if len([word for word in t[:n_cand] if t[n_cand] not in word]) == len(t[:n_cand]):
            substring=[word for word in t[n_cand].split(" ")]
            chain=[' '.join(substring[i:]) for i in range(0,2)]
            phrase_chains.append(chain)
    elif len(t[n_cand].split(" "))==1:
        if len([word for word in t[:n_cand] if t[n_cand] not in word]) == len(t[:n_cand]):
            substring=[word for word in t[n_cand].split(" ")]
            chain=[' '.join(substring[i:]) for i in range(0,1)]
            phrase_chains.append(chain)

print(phrase_chains)
[ 'high level machine learning',
 'cold chocolate banana milkshake', 'highly shared playing house', 'shape playing house', 'hot green banana', 
 'addictive machine learning', 'exciting machine learning', 'shared playing house', 'machine learning',
 'chocolate banana', 'banana milkshake', 'green banana', 'learning', 'bath', 'swim', 'swimming', 'banana', 'milkshake']

['very good view swimming pool', 'good beautiful shape playing house', 'good view swimming pool', 'high level machine learning', 'cold chocolate banana milkshake', 'highly shared playing house', 'shape playing house', 'hot green banana', 'addictive machine learning', 'exciting machine learning', 'shared playing house', 'machine learning', 'swimming pool', 'chocolate banana', 'banana milkshake', 'green banana', 'learning', 'bath', 'swim', 'swimming', 'banana', 'milkshake']
[['very good view swimming pool', 'good view swimming pool', 'view swimming pool', 'swimming pool', 'pool'], ['good beautiful shape playing house', 'beautiful shape playing house', 'shape playing house', 'playing house', 'house'], ['high level machine learning', 'level machine learning', 'machine learning', 'learning'], ['cold chocolate banana milkshake', 'chocolate banana milkshake', 'banana milkshake', 'milkshake'], ['highly shared playing house', 'shared playing house', 'playing house', 'house'], ['hot green banana

In [None]:
##merge some related 
from itertools import chain

t=["machine learning","learning","bath","swim","swimming","swimming pool","very good view swimming pool",
   "good view swimming pool","high level machine learning","banana", "chocolate banana","cold chocolate banana milkshake",
  "good beautiful shape playing house","shape playing house","milkshake","banana milkshake", "green banana",
  "hot green banana","addictive machine learning","exciting machine learning", "shared playing house",
   "highly shared playing house"]
t.sort(key= lambda s: len(s.split(" ")), reverse=True)
print(t)

phrase_chains=[]
rest=[]
for n_cand in range(len(t)):
    if len(t[n_cand].split(" "))==5:
        substring=[word for word in t[n_cand].split(" ")]
        chain=[' '.join(substring[i:]) for i in range(0,5)]
        phrase_chains.append(chain)
    elif len(t[n_cand].split(" "))==4:
        #check if this string has been added into another chain
        if len([word for word in t[:n_cand] if t[n_cand] not in word]) == len(t[:n_cand]):
            substring=[word for word in t[n_cand].split(" ")]
            chain=[' '.join(substring[i:]) for i in range(0,4)]
            
            phrase_chains.append(chain)
    elif len(t[n_cand].split(" "))==3:
        if len([word for word in t[:n_cand] if t[n_cand] not in word]) == len(t[:n_cand]):
            substring=[word for word in t[n_cand].split(" ")]
            chain=[' '.join(substring[i:]) for i in range(0,3)]
            phrase_chains.append(chain)
    elif len(t[n_cand].split(" "))==2:
        if len([word for word in t[:n_cand] if t[n_cand] not in word]) == len(t[:n_cand]):
            substring=[word for word in t[n_cand].split(" ")]
            chain=[' '.join(substring[i:]) for i in range(0,2)]
            phrase_chains.append(chain)
    elif len(t[n_cand].split(" "))==1:
        if len([word for word in t[:n_cand] if t[n_cand] not in word]) == len(t[:n_cand]):
            substring=[word for word in t[n_cand].split(" ")]
            chain=[' '.join(substring[i:]) for i in range(0,1)]
            phrase_chains.append(chain)
print(phrase_chains)

In [4]:
#test dictionary
l=[['very good view swimming pool', 'good view swimming pool', 'view swimming pool', 'swimming pool',
    'pool'], ['good beautiful shape playing house', 'beautiful shape playing house', 
    'shape playing house', 'playing house', 'house'], ['high level machine learning', 
    'level machine learning', 'machine learning', 'learning'], ['cold chocolate banana milkshake', 
    'chocolate banana milkshake', 'banana milkshake', 'milkshake'], ['highly shared playing house', 
    'shared playing house', 'playing house', 'house'], ['hot green banana', 'green banana', 'banana'],
   ['addictive machine learning', 'machine learning', 'learning'], 
   ['exciting machine learning', 'machine learning', 'learning'], ['bath']]
al=[]
for el in l:
    d={k:v+1 for v, k in enumerate(el)}
    al.append(d)
print(al)

[{'swimming pool': 4, 'very good view swimming pool': 1, 'pool': 5, 'good view swimming pool': 2, 'view swimming pool': 3}, {'playing house': 4, 'beautiful shape playing house': 2, 'good beautiful shape playing house': 1, 'shape playing house': 3, 'house': 5}, {'learning': 4, 'machine learning': 3, 'high level machine learning': 1, 'level machine learning': 2}, {'chocolate banana milkshake': 2, 'milkshake': 4, 'banana milkshake': 3, 'cold chocolate banana milkshake': 1}, {'playing house': 3, 'house': 4, 'highly shared playing house': 1, 'shared playing house': 2}, {'green banana': 2, 'banana': 3, 'hot green banana': 1}, {'learning': 3, 'machine learning': 2, 'addictive machine learning': 1}, {'learning': 3, 'machine learning': 2, 'exciting machine learning': 1}, {'bath': 1}]


In [145]:
t=['very good view swimming pool', 'good beautiful shape playing house','playing house',"hehe"]
h=[3,4]
tt=[word for word in t[:2]]
print(tt)
if len([word for word in t[:2] if t[2] not in word]) == len(t[:2]):
    print(t[2])

['very good view swimming pool', 'good beautiful shape playing house']


In [103]:
##calculate c_value DONE
import math

uniq_nested_number=0
log2_a=(math.log(float(4))/math.log(float(2)))
freq_a=float(7)

if uniq_nested_number==0:
    c_value=log2_a*freq_a
else:
    inv_uniq_nested=float(1)/float(uniq_nested_number)
    freq_nested=float(0)
    c_value=log2_a*(freq_a - inv_uniq_nested * freq_nested)
    
print(c_value)

14.0
