In [None]:
import glob, os, re, string, itertools
import logging
import numpy as np
from __future__ import print_function
from time import time
from nltk.stem.porter import *
from collections import defaultdict
from nltk import word_tokenize, pos_tag, pos_tag_sents
from nltk.corpus import wordnet, stopwords
from nltk.chunk.regexp import RegexpParser
from nltk.chunk import tree2conlltags
from pandas import DataFrame
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
from scipy.sparse.csr import csr_matrix
from sklearn import svm                                       #library for creating the classifier, SVM
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
def clean(input_list):
    result=[]
    #remove unwanted character per line
    for line in input_list:
        clean=re.sub("(\.)?\n",'', line) #remove \n
        clean=re.sub("('s)",'', clean) #remove 's
        clean=re.sub("\[([0-9]{1,2}\,?\s?)+\]",'', clean) #remove [2]
        clean=re.sub("\(([0-9]{1,2}\,?\s?)+\)",'', clean) #remove (2)
        #clean=re.sub(r"\b(iv|ix|x|v?i{0,3})+\b",'', clean) #remove roman number
        #remove fig. 2 etc, need improvement to catch the sentence after it
        #clean=re.sub(r"\b(i.e.g.|e.g.|i.e.)",'', clean) #remove i.e.g., i.e., e.g.
        clean=re.sub("([Ff]ig.|[Ff]igure|[Tt]ab.|[Tt]able)\s?[0-9]{1,2}",'', clean) #remove fig. 2 etc
        clean=re.sub(r"\b((https?://|www.)[^\s]+)",'', clean) #remove email
        result.append(clean)
    return result

In [None]:
def load_files(path):
    raw=[]
    for file in path:
        dict_doc={'doc_id': None, 'title': None, 'abstract': None, 'introduction': None, 'full-text': None}
        file_id=os.path.basename(file).rstrip('.txt.final') #catch only file name  
        dict_doc['doc_id']=file_id
        
        source=open(file,encoding='utf-8').readlines()
        source=clean(source)
        
        ##########detect title
        beginning=re.sub("\n", "", source[0]) #retrieve title
        candidate=re.sub("\n", "", source[1]) # retrieve title candidate
        h_candidate=word_tokenize(re.sub("-",' ',candidate)) #tokenize the candidate
        
        title=[]
        name=[]
        for word in h_candidate:
            if wordnet.synsets(word): #check if title candidate exist on wordnet
                title.append(word)
            else:
                name.append(word)
            #if title>
            if len(title)>len(name): 
                newtitle=beginning+' '+candidate
            elif len(title)==len(name):
                newtitle=beginning
            else:
                newtitle=beginning

        dict_doc['title']=newtitle
        
        content=source[2:]
        ######check header, inconsistency all file
        r_intro=re.compile("^1\.?\s[A-Z]+")
        r_after_intro=re.compile("^2\.?\s[A-Z]+")
        r_ref=re.compile("[0-9]{1,2}?\.?\s?R[EFERENCES|eferences]") #detect reference
        #r_header=re.compile("[0-9]{1,2}?\.?\s?[A-Z]")
        
        in_abstract=content.index('ABSTRACT')
        in_authorkey=content.index('Categories and Subject Descriptors')
        
        list_intro=[i for i, item in enumerate(content) if re.search(r_intro, item)]
        in_intro=list_intro[0]
        list_after_intro=[i for i, item in enumerate(content) if re.search(r_after_intro, item)]
        in_after_intro=list_after_intro[0]
        list_ref=[i for i, item in enumerate(content) if re.search(r_ref, item)]
        in_ref=list_ref[0]
        
        abstract=content[in_abstract+1:in_authorkey] #eliminate keyword and category
        intro=content[in_intro+1:in_after_intro]
        body=content[in_after_intro+1:in_ref] #remove reference 
        
        #body=content[in_intro+1:in_ref] #remove reference       

        list_title=[]
        list_title.append(newtitle)
        
        #full_text=list(chain(list_title, abstract, intro, body))
        #dict_doc['abstract']=abstract
        dict_doc['introduction']=intro
        #dict_doc['body']=body
        #dict_doc['full_text']=full_text
        
        #per sentence in a document
        raw.append(dict_doc)
    return raw

train_directory=glob.glob('./se_txt/train/dummy/*.txt.final')
train_raw=load_files(train_directory)
print(train_raw)

In [None]:
#convert data to tfidfvectorizer format
#corpus=['a','b','c']
#RENAME TO CREATE CORPUS
def create_corpus(raw_data):
    train_data=[]
    for doc in raw_data:
        #add to list and join all element in full text into a text
        train_data.append(' '.join(doc['full_text']))
    return train_data

In [None]:
def calculate_ngram_tfidf(corpus):
    
    #porter stemmer
    stemmer=PorterStemmer()

    #eliminate ngram which starts or ends from stopwords
    #from https://stackoverflow.com/questions/49746555/sklearn-tfidfvectorizer-generate-custom
    #-ngrams-by-not-removing-stopword-in-the/49775000#49775000
    class NewTfidfVectorizer(TfidfVectorizer):
        def _word_ngrams(self, tokens, stop_words=None):
            # First get tokens without stop words
            tokens = super(TfidfVectorizer, self)._word_ngrams(tokens, None)
            if stop_words is not None:
                new_tokens=[]
                for token in tokens:
                    split_words = token.split(' ')
                    # Only check the first and last word for stop words
                    if len(token)>2 and split_words[0] not in stop_words and split_words[-1] not in stop_words:
                        #stem every word in token
                        if len(split_words)==1 and len(split_words[0])>2:
                            new_tokens.append(stemmer.stem(token))
                        elif len(split_words)==2 and split_words[-1]=="'":
                            del(token)
                        elif len(split_words[0])<3 and len(split_words[1])<3:
                            del(token)
                        elif split_words[1]=="'" and split_words[2]=="s":
                            new_tokens.append(stemmer.stem(split_words[0])+split_words[1]+split_words[2])
                        else:
                            new_tokens.append(' '.join(list(stemmer.stem(word) for word in word_tokenize(token))))
                return new_tokens
            return tokens
    
    stop_words=text.ENGLISH_STOP_WORDS
    
    tfidf=NewTfidfVectorizer(ngram_range=(1,5), stop_words=stop_words,
                                token_pattern=r"(?u)\b[A-Za-z-]+\b")
    
    matrix=tfidf.fit_transform(corpus)
    feature_names=tfidf.get_feature_names()

    #how to print tf-idf from https://stackoverflow.com/questions/34449127/
    #sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
    candidates=[]
    for doc in range(0,len(corpus)):
        feature_index=matrix[doc,:].nonzero()[1]
        tfidf_doc=zip(feature_index, [matrix[doc, x] for x in feature_index])
        names_tfidf=[(w, s) for w, s in [(feature_names[i], s) for (i, s) in tfidf_doc]]
        candidates.append(names_tfidf)
    
    #this is the candidates per document
    #vocab_perdoc=tfidf.inverse_transform(matrix)
    
    return candidates

In [None]:
#generate noun phrases based on corpus
def create_phrase_vocabulary(raw_data):
    
    #porter stemmer
    stemmer=PorterStemmer()
    
    #from http://bdewilde.github.io/blog/2014/09/23/intro-to-automatic-keyphrase-extraction/
    grammar=r'NP: {(<JJ.*>* <NN.*>+ <IN>)? (<JJ.*>* <NN.*>+)+}' #only detect noun phrases that contain specific pattern, hypen word is counted as one NN
    
    #test new grammar
    #grammar=r'NP: {(<JJ>* <VBN>? <NN.*>+ <IN>)? <JJ>* <VBG>? <NN.*>+}' 
    
    punct = set(string.punctuation) #list of punctuation
    chunker = RegexpParser(grammar) #chunker from nltk
    
    def lambda_unpack(f):
        return lambda args:f(*args)
    
    postag_sents = pos_tag_sents(word_tokenize(sent) for sent in raw_data) #tokenize and create pos tag per sentence
    #list of IOB of noun phrases based on the specific grammar
    noun_phrases = list(chain.from_iterable(tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in postag_sents)) 
    
    #join B-NP and I-NP tags as one noun phrase excluding O tags    
    merged_nounphrase = [' '.join(stemmer.stem(word) for word, pos, chunk in group).lower() for key, group in
                    itertools.groupby(noun_phrases, lambda_unpack(lambda word, pos, chunk: chunk != 'O')) if key]
    
    #filter noun phrases from stopwords and punctuation
    all_nounphrases=[cand for cand in merged_nounphrase
            if len(cand)>2 and not all(char in punct for char in cand)]
    
    #select distinct noun phrases
    vocabulary=(list(set(all_nounphrases)))
    return vocabulary

In [None]:
def calculate_nounphrase_tfidf(corpus, voc):
    
    stemmer=PorterStemmer()
    
    class StemmedTfidfVectorizer(TfidfVectorizer):
        def build_tokenizer(self):
            tokenizer=super(TfidfVectorizer, self).build_tokenizer()
            return lambda doc: (stemmer.stem(token) for token in tokenizer(doc) if token not in stop_words)

    stop_words=set(text.ENGLISH_STOP_WORDS)
    s=['of','in','on','for']
    stop_words=stop_words.difference(s)
    tfidf=StemmedTfidfVectorizer(ngram_range=(1,5), stop_words=stop_words, vocabulary=voc, token_pattern=r"(?u)\b[A-Za-z-]+\b")
    
    matrix=tfidf.fit_transform(corpus)
    feature_names=tfidf.get_feature_names()

    #how to print tf-idf from https://stackoverflow.com/questions/34449127/
    #sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-documen
    candidates=[]
    for doc in range(0,len(corpus)):
        feature_index=matrix[doc,:].nonzero()[1]
        tfidf_doc=zip(feature_index, [matrix[doc, x] for x in feature_index])
        names_tfidf=[(w, s) for w, s in [(feature_names[i], s) for (i, s) in tfidf_doc]]
        candidates.append(names_tfidf)
    
    return candidates

In [None]:
###sorting candidates based on 15 keywords
def get_top_candidates(candidates_list, number_keyphrases):
    best_candidates=[]
    for doc in candidates_list:
        #sort candidates by tf-idf value
        sorted_candidates=sorted(doc, key=lambda x: x[1], reverse=True)[:number_keyphrases]
        #best_candidates.append(sorted_candidates)
        best_candidates.append([x for x,_ in sorted_candidates])
        #remove overlapping keywords
    return best_candidates

In [None]:
###compare candidates to goldstandard
def extract_goldkeyphrase(gold_data):
    
    r_plus=re.compile("^.*\+.*$")
    r_slash=re.compile("^.*\s.*\/.*$")
    
    gold_standard=[]
    for line in gold_data.split('\n'):
        doc=[]      
        for key in line[6:].split(','):
            if key[0]==' ':
                doc.append(key[1:])
            elif re.search(r_plus, key):
                split=[]
                for element in key.split('+'):
                    doc.append(element)
            elif re.search(r_slash, key):
                split=[]
                for element in key.split('/'):
                    doc.append(element)
            else:
                doc.append(key)
        gold_standard.append(doc)
    return gold_standard

def calculate_fmeasure(candidates_list, gold_data):
    #true positive
    all_matches=[]
    for index in range(len(candidates_list)):
        #store all measure per document in dic
        value={'tp': None, 'fp': None, 'fn': None, 'gold': None}
        value['gold']=len(gold_data[index])
        #counter true positive per document
        true_positive=0
        #loop between elements
        for element_candidate in candidates_list[index]:                    
            for element_goldkeyphrase in gold_data[index]:
                #matched predicted keyword in gold keyphrase
                if element_candidate==element_goldkeyphrase:
                    #matches_perdoc.append(element_candidate)
                    true_positive+=1
            #if need the detail of evaluation
            value['tp']=int(true_positive) #matched pair
            value['fp']=int(15-true_positive) #depend how many keyword should we use
            value['fn']=int(value['gold']-value['tp'])
        #return all metrics per document
        all_matches.append(value)

    true_positive=sum(doc['tp'] for doc in all_matches)
    false_positive=sum(doc['fp'] for doc in all_matches)
    false_negative=sum(doc['fn'] for doc in all_matches)
    
    #matched/total top n
    precision=float(true_positive/(false_positive+true_positive))
    #matched/total gold standard
    recall=float(true_positive/(false_negative+true_positive))
    # calculate with micro averagedprecision
    f_measure=float("{0:.2F}".format(2*(precision*recall)/(precision+recall)*100))
    return f_measure

In [None]:
def calculate_term_frequency(section):
     #porter stemmer
    stemmer=PorterStemmer()
    
    #eliminate ngram which starts or ends from stopwords
    class NewCountVectorizer(CountVectorizer):
        def _word_ngrams(self, tokens, stop_words=None):
            # First get tokens without stop words
            tokens = super(CountVectorizer, self)._word_ngrams(tokens, None)
            if stop_words is not None:
                new_tokens=[]
                for token in tokens:
                    split_words = token.split(' ')
                    # Only check the first and last word for stop words
                    if len(token)>2 and split_words[0] not in stop_words and split_words[-1] not in stop_words:
                        #stem every word in token
                        if len(split_words)==1 and len(split_words[0])>2:
                            new_tokens.append(stemmer.stem(token))
                        elif len(split_words)==2 and split_words[-1]=="'":
                            del(token)
                        elif len(split_words[0])<3 and len(split_words[1])<3:
                            del(token)
                        elif split_words[1]=="'" and split_words[2]=="s":
                            new_tokens.append(stemmer.stem(split_words[0])+split_words[1]+split_words[2])
                        else:
                            new_tokens.append(' '.join(list(stemmer.stem(word) for word in word_tokenize(token))))
                return new_tokens
            return tokens
    
    stop_words=text.ENGLISH_STOP_WORDS
    
    count_vect=NewCountVectorizer(ngram_range=(1,5), stop_words=stop_words,
                                token_pattern=r"(?u)\b[A-Za-z-]+\b")
    
    matrix=count_vect.fit_transform(section)
    feature_names=count_vect.get_feature_names()

    #how to print tf-idf from https://stackoverflow.com/questions/34449127/
    #sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-document
    ngrams=[]
    for doc in range(0,len(section)):
        feature_index=matrix[doc,:].nonzero()[1]
        count_vect_doc=zip(feature_index, [matrix[doc, x] for x in feature_index])
        names_count_vect=[(w, s) for w, s in [(feature_names[i], s) for (i, s) in count_vect_doc]]
        ngrams.append(names_count_vect)
    
    return ngrams

In [None]:
#----------------------------------------------------TF-IDF version
###load training data
train_directory=glob.glob('./se_txt/train/*.txt.final')
train_raw=load_files(train_directory)
pickle_train_raw=create_pickle(train_raw,'txt train raw')
train_data=create_corpus(train_raw)
pickle_train_data=create_pickle(train_data,'txt train data')

#load gold keyphrase
train_label_directory=open('./se_txt/train/train.combined.stem.final', encoding='utf-8').read()
train_label=extract_goldkeyphrase(train_label_directory)
pickle_train_label=create_pickle(train_label, 'txt train label')

###Load testing data
test_directory=glob.glob('./se_txt/test/*.txt.final')
test_raw=load_files(test_directory)
pickle_test_raw=create_pickle(test_raw,'txt test raw')
test_data=create_corpus(test_raw)
pickle_test_data=create_pickle(test_data,'txt test data')

test_label_directory=open('./se_txt/test_answer/test.combined.stem.final', encoding='utf-8').read()
test_label=extract_goldkeyphrase(test_label_directory)
pickle_test_label=create_pickle(test_label, 'txt test label')

#### Ngram version
print("N-gram TF-IDF version")
ngram_candidates=calculate_ngram_tfidf(train_data) 
pickle_ngram_candidates=create_pickle(ngram_candidates, 'txt ngram candidates')
#ngram_top_keyphrases=get_top_candidates(ngram_candidates, 15)
#ngram_fmeasure=calculate_fmeasure(ngram_top_keyphrases, train_label)
#print("F-measure on training:", ngram_fmeasure)

test_ngram_candidates=calculate_ngram_tfidf(test_data)
pickle_test_ngram_candidates=create_pickle(test_ngram_candidates, 'txt test ngram candidates')
#test_ngram_top_candidates=get_top_candidates(test_ngram_candidates, 15)
#test_ngram_fmeasure=calculate_fmeasure(test_ngram_top_candidates, test_label)
#print("F-measure on testing:", test_ngram_fmeasure)


#### Noun phrase version
print("Noun phrase TF-IDF version")
nounphrase_vocabulary=create_phrase_vocabulary(train_data)
nounphrase_candidates=calculate_nounphrase_tfidf(train_data, nounphrase_vocabulary)
pickle_nounphrase_candidates=create_pickle(nounphrase_candidates, 'txt nounphrase candidates')
#nounphrase_top_keyphrases=get_top_candidates(nounphrase_candidates, 15)
#nounphrase_fmeasure=calculate_fmeasure(nounphrase_top_keyphrases, train_label)
#print("F-measure on training:", nounphrase_fmeasure)

test_nounphrase_vocabulary=create_phrase_vocabulary(test_data)
test_nounphrase_candidates=calculate_nounphrase_tfidf(test_data, test_nounphrase_vocabulary)
pickle_test_nounphrase_candidates=create_pickle(test_nounphrase_candidates, 'txt test nounphrase candidates')
#test_nounphrase_top_candidates=get_top_candidates(test_nounphrase_candidates, 15)
#test_nounphrase_fmeasure=calculate_fmeasure(test_nounphrase_top_candidates, test_label)
#print("F-measure on testing:", test_nounphrase_fmeasure)

In [None]:
def clean_corpus(corpus):
    clean=[]
    stemmer=PorterStemmer()
    for doc in corpus:
        cleaned_words=" ".join([word for word in word_tokenize(doc.lower()) if re.search(r"\b[A-Za-z-]+\b", word) and len(word)>2])
        stemmed_words=[stemmer.stem(word) for word in cleaned_words.split()]
        clean.append(" ".join([word for word in stemmed_words]))
    return clean

In [None]:
def create_example(raw_data, corpus, candidates, label):
    
    def feature_is_title(candidates, raw_data):
        titles=[doc['title'] for doc in raw_data]
        title_tf=calculate_term_frequency(titles)
        feature2=[]
        for n_doc in range(len(candidates)):
            doc=[]
            for n_feature in range(len(candidates[n_doc])):
                features_perdoc=[feature for feature, value in title_tf[n_doc]]
                if candidates[n_doc][n_feature][0] not in features_perdoc:
                    doc.append(0)
                else:
                    doc.append(1)
            feature2.append(doc)
        return feature2
    
    #refine with similarity
    def feature_is_abstract(candidates, raw_data):
        abstracts=[' '.join(doc['abstract']) for doc in raw_data]
        abstract_tf=calculate_term_frequency(abstracts)
        feature3=[]
        for n_doc in range(len(candidates)):
            doc=[]
            for n_feature in range(len(candidates[n_doc])):
                features_perdoc=[feature for feature, value in abstract_tf[n_doc]]
                if candidates[n_doc][n_feature][0] not in features_perdoc:
                    doc.append(0)
                else:
                    doc.append(1)
            feature3.append(doc)
        return feature3
    
    def feature_candidate_length(candidates):
        feature4=[]
        for n_doc in range(len(candidates)):
            doc=[]
            for n_feature in range(len(candidates[n_doc])):
                doc.append(len(candidates[n_doc][n_feature][0]))
            feature4.append(doc)
        return feature4
    
    def feature_term_frequency(corpus):
        term_frequency=calculate_term_frequency(corpus) #save as pickle for term frequency, it can be used for counting n title or n abstract
        feature5=[]
        for n_doc in range(len(term_frequency)):
            doc=[]
            for n_feature in range(len(term_frequency[n_doc])):
                doc.append(term_frequency[n_doc][n_feature][1])
            feature5.append(doc)
        return feature5
    
    def feature_supervised_keyphraseness(corpus, label): #make sure this is only keyphrase per document or all keyphrase compare
        term_frequency=calculate_term_frequency(corpus)
        merged_labels=list(chain.from_iterable(label))
        feature6=[]
        for n_doc in range(len(term_frequency)):
            doc=[]
            for n_feature in range(len(term_frequency[n_doc])):
                #gold_label=list(label[n_doc])
                if term_frequency[n_doc][n_feature][0] not in merged_labels:
                    doc.append(0)
                else:
                    doc.append(term_frequency[n_doc][n_feature][1])
            feature6.append(doc)
        return feature6
    
    def feature_first_occurence(candidates, corpus):
        feature7=[]
        cleaned_corpus=clean_corpus(corpus)
        for n_doc in range(len(candidates)):
            doc=[]
            for n_cand in range(len(candidates[n_doc])):
                index=cleaned_corpus[n_doc].lower().find(candidates[n_doc][n_cand][0])
                preceding_words=len(cleaned_corpus[n_doc][:index].split(" "))-1
                doc.append(preceding_words)
            feature7.append(doc)
        return feature7

    def feature_distance(candidates, corpus):
        #cleaning the CORPUS from 
        feature8=[]
        cleaned_corpus=clean_corpus(corpus)
        for n_doc in range(len(candidates)):
            doc=[]
            corpus_words=len(cleaned_corpus[n_doc].split(" "))
            for n_cand in range(len(candidates[n_doc])):
                index=cleaned_corpus[n_doc].lower().find(candidates[n_doc][n_cand][0])
                preceding_words=len(cleaned_corpus[n_doc][:index].split(" "))-1
                position=float("{0:.2F}".format(preceding_words/corpus_words))
                doc.append(position)
            feature8.append(doc)
        return feature8
    
    #lists of feature
    feature2=feature_is_title(candidates, raw_data)
    feature3=feature_is_abstract(candidates, raw_data)
    feature4=feature_candidate_length(candidates)
    feature5=feature_term_frequency(corpus)
    feature6=feature_supervised_keyphraseness(corpus, label)
    feature7=feature_first_occurence(candidates, corpus) #important feature
    feature8=feature_distance(candidates, corpus)
    
    #add values of all features into candidate list
    for n_doc in range(len(candidates)):
        for n_candidate in range(len(candidates[n_doc])):
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature2[n_doc][n_candidate],)
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature3[n_doc][n_candidate],)
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature4[n_doc][n_candidate],)  
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature5[n_doc][n_candidate],)
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature6[n_doc][n_candidate],)
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature7[n_doc][n_candidate],)
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature8[n_doc][n_candidate],)
            
    #convert the format from candidate from tuple to list
    x_data=[]
    for n_doc in range(len(candidates)):
        for n_candidate in range(len(candidates[n_doc])):
            #append only values of features. without word
            x_data.append(list(candidates[n_doc][n_candidate][1:]))
    return x_data     

In [None]:
##create label for training or testing
def create_label(candidates, label):
    y_label=[]
    for n_doc in range(len(candidates)):
        for n_cand in range(len(candidates[n_doc])):
            keyphrase_document=list(label[n_doc])
            if candidates[n_doc][n_cand][0] not in keyphrase_document:
                y_label.append(0)
            else:
                y_label.append(1)
    return y_label

In [None]:
def predict_data(x_train, y_train, x_test, y_test, candidates, labels):
    seed = 7 #just randomly select the number
    models = []
    models.append(('LR', LogisticRegression()))
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('NB', GaussianNB()))
    models.append(('SVM', SVC(probability=True)))
    models.append(('RF', RF(n_estimators=10, max_depth=3)))
    models.append(('AdaBoost', AdaBoostClassifier()))
    models.append(('Bagging', BaggingClassifier()))
    models.append(('GradientBoosting', (GradientBoostingClassifier())))
    models.append(('MLP', (MLPClassifier())))
    models.append(('Multinomial', (MultinomialNB())))
    results = []
    names = []
    scoring='accuracy'
    #print("\nAccuracy on testing data:")
    all_predict_proba=[]
    for name, model in models:
        #accuracy
        #print("%s: %.3f" % (name, accuracy_score(model.fit(x_train, y_train).predict(x_test), y_test)))
        all_predict_proba.append(model.fit(x_train, y_train).predict_proba(x_test)[:,1])
    
    all_fmeasure=[]
    for model in range(0, len(all_predict_proba)):
        probability=[]
        counter=0
        for n_doc in range(len(candidates)):
            doc=[]
            for n_cand in range(len(candidates[n_doc])):
                doc.append((candidates[n_doc][n_cand][0], all_predict_proba[model][counter]))
                counter+=1
            probability.append(doc)
        fmeasure=calculate_fmeasure(get_top_candidates(probability, 15), labels)
        all_fmeasure.append((models[model][0], fmeasure))
    return all_fmeasure

In [None]:
import pickle
def create_pickle(data, name):
    with open('%s.pickle' % name,'wb') as handle:
        result=pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return result

def open_pickle(name):
    with open('%s.pickle' % name,'rb') as handle:
        result=pickle.load(handle)
    return result

In [None]:
####with machine learning
##NGRAM
#open all pickle
train_raw=open_pickle('txt train raw')
train_data=open_pickle('txt train data')
train_label=open_pickle('txt train label')

test_raw=open_pickle('txt test raw')
test_data=open_pickle('txt test data')
test_label=open_pickle('txt test label')

ngram_candidates=open_pickle('txt ngram candidates')
test_ngram_candidates=open_pickle('txt test ngram candidates')
nounphrase_candidates=open_pickle('txt nounphrase candidates')
test_nounphrase_candidates=open_pickle('txt test nounphrase candidates')


ngram_x_train=create_example(train_raw, train_data, ngram_candidates, train_label)
ngram_y_train=create_label(ngram_candidates, train_label)
ngram_x_test=create_example(test_raw, test_data, test_ngram_candidates, test_label)
ngram_y_test=create_label(test_ngram_candidates, test_label)

nounphrase_x_train=create_example(train_raw, train_data, nounphrase_candidates, train_label)
nounphrase_y_train=create_label(nounphrase_candidates, train_label)
nounphrase_x_test=create_example(test_raw, test_data, test_nounphrase_candidates, test_label)
nounphrase_y_test=create_label(test_nounphrase_candidates, test_label)


print("F-measure with machine learning (testing)")
ngram_prediction=predict_data(ngram_x_train, ngram_y_train, ngram_x_test, ngram_y_test, test_ngram_candidates, test_label)
print('F-measure on ngram', ngram_prediction)
nounphrase_prediction=predict_data(nounphrase_x_train, nounphrase_y_train, nounphrase_x_test, nounphrase_y_test, test_nounphrase_candidates, test_label)
print('F-measure on noun phrase', nounphrase_prediction)

#print(len(x_train_ngram))#print(len(y_train_ngram))#print(len(x_test_ngram))#print(len(y_test_ngram))

In [None]:
candidates_list=[['a','b','c','d','e','a1','b1','c1','d1','e1','a2','b2','c2','d2','e2'],
                 ['a3','b3','c3','d3','e3','a31','b31','c31','d31','e31','a32','b32','c32','d32','e32'],
                 ['a4','b4','c4','d4','e4','a41','b41','c41','d41','e41','a42','b42','c42','d42','e42']]

gold_data=[['a1','b1','c1','d','e','a12','b12','c1','d12','e12','a22','b22'],
                 ['a33','b33','c33','d33','e33','a313','b313','c313','a323','b32','c323','d323','e32'],
                 ['a44','b44','c44','d44','e44','a441','d441','e441','a442','b442','c442','d442','e442']]


In [None]:
#testing o compare tfidfvalue with one feature


#feature phrase length
#feature1=[]
#for n_doc in range(len(tfidf)):
#    doc=[]
#    for n_feature in range(len(tfidf[n_doc])):
#        doc.append(len(tfidf[n_doc][n_feature][0]))
#    feature1.append(doc)
#print(feature1)

tfidf=[[('dog',1),('swimming',4),('car',7)],
      [('air',11),('bowl',14),('cone',17),('done',17)],
       [('air of water',21),('chocolate biscuit',24)],
      [('air conditioner',21),('hot white chocolate',24)],]

title=[[('dog',0),('rabbit',0),('snake',0),('car',0)],
      [('bowl',0),('dog',0),('rabbit',0)],
      [('chocolate biscuits',0),('a lot air of water',0),('rabbit',0),('snake',0)],
      [('air conditioner',0),('hot white',0)]]

#is_title, is_abstract, is etc, but extract section with ngram(1,5)
feature2=[]
for n_doc in range(len(tfidf)):
    doc=[]
    for n_feature in range(len(tfidf[n_doc])):
        #title_feature=[feature for feature in title[n_doc]]
        title_feature=[feature for feature, value in title[n_doc]]
        if tfidf[n_doc][n_feature][0] not in title_feature:
            doc.append(0)
        else:
            doc.append(1)
    feature2.append(doc)
print(feature2)

#is_abstract


In [None]:
'''If need cross validation per model
###measure accuracy with k-fold
print("Accuracy on training data with Cross-validation:")
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, x_train_ngram, y_train_ngram, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %.3f (%.3f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
'''


In [None]:
candidates=[[('aa',1),('a',5),('a3',5),('a6',7)],
            [('aq',3),('aw',4),('ag',2),('ar',8)]]

feature1=[[3,4,5,6],
            [7,9,6,5]]
feature2=[[1,2,7,8],
            [9,90,4,3]]

for n_doc in range(len(candidates)):
    for n_candidate in range(len(candidates[n_doc])):
        candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature1[n_doc][n_candidate],)
        candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature2[n_doc][n_candidate],)
print(candidates)