In [1]:
import glob, os, re, string, itertools
import logging
import numpy as np
from __future__ import print_function
from time import time
from nltk.stem.porter import *
from collections import defaultdict
from nltk import word_tokenize, pos_tag, pos_tag_sents
from nltk.corpus import wordnet, stopwords
from nltk.chunk.regexp import RegexpParser
from nltk.chunk import tree2conlltags
from pandas import DataFrame
from itertools import chain
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction import text
from scipy.sparse.csr import csr_matrix
from sklearn import svm                                       #library for creating the classifier, SVM
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
###sorting candidates based on 15 keywords
def get_top_candidates(candidates_list, number_keyphrases):
    best_candidates=[]
    for doc in candidates_list:
        #sort candidates by tf-idf value
        sorted_candidates=sorted(doc, key=lambda x: x[1], reverse=True)[:number_keyphrases]
        #best_candidates.append(sorted_candidates)
        best_candidates.append([x for x,_ in sorted_candidates])
        #remove overlapping keywords
    return best_candidates

In [3]:
###compare candidates to goldstandard
def extract_goldkeyphrase(gold_data):
    
    r_plus=re.compile("^.*\+.*$")
    r_slash=re.compile("^.*\s.*\/.*$")
    
    gold_standard=[]
    for line in gold_data.split('\n'):
        doc=[]      
        for key in line[6:].split(','):
            if key[0]==' ':
                doc.append(key[1:])
            elif re.search(r_plus, key):
                split=[]
                for element in key.split('+'):
                    doc.append(element)
            elif re.search(r_slash, key):
                split=[]
                for element in key.split('/'):
                    doc.append(element)
            else:
                doc.append(key)
        gold_standard.append(doc)
    return gold_standard

def calculate_fmeasure(candidates_list, gold_data):
    #true positive
    all_matches=[]
    for index in range(len(candidates_list)):
        #store all measure per document in dic
        value={'tp': None, 'fp': None, 'fn': None, 'gold': None}
        value['gold']=len(gold_data[index])
        #counter true positive per document
        true_positive=0
        #loop between elements
        for element_candidate in candidates_list[index]:                    
            for element_goldkeyphrase in gold_data[index]:
                #matched predicted keyword in gold keyphrase
                if element_candidate==element_goldkeyphrase:
                    #matches_perdoc.append(element_candidate)
                    true_positive+=1
            #if need the detail of evaluation
            value['tp']=int(true_positive) #matched pair
            value['fp']=int(15-true_positive) #depend how many keyword should we use
            value['fn']=int(value['gold']-value['tp'])
        #return all metrics per document
        all_matches.append(value)

    true_positive=sum(doc['tp'] for doc in all_matches)
    false_positive=sum(doc['fp'] for doc in all_matches)
    false_negative=sum(doc['fn'] for doc in all_matches)
    
    #matched/total top n
    precision=float(true_positive/(false_positive+true_positive))
    #matched/total gold standard
    recall=float(true_positive/(false_negative+true_positive))
    # calculate with micro averagedprecision
    f_measure=float("{0:.2F}".format(2*(precision*recall)/(precision+recall)*100))
    return f_measure

In [4]:
def calculate_term_frequency(section):
     #porter stemmer
    stemmer=PorterStemmer()
    
    #eliminate ngram which starts or ends from stopwords
    class NewCountVectorizer(CountVectorizer):
        def _word_ngrams(self, tokens, stop_words=None):
            # First get tokens without stop words
            tokens = super(CountVectorizer, self)._word_ngrams(tokens, None)
            if stop_words is not None:
                new_tokens=[]
                for token in tokens:
                    split_words = token.split(' ')
                    # Only check the first and last word for stop words
                    if len(token)>2 and split_words[0] not in stop_words and split_words[-1] not in stop_words:
                        #stem every word in token
                        if len(split_words)==1 and len(split_words[0])>2:
                            new_tokens.append(stemmer.stem(token))
                        elif len(split_words)==2 and split_words[-1]=="'":
                            del(token)
                        elif len(split_words[0])<3 and len(split_words[1])<3:
                            del(token)
                        elif split_words[1]=="'" and split_words[2]=="s":
                            new_tokens.append(stemmer.stem(split_words[0])+split_words[1]+split_words[2])
                        else:
                            new_tokens.append(' '.join(list(stemmer.stem(word) for word in word_tokenize(token))))
                return new_tokens
            return tokens
    
    stop_words=text.ENGLISH_STOP_WORDS
    
    count_vect=NewCountVectorizer(ngram_range=(1,5), stop_words=stop_words,
                                token_pattern=r"(?u)\b[A-Za-z-]+\b")
    
    matrix=count_vect.fit_transform(section)
    feature_names=count_vect.get_feature_names()

    #how to print tf-idf from https://stackoverflow.com/questions/34449127/
    #sklearn-tfidf-transformer-how-to-get-tf-idf-values-of-given-words-in-document
    ngrams=[]
    for doc in range(0,len(section)):
        feature_index=matrix[doc,:].nonzero()[1]
        count_vect_doc=zip(feature_index, [matrix[doc, x] for x in feature_index])
        names_count_vect=[(w, s) for w, s in [(feature_names[i], s) for (i, s) in count_vect_doc]]
        ngrams.append(names_count_vect)
    
    return ngrams

In [5]:
def create_example(candidates):
    #convert the format from candidate from tuple to list
    def feature_candidate_length(candidates):
        feature4=[]
        for n_doc in range(len(candidates)):
            doc=[]
            for n_feature in range(len(candidates[n_doc])):
                doc.append(len(candidates[n_doc][n_feature][0]))
            feature4.append(doc)
        return feature4
    
    feature4=feature_candidate_length(candidates)
    
    for n_doc in range(len(candidates)):
        for n_candidate in range(len(candidates[n_doc])):
            candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature4[n_doc][n_candidate],)  
   
    x_data=[]
    for n_doc in range(len(candidates)):
        for n_candidate in range(len(candidates[n_doc])):
            #append only values of features. without word
            x_data.append(list(candidates[n_doc][n_candidate][1:]))
    return x_data     

In [6]:
##create label for training or testing
def create_label(candidates, label):        
    y_label=[]
    for n_doc in range(len(candidates)):
        for n_cand in range(len(candidates[n_doc])):
            keyphrase_document=list(label[n_doc])
            if candidates[n_doc][n_cand][0] not in keyphrase_document:
                y_label.append(0)
            else:
                y_label.append(1)
    return y_label

In [22]:
def probability_to_fmeasure(predict_proba, candidates, labels, models):
    #all_fmeasure=[]
    for model in range(0, len(predict_proba)):
        probability=[]
        counter=0
        for n_doc in range(len(candidates)):
            doc=[]
            for n_cand in range(len(candidates[n_doc])):
                doc.append((candidates[n_doc][n_cand][0], predict_proba[model][counter]))
                counter+=1
            probability.append(doc)
        fmeasure=calculate_fmeasure(get_top_candidates(probability, 15), labels)
        print("Model %s: %.3f" % (models[model][0], fmeasure))
        #all_fmeasure.append((models[model][0], fmeasure))
    return 'finish'

def predict_data(x_train, y_train, x_test, y_test, candidates, labels):
    seed = 7 #just randomly select the number
    models = []
    models.append(('LR', LogisticRegression(C=10))) 
    #models.append(('LDA', LinearDiscriminantAnalysis()))
    #models.append(('NB', GaussianNB()))
    models.append(('SVM', SVC(probability=True, C=10, gamma=0.1)))
    models.append(('RF', RF(n_estimators=15, max_depth=4)))
    models.append(('AdaBoost', AdaBoostClassifier(n_estimators=60, learning_rate=2.0)))
    models.append(('Bagging', BaggingClassifier(n_estimators=15)))
    models.append(('GradientBoosting', (GradientBoostingClassifier(n_estimators=120, learning_rate=0.2))))
    models.append(('MLP', (MLPClassifier(learning_rate_init=0.002))))
    models.append(('Multinomial', (MultinomialNB(alpha=2.0))))
    #models.append(('', ()))
    #models.append(('', ()))
    #models.append(('', ()))
    results = []
    names = []
    scoring='accuracy'
    #print("\nAccuracy on testing data:")
    
    print("Full features:")
    all_predict_proba=[]
    for name, model in models:
        #accuracy
        #print("%s: %.3f" % (name, accuracy_score(model.fit(x_train, y_train).predict(x_test), y_test)))
        all_predict_proba.append(model.fit(x_train, y_train).predict_proba(x_test)[:,1])
    print("Measuring fscore")
    all_fmeasure=probability_to_fmeasure(all_predict_proba, candidates, labels, models)
  
    return 'all predictions have been completed'

In [23]:
import pickle
def create_pickle(data, name):
    with open('%s.pickle' % name,'wb') as handle:
        result=pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
    return result

def open_pickle(name):
    with open('%s.pickle' % name,'rb') as handle:
        result=pickle.load(handle)
    return result

In [24]:
####with machine learning
##NGRAM
#open all pickle
print("opening all pickles")
train_raw=open_pickle('txt train raw')
train_data=open_pickle('txt train data')
train_label=open_pickle('txt train label')

test_raw=open_pickle('txt test raw')
test_data=open_pickle('txt test data')
test_label=open_pickle('txt test label')

ngram_candidates=open_pickle('txt ngram candidates')
test_ngram_candidates=open_pickle('txt test ngram candidates')
#nounphrase_candidates=open_pickle('txt nounphrase candidates')
#test_nounphrase_candidates=open_pickle('txt test nounphrase candidates')

print("creating example on training..")
ngram_x_train=create_example(ngram_candidates)
print("creating label on training..")
ngram_y_train=create_label(ngram_candidates, train_label)
print("creating example on testing..")
ngram_x_test=create_example(test_ngram_candidates)
print("creating label on testing..")
ngram_y_test=create_label(test_ngram_candidates, test_label)

#create pickle for training data


#nounphrase_x_train=create_example(train_raw, train_data, nounphrase_candidates, train_label)
#nounphrase_y_train=create_label(nounphrase_candidates, train_label)
#nounphrase_x_test=create_example(test_raw, test_data, test_nounphrase_candidates, test_label)
#nounphrase_y_test=create_label(test_nounphrase_candidates, test_label)

#next time if features hsve been complete, put in the pickle, test in here
print("F-measure with machine learning (only TF-IDF feature)")
ngram_prediction=predict_data(ngram_x_train, ngram_y_train, ngram_x_test, ngram_y_test, test_ngram_candidates, test_label)
print('F-measure on ngram', ngram_prediction)
#nounphrase_prediction=predict_data(nounphrase_x_train, nounphrase_y_train, nounphrase_x_test, nounphrase_y_test, test_nounphrase_candidates, test_label)
#print('F-measure on noun phrase', nounphrase_prediction)

#print(len(x_train_ngram))#print(len(y_train_ngram))#print(len(x_test_ngram))#print(len(y_test_ngram))

opening all pickles
creating example on training..
creating label on training..
creating example on testing..
creating label on testing..
F-measure with machine learning (only TF-IDF feature)
Full features:
Measuring fscore
Model LR: 14.740
Model SVM: 0.070
Model RF: 20.190


ZeroDivisionError: float division by zero

In [None]:
####setting for searching best parameter
from sklearn import svm, grid_search
def svc_param_selection(X, y, nfolds):
    Cs = [0.001, 0.01, 0.1, 1, 10]
    gammas = [0.001, 0.01, 0.1, 1]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_

In [None]:
candidates_list=[['a','b','c','d','e','a1','b1','c1','d1','e1','a2','b2','c2','d2','e2'],
                 ['a3','b3','c3','d3','e3','a31','b31','c31','d31','e31','a32','b32','c32','d32','e32'],
                 ['a4','b4','c4','d4','e4','a41','b41','c41','d41','e41','a42','b42','c42','d42','e42']]

gold_data=[['a1','b1','c1','d','e','a12','b12','c1','d12','e12','a22','b22'],
                 ['a33','b33','c33','d33','e33','a313','b313','c313','a323','b32','c323','d323','e32'],
                 ['a44','b44','c44','d44','e44','a441','d441','e441','a442','b442','c442','d442','e442']]


In [None]:
#testing o compare tfidfvalue with one feature


#feature phrase length
#feature1=[]
#for n_doc in range(len(tfidf)):
#    doc=[]
#    for n_feature in range(len(tfidf[n_doc])):
#        doc.append(len(tfidf[n_doc][n_feature][0]))
#    feature1.append(doc)
#print(feature1)

tfidf=[[('dog',1),('swimming',4),('car',7)],
      [('air',11),('bowl',14),('cone',17),('done',17)],
       [('air of water',21),('chocolate biscuit',24)],
      [('air conditioner',21),('hot white chocolate',24)],]

title=[[('dog',0),('rabbit',0),('snake',0),('car',0)],
      [('bowl',0),('dog',0),('rabbit',0)],
      [('chocolate biscuits',0),('a lot air of water',0),('rabbit',0),('snake',0)],
      [('air conditioner',0),('hot white',0)]]

#is_title, is_abstract, is etc, but extract section with ngram(1,5)
feature2=[]
for n_doc in range(len(tfidf)):
    doc=[]
    for n_feature in range(len(tfidf[n_doc])):
        #title_feature=[feature for feature in title[n_doc]]
        title_feature=[feature for feature, value in title[n_doc]]
        if tfidf[n_doc][n_feature][0] not in title_feature:
            doc.append(0)
        else:
            doc.append(1)
    feature2.append(doc)
print(feature2)

#is_abstract


In [None]:
'''If need cross validation per model
###measure accuracy with k-fold
print("Accuracy on training data with Cross-validation:")
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, x_train_ngram, y_train_ngram, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %.3f (%.3f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
'''


In [None]:
candidates=[[('aa',1),('a',5),('a3',5),('a6',7)],
            [('aq',3),('aw',4),('ag',2),('ar',8)]]

feature1=[[3,4,5,6],
            [7,9,6,5]]
feature2=[[1,2,7,8],
            [9,90,4,3]]

for n_doc in range(len(candidates)):
    for n_candidate in range(len(candidates[n_doc])):
        candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature1[n_doc][n_candidate],)
        candidates[n_doc][n_candidate]=candidates[n_doc][n_candidate]+(feature2[n_doc][n_candidate],)
print(candidates)