In [4]:
#This file contains noun phrase and n-gram filters on Semeval dataset (XML version)
#only evaluate the model on combined label

import glob, utils, preprocessing, generate_candidate 
import feature_extraction, generate_keyphrase

#this is the default number
number_keyphrase = 15

In [None]:
#if these files are exist on pickle, please skip into next step

#create gold keyphrase
train_label_directory = open('./data/se_txt/train/train.combined.stem.final', 
                        encoding='utf-8').read()
train_label = preprocessing.extract_keyphrase(train_label_directory)
pickle_train_label = utils.create_pickle(train_label, './pickle/semeval/train label')

test_label_directory = open('./data/se_txt/test_answer/test.combined.stem.final', 
                            encoding='utf-8').read()
test_label = preprocessing.extract_keyphrase(test_label_directory)
pickle_test_label = utils.create_pickle(test_label, './pickle/semeval/test label')

In [None]:
#only run one time, if pickles have been available, please skip this step
#this section is intended to create candidates, supervised keyphraseness

#load and create training data
train_directory = glob.glob('./data/se_txt/train/*.txt.final')
train_raw = preprocessing.load_files(train_directory)
pickle_train_raw = utils.create_pickle(train_raw,'./pickle/semeval/txt train raw')
train_data = preprocessing.create_corpus(train_raw)
pickle_train_data = utils.create_pickle(train_data,'./pickle/semeval/txt train data')
train_tf_corpus = feature_extraction.calculate_tf(train_data, vocab = None, 
                    type = 'ngram')
pickle_train_tf_corpus = utils.create_pickle(train_tf_corpus,
                        './pickle/semeval/txt train tf corpus')

#load and create testing data
test_directory = glob.glob('./data/se_txt/test/*.txt.final')
test_raw = preprocessing.load_files(test_directory)
pickle_test_raw = utils.create_pickle(test_raw,'./pickle/semeval/txt test raw')
test_data = preprocessing.create_corpus(test_raw)
pickle_test_data = utils.create_pickle(test_data,'./pickle/semeval/txt test data')
test_tf_corpus = feature_extraction.calculate_tf(test_data, vocab = None, 
                    type = 'ngram')
pickle_test_tf_corpus = utils.create_pickle(test_tf_corpus,
                    './pickle/semeval/txt test tf corpus')


#create candidates based on n-gram and store into pickle of training data
print("Generating n-gram candidates..")
ngram_candidates = generate_candidate.calculate_tfidf(train_data, vocab=None, 
                        type='ngram') 
pickle_ngram_candidates = utils.create_pickle(ngram_candidates,
                            './pickle/semeval/txt ngram candidates')

#create candidates based on n-gram and store into pickle of testing data
test_ngram_candidates = generate_candidate.calculate_tfidf(test_data, vocab=None, 
                            type='ngram')
pickle_test_ngram_candidates = utils.create_pickle(test_ngram_candidates, 
                                './pickle/semeval/txt test ngram candidates')


#create candidates based on noun phrase and store into pickle of training data
print("Generating noun phrase candidates..")
nounphrase_vocabulary = generate_candidate.create_phrase_vocabulary(train_data)
train_tf_nounphrase_corpus = feature_extraction.calculate_tf(train_data, 
                                vocab = nounphrase_vocabulary, type = 'np')
pickle_train_tf_nounphrase_corpus = utils.create_pickle(train_tf_nounphrase_corpus,
                                    './pickle/semeval/txt train tf nounphrase corpus')
nounphrase_candidates = generate_candidate.calculate_tfidf(train_data, 
                        nounphrase_vocabulary, type='np')
pickle_nounphrase_candidates = utils.create_pickle(nounphrase_candidates, 
                                './pickle/semeval/txt nounphrase candidates')

#create candidates based on noun phrase and store into pickle of testing data
test_nounphrase_vocabulary = generate_candidate.create_phrase_vocabulary(test_data)
test_tf_nounphrase_corpus = feature_extraction.calculate_tf(test_data, 
                            vocab = test_nounphrase_vocabulary, type = 'np')
pickle_test_tf_nounphrase_corpus = utils.create_pickle(test_tf_nounphrase_corpus,
                                    './pickle/semeval/txt test tf nounphrase corpus')
test_nounphrase_candidates = generate_candidate.calculate_tfidf(test_data, 
                                test_nounphrase_vocabulary, type='np')
pickle_test_nounphrase_candidates = utils.create_pickle(test_nounphrase_candidates, 
                                        './pickle/semeval/txt test nounphrase candidates')

#create a dictionary supervised keyphraseness on ngram filter by combined label
supervised_key = feature_extraction.create_supervised_list(train_label, train_tf_corpus)
supervised_corpus = utils.create_pickle(supervised_key, './pickle/semeval/txt ngram supervised keyphraseness')

#create a dictionary supervised keyphraseness on noun phrase filter by combined label
np_supervised_key = feature_extraction.create_supervised_list(train_label, train_tf_nounphrase_corpus)
np_supervised_corpus = utils.create_pickle(np_supervised_key, './pickle/semeval/txt np supervised keyphraseness')

In [5]:
#open all relevant pickles

print("Opening all pickles")
train_raw = utils.open_pickle('./pickle/semeval/txt train raw')
train_data = utils.open_pickle('./pickle/semeval/txt train data')

train_label = utils.open_pickle('./pickle/semeval/train label')
train_tf_corpus = utils.open_pickle('./pickle/semeval/txt train tf corpus')
train_tf_nounphrase_corpus = utils.open_pickle('./pickle/semeval/txt train tf nounphrase corpus')

test_raw = utils.open_pickle('./pickle/semeval/txt test raw')
test_data = utils.open_pickle('./pickle/semeval/txt test data')

test_label = utils.open_pickle('./pickle/semeval/test label')
test_tf_corpus = utils.open_pickle('./pickle/semeval/txt test tf corpus')
test_tf_nounphrase_corpus = utils.open_pickle('./pickle/semeval/txt test tf nounphrase corpus')

train_topics = utils.open_pickle('./pickle/semeval/txt train topics')
test_topics = utils.open_pickle('./pickle/semeval/txt test topics')

ngram_candidates = utils.open_pickle('./pickle/semeval/txt ngram candidates')
test_ngram_candidates = utils.open_pickle('./pickle/semeval/txt test ngram candidates')

nounphrase_candidates = utils.open_pickle('./pickle/semeval/txt nounphrase candidates')
test_nounphrase_candidates = utils.open_pickle('./pickle/semeval/txt test nounphrase candidates')

supervised_key = utils.open_pickle('./pickle/semeval/txt ngram supervised keyphraseness')
np_supervised_key = utils.open_pickle('./pickle/semeval/txt np supervised keyphraseness')

Opening all pickles


In [None]:
#create features in training and testing data, 
#if those csv have been available, please skip to the next step

#create examples on training and testing data
print("Creating examples of n-gram on combined label..")
ngram_train = feature_extraction.create_features(train_data, 
                                                 ngram_candidates, 
                                                 train_label,
                                                 supervised_key,
                                                 train_tf_corpus, 
                                                 train_topics, 
                                                 name='./csv/semeval/txt_train_ngram', 
                                                 n_keyphrase = number_keyphrase
                                                 )

ngram_test = feature_extraction.create_features(test_data, 
                                                test_ngram_candidates, 
                                                test_label, 
                                                supervised_key,
                                                test_tf_corpus, 
                                                test_topics, 
                                                name='./csv/semeval/txt_test_ngram',
                                                n_keyphrase = number_keyphrase
                                                )

print("Creating examples of noun phrase on combined label..")
nounphrase_train = feature_extraction.create_features( 
                                                train_data, 
                                                nounphrase_candidates, 
                                                train_label, 
                                                np_supervised_key,
                                                train_tf_nounphrase_corpus, 
                                                train_topics,
                                                name='./csv/semeval/txt_train_nounphrase',
                                                n_keyphrase = number_keyphrase
                                                )

nounphrase_test = feature_extraction.create_features(
                                                test_data, 
                                                test_nounphrase_candidates, 
                                                test_label,
                                                np_supervised_key, 
                                                test_tf_nounphrase_corpus, 
                                                test_topics,
                                                name='./csv/semeval/txt_test_nounphrase',
                                                n_keyphrase = number_keyphrase
                                                )


In [6]:
#evaluation part

print('Evaluation on ngram filter:')
ngram_prediction = generate_keyphrase.predict_data(test_ngram_candidates, 
                                                test_label, 
                                                train_data='./csv/semeval/txt_train_ngram', 
                                                test_data='./csv/semeval/txt_test_ngram',
                                                n_keyphrase = number_keyphrase)
print('Precision, recall, f-measure on top 15 candidates:', ngram_prediction)

#generate the result of prediction into excel
ngram_prediction_keyphrase = generate_keyphrase.get_predicted_keyphrases(test_ngram_candidates, 
                                        train_data='./csv/semeval/txt_train_ngram', 
                                        test_data='./csv/semeval/txt_test_ngram', 
                                        csv_name='./csv/semeval/txt predicted ngram keyphrases',
                                        n_keyphrase = number_keyphrase)

print('Evaluation on nounphrase filter:')
nounphrase_prediction = generate_keyphrase.predict_data(test_nounphrase_candidates, 
                                                test_label,
                                                train_data='./csv/semeval/txt_train_nounphrase', 
                                                test_data='./csv/semeval/txt_test_nounphrase', 
                                                n_keyphrase = number_keyphrase)
print('Precision, recall, f-measure on top 15 candidates:', nounphrase_prediction)

#generate the result of prediction into excel
nounphrase_prediction_keyphrase = generate_keyphrase.get_predicted_keyphrases(
                                        test_nounphrase_candidates, 
                                        train_data='./csv/semeval/txt_train_nounphrase', 
                                        test_data='./csv/semeval/txt_test_nounphrase', 
                                        csv_name='./csv/semeval/txt predicted nounphrase keyphrases',
                                        n_keyphrase = number_keyphrase)


Evaluation on ngram filter:
Precision, recall, f-measure on top 15 candidates: [('LR', (18.73, 18.58, 18.65)), ('NB', (5.67, 5.62, 5.64)), ('RF', (12.53, 12.43, 12.48)), ('AdaBoost', (11.67, 11.57, 11.62)), ('Bagging', (10.93, 10.85, 10.89))]
Evaluation on nounphrase filter:
Precision, recall, f-measur on top 15 candidates: [('LR', (19.87, 19.71, 19.79)), ('NB', (15.2, 15.08, 15.14)), ('RF', (12.73, 12.63, 12.68)), ('AdaBoost', (10.93, 10.85, 10.89)), ('Bagging', (10.27, 10.19, 10.23))]
