In [1]:
#This file contains noun phrase and n-gram filters on text file version

import glob, utils, preprocessing, generate_candidate, generate_keyphrase

#input the number of keyphrase, default is 15
number_keyphrase = 15

In [None]:
#only run one time, if pickles have been available, please skip next step

#load and create training data
train_directory = glob.glob('./data/se_txt/train/*.txt.final')
train_raw = preprocessing.load_files(train_directory)
pickle_train_raw = utils.create_pickle(train_raw,'./pickle/semeval/txt train raw')
train_data = preprocessing.create_corpus(train_raw)
pickle_train_data = utils.create_pickle(train_data,'./pickle/semeval/txt train data')
train_tf_corpus = feature_extraction.calculate_tf(train_data, vocab = None, 
                    type = 'ngram')
pickle_train_tf_corpus = utils.create_pickle(train_tf_corpus,
                        './pickle/semeval/txt train tf corpus')

#load and create testing data
test_directory = glob.glob('./data/se_txt/test/*.txt.final')
test_raw = preprocessing.load_files(test_directory)
pickle_test_raw = utils.create_pickle(test_raw,'./pickle/semeval/txt test raw')
test_data = preprocessing.create_corpus(test_raw)
pickle_test_data = utils.create_pickle(test_data,'./pickle/semeval/txt test data')
test_tf_corpus = feature_extraction.calculate_tf(test_data, vocab = None, 
                    type = 'ngram')
pickle_test_tf_corpus = utils.create_pickle(test_tf_corpus,
                    './pickle/semeval/txt test tf corpus')

#generate the ngram version
print("Generating TF-IDF n-gram candidates..")
ngram_candidates = generate_candidate.calculate_tfidf(train_data, vocab=None, 
                        type='ngram') 
pickle_ngram_candidates = utils.create_pickle(ngram_candidates,
                            './pickle/semeval/txt ngram candidates')

test_ngram_candidates = generate_candidate.calculate_tfidf(test_data, vocab=None, 
                            type='ngram')
pickle_test_ngram_candidates = utils.create_pickle(test_ngram_candidates, 
                                './pickle/semeval/txt test ngram candidates')

#generate the noun phrase version
print("Generating TF-IDF noun phrase candidates..")
nounphrase_vocabulary = generate_candidate.create_phrase_vocabulary(train_data)
train_tf_nounphrase_corpus = feature_extraction.calculate_tf(train_data, 
                                vocab = nounphrase_vocabulary, type = 'np')
pickle_train_tf_nounphrase_corpus = utils.create_pickle(train_tf_nounphrase_corpus,
                                    './pickle/semeval/txt train tf new nounphrase corpus')
nounphrase_candidates = generate_candidate.calculate_tfidf(train_data, 
                        nounphrase_vocabulary, type='np')
pickle_nounphrase_candidates = utils.create_pickle(nounphrase_candidates, 
                                './pickle/semeval/txt nounphrase candidates')

test_nounphrase_vocabulary = generate_candidate.create_phrase_vocabulary(test_data)
test_tf_nounphrase_corpus = feature_extraction.calculate_tf(test_data, 
                            vocab = test_nounphrase_vocabulary, type = 'np')
pickle_test_tf_nounphrase_corpus = utils.create_pickle(test_tf_nounphrase_corpus,
                                    './pickle/semeval/txt test tf new nounphrase corpus')
test_nounphrase_candidates = generate_candidate.calculate_tfidf(test_data, 
                                test_nounphrase_vocabulary, type='np')
pickle_test_nounphrase_candidates = utils.create_pickle(test_nounphrase_candidates, 
                                        './pickle/semeval/txt test nounphrase candidates')

In [None]:
#if these files are exist on pickle, please skip into next step

#load gold-standard keyphrase
train_label_directory = open('./data/se_txt/train/train.combined.stem.final', 
                        encoding='utf-8').read()
train_label = preprocessing.extract_keyphrase(train_label_directory)
pickle_train_label = utils.create_pickle(train_label, './pickle/semeval/train label')

test_label_directory = open('./data/se_txt/test_answer/test.combined.stem.final', 
                            encoding='utf-8').read()
test_label = preprocessing.extract_keyphrase(test_label_directory)
pickle_test_label = utils.create_pickle(test_label, './pickle/semeval/test label')

In [2]:
#This section to test TFIDF extraction

#load all relevant pickles
train_label = utils.open_pickle('./pickle/semeval/train label')
test_label = utils.open_pickle('./pickle/semeval/test label')

ngram_candidates = utils.open_pickle('./pickle/semeval/txt ngram candidates')
test_ngram_candidates = utils.open_pickle('./pickle/semeval/txt test ngram candidates')

nounphrase_candidates = utils.open_pickle('./pickle/semeval/txt nounphrase candidates')
test_nounphrase_candidates = utils.open_pickle('./pickle/semeval/txt test nounphrase candidates')


#Evaluate TF-IDF extraction
print("TF-IDF n-gram version:")
ngram_top_keyphrases = generate_keyphrase.get_tf_keyphrase(ngram_candidates, 
                                            15, csv_name='./csv/tfidf train ngram')
ngram_fmeasure = generate_keyphrase.calculate_fmeasure(ngram_top_keyphrases, 
                                            train_label, 15)
print("Precision, recall, f-measure on ngram training:", ngram_fmeasure)

test_ngram_top_candidates = generate_keyphrase.get_tf_keyphrase(test_ngram_candidates, 
                                            15, csv_name='./csv/tfidf test ngram')
test_ngram_fmeasure = generate_keyphrase.calculate_fmeasure(test_ngram_top_candidates, 
                                            test_label, 15)
print("Precision, recall, f-measure on ngram testing:", test_ngram_fmeasure)

print("TF-IDF noun phrase version:")
nounphrase_top_keyphrases = generate_keyphrase.get_tf_keyphrase(nounphrase_candidates, 
                                            15, csv_name='./csv/tfidf train noun phrase')
nounphrase_fmeasure = generate_keyphrase.calculate_fmeasure(nounphrase_top_keyphrases, 
                                            train_label, 15)
print("Precision, recall, f-measure on noun phrase training:", nounphrase_fmeasure)

test_nounphrase_top_candidates = generate_keyphrase.get_tf_keyphrase(test_nounphrase_candidates, 
                                            15, csv_name='./csv/tfidf test nounphrase')
test_nounphrase_fmeasure = generate_keyphrase.calculate_fmeasure(test_nounphrase_top_candidates, 
                                            test_label, 15)
print("Precision, recall, f-measure on noun phrase testing:", test_nounphrase_fmeasure)


TF-IDF n-gram version:
Precision, recall, f-measure on ngram training: (16.34, 15.61, 15.97)
Precision, recall, f-measure on ngram testing: (16.13, 16.01, 16.07)
TF-IDF noun phrase version:
Precision, recall, f-measure on noun phrase training: (16.62, 15.87, 16.24)
Precision, recall, f-measure on noun phrase testing: (16.93, 16.8, 16.86)
