-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
429 lines (285 loc) · 13.9 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
import logging as logger
import pickle
import pandas as pd
from sklearn.externals import joblib
import utils.dbpedia as dbp
import utils.filesutils as fu
import utils.stats as stats
import utils.stringutils as su
import utils.spotlight as sl
import utils.features as feat
import utils.preprocess as pp
import utils.classifiers as cls
import utils.utils as ut
import utils.args as uarg
import utils.measurements as meas
import utils.ne as ne
COL_INDIVIDUAL_NAME = 'individual'
COL_ABSTRACT_NAME = 'abstract'
COL_TYPE_NAME = 'type'
COL_NE_TYPE_NAME = 'ne_types'
COL_PREDICTIONS_NAME = 'predictions'
COL_LABELS_NAME = 'labels'
COL_UNSEEN_NAME = 'unseen'
def add_named_entities(une, df, confidence, support, dbonly, reuse_ne_path = ""):
logger.info("Starting add_named_entities")
if reuse_ne_path:
rne_dict = ne.get_ne_dict(reuse_ne_path, COL_INDIVIDUAL_NAME, COL_NE_TYPE_NAME)
# add a new column for NE types, if not use_ne_types, then is empty anyways
if une:
slservice = sl.SpotLightNER()
net_list = [] # list for NE types
abs_index = df.columns.get_loc(COL_ABSTRACT_NAME) + 1
ind_index = df.columns.get_loc(COL_INDIVIDUAL_NAME) + 1
for (tupin, tup) in enumerate(df.itertuples()):
logger.debug("NER %s/%s" % (tupin + 1, len(df)))
individual = tup[ind_index]
rne_found = False
if reuse_ne_path:
logger.debug("Checkin if reusable NE for individual %s" % individual)
if individual in rne_dict:
# there is an existing entry
rne_found = True
net = rne_dict[individual]
logger.debug("Found entities for individual %s, NEs=%s" % (individual,net))
if not rne_found:
# if it was not available for reuse, we have to parse it
abstract = tup[abs_index]
net = su.to_string(slservice.get_annotations(abstract, confidence, support, dbonly), " ")
net_list.append(net)
net_series = pd.Series(net_list)
df[COL_NE_TYPE_NAME] = net_series.values
else:
df[COL_NE_TYPE_NAME] = ""
logger.info("End add_named_entities")
def get_types_and_abstracts(types_path, abstract_path, res_path, usn=False, upa="", enctyp=False, encabs=False, tmp = False):
logger.info("Starting get_types_and_abstracts")
# get types df
dft = dbp.get_resources_from_types([], types_path, enctyp)
if usn:
logger.info("Removing unseen data from the %s entries found" % len(dft))
# get types df from unseen
dfut = dbp.get_resources_from_types([], upa)
# https://stackoverflow.com/questions/27965295/dropping-rows-from-dataframe-based-on-a-not-in-condition
unseen_ind_list = dfut[COL_INDIVIDUAL_NAME].tolist()
logger.info("Removing %s unseen entries" % len(unseen_ind_list))
# save length for loggin
pre_len = len(dft)
# remove individuals that are in unseen
dft = dft[~dft[COL_INDIVIDUAL_NAME].isin(unseen_ind_list)]
logger.info("Removed %s unseen entries from the total of %s" % (pre_len-len(dft),pre_len))
# get abstract df
dfa = dbp.get_resource_abstracts([], abstract_path, encabs)
# full df types and abstract
df_full = fu.join_dataframes(dft, dfa)
#### TEMP
if tmp:
gs_list = df_full[COL_INDIVIDUAL_NAME].tolist()
print(gs_list)
df_gs_excluded = dft[~dft[COL_INDIVIDUAL_NAME].isin(gs_list)]
fu.df_to_csv("./tmp_df_gs_excluded", df_gs_excluded)
##### TEMP
logger.info("Got %s type entries, %s abstracts. Combined size=%s" % (len(dft), len(dfa), len(df_full)))
# store types to disk
fu.df_to_csv(res_path, df_full)
logger.debug("Got finally %s entries combining abstract and NEs" % len(df_full))
logger.info("End of get_types_and_abstracts")
def main():
"""
# Entry method of the NLP4Types method
"""
logger.info("Starting NLP 4 Types")
# TODO: add these as CLI parameters
# Files
prefix = args.prefix
uprefix = args.uprefix
input_base = args.ibase
output_base = args.obase
file_dbo_tree = args.dbtree
unseen_path = args.upath
rne_path = args.rnepath
types_path = '../../data/' + prefix + '_instance_types_en.ttl'
abstract_path = '../../data/' + prefix + '_long_abstracts_en.ttl'
unseen_abstract_path = '../../data/' + uprefix + '_long_abstracts_en.ttl'
res_path = '../../data/results/' + prefix + '_merged_types_abstract.csv'
res_unseen_path = '../../data/results/' + prefix + '_merged_types_abstract_unseen.csv'
ne_res_path = '../../data/results/' + prefix + '_ne_types_abstract.csv'
pred_res_path = '../../data/results/' + prefix + '_pred_and_labels.csv'
file_pp_text_list = '../../data/results/' + prefix + '_pp_text_list.p'
file_vec_data = '../../data/results/' + prefix + '_vec_data.p'
file_vectorizer = '../../data/results/' + prefix + '_vectorizer.p'
file_classifier = '../../data/results/' + prefix + '_classifier.p'
file_train_index = '../../data/results/' + prefix + '_train_index.p'
file_args = '../../data/results/' + prefix + '_args.p'
file_unseen_df = '../../data/results/' + prefix + '_unseen_df.p'
file_unseen_data_vec = '../../data/results/' + prefix + '_unseen_data_vec.p'
# Parameters
first_step = args.fstep
last_step = args.lstep
get_stats = args.stats
use_abstract = args.abstract
use_ne_types = args.ner
use_stemm = args.stemm
use_lemma = args.lemma
use_sw = args.sw
dbonly = args.dbonly
confidence = args.confidence
support = args.support
train_size = args.tsize
ne_weight = args.neweight
unseen = args.unseen
st = ut.Steps(first_step, last_step)
if st.isstep(1, "Generate types and abstracts"):
# generate data for types and abstracts
get_types_and_abstracts(types_path, abstract_path, res_path, unseen, unseen_path, enctyp =True, encabs = True)
if st.isstep(2, "NER"): # generate NE types as words
# load data from disk
df_types_abs = fu.csv_to_df(res_path)
logger.info("Combined seen data for a total of %s entries" % len(df_types_abs))
# generate stats if necessary
if get_stats:
stats.column_count(df_types_abs, COL_TYPE_NAME)
add_named_entities(use_ne_types, df_types_abs, confidence, support, dbonly, reuse_ne_path=rne_path)
logger.debug("Writing df_types_abs with NE to % s" % ne_res_path)
fu.df_to_csv(ne_res_path, df_types_abs)
if st.isstep(3, "Preprocessing"): # start preprocessing text
# load data from disk
df_pre = fu.csv_to_df(ne_res_path)
print(df_pre.columns)
# preprocess data
pp_text_list = pp.process_data_frame(df_pre, use_abstract, COL_ABSTRACT_NAME,
use_ne_types, COL_NE_TYPE_NAME, ne_weight,
use_stemm=use_stemm, use_lemma=use_lemma,use_sw=use_sw)
# TODO pickle pp_text_list
pickle.dump(pp_text_list, open(file_pp_text_list, "wb"))
if st.isstep(4, "Vectorization"): # start vectorization
# TODO un-pickle pp_text_list
pp_text_list = pickle.load(open(file_pp_text_list, "rb"))
# calculate features
# using training size to calculate the final index
if not unseen:
train_index = int(train_size * len(pp_text_list))
else:
train_index = len(pp_text_list) #with unseen data, train with everything
logger.info("Training rows [%s-%s], testing rows [%s-%s]" % (0, train_index, train_index, len(pp_text_list)))
# vectorize data
vec_data, vectorizer = feat.vectorize_data(pp_text_list, 0, len(pp_text_list))
# pickle vec_data
pickle.dump(vec_data, open(file_vec_data, "wb"))
# pickle vectorizer
pickle.dump(vectorizer, open(file_vectorizer, "wb"))
# pickle train_index
pickle.dump(train_index, open(file_train_index, "wb"))
if st.isstep(5, "Training"): # start training
if 'vec_data' not in locals():
# un-pickle vec_data
logger.debug("unpickled %s" % file_vec_data)
vec_data = pickle.load(open(file_vec_data, "rb"))
if 'train_index' not in locals():
# un-pickle train index
train_index = pickle.load(open(file_train_index, "rb"))
logger.debug("unpickled %s, value=%s" % (file_train_index,train_index))
# get training labels
if 'df_pre' not in locals():
# load data from disk
df_pre = fu.csv_to_df(ne_res_path)
logger.debug("Df pre rescued from %s" % ne_res_path)
trainig_labels = df_pre[COL_TYPE_NAME].tolist()[:train_index]
train_data = vec_data[:train_index]
logger.debug("Training labels ready %s" % len(trainig_labels))
# train the classifier
logger.debug("Start training")
classifier = cls.train_linear_svc(train_data, trainig_labels)
logger.debug("End training")
# TODO pickle classifier
logger.debug("Save classifier to %s" % file_classifier)
# pickle.dump(classifier, open(file_classifier, "wb"))
joblib.dump(classifier, file_classifier)
if st.isstep(6, "Prediction/testing"): # start prediction
if unseen:
if 'vectorizer' not in locals():
# un-pickle vec_data
logger.debug("unpickled %s" % file_vectorizer)
vectorizer = pickle.load(open(file_vectorizer, "rb"))
logger.info("Getting unseen data for prediction")
logger.debug("Mind that data from %s should have been previously removed from training set" % res_unseen_path)
# generate type and abstract for unseen data
get_types_and_abstracts(unseen_path, unseen_abstract_path, res_unseen_path, enctyp=False, encabs=True, tmp=True)
# get unseen data from disk
df_types_abs_unseen = fu.csv_to_df(res_unseen_path)
logger.info("Got %s entries of unseen data " % len(df_types_abs_unseen))
add_named_entities(use_ne_types, df_types_abs_unseen, confidence, support, dbonly)
# preprocess data
pp_unseen_text_list = pp.process_data_frame(df_types_abs_unseen, use_abstract, COL_ABSTRACT_NAME,
use_ne_types, COL_NE_TYPE_NAME, ne_weight, use_stemm=use_stemm)
# use the vectorizer from training data to vectorize unseen data
prediction_data = feat.vectorize_data_unseen(vectorizer, pp_unseen_text_list)
else:
if 'vec_data' not in locals():
# un-pickle vec_data
logger.debug("unpickled %s" % file_vec_data)
vec_data = pickle.load(open(file_vec_data, "rb"))
if 'train_index' not in locals():
# un-pickle train index
train_index = pickle.load(open(file_train_index, "rb"))
logger.debug("unpickled %s, value=%s" % (file_train_index,train_index))
if 'df_pre' not in locals():
# load data from disk
logger.debug("unpickled %s" % ne_res_path)
df_pre = fu.csv_to_df(ne_res_path)
logger.info("train_index=%s" % train_index)
prediction_data = vec_data[train_index:]
if 'classifier' not in locals():
# un-pickle classifier
logger.debug("unpickled %s" % file_classifier)
#classifier = pickle.load(open(file_classifier, "rb"))
classifier = joblib.load(file_classifier)
# makes no sense to predict
if prediction_data.shape[0] <= 0:
logger.info("Data for prediction is empty, makes no sense to predict. Stopping")
return
predictions = []
try:
predictions = classifier.predict(prediction_data)
except Exception as e:
logger.error('Failed to classify: ' + str(e))
logger.error('Exiting')
return
# save predictions and labels
if unseen:
labels = df_types_abs_unseen[COL_TYPE_NAME].tolist()
print("Predictions %s vs. labels %s" % (len(predictions), len(labels)))
df_pred = pd.DataFrame({COL_PREDICTIONS_NAME: predictions, COL_LABELS_NAME: labels})
fu.df_to_csv(pred_res_path, df_pred)
else:
labels = df_pre[COL_TYPE_NAME].tolist()[train_index:]
df_pred = pd.DataFrame({COL_PREDICTIONS_NAME: predictions, COL_LABELS_NAME: labels})
fu.df_to_csv(pred_res_path, df_pred)
if st.isstep(7, "Evaluation"): # start measuring performance
# load data from df
df_pred_label = fu.csv_to_df(pred_res_path)
dbtree = pickle.load(open(file_dbo_tree, "rb"))
logger.debug("unpickled dbtree from: %s" % (file_dbo_tree))
general_accuracy, hier_prec, hier_recall, hier_f_meas = meas.get_hierarchical_measurements(df_pred_label, COL_PREDICTIONS_NAME, COL_LABELS_NAME, dbtree)
logger.info("**** Results *****")
logger.info("* General accuracy=%s" % general_accuracy)
logger.info("* Hierachical precission=%s" % hier_prec)
logger.info("* Hierachical recall=%s" % hier_recall)
logger.info("* Hierachical F-measure=%s" % hier_f_meas)
st.endsteps()
logger.info(st.get_print_times())
logger.info("Arguments %s" % args)
pickle.dump(args, open(file_args, "wb"))
logger.info("End of NLP 4 Types")
# change this to file config
if __name__ == '__main__':
import logging.config
ps = uarg.Args()
args = ps.get_args()
logfile='log_nlp4types.log'
if args.log:
logfile = args.log
logging.basicConfig(filename=logfile, format='%(asctime)s %(levelname)s %(message)s',
level=logger.DEBUG)
logger.info("Arguments %s" % args)
main()