/
load_data_eraser_fever.py
84 lines (74 loc) · 3.2 KB
/
load_data_eraser_fever.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import tensorflow as tf
import pandas as pd
import os
import json
def annotated_doc(doc_fname, evidences, classification):
with open(doc_fname, 'r') as fin:
doc_content = fin.readlines()
doc_content = [l.strip() for l in doc_content]
doc_content = ' '.join(doc_content).split()
evidences_sorted = []
if len(evidences) != 0:
if len(evidences) == 1:
evidences = evidences[0]
else:
evidences = [e[0] for e in evidences]
evidences_sorted = sorted(evidences, key=lambda x: x['start_token'])
evi_texts = [e['text'] for e in evidences_sorted]
evidences_sorted = [(e['start_token'], e['end_token'])
for e in evidences_sorted]
evidences_sorted.append((len(doc_content), -1))
ret = doc_content[:evidences_sorted[0][0]]
tag = 'POS' if classification == 1 else 'NEG'
for i in range(len(evidences_sorted) - 1):
ret.append('<{}>'.format(tag))
evi = doc_content[evidences_sorted[i][0]: evidences_sorted[i][1]]
try:
assert ' '.join(evi) == evi_texts[i]
except AssertionError:
print(doc_fname)
print(evi)
print(evi_texts[i])
raise AssertionError
ret += doc_content[evidences_sorted[i][0]: evidences_sorted[i][1]]
ret.append('</{}>'.format(tag))
ret += doc_content[evidences_sorted[i][1]: evidences_sorted[i+1][0]]
return ' '.join(ret)
# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(docs_folder, dataset_fname):
with open(dataset_fname, 'r') as fin:
d = fin.readlines()
ret = {'passage': [], 'query': [],
'classification': [], 'annotation_id': [], "docids": []}
for a in d:
annotation = json.loads(a)
ret['annotation_id'].append(annotation['annotation_id'])
classification = 1 if annotation['classification'] == 'SUPPORTS' else 0
ret['classification'].append(classification)
ret["docids"].append(annotation['docids'])
doc_fname = os.path.join(
docs_folder, annotation['docids'][0])
try:
ret['passage'].append(annotated_doc(doc_fname, annotation['evidences'], classification))
except AssertionError:
print(a)
print(doc_fname)
print('-------------------------------------')
ret['query'].append(annotation['query'])
return pd.DataFrame(ret)
# return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)
# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
dataset = tf.keras.utils.get_file(
fname="fever.tar.gz",
origin="http://www.eraserbenchmark.com/zipped/fever.tar.gz",
extract=True)
dataset_folder = os.path.join(os.path.dirname(dataset), 'fever')
docs_folder = os.path.join(dataset_folder, 'docs')
df_train = load_dataset(docs_folder, os.path.join(
dataset_folder, 'train.jsonl'))
df_val = load_dataset(docs_folder, os.path.join(
dataset_folder, 'val.jsonl'))
df_test = load_dataset(docs_folder, os.path.join(
dataset_folder, 'test.jsonl'))
return df_train, df_val, df_test