/
wikihow.py
204 lines (175 loc) 路 8.09 KB
/
wikihow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# coding=utf-8
# Copyright 2020 The TensorFlow Datasets Authors and the HuggingFace Datasets Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Lint as: python3
"""WikiHow Datasets."""
from __future__ import absolute_import, division, print_function
import csv
import os
import re
import datasets
_CITATION = """
@misc{koupaee2018wikihow,
title={WikiHow: A Large Scale Text Summarization Dataset},
author={Mahnaz Koupaee and William Yang Wang},
year={2018},
eprint={1810.09305},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
"""
_DESCRIPTION = """
WikiHow is a new large-scale dataset using the online WikiHow
(http://www.wikihow.com/) knowledge base.
There are two features:
- text: wikihow answers texts.
- headline: bold lines as summary.
There are two separate versions:
- all: consisting of the concatenation of all paragraphs as the articles and
the bold lines as the reference summaries.
- sep: consisting of each paragraph and its summary.
Download "wikihowAll.csv" and "wikihowSep.csv" from
https://github.com/mahnazkoupaee/WikiHow-Dataset and place them in manual folder
https://www.tensorflow.org/datasets/api_docs/python/tfds/download/DownloadConfig.
Train/validation/test splits are provided by the authors.
Preprocessing is applied to remove short articles
(abstract length < 0.75 article length) and clean up extra commas.
"""
_DOCUMENT = "text"
_SUMMARY = "headline"
_URLS = {
"train": "https://raw.githubusercontent.com/mahnazkoupaee/WikiHow-Dataset/master/all_train.txt",
"validation": "https://raw.githubusercontent.com/mahnazkoupaee/WikiHow-Dataset/master/all_val.txt",
"test": "https://raw.githubusercontent.com/mahnazkoupaee/WikiHow-Dataset/master/all_test.txt",
}
class WikihowConfig(datasets.BuilderConfig):
"""BuilderConfig for Wikihow."""
def __init__(self, filename=None, **kwargs):
"""BuilderConfig for Wikihow.
Args:
filename: filename of different configs for the dataset.
**kwargs: keyword arguments forwarded to super.
"""
# Version 1.1.0 remove empty document and summary strings.
# Version 1.2.0 add train validation test split, add cleaning & filtering.
super(WikihowConfig, self).__init__(version=datasets.Version("1.2.0"), **kwargs)
self.filename = filename
class Wikihow(datasets.GeneratorBasedBuilder):
"""WikiHow: A Large Scale Text Summarization Dataset."""
BUILDER_CONFIGS = [
WikihowConfig(
name="all",
filename="wikihowAll.csv",
description="Use the concatenation of all paragraphs as the articles"
" and the bold lines as the reference summaries",
),
WikihowConfig(name="sep", filename="wikihowSep.csv", description="use each paragraph and its summary."),
]
@property
def manual_download_instructions(self):
return """\
You need to manually download two wikihow files. An overview of which files to download can be seen at https://github.com/mahnazkoupaee/WikiHow-Dataset.
You need to download the following two files manually:
1) https://ucsb.app.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358 and save the file under <path/to/folder>/wikihowAll.csv
2) https://ucsb.app.box.com/s/7yq601ijl1lzvlfu4rjdbbxforzd2oag and save the file under <path/to/folder>/wikihowSep.csv
The <path/to/folder> can e.g. be "~/manual_wikihow_data".
Wikihow can then be loaded using the following command `datasets.load_dataset("wikihow", data_dir="<path/to/folder>")`.
"""
def _info(self):
feature_names = [_DOCUMENT, _SUMMARY, "title"]
if self.config.name == "sep":
feature_names.extend(["overview", "sectionLabel"])
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=datasets.Features({k: datasets.Value("string") for k in feature_names}),
supervised_keys=None,
homepage="https://github.com/mahnazkoupaee/WikiHow-Dataset",
citation=_CITATION,
)
def _split_generators(self, dl_manager):
"""Returns SplitGenerators."""
dl_path = dl_manager.download_and_extract(_URLS)
titles = {k: set() for k in dl_path}
for k, path in dl_path.items():
with open(path, encoding="utf-8") as f:
for line in f:
titles[k].add(line.strip())
path_to_manual_file = os.path.join(
os.path.abspath(os.path.expanduser(dl_manager.manual_dir)), self.config.filename
)
if not os.path.exists(path_to_manual_file):
raise FileNotFoundError(
"{} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('wikihow', data_dir=...)` that includes a file name {}. Manual download instructions: {})".format(
path_to_manual_file, self.config.filename, self.manual_download_instructions
)
)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"path": path_to_manual_file,
"title_set": titles["train"],
},
),
datasets.SplitGenerator(
name=datasets.Split.VALIDATION,
gen_kwargs={
"path": path_to_manual_file,
"title_set": titles["validation"],
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"path": path_to_manual_file,
"title_set": titles["test"],
},
),
]
def _generate_examples(self, path=None, title_set=None):
"""Yields examples."""
with open(path, encoding="utf-8") as f:
reader = csv.reader(f)
headers = next(reader)
if self.config.name == "all" and headers != ["headline", "title", "text"]:
raise ValueError("Mismatched header in WikiAll.txt")
if self.config.name == "sep" and headers != ["overview", "headline", "text", "sectionLabel", "title"]:
raise ValueError("Mismatched header in WikiSep.txt")
key2id = {key: i for i, key in enumerate(headers)}
for i, line in enumerate(reader):
# skip empty line or insufficient line.
if len(line) == len(key2id):
summary = line[key2id[_SUMMARY]].strip()
document = line[key2id[_DOCUMENT]].strip()
summary, document = _filter_and_clean(summary, document)
if summary and document:
if line[key2id["title"]].strip().replace(" ", "") in title_set:
d = {k: line[v].strip() for k, v in key2id.items() if k not in [_SUMMARY, _DOCUMENT]}
d[_DOCUMENT] = document
d[_SUMMARY] = summary
yield i, d
# This functions follow data processing acoording to original paper at
# https://github.com/mahnazkoupaee/WikiHow-Dataset/blob/master/process.py
def _filter_and_clean(abstract, article):
"""Remove short article and clean up commas in abstract and article."""
# a threshold is used to remove short articles with long summaries
# as well as articles with no summary
if len(abstract) < (0.75 * len(article)):
# remove extra commas in abstracts
abstract = abstract.replace(".,", ".")
# remove extra commas in articles
article = re.sub(r"[.]+[\n]+[,]", ".\n", article)
return abstract, article
else:
return "", ""