Skip to content

Commit

Permalink
fix wiki
Browse files Browse the repository at this point in the history
  • Loading branch information
zheyuye committed Jun 30, 2020
1 parent 9773efd commit 6cc5ccd
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 6 deletions.
7 changes: 6 additions & 1 deletion scripts/datasets/pretrain_corpus/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,18 @@ Also, you should follow the [license](https://www.gutenberg.org/wiki/Gutenberg:T

Please install [attardi/wikiextractor](https://github.com/attardi/wikiextractor) for preparing the data.

```
```bash
# Download
python prepare_wikipedia.py --mode download --lang en --date latest -o ./

# Properly format the text files
python prepare_wikipedia.py --mode format -i [path-to-wiki.xml.bz2] -o ./

```
The process of downloading and formatting is time consuming, and we offer an alternative solution to download the prepared raw text file from S3 bucket. This raw text file is in English and was dumped at 2020-06-20 being formated by the above very process (` --lang en --date 20200620`).

```bash
python prepare_wikipedia.py --mode download_prepared -o ./
```
### References
- [NVIDIA/DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)
Expand Down
24 changes: 19 additions & 5 deletions scripts/datasets/pretrain_corpus/prepare_wikipedia.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import sys
import argparse
import glob
from gluonnlp.utils.misc import download
from gluonnlp.utils.misc import download, load_checksum_stats
from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY

_CITATION = """\
Expand Down Expand Up @@ -47,22 +47,29 @@
_BASE_URL_TMPL\
= "https://dumps.wikimedia.org/{lang}wiki/{date}/{lang}wiki-{date}-pages-articles.xml.bz2"
_CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'wikipedia.txt')
_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)

_URLS = {
'wikipedia-en-20200620':
'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/pretrain_corpus/wikicorpus_one_article_per_line_20200620.txt',
}

def get_url(lang, date):
return _BASE_URL_TMPL.format(lang=lang, date=date)


def try_import_wikiextractor():
try:
sys.path.append(_CURR_DIR)
import WikiExtractor
except ImportError:
try:
download(
'https://raw.githubusercontent.com/attardi/wikiextractor/'
'16186e290d9eb0eb3a3784c6c0635a9ed7e855c3/WikiExtractor.py',
'https://raw.githubusercontent.com/attardi/wikiextractor/master/WikiExtractor.py',
path=os.path.join(_CURR_DIR, 'WikiExtractor.py'),
sha1_hash='3c4896a837b75c476d23c037e8d6c7fdfd9a29eb')
sys.path.append(_CURR_DIR)
import WikiExtractor
except:
raise ImportError('Cannot import WikiExtractor! You can download the "WikiExtractor.py"'
Expand Down Expand Up @@ -107,12 +114,13 @@ def get_parser():
parser = argparse.ArgumentParser(description='Download and Prepare the Wikipedia')
parser.add_argument('--mode', type=str,
default='download+format',
choices=['download', 'format', 'download+format'],
choices=['download', 'format', 'download+format', 'download_prepared'],
help='Specify the action you want the app to take. '
'"download" means to download the Wikipedia dump. '
'"format" means to extract the content and '
'format it for pretraining. "download+format" means to combine '
'these two options')
'these two options'
'"download_prepared" downloads the prepared txt from S3 directly')
parser.add_argument('--lang', type=str, default='en',
help='Language of the wikipedia dump file.'
'We only support English and Chinese for current version')
Expand Down Expand Up @@ -171,6 +179,12 @@ def main(args):
elif args.mode == 'download+format':
downloaded_file = download_wikicorpus(args.lang, args.date, args.output)
format_wikicorpus(downloaded_file, args.output, args.bytes)
elif args.mode == 'download_prepared':
url = _URLS['wikipedia-en-20200620']
file_hash = _URL_FILE_STATS[url]
target_download_location = os.path.join(args.output,
os.path.basename(url))
download(url, target_download_location, sha1_hash=file_hash)
else:
raise NotImplementedError

Expand Down
1 change: 1 addition & 0 deletions scripts/datasets/url_checksums/wikipedia.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/pretrain_corpus/wikicorpus_one_article_per_line_20200620.txt 67825b9c721192acbf385816984ac8a250cf5216 13538212348

0 comments on commit 6cc5ccd

Please sign in to comment.