fix wiki

liuzh47 · Jun 30, 2020 · 6cc5ccd · 6cc5ccd
1 parent 9773efd
commit 6cc5ccd
Show file tree

Hide file tree

Showing 3 changed files with 26 additions and 6 deletions.
diff --git a/scripts/datasets/pretrain_corpus/README.md b/scripts/datasets/pretrain_corpus/README.md
@@ -25,13 +25,18 @@ Also, you should follow the [license](https://www.gutenberg.org/wiki/Gutenberg:T
 
 Please install [attardi/wikiextractor](https://github.com/attardi/wikiextractor) for preparing the data.
 
-```
+```bash
 # Download
 python prepare_wikipedia.py --mode download --lang en --date latest -o ./
 
 # Properly format the text files
 python prepare_wikipedia.py --mode format -i [path-to-wiki.xml.bz2] -o ./
 
+```
+The process of downloading and formatting is time consuming, and we offer an alternative solution to download the prepared raw text file from S3 bucket. This raw text file is in English and was dumped at 2020-06-20 being formated by the above very process (` --lang en --date 20200620`).
+
+```bash
+python prepare_wikipedia.py --mode download_prepared -o ./
 ```
 ### References
 - [NVIDIA/DeepLearningExamples](https://github.com/NVIDIA/DeepLearningExamples/tree/master/PyTorch/LanguageModeling/BERT)

diff --git a/scripts/datasets/pretrain_corpus/prepare_wikipedia.py b/scripts/datasets/pretrain_corpus/prepare_wikipedia.py
@@ -3,7 +3,7 @@
 import sys
 import argparse
 import glob
-from gluonnlp.utils.misc import download
+from gluonnlp.utils.misc import download, load_checksum_stats
 from gluonnlp.registry import DATA_PARSER_REGISTRY, DATA_MAIN_REGISTRY
 
 _CITATION = """\
@@ -47,22 +47,29 @@
 _BASE_URL_TMPL\
     = "https://dumps.wikimedia.org/{lang}wiki/{date}/{lang}wiki-{date}-pages-articles.xml.bz2"
 _CURR_DIR = os.path.realpath(os.path.dirname(os.path.realpath(__file__)))
+_URL_FILE_STATS_PATH = os.path.join(_CURR_DIR, '..', 'url_checksums', 'wikipedia.txt')
+_URL_FILE_STATS = load_checksum_stats(_URL_FILE_STATS_PATH)
 
+_URLS = {
+    'wikipedia-en-20200620':
+        'https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/pretrain_corpus/wikicorpus_one_article_per_line_20200620.txt',
+}
 
 def get_url(lang, date):
     return _BASE_URL_TMPL.format(lang=lang, date=date)
 
 
 def try_import_wikiextractor():
     try:
+        sys.path.append(_CURR_DIR)
         import WikiExtractor
     except ImportError:
         try:
             download(
-                'https://raw.githubusercontent.com/attardi/wikiextractor/'
-                '16186e290d9eb0eb3a3784c6c0635a9ed7e855c3/WikiExtractor.py',
+                'https://raw.githubusercontent.com/attardi/wikiextractor/master/WikiExtractor.py',
                 path=os.path.join(_CURR_DIR, 'WikiExtractor.py'),
                 sha1_hash='3c4896a837b75c476d23c037e8d6c7fdfd9a29eb')
+            sys.path.append(_CURR_DIR)
             import WikiExtractor
         except:
             raise ImportError('Cannot import WikiExtractor! You can download the "WikiExtractor.py"'
@@ -107,12 +114,13 @@ def get_parser():
     parser = argparse.ArgumentParser(description='Download and Prepare the Wikipedia')
     parser.add_argument('--mode', type=str,
                         default='download+format',
-                        choices=['download', 'format', 'download+format'],
+                        choices=['download', 'format', 'download+format', 'download_prepared'],
                         help='Specify the action you want the app to take. '
                              '"download" means to download the Wikipedia dump. '
                              '"format" means to extract the content and '
                              'format it for pretraining. "download+format" means to combine '
-                             'these two options')
+                             'these two options'
+                             '"download_prepared" downloads the prepared txt from S3 directly')
     parser.add_argument('--lang', type=str, default='en',
                         help='Language of the wikipedia dump file.'
                              'We only support English and Chinese for current version')
@@ -171,6 +179,12 @@ def main(args):
     elif args.mode == 'download+format':
         downloaded_file = download_wikicorpus(args.lang, args.date, args.output)
         format_wikicorpus(downloaded_file, args.output, args.bytes)
+    elif args.mode == 'download_prepared':
+        url = _URLS['wikipedia-en-20200620']
+        file_hash = _URL_FILE_STATS[url]
+        target_download_location = os.path.join(args.output,
+                                                os.path.basename(url))
+        download(url, target_download_location, sha1_hash=file_hash)
     else:
         raise NotImplementedError
 

diff --git a/scripts/datasets/url_checksums/wikipedia.txt b/scripts/datasets/url_checksums/wikipedia.txt
@@ -0,0 +1 @@
+https://gluonnlp-numpy-data.s3-us-west-2.amazonaws.com/pretrain_corpus/wikicorpus_one_article_per_line_20200620.txt 67825b9c721192acbf385816984ac8a250cf5216 13538212348