From ae9d0a2805384f0e5c37ffdb51d698bf9954ed8c Mon Sep 17 00:00:00 2001 From: Chen Qian Date: Mon, 9 May 2022 16:40:47 -0700 Subject: [PATCH 1/3] Modify README of bert example --- examples/bert/README.md | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/examples/bert/README.md b/examples/bert/README.md index d6f6db7d2d..491e9e9f06 100644 --- a/examples/bert/README.md +++ b/examples/bert/README.md @@ -79,7 +79,7 @@ The latest wikipedia dump can be downloaded [at this link](https://dumps.wikimed or via command line: ```shell -curl https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 +curl -O https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 ``` The dump can be extracted with the `wikiextractor` tool. @@ -126,7 +126,12 @@ The `create_vocabulary.py` script allows you to compute your own WordPiece vocabulary for use with BERT. In most cases however, it is desirable to use the standard BERT vocabularies from the original models. You can download the English uncased vocabulary -[here](https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt). +[here](https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt), +or in your terminal run: +```shell +curl https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt \ + -o bert_vocab_uncased.txt +``` ### Tokenize, mask, and combine sentences into training examples @@ -169,7 +174,7 @@ for file in path/to/sentence-split-data/*; do output="path/to/pretraining-data/$(basename -- "$file" .txt).tfrecord" python examples/bert/create_pretraining_data.py \ --input_files ${file} \ - --vocab_file vocab.txt \ + --vocab_file bert_vocab_uncased.txt \ --output_file ${output} done ``` @@ -183,7 +188,7 @@ for file in path/to/sentence-split-data/*; do output="path/to/pretraining-data/$(basename -- "$file" .txt).tfrecord" echo python examples/bert/create_pretraining_data.py \ --input_files ${file} \ - --vocab_file vocab.txt \ + --vocab_file bert_vocab_uncased.txt \ --output_file ${output} done | parallel -j ${NUM_JOBS} ``` @@ -220,7 +225,8 @@ training for a few epochs to finetune the model. ```shell python3 examples/bert/run_glue_finetuning.py \ - --saved_model_input path/to/model/ \ + --saved_model_input path/to/input/model/ \ + --saved_model_output path/to/output/model/ \ --vocab_file path/to/bert_vocab_uncased.txt \ --bert_config_file examples/bert/configs/bert_tiny.json \ ``` From 6efd5b1458040577c1761144a15af1a39c98e11c Mon Sep 17 00:00:00 2001 From: Chen Qian Date: Tue, 10 May 2022 10:57:39 -0700 Subject: [PATCH 2/3] fix some comments --- examples/bert/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/bert/README.md b/examples/bert/README.md index 491e9e9f06..9bd3e92471 100644 --- a/examples/bert/README.md +++ b/examples/bert/README.md @@ -128,6 +128,7 @@ standard BERT vocabularies from the original models. You can download the English uncased vocabulary [here](https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt), or in your terminal run: + ```shell curl https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt \ -o bert_vocab_uncased.txt From d836c2879f43a25e1b85ec7a203518ac396e7080 Mon Sep 17 00:00:00 2001 From: Chen Qian Date: Tue, 10 May 2022 11:13:12 -0700 Subject: [PATCH 3/3] fix some comments --- examples/bert/README.md | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/bert/README.md b/examples/bert/README.md index 9bd3e92471..7913ea68a1 100644 --- a/examples/bert/README.md +++ b/examples/bert/README.md @@ -79,7 +79,7 @@ The latest wikipedia dump can be downloaded [at this link](https://dumps.wikimed or via command line: ```shell -curl -O https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 +curl -O https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2 ``` The dump can be extracted with the `wikiextractor` tool. @@ -130,8 +130,7 @@ English uncased vocabulary or in your terminal run: ```shell -curl https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt \ - -o bert_vocab_uncased.txt +curl -O https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt ``` ### Tokenize, mask, and combine sentences into training examples @@ -226,8 +225,7 @@ training for a few epochs to finetune the model. ```shell python3 examples/bert/run_glue_finetuning.py \ - --saved_model_input path/to/input/model/ \ - --saved_model_output path/to/output/model/ \ + --saved_model_input path/to/model/ \ --vocab_file path/to/bert_vocab_uncased.txt \ --bert_config_file examples/bert/configs/bert_tiny.json \ ```