-
Notifications
You must be signed in to change notification settings - Fork 803
/
train_bert_wordpiece.py
52 lines (44 loc) · 1.16 KB
/
train_bert_wordpiece.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import argparse
import glob
from tokenizers import BertWordPieceTokenizer
parser = argparse.ArgumentParser()
parser.add_argument(
"--files",
default=None,
metavar="path",
type=str,
required=True,
help="The files to use as training; accept '**/*.txt' type of patterns \
if enclosed in quotes",
)
parser.add_argument(
"--out",
default="./",
type=str,
help="Path to the output directory, where the files will be saved",
)
parser.add_argument("--name", default="bert-wordpiece", type=str, help="The name of the output vocab files")
args = parser.parse_args()
files = glob.glob(args.files)
if not files:
print(f"File does not exist: {args.files}")
exit(1)
# Initialize an empty tokenizer
tokenizer = BertWordPieceTokenizer(
clean_text=True,
handle_chinese_chars=True,
strip_accents=True,
lowercase=True,
)
# And then train
tokenizer.train(
files,
vocab_size=10000,
min_frequency=2,
show_progress=True,
special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"],
limit_alphabet=1000,
wordpieces_prefix="##",
)
# Save the files
tokenizer.save_model(args.out, args.name)