/
bert_wordpiece.py
151 lines (135 loc) 路 5.39 KB
/
bert_wordpiece.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from typing import Dict, Iterator, List, Optional, Union
from tokenizers import AddedToken, Tokenizer, decoders, trainers
from tokenizers.models import WordPiece
from tokenizers.normalizers import BertNormalizer
from tokenizers.pre_tokenizers import BertPreTokenizer
from tokenizers.processors import BertProcessing
from .base_tokenizer import BaseTokenizer
class BertWordPieceTokenizer(BaseTokenizer):
"""Bert WordPiece Tokenizer"""
def __init__(
self,
vocab: Optional[Union[str, Dict[str, int]]] = None,
unk_token: Union[str, AddedToken] = "[UNK]",
sep_token: Union[str, AddedToken] = "[SEP]",
cls_token: Union[str, AddedToken] = "[CLS]",
pad_token: Union[str, AddedToken] = "[PAD]",
mask_token: Union[str, AddedToken] = "[MASK]",
clean_text: bool = True,
handle_chinese_chars: bool = True,
strip_accents: Optional[bool] = None,
lowercase: bool = True,
wordpieces_prefix: str = "##",
):
if vocab is not None:
tokenizer = Tokenizer(WordPiece(vocab, unk_token=str(unk_token)))
else:
tokenizer = Tokenizer(WordPiece(unk_token=str(unk_token)))
# Let the tokenizer know about special tokens if they are part of the vocab
if tokenizer.token_to_id(str(unk_token)) is not None:
tokenizer.add_special_tokens([str(unk_token)])
if tokenizer.token_to_id(str(sep_token)) is not None:
tokenizer.add_special_tokens([str(sep_token)])
if tokenizer.token_to_id(str(cls_token)) is not None:
tokenizer.add_special_tokens([str(cls_token)])
if tokenizer.token_to_id(str(pad_token)) is not None:
tokenizer.add_special_tokens([str(pad_token)])
if tokenizer.token_to_id(str(mask_token)) is not None:
tokenizer.add_special_tokens([str(mask_token)])
tokenizer.normalizer = BertNormalizer(
clean_text=clean_text,
handle_chinese_chars=handle_chinese_chars,
strip_accents=strip_accents,
lowercase=lowercase,
)
tokenizer.pre_tokenizer = BertPreTokenizer()
if vocab is not None:
sep_token_id = tokenizer.token_to_id(str(sep_token))
if sep_token_id is None:
raise TypeError("sep_token not found in the vocabulary")
cls_token_id = tokenizer.token_to_id(str(cls_token))
if cls_token_id is None:
raise TypeError("cls_token not found in the vocabulary")
tokenizer.post_processor = BertProcessing((str(sep_token), sep_token_id), (str(cls_token), cls_token_id))
tokenizer.decoder = decoders.WordPiece(prefix=wordpieces_prefix)
parameters = {
"model": "BertWordPiece",
"unk_token": unk_token,
"sep_token": sep_token,
"cls_token": cls_token,
"pad_token": pad_token,
"mask_token": mask_token,
"clean_text": clean_text,
"handle_chinese_chars": handle_chinese_chars,
"strip_accents": strip_accents,
"lowercase": lowercase,
"wordpieces_prefix": wordpieces_prefix,
}
super().__init__(tokenizer, parameters)
@staticmethod
def from_file(vocab: str, **kwargs):
vocab = WordPiece.read_file(vocab)
return BertWordPieceTokenizer(vocab, **kwargs)
def train(
self,
files: Union[str, List[str]],
vocab_size: int = 30000,
min_frequency: int = 2,
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
special_tokens: List[Union[str, AddedToken]] = [
"[PAD]",
"[UNK]",
"[CLS]",
"[SEP]",
"[MASK]",
],
show_progress: bool = True,
wordpieces_prefix: str = "##",
):
"""Train the model using the given files"""
trainer = trainers.WordPieceTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
special_tokens=special_tokens,
show_progress=show_progress,
continuing_subword_prefix=wordpieces_prefix,
)
if isinstance(files, str):
files = [files]
self._tokenizer.train(files, trainer=trainer)
def train_from_iterator(
self,
iterator: Union[Iterator[str], Iterator[Iterator[str]]],
vocab_size: int = 30000,
min_frequency: int = 2,
limit_alphabet: int = 1000,
initial_alphabet: List[str] = [],
special_tokens: List[Union[str, AddedToken]] = [
"[PAD]",
"[UNK]",
"[CLS]",
"[SEP]",
"[MASK]",
],
show_progress: bool = True,
wordpieces_prefix: str = "##",
length: Optional[int] = None,
):
"""Train the model using the given iterator"""
trainer = trainers.WordPieceTrainer(
vocab_size=vocab_size,
min_frequency=min_frequency,
limit_alphabet=limit_alphabet,
initial_alphabet=initial_alphabet,
special_tokens=special_tokens,
show_progress=show_progress,
continuing_subword_prefix=wordpieces_prefix,
)
self._tokenizer.train_from_iterator(
iterator,
trainer=trainer,
length=length,
)