Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Copy tokenizer files in each of their repo #10624

Merged
merged 5 commits into from
Mar 10, 2021
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
51 changes: 33 additions & 18 deletions src/transformers/models/bart/tokenization_bart.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,36 @@
logger = logging.get_logger(__name__)


# vocab and merges same as roberta
vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
_all_bart_models = [
"facebook/bart-base",
"facebook/bart-large",
"facebook/bart-large-mnli",
"facebook/bart-large-cnn",
"facebook/bart-large-xsum",
"yjernite/bart_eli5",
# This is not exhaustive: see https://huggingface.co/models?filter=bart
]
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}

# See all BART models at https://huggingface.co/models?filter=bart
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/vocab.json",
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json",
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/vocab.json",
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/vocab.json",
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/vocab.json",
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/vocab.json",
},
"merges_file": {
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/merges.txt",
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt",
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/merges.txt",
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/merges.txt",
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/merges.txt",
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/merges.txt",
},
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/bart-base": 1024,
"facebook/bart-large": 1024,
"facebook/bart-large-mnli": 1024,
"facebook/bart-large-cnn": 1024,
"facebook/bart-large-xsum": 1024,
"yjernite/bart_eli5": 1024,
}


class BartTokenizer(RobertaTokenizer):
Expand All @@ -42,9 +60,6 @@ class BartTokenizer(RobertaTokenizer):
:class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
parameters and other methods.
"""
# merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
pretrained_vocab_files_map = {
"vocab_file": {m: vocab_url for m in _all_bart_models},
"merges_file": {m: merges_url for m in _all_bart_models},
}
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
61 changes: 41 additions & 20 deletions src/transformers/models/bart/tokenization_bart_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,44 @@
logger = logging.get_logger(__name__)


# vocab and merges same as roberta
vocab_url = "https://huggingface.co/roberta-large/resolve/main/vocab.json"
merges_url = "https://huggingface.co/roberta-large/resolve/main/merges.txt"
tokenizer_url = "https://huggingface.co/roberta-large/resolve/main/tokenizer.json"
_all_bart_models = [
"facebook/bart-base",
"facebook/bart-large",
"facebook/bart-large-mnli",
"facebook/bart-large-cnn",
"facebook/bart-large-xsum",
"yjernite/bart_eli5",
# This is not exhaustive: see https://huggingface.co/models?filter=bart
]
VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}

# See all BART models at https://huggingface.co/models?filter=bart
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/vocab.json",
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/vocab.json",
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/vocab.json",
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/vocab.json",
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/vocab.json",
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/vocab.json",
},
"merges_file": {
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/merges.txt",
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/merges.txt",
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/merges.txt",
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/merges.txt",
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/merges.txt",
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/merges.txt",
},
"tokenizer_file": {
"facebook/bart-base": "https://huggingface.co/facebook/bart-base/resolve/main/tokenizer.json",
"facebook/bart-large": "https://huggingface.co/facebook/bart-large/resolve/main/tokenizer.json",
"facebook/bart-large-mnli": "https://huggingface.co/facebook/bart-large-mnli/resolve/main/tokenizer.json",
"facebook/bart-large-cnn": "https://huggingface.co/facebook/bart-large-cnn/resolve/main/tokenizer.json",
"facebook/bart-large-xsum": "https://huggingface.co/facebook/bart-large-xsum/resolve/main/tokenizer.json",
"yjernite/bart_eli5": "https://huggingface.co/yjernite/bart_eli5/resolve/main/tokenizer.json",
},
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/bart-base": 1024,
"facebook/bart-large": 1024,
"facebook/bart-large-mnli": 1024,
"facebook/bart-large-cnn": 1024,
"facebook/bart-large-xsum": 1024,
"yjernite/bart_eli5": 1024,
}


class BartTokenizerFast(RobertaTokenizerFast):
Expand All @@ -44,11 +69,7 @@ class BartTokenizerFast(RobertaTokenizerFast):
superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the
initialization parameters and other methods.
"""
# merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
pretrained_vocab_files_map = {
"vocab_file": {m: vocab_url for m in _all_bart_models},
"merges_file": {m: merges_url for m in _all_bart_models},
"tokenizer_file": {m: tokenizer_url for m in _all_bart_models},
}
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
slow_tokenizer_class = BartTokenizer
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,13 @@

VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}

tokenizer_url = "https://huggingface.co/google/bert_for_seq_generation_L-24_bbc_encoder/resolve/main/spiece.model"
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"bert_for_seq_generation": "https://huggingface.co/google/bert_for_seq_generation_L-24_bbc_encoder/resolve/main/spiece.model",
}
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"bert_for_seq_generation": 512}


class BertGenerationTokenizer(PreTrainedTokenizer):
Expand All @@ -55,8 +61,8 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
"""

vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = {"vocab_file": {"bert_for_seq_generation": tokenizer_url}}
max_model_input_sizes = {"bert_for_seq_generation": 512}
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
prefix_tokens: List[int] = []
model_input_names = ["input_ids", "attention_mask"]

Expand Down
29 changes: 14 additions & 15 deletions src/transformers/models/blenderbot/tokenization_blenderbot.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,18 @@
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
# "tokenizer_config_file": "tokenizer_config.json",
"tokenizer_config_file": "tokenizer_config.json",
}
CKPT_3B = "facebook/blenderbot-3B"

PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"},
"merges_file": {"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"},
"tokenizer_config_file": {
"facebook/blenderbot-3B": "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json"
},
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot-3B": 128}


class BlenderbotTokenizer(RobertaTokenizer):
Expand All @@ -45,19 +54,9 @@ class BlenderbotTokenizer(RobertaTokenizer):
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
parameters.
"""
vocab_files_names = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_config_file": "tokenizer_config.json",
}
pretrained_vocab_files_map = {
"vocab_file": {CKPT_3B: "https://huggingface.co/facebook/blenderbot-3B/resolve/main/vocab.json"},
"merges_file": {CKPT_3B: "https://huggingface.co/facebook/blenderbot-3B/resolve/main/merges.txt"},
"tokenizer_config_file": {
CKPT_3B: "https://huggingface.co/facebook/blenderbot-3B/resolve/main/tokenizer_config.json"
},
}
max_model_input_sizes = {"facebook/blenderbot-3B": 128}
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES

def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: List[int] = None):
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,20 @@
"tokenizer_config_file": "tokenizer_config.json",
}

PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
},
"merges_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
},
"tokenizer_config_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json"
},
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"facebook/blenderbot_small-90M": 512}


def get_pairs(word):
"""
Expand Down Expand Up @@ -75,23 +89,9 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
"""

vocab_files_names = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_config": "tokenizer_config.json",
}
pretrained_vocab_files_map = {
"vocab_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
},
"merges_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
},
"tokenizer_config_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer.json"
},
}
max_model_input_sizes = {"facebook/blenderbot_small-90M": 512}
vocab_files_names = VOCAB_FILES_NAMES
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"]

def __init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,23 @@

logger = logging.get_logger(__name__)

VOCAB_FILES_NAMES = {}
VOCAB_FILES_NAMES = {
"vocab_file": "vocab.json",
"merges_file": "merges.txt",
"tokenizer_config_file": "tokenizer_config.json",
}

PRETRAINED_VOCAB_FILES_MAP = {}
PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/vocab.json"
},
"merges_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/merges.txt"
},
"tokenizer_config_file": {
"facebook/blenderbot_small-90M": "https://huggingface.co/facebook/blenderbot_small-90M/resolve/main/tokenizer_config.json"
},
}

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
"facebook/blenderbot_small-90M": 512,
Expand Down
7 changes: 0 additions & 7 deletions src/transformers/models/camembert/tokenization_camembert.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,6 @@
"camembert-base": 512,
}

SHARED_MODEL_IDENTIFIERS = [
# Load with
# `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")`
"Musixmatch/umberto-commoncrawl-cased-v1",
"Musixmatch/umberto-wikipedia-uncased-v1",
]

SPIECE_UNDERLINE = "▁"


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,6 @@
"camembert-base": 512,
}

SHARED_MODEL_IDENTIFIERS = [
# Load with
# `tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")`
"Musixmatch/umberto-commoncrawl-cased-v1",
"Musixmatch/umberto-wikipedia-uncased-v1",
]

SPIECE_UNDERLINE = "▁"


Expand Down
10 changes: 5 additions & 5 deletions src/transformers/models/distilbert/tokenization_distilbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,12 @@

PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
"distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
"distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt",
"distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt",
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt",
}
}

Expand Down
20 changes: 10 additions & 10 deletions src/transformers/models/distilbert/tokenization_distilbert_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,20 @@

PRETRAINED_VOCAB_FILES_MAP = {
"vocab_file": {
"distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/vocab.txt",
"distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/vocab.txt",
"distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/vocab.txt",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/vocab.txt",
"distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/vocab.txt",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/vocab.txt",
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/vocab.txt",
"distilbert-base-multilingual-cased": "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/vocab.txt",
},
"tokenizer_file": {
"distilbert-base-uncased": "https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/bert-large-uncased/resolve/main/tokenizer.json",
"distilbert-base-cased": "https://huggingface.co/bert-base-cased/resolve/main/tokenizer.json",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/bert-large-cased/resolve/main/tokenizer.json",
"distilbert-base-uncased": "https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer.json",
"distilbert-base-uncased-distilled-squad": "https://huggingface.co/distilbert-base-uncased-distilled-squad/resolve/main/tokenizer.json",
"distilbert-base-cased": "https://huggingface.co/distilbert-base-cased/resolve/main/tokenizer.json",
"distilbert-base-cased-distilled-squad": "https://huggingface.co/distilbert-base-cased-distilled-squad/resolve/main/tokenizer.json",
"distilbert-base-german-cased": "https://huggingface.co/distilbert-base-german-cased/resolve/main/tokenizer.json",
"distilbert-base-multilingual-cased": "https://huggingface.co/bert-base-multilingual-cased/resolve/main/tokenizer.json",
"distilbert-base-multilingual-cased": "https://huggingface.co/distilbert-base-multilingual-cased/resolve/main/tokenizer.json",
},
}

Expand Down