Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions tests/models/albert/test_tokenization_albert.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,28 +65,6 @@ def test_get_vocab(self):
def test_vocab_size(self):
self.assertEqual(self.get_tokenizer().vocab_size, 30_000)

def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()

sequence = "I was born in 92000, and this is falsé."

tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

def test_full_tokenizer(self):
tokenizer = AlbertTokenizer(SAMPLE_VOCAB, keep_accents=True)

Expand Down
23 changes: 0 additions & 23 deletions tests/models/barthez/test_tokenization_barthez.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,29 +73,6 @@ def test_prepare_batch(self):
result = batch.input_ids.tolist()[0]
self.assertListEqual(expected_src_tokens, result)

def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()

sequence = "I was born in 92000, and this is falsé."

tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

@slow
def test_tokenizer_integration(self):
expected_encoding = {'input_ids': [[0, 490, 14328, 4507, 354, 47, 43669, 95, 25, 78117, 20215, 19779, 190, 22, 400, 4, 35343, 80310, 603, 86, 24937, 105, 33438, 94762, 196, 39642, 7, 15, 15933, 173, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [0, 10534, 87, 25, 66, 3358, 196, 55289, 8, 82961, 81, 2204, 75203, 7, 15, 763, 12956, 216, 178, 14328, 9595, 1377, 69693, 7, 448, 71021, 196, 18106, 1437, 13974, 108, 9083, 4, 49315, 7, 39, 86, 1326, 2793, 46333, 4, 448, 196, 74588, 7, 49315, 7, 39, 21, 822, 38470, 74, 21, 66723, 62480, 8, 22050, 5, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip

Expand Down
41 changes: 0 additions & 41 deletions tests/models/bert/test_tokenization_bert.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,47 +77,6 @@ def test_full_tokenizer(self):
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])

def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()

sequence = "UNwant\u00e9d,running"

tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

# With lower casing
tokenizer = self.get_tokenizer(do_lower_case=True)
rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)

sequence = "UNwant\u00e9d,running"

tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

def test_chinese(self):
tokenizer = BasicTokenizer()

Expand Down
22 changes: 0 additions & 22 deletions tests/models/big_bird/test_tokenization_big_bird.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,28 +61,6 @@ def test_get_vocab(self):
def test_vocab_size(self):
self.assertEqual(self.get_tokenizer().vocab_size, 1_000)

def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()

sequence = "I was born in 92000, and this is falsé."

tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

def test_full_tokenizer(self):
tokenizer = BigBirdTokenizer(SAMPLE_VOCAB, keep_accents=True)

Expand Down
22 changes: 0 additions & 22 deletions tests/models/camembert/test_tokenization_camembert.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,28 +94,6 @@ def test_rust_and_python_bpe_tokenizers(self):
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()

sequence = "I was born in 92000, and this is falsé."

tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

@slow
def test_tokenizer_integration(self):
expected_encoding = {'input_ids': [[5, 54, 7196, 297, 30, 23, 776, 18, 11, 3215, 3705, 8252, 22, 3164, 1181, 2116, 29, 16, 813, 25, 791, 3314, 20, 3446, 38, 27575, 120, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [5, 468, 17, 11, 9088, 20, 1517, 8, 22804, 18818, 10, 38, 629, 607, 607, 142, 19, 7196, 867, 56, 10326, 24, 2267, 20, 416, 5072, 15612, 233, 734, 7, 2399, 27, 16, 3015, 1649, 7, 24, 20, 4338, 2399, 27, 13, 3400, 14, 13, 6189, 8, 930, 9, 6]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # fmt: skip
Expand Down
17 changes: 2 additions & 15 deletions tests/models/clvp/test_tokenization_clvp.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,31 +103,18 @@ def test_add_special_tokens(self):
decoded = tokenizer.decode(encoded_special_token, skip_special_tokens=True)
self.assertTrue(special_token not in decoded)

# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.test_rust_and_python_full_tokenizers
def test_rust_and_python_full_tokenizers(self):
def test_unkown_token(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)

sequence = "lower newer"
self._current_sequence = sequence

# Testing tokenization
tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

# Testing conversion to ids without special tokens
ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

# Testing conversion to ids with special tokens
rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
ids = tokenizer.encode(sequence, add_prefix_space=True)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

# Testing the unknown token
input_tokens = tokens + [rust_tokenizer.unk_token]
Expand Down
15 changes: 1 addition & 14 deletions tests/models/codegen/test_tokenization_codegen.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def test_full_tokenizer(self):
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)

def test_rust_and_python_full_tokenizers(self):
def test_lower_case(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

Expand All @@ -117,19 +117,6 @@ def test_rust_and_python_full_tokenizers(self):

# Testing tokenization
tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

# Testing conversion to ids without special tokens
ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

# Testing conversion to ids with special tokens
rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
ids = tokenizer.encode(sequence, add_prefix_space=True)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

# Testing the unknown token
input_tokens = tokens + [rust_tokenizer.unk_token]
Expand Down
19 changes: 0 additions & 19 deletions tests/models/deberta_v2/test_tokenization_deberta_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,25 +173,6 @@ def test_do_lower_case_false_split_by_punct_false(self):

self.assertListEqual(rust_tokens, tokens_target)

def test_rust_and_python_full_tokenizers(self):
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()

sequence = "I was born in 92000, and this is falsé!"

tokens = tokenizer.convert_ids_to_tokens(tokenizer.encode(sequence, add_special_tokens=False))
rust_tokens = rust_tokenizer.convert_ids_to_tokens(rust_tokenizer.encode(sequence, add_special_tokens=False))
self.assertListEqual(tokens, rust_tokens)

ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

def test_full_tokenizer(self):
sequence = "This is a test"
ids_target = [13, 1, 4398, 25, 21, 1289]
Expand Down
41 changes: 0 additions & 41 deletions tests/models/electra/test_tokenization_electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,47 +76,6 @@ def test_full_tokenizer(self):
self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [9, 6, 7, 12, 10, 11])

def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()

sequence = "UNwant\u00e9d,running"

tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

# With lower casing
tokenizer = self.get_tokenizer(do_lower_case=True)
rust_tokenizer = self.get_rust_tokenizer(do_lower_case=True)

sequence = "UNwant\u00e9d,running"

tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

def test_chinese(self):
tokenizer = BasicTokenizer()

Expand Down
22 changes: 0 additions & 22 deletions tests/models/fnet/test_tokenization_fnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,28 +67,6 @@ def test_get_vocab(self):
def test_vocab_size(self):
self.assertEqual(self.get_tokenizer().vocab_size, 30_000)

def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()

sequence = "I was born in 92000, and this is falsé."

tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

def test_full_tokenizer(self):
tokenizer = FNetTokenizer(SAMPLE_VOCAB, keep_accents=True)

Expand Down
15 changes: 1 addition & 14 deletions tests/models/gpt2/test_tokenization_gpt2.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def test_full_tokenizer(self):
input_bpe_tokens = [14, 15, 10, 9, 3, 2, 15, 19]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)

def test_rust_and_python_full_tokenizers(self):
def test_rust_unkown_token(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

Expand All @@ -116,19 +116,6 @@ def test_rust_and_python_full_tokenizers(self):

# Testing tokenization
tokens = tokenizer.tokenize(sequence, add_prefix_space=True)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

# Testing conversion to ids without special tokens
ids = tokenizer.encode(sequence, add_special_tokens=False, add_prefix_space=True)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

# Testing conversion to ids with special tokens
rust_tokenizer = self.get_rust_tokenizer(add_prefix_space=True)
ids = tokenizer.encode(sequence, add_prefix_space=True)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

# Testing the unknown token
input_tokens = tokens + [rust_tokenizer.unk_token]
Expand Down
22 changes: 0 additions & 22 deletions tests/models/herbert/test_tokenization_herbert.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,28 +93,6 @@ def test_full_tokenizer(self):
input_bpe_tokens = [16, 17, 23]
self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)

def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
self.skipTest(reason="test_rust_tokenizer is set to False")

tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()

sequence = "lower,newer"

tokens = tokenizer.tokenize(sequence)
rust_tokens = rust_tokenizer.tokenize(sequence)
self.assertListEqual(tokens, rust_tokens)

ids = tokenizer.encode(sequence, add_special_tokens=False)
rust_ids = rust_tokenizer.encode(sequence, add_special_tokens=False)
self.assertListEqual(ids, rust_ids)

rust_tokenizer = self.get_rust_tokenizer()
ids = tokenizer.encode(sequence)
rust_ids = rust_tokenizer.encode(sequence)
self.assertListEqual(ids, rust_ids)

@slow
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("allegro/herbert-base-cased")
Expand Down
Loading