diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index be48958e6..14c7b9584 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -1,6 +1,6 @@ import pickle import pytest -from ..utils import data_dir, roberta_files, bert_files +from ..utils import data_dir, roberta_files, bert_files, encode_decode_in_subprocess from tokenizers import AddedToken, Tokenizer, Encoding from tokenizers.models import Model, BPE, WordPiece @@ -125,18 +125,54 @@ def test_encode_formats(self, bert_files): output = tokenizer.encode("my name is john") assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] output = tokenizer.encode("my name is john", "pair") - assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"] + assert output.tokens == [ + "[CLS]", + "my", + "name", + "is", + "john", + "[SEP]", + "pair", + "[SEP]", + ] output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True) assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True) - assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"] + assert output.tokens == [ + "[CLS]", + "my", + "name", + "is", + "john", + "[SEP]", + "pair", + "[SEP]", + ] output = tokenizer.encode_batch(["My name is John", "My name is Georges"]) assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] assert output[1].tokens == ["[CLS]", "my", "name", "is", "georges", "[SEP]"] output = tokenizer.encode_batch([("my name is john", "pair"), ("my name is john", "pair")]) - assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"] - assert output[1].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"] + assert output[0].tokens == [ + "[CLS]", + "my", + "name", + "is", + "john", + "[SEP]", + "pair", + "[SEP]", + ] + assert output[1].tokens == [ + "[CLS]", + "my", + "name", + "is", + "john", + "[SEP]", + "pair", + "[SEP]", + ] output = tokenizer.encode_batch([["my", "name", "is", "john"]], is_pretokenized=True) assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"] @@ -162,7 +198,14 @@ def test_encode_add_special_tokens(self, roberta_files): # Can encode with special tokens output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True) - assert output_with_specials.tokens == ["", "ĠMy", "Ġname", "Ġis", "ĠJohn", ""] + assert output_with_specials.tokens == [ + "", + "ĠMy", + "Ġname", + "Ġis", + "ĠJohn", + "", + ] # Can encode without special tokens output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False) @@ -269,3 +312,7 @@ def test_post_process(self): # Can post process a pair of encodings output = tokenizer.post_process(encoding, pair_encoding) assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"] + + def test_encode_decode_in_subprocess(self): + tokenizer = Tokenizer(BPE()) + encode_decode_in_subprocess(tokenizer) diff --git a/bindings/python/tests/implementations/test_bert_wordpiece.py b/bindings/python/tests/implementations/test_bert_wordpiece.py index fb241cd10..1dba0d5c4 100644 --- a/bindings/python/tests/implementations/test_bert_wordpiece.py +++ b/bindings/python/tests/implementations/test_bert_wordpiece.py @@ -1,4 +1,4 @@ -from ..utils import data_dir, bert_files +from ..utils import data_dir, bert_files, encode_decode_in_subprocess from tokenizers import BertWordPieceTokenizer @@ -9,8 +9,26 @@ def test_basic_encode(self, bert_files): # Encode with special tokens by default output = tokenizer.encode("My name is John", "pair") assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102] - assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"] - assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15), (0, 0), (0, 4), (0, 0)] + assert output.tokens == [ + "[CLS]", + "my", + "name", + "is", + "john", + "[SEP]", + "pair", + "[SEP]", + ] + assert output.offsets == [ + (0, 0), + (0, 2), + (3, 7), + (8, 10), + (11, 15), + (0, 0), + (0, 4), + (0, 0), + ] assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1] # Can encode without the special tokens @@ -19,3 +37,7 @@ def test_basic_encode(self, bert_files): assert output.tokens == ["my", "name", "is", "john", "pair"] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)] assert output.type_ids == [0, 0, 0, 0, 1] + + def test_encode_decode_in_subprocess(self, bert_files): + tokenizer = BertWordPieceTokenizer(bert_files["vocab"]) + encode_decode_in_subprocess(tokenizer) diff --git a/bindings/python/tests/implementations/test_byte_level_bpe.py b/bindings/python/tests/implementations/test_byte_level_bpe.py index d5a4673e0..e45ce44ff 100644 --- a/bindings/python/tests/implementations/test_byte_level_bpe.py +++ b/bindings/python/tests/implementations/test_byte_level_bpe.py @@ -1,4 +1,8 @@ -from ..utils import data_dir, roberta_files +from ..utils import ( + data_dir, + roberta_files, + encode_decode_in_subprocess, +) from tokenizers import ByteLevelBPETokenizer @@ -63,7 +67,7 @@ def test_add_prefix_space(self, roberta_files): def test_lowerspace(self, roberta_files): tokenizer = ByteLevelBPETokenizer( - roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True + roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True, ) output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog") @@ -79,3 +83,9 @@ def test_lowerspace(self, roberta_files): "Ġlazy", "Ġdog", ] + + def test_encode_decode_in_subprocess(self, roberta_files): + tokenizer = ByteLevelBPETokenizer( + roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True, + ) + encode_decode_in_subprocess(tokenizer) diff --git a/bindings/python/tests/implementations/test_char_bpe.py b/bindings/python/tests/implementations/test_char_bpe.py index 66b45f43a..fd8c96652 100644 --- a/bindings/python/tests/implementations/test_char_bpe.py +++ b/bindings/python/tests/implementations/test_char_bpe.py @@ -1,4 +1,4 @@ -from ..utils import data_dir, openai_files +from ..utils import data_dir, openai_files, encode_decode_in_subprocess from tokenizers import CharBPETokenizer @@ -42,3 +42,7 @@ def test_decoding(self, openai_files): tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True) decoded = tokenizer.decode(tokenizer.encode("my name is john").ids) assert decoded == "my name is john" + + def test_encode_decode_in_subprocess(self, openai_files): + tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True) + encode_decode_in_subprocess(tokenizer) diff --git a/bindings/python/tests/utils.py b/bindings/python/tests/utils.py index cdf52fcb2..e9cdcc0c2 100644 --- a/bindings/python/tests/utils.py +++ b/bindings/python/tests/utils.py @@ -1,3 +1,4 @@ +from multiprocessing import Process import os import requests import pytest @@ -56,3 +57,28 @@ def openai_files(data_dir): "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt" ), } + + +def encode_decode_in_subprocess(tokenizer): + # It's essential to this test that we call 'encode' or 'encode_batch' + # before the fork. This causes the main process to "lock" some resources + # provided by the Rust "rayon" crate that are needed for parallel processing. + tokenizer.encode("Hi") + tokenizer.encode_batch(["hi", "there"]) + + def encode(): + encoding = tokenizer.encode("Hi") + tokenizer.decode(encoding.ids) + + p = Process(target=encode) + p.start() + p.join(timeout=1) + + # At this point the process should have successfully exited. + # If the subprocess is still alive, the test have failed. + # But we want terminate that process anyway otherwise pytest might hang forever. + if p.is_alive(): + p.terminate() + assert False, "tokenizer in sub process caused dead lock" + + assert p.exitcode == 0 diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index ecd95f687..ba0daebab 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -2,7 +2,6 @@ use crate::tokenizer::{ Decoder, Encoding, NormalizedString, Offsets, PostProcessor, PreTokenizer, Result, }; use onig::Regex; -use rayon::prelude::*; use serde::{Deserialize, Serialize}; use std::collections::{HashMap, HashSet}; @@ -97,7 +96,7 @@ impl PreTokenizer for ByteLevel { .collect::>(); let splits = positions - .into_par_iter() + .into_iter() .map(|range| { // Process one of the splits let slice = &normalized.get()[range]; diff --git a/tokenizers/src/tokenizer/encoding.rs b/tokenizers/src/tokenizer/encoding.rs index 390d864bb..d52d50ce3 100644 --- a/tokenizers/src/tokenizer/encoding.rs +++ b/tokenizers/src/tokenizer/encoding.rs @@ -1,6 +1,5 @@ use crate::tokenizer::{Offsets, Token}; use crate::utils::padding::PaddingDirection; -use rayon::prelude::*; use serde::{Deserialize, Serialize}; /// Represents the output of a `Tokenizer`. @@ -362,7 +361,7 @@ impl Encoding { direction: PaddingDirection, ) { // Dispatch call to all the overflowings first - self.overflowing.par_iter_mut().for_each(|encoding| { + self.overflowing.iter_mut().for_each(|encoding| { encoding.pad(target_length, pad_id, pad_type_id, pad_token, direction) }); diff --git a/tokenizers/src/utils/padding.rs b/tokenizers/src/utils/padding.rs index 9d03df10c..fc5272e4e 100644 --- a/tokenizers/src/utils/padding.rs +++ b/tokenizers/src/utils/padding.rs @@ -1,5 +1,4 @@ use crate::tokenizer::{Encoding, Result}; -use rayon::prelude::*; use serde::{Deserialize, Serialize}; /// The various possible padding directions. @@ -54,11 +53,7 @@ pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Resu let mut pad_length = match params.strategy { PaddingStrategy::Fixed(size) => size, - PaddingStrategy::BatchLongest => encodings - .par_iter() - .map(|e| e.get_ids().len()) - .max() - .unwrap(), + PaddingStrategy::BatchLongest => encodings.iter().map(|e| e.get_ids().len()).max().unwrap(), }; if let Some(multiple) = params.pad_to_multiple_of { @@ -67,7 +62,7 @@ pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Resu } } - encodings.par_iter_mut().for_each(|encoding| { + encodings.iter_mut().for_each(|encoding| { encoding.pad( pad_length, params.pad_id,