diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
index be48958e6..14c7b9584 100644
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@@ -1,6 +1,6 @@
import pickle
import pytest
-from ..utils import data_dir, roberta_files, bert_files
+from ..utils import data_dir, roberta_files, bert_files, encode_decode_in_subprocess
from tokenizers import AddedToken, Tokenizer, Encoding
from tokenizers.models import Model, BPE, WordPiece
@@ -125,18 +125,54 @@ def test_encode_formats(self, bert_files):
output = tokenizer.encode("my name is john")
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
output = tokenizer.encode("my name is john", "pair")
- assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
+ assert output.tokens == [
+ "[CLS]",
+ "my",
+ "name",
+ "is",
+ "john",
+ "[SEP]",
+ "pair",
+ "[SEP]",
+ ]
output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True)
- assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
+ assert output.tokens == [
+ "[CLS]",
+ "my",
+ "name",
+ "is",
+ "john",
+ "[SEP]",
+ "pair",
+ "[SEP]",
+ ]
output = tokenizer.encode_batch(["My name is John", "My name is Georges"])
assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
assert output[1].tokens == ["[CLS]", "my", "name", "is", "georges", "[SEP]"]
output = tokenizer.encode_batch([("my name is john", "pair"), ("my name is john", "pair")])
- assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
- assert output[1].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
+ assert output[0].tokens == [
+ "[CLS]",
+ "my",
+ "name",
+ "is",
+ "john",
+ "[SEP]",
+ "pair",
+ "[SEP]",
+ ]
+ assert output[1].tokens == [
+ "[CLS]",
+ "my",
+ "name",
+ "is",
+ "john",
+ "[SEP]",
+ "pair",
+ "[SEP]",
+ ]
output = tokenizer.encode_batch([["my", "name", "is", "john"]], is_pretokenized=True)
assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
@@ -162,7 +198,14 @@ def test_encode_add_special_tokens(self, roberta_files):
# Can encode with special tokens
output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True)
- assert output_with_specials.tokens == ["", "ĠMy", "Ġname", "Ġis", "ĠJohn", ""]
+ assert output_with_specials.tokens == [
+ "",
+ "ĠMy",
+ "Ġname",
+ "Ġis",
+ "ĠJohn",
+ "",
+ ]
# Can encode without special tokens
output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False)
@@ -269,3 +312,7 @@ def test_post_process(self):
# Can post process a pair of encodings
output = tokenizer.post_process(encoding, pair_encoding)
assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
+
+ def test_encode_decode_in_subprocess(self):
+ tokenizer = Tokenizer(BPE())
+ encode_decode_in_subprocess(tokenizer)
diff --git a/bindings/python/tests/implementations/test_bert_wordpiece.py b/bindings/python/tests/implementations/test_bert_wordpiece.py
index fb241cd10..1dba0d5c4 100644
--- a/bindings/python/tests/implementations/test_bert_wordpiece.py
+++ b/bindings/python/tests/implementations/test_bert_wordpiece.py
@@ -1,4 +1,4 @@
-from ..utils import data_dir, bert_files
+from ..utils import data_dir, bert_files, encode_decode_in_subprocess
from tokenizers import BertWordPieceTokenizer
@@ -9,8 +9,26 @@ def test_basic_encode(self, bert_files):
# Encode with special tokens by default
output = tokenizer.encode("My name is John", "pair")
assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102]
- assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
- assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15), (0, 0), (0, 4), (0, 0)]
+ assert output.tokens == [
+ "[CLS]",
+ "my",
+ "name",
+ "is",
+ "john",
+ "[SEP]",
+ "pair",
+ "[SEP]",
+ ]
+ assert output.offsets == [
+ (0, 0),
+ (0, 2),
+ (3, 7),
+ (8, 10),
+ (11, 15),
+ (0, 0),
+ (0, 4),
+ (0, 0),
+ ]
assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]
# Can encode without the special tokens
@@ -19,3 +37,7 @@ def test_basic_encode(self, bert_files):
assert output.tokens == ["my", "name", "is", "john", "pair"]
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
assert output.type_ids == [0, 0, 0, 0, 1]
+
+ def test_encode_decode_in_subprocess(self, bert_files):
+ tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
+ encode_decode_in_subprocess(tokenizer)
diff --git a/bindings/python/tests/implementations/test_byte_level_bpe.py b/bindings/python/tests/implementations/test_byte_level_bpe.py
index d5a4673e0..e45ce44ff 100644
--- a/bindings/python/tests/implementations/test_byte_level_bpe.py
+++ b/bindings/python/tests/implementations/test_byte_level_bpe.py
@@ -1,4 +1,8 @@
-from ..utils import data_dir, roberta_files
+from ..utils import (
+ data_dir,
+ roberta_files,
+ encode_decode_in_subprocess,
+)
from tokenizers import ByteLevelBPETokenizer
@@ -63,7 +67,7 @@ def test_add_prefix_space(self, roberta_files):
def test_lowerspace(self, roberta_files):
tokenizer = ByteLevelBPETokenizer(
- roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True
+ roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
)
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")
@@ -79,3 +83,9 @@ def test_lowerspace(self, roberta_files):
"Ġlazy",
"Ġdog",
]
+
+ def test_encode_decode_in_subprocess(self, roberta_files):
+ tokenizer = ByteLevelBPETokenizer(
+ roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
+ )
+ encode_decode_in_subprocess(tokenizer)
diff --git a/bindings/python/tests/implementations/test_char_bpe.py b/bindings/python/tests/implementations/test_char_bpe.py
index 66b45f43a..fd8c96652 100644
--- a/bindings/python/tests/implementations/test_char_bpe.py
+++ b/bindings/python/tests/implementations/test_char_bpe.py
@@ -1,4 +1,4 @@
-from ..utils import data_dir, openai_files
+from ..utils import data_dir, openai_files, encode_decode_in_subprocess
from tokenizers import CharBPETokenizer
@@ -42,3 +42,7 @@ def test_decoding(self, openai_files):
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
assert decoded == "my name is john"
+
+ def test_encode_decode_in_subprocess(self, openai_files):
+ tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
+ encode_decode_in_subprocess(tokenizer)
diff --git a/bindings/python/tests/utils.py b/bindings/python/tests/utils.py
index cdf52fcb2..e9cdcc0c2 100644
--- a/bindings/python/tests/utils.py
+++ b/bindings/python/tests/utils.py
@@ -1,3 +1,4 @@
+from multiprocessing import Process
import os
import requests
import pytest
@@ -56,3 +57,28 @@ def openai_files(data_dir):
"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"
),
}
+
+
+def encode_decode_in_subprocess(tokenizer):
+ # It's essential to this test that we call 'encode' or 'encode_batch'
+ # before the fork. This causes the main process to "lock" some resources
+ # provided by the Rust "rayon" crate that are needed for parallel processing.
+ tokenizer.encode("Hi")
+ tokenizer.encode_batch(["hi", "there"])
+
+ def encode():
+ encoding = tokenizer.encode("Hi")
+ tokenizer.decode(encoding.ids)
+
+ p = Process(target=encode)
+ p.start()
+ p.join(timeout=1)
+
+ # At this point the process should have successfully exited.
+ # If the subprocess is still alive, the test have failed.
+ # But we want terminate that process anyway otherwise pytest might hang forever.
+ if p.is_alive():
+ p.terminate()
+ assert False, "tokenizer in sub process caused dead lock"
+
+ assert p.exitcode == 0
diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs
index ecd95f687..ba0daebab 100644
--- a/tokenizers/src/pre_tokenizers/byte_level.rs
+++ b/tokenizers/src/pre_tokenizers/byte_level.rs
@@ -2,7 +2,6 @@ use crate::tokenizer::{
Decoder, Encoding, NormalizedString, Offsets, PostProcessor, PreTokenizer, Result,
};
use onig::Regex;
-use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};
@@ -97,7 +96,7 @@ impl PreTokenizer for ByteLevel {
.collect::>();
let splits = positions
- .into_par_iter()
+ .into_iter()
.map(|range| {
// Process one of the splits
let slice = &normalized.get()[range];
diff --git a/tokenizers/src/tokenizer/encoding.rs b/tokenizers/src/tokenizer/encoding.rs
index 390d864bb..d52d50ce3 100644
--- a/tokenizers/src/tokenizer/encoding.rs
+++ b/tokenizers/src/tokenizer/encoding.rs
@@ -1,6 +1,5 @@
use crate::tokenizer::{Offsets, Token};
use crate::utils::padding::PaddingDirection;
-use rayon::prelude::*;
use serde::{Deserialize, Serialize};
/// Represents the output of a `Tokenizer`.
@@ -362,7 +361,7 @@ impl Encoding {
direction: PaddingDirection,
) {
// Dispatch call to all the overflowings first
- self.overflowing.par_iter_mut().for_each(|encoding| {
+ self.overflowing.iter_mut().for_each(|encoding| {
encoding.pad(target_length, pad_id, pad_type_id, pad_token, direction)
});
diff --git a/tokenizers/src/utils/padding.rs b/tokenizers/src/utils/padding.rs
index 9d03df10c..fc5272e4e 100644
--- a/tokenizers/src/utils/padding.rs
+++ b/tokenizers/src/utils/padding.rs
@@ -1,5 +1,4 @@
use crate::tokenizer::{Encoding, Result};
-use rayon::prelude::*;
use serde::{Deserialize, Serialize};
/// The various possible padding directions.
@@ -54,11 +53,7 @@ pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Resu
let mut pad_length = match params.strategy {
PaddingStrategy::Fixed(size) => size,
- PaddingStrategy::BatchLongest => encodings
- .par_iter()
- .map(|e| e.get_ids().len())
- .max()
- .unwrap(),
+ PaddingStrategy::BatchLongest => encodings.iter().map(|e| e.get_ids().len()).max().unwrap(),
};
if let Some(multiple) = params.pad_to_multiple_of {
@@ -67,7 +62,7 @@ pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Resu
}
}
- encodings.par_iter_mut().for_each(|encoding| {
+ encodings.iter_mut().for_each(|encoding| {
encoding.pad(
pad_length,
params.pad_id,