huggingface · epwalsh · Jun 17, 2020 · Jun 17, 2020
diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py
@@ -1,6 +1,6 @@
 import pickle
 import pytest
-from ..utils import data_dir, roberta_files, bert_files
+from ..utils import data_dir, roberta_files, bert_files, encode_decode_in_subprocess
 
 from tokenizers import AddedToken, Tokenizer, Encoding
 from tokenizers.models import Model, BPE, WordPiece
@@ -125,18 +125,54 @@ def test_encode_formats(self, bert_files):
         output = tokenizer.encode("my name is john")
         assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
         output = tokenizer.encode("my name is john", "pair")
-        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
+        assert output.tokens == [
+            "[CLS]",
+            "my",
+            "name",
+            "is",
+            "john",
+            "[SEP]",
+            "pair",
+            "[SEP]",
+        ]
         output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
         assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
         output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True)
-        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
+        assert output.tokens == [
+            "[CLS]",
+            "my",
+            "name",
+            "is",
+            "john",
+            "[SEP]",
+            "pair",
+            "[SEP]",
+        ]
 
         output = tokenizer.encode_batch(["My name is John", "My name is Georges"])
         assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
         assert output[1].tokens == ["[CLS]", "my", "name", "is", "georges", "[SEP]"]
         output = tokenizer.encode_batch([("my name is john", "pair"), ("my name is john", "pair")])
-        assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
-        assert output[1].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
+        assert output[0].tokens == [
+            "[CLS]",
+            "my",
+            "name",
+            "is",
+            "john",
+            "[SEP]",
+            "pair",
+            "[SEP]",
+        ]
+        assert output[1].tokens == [
+            "[CLS]",
+            "my",
+            "name",
+            "is",
+            "john",
+            "[SEP]",
+            "pair",
+            "[SEP]",
+        ]
         output = tokenizer.encode_batch([["my", "name", "is", "john"]], is_pretokenized=True)
         assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
 
@@ -162,7 +198,14 @@ def test_encode_add_special_tokens(self, roberta_files):
 
         # Can encode with special tokens
         output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True)
-        assert output_with_specials.tokens == ["<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>"]
+        assert output_with_specials.tokens == [
+            "<s>",
+            "ĠMy",
+            "Ġname",
+            "Ġis",
+            "ĠJohn",
+            "</s>",
+        ]
 
         # Can encode without special tokens
         output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False)
@@ -269,3 +312,7 @@ def test_post_process(self):
         # Can post process a pair of encodings
         output = tokenizer.post_process(encoding, pair_encoding)
         assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
+
+    def test_encode_decode_in_subprocess(self):
+        tokenizer = Tokenizer(BPE())
+        encode_decode_in_subprocess(tokenizer)
diff --git a/bindings/python/tests/implementations/test_bert_wordpiece.py b/bindings/python/tests/implementations/test_bert_wordpiece.py
@@ -1,4 +1,4 @@
-from ..utils import data_dir, bert_files
+from ..utils import data_dir, bert_files, encode_decode_in_subprocess
 from tokenizers import BertWordPieceTokenizer
 
 
@@ -9,8 +9,26 @@ def test_basic_encode(self, bert_files):
         # Encode with special tokens by default
         output = tokenizer.encode("My name is John", "pair")
         assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102]
-        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
-        assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15), (0, 0), (0, 4), (0, 0)]
+        assert output.tokens == [
+            "[CLS]",
+            "my",
+            "name",
+            "is",
+            "john",
+            "[SEP]",
+            "pair",
+            "[SEP]",
+        ]
+        assert output.offsets == [
+            (0, 0),
+            (0, 2),
+            (3, 7),
+            (8, 10),
+            (11, 15),
+            (0, 0),
+            (0, 4),
+            (0, 0),
+        ]
         assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]
 
         # Can encode without the special tokens
@@ -19,3 +37,7 @@ def test_basic_encode(self, bert_files):
         assert output.tokens == ["my", "name", "is", "john", "pair"]
         assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
         assert output.type_ids == [0, 0, 0, 0, 1]
+
+    def test_encode_decode_in_subprocess(self, bert_files):
+        tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
+        encode_decode_in_subprocess(tokenizer)
diff --git a/bindings/python/tests/implementations/test_byte_level_bpe.py b/bindings/python/tests/implementations/test_byte_level_bpe.py
@@ -1,4 +1,8 @@
-from ..utils import data_dir, roberta_files
+from ..utils import (
+    data_dir,
+    roberta_files,
+    encode_decode_in_subprocess,
+)
 from tokenizers import ByteLevelBPETokenizer
 
 
@@ -63,7 +67,7 @@ def test_add_prefix_space(self, roberta_files):
 
     def test_lowerspace(self, roberta_files):
         tokenizer = ByteLevelBPETokenizer(
-            roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True
+            roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
         )
         output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")
 
@@ -79,3 +83,9 @@ def test_lowerspace(self, roberta_files):
             "Ġlazy",
             "Ġdog",
         ]
+
+    def test_encode_decode_in_subprocess(self, roberta_files):
+        tokenizer = ByteLevelBPETokenizer(
+            roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
+        )
+        encode_decode_in_subprocess(tokenizer)
diff --git a/bindings/python/tests/implementations/test_char_bpe.py b/bindings/python/tests/implementations/test_char_bpe.py
@@ -1,4 +1,4 @@
-from ..utils import data_dir, openai_files
+from ..utils import data_dir, openai_files, encode_decode_in_subprocess
 from tokenizers import CharBPETokenizer
 
 
@@ -42,3 +42,7 @@ def test_decoding(self, openai_files):
         tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
         decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
         assert decoded == "my name is john"
+
+    def test_encode_decode_in_subprocess(self, openai_files):
+        tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
+        encode_decode_in_subprocess(tokenizer)
diff --git a/bindings/python/tests/utils.py b/bindings/python/tests/utils.py
@@ -1,3 +1,4 @@
+from multiprocessing import Process
 import os
 import requests
 import pytest
@@ -56,3 +57,28 @@ def openai_files(data_dir):
             "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"
         ),
     }
+
+
+def encode_decode_in_subprocess(tokenizer):
+    # It's essential to this test that we call 'encode' or 'encode_batch'
+    # before the fork. This causes the main process to "lock" some resources
+    # provided by the Rust "rayon" crate that are needed for parallel processing.
+    tokenizer.encode("Hi")
+    tokenizer.encode_batch(["hi", "there"])
+
+    def encode():
+        encoding = tokenizer.encode("Hi")
+        tokenizer.decode(encoding.ids)
+
+    p = Process(target=encode)
+    p.start()
+    p.join(timeout=1)
+
+    # At this point the process should have successfully exited.
+    # If the subprocess is still alive, the test have failed.
+    # But we want terminate that process anyway otherwise pytest might hang forever.
+    if p.is_alive():
+        p.terminate()
+        assert False, "tokenizer in sub process caused dead lock"
+
+    assert p.exitcode == 0
diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs
@@ -2,7 +2,6 @@ use crate::tokenizer::{
     Decoder, Encoding, NormalizedString, Offsets, PostProcessor, PreTokenizer, Result,
 };
 use onig::Regex;
-use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 use std::collections::{HashMap, HashSet};
 
@@ -97,7 +96,7 @@ impl PreTokenizer for ByteLevel {
             .collect::<Vec<_>>();
 
         let splits = positions
-            .into_par_iter()
+            .into_iter()
             .map(|range| {
                 // Process one of the splits
                 let slice = &normalized.get()[range];

diff --git a/tokenizers/src/tokenizer/encoding.rs b/tokenizers/src/tokenizer/encoding.rs
@@ -1,6 +1,5 @@
 use crate::tokenizer::{Offsets, Token};
 use crate::utils::padding::PaddingDirection;
-use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 
 /// Represents the output of a `Tokenizer`.
@@ -362,7 +361,7 @@ impl Encoding {
         direction: PaddingDirection,
     ) {
         // Dispatch call to all the overflowings first
-        self.overflowing.par_iter_mut().for_each(|encoding| {
+        self.overflowing.iter_mut().for_each(|encoding| {
             encoding.pad(target_length, pad_id, pad_type_id, pad_token, direction)
         });
 

diff --git a/tokenizers/src/utils/padding.rs b/tokenizers/src/utils/padding.rs
@@ -1,5 +1,4 @@
 use crate::tokenizer::{Encoding, Result};
-use rayon::prelude::*;
 use serde::{Deserialize, Serialize};
 
 /// The various possible padding directions.
@@ -54,11 +53,7 @@ pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Resu
 
     let mut pad_length = match params.strategy {
         PaddingStrategy::Fixed(size) => size,
-        PaddingStrategy::BatchLongest => encodings
-            .par_iter()
-            .map(|e| e.get_ids().len())
-            .max()
-            .unwrap(),
+        PaddingStrategy::BatchLongest => encodings.iter().map(|e| e.get_ids().len()).max().unwrap(),
     };
 
     if let Some(multiple) = params.pad_to_multiple_of {
@@ -67,7 +62,7 @@ pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Resu
         }
     }
 
-    encodings.par_iter_mut().for_each(|encoding| {
+    encodings.iter_mut().for_each(|encoding| {
         encoding.pad(
             pad_length,
             params.pad_id,