Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove use of parallel iterators except in batch methods #308

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
59 changes: 53 additions & 6 deletions bindings/python/tests/bindings/test_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pickle
import pytest
from ..utils import data_dir, roberta_files, bert_files
from ..utils import data_dir, roberta_files, bert_files, encode_decode_in_subprocess

from tokenizers import AddedToken, Tokenizer, Encoding
from tokenizers.models import Model, BPE, WordPiece
Expand Down Expand Up @@ -125,18 +125,54 @@ def test_encode_formats(self, bert_files):
output = tokenizer.encode("my name is john")
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
output = tokenizer.encode("my name is john", "pair")
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
assert output.tokens == [
"[CLS]",
"my",
"name",
"is",
"john",
"[SEP]",
"pair",
"[SEP]",
]
output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True)
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
assert output.tokens == [
"[CLS]",
"my",
"name",
"is",
"john",
"[SEP]",
"pair",
"[SEP]",
]

output = tokenizer.encode_batch(["My name is John", "My name is Georges"])
assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
assert output[1].tokens == ["[CLS]", "my", "name", "is", "georges", "[SEP]"]
output = tokenizer.encode_batch([("my name is john", "pair"), ("my name is john", "pair")])
assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
assert output[1].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
assert output[0].tokens == [
"[CLS]",
"my",
"name",
"is",
"john",
"[SEP]",
"pair",
"[SEP]",
]
assert output[1].tokens == [
"[CLS]",
"my",
"name",
"is",
"john",
"[SEP]",
"pair",
"[SEP]",
]
output = tokenizer.encode_batch([["my", "name", "is", "john"]], is_pretokenized=True)
assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]

Expand All @@ -162,7 +198,14 @@ def test_encode_add_special_tokens(self, roberta_files):

# Can encode with special tokens
output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True)
assert output_with_specials.tokens == ["<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>"]
assert output_with_specials.tokens == [
"<s>",
"ĠMy",
"Ġname",
"Ġis",
"ĠJohn",
"</s>",
]

# Can encode without special tokens
output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False)
Expand Down Expand Up @@ -269,3 +312,7 @@ def test_post_process(self):
# Can post process a pair of encodings
output = tokenizer.post_process(encoding, pair_encoding)
assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]

def test_encode_decode_in_subprocess(self):
tokenizer = Tokenizer(BPE())
encode_decode_in_subprocess(tokenizer)
28 changes: 25 additions & 3 deletions bindings/python/tests/implementations/test_bert_wordpiece.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ..utils import data_dir, bert_files
from ..utils import data_dir, bert_files, encode_decode_in_subprocess
from tokenizers import BertWordPieceTokenizer


Expand All @@ -9,8 +9,26 @@ def test_basic_encode(self, bert_files):
# Encode with special tokens by default
output = tokenizer.encode("My name is John", "pair")
assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102]
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15), (0, 0), (0, 4), (0, 0)]
assert output.tokens == [
"[CLS]",
"my",
"name",
"is",
"john",
"[SEP]",
"pair",
"[SEP]",
]
assert output.offsets == [
(0, 0),
(0, 2),
(3, 7),
(8, 10),
(11, 15),
(0, 0),
(0, 4),
(0, 0),
]
assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]

# Can encode without the special tokens
Expand All @@ -19,3 +37,7 @@ def test_basic_encode(self, bert_files):
assert output.tokens == ["my", "name", "is", "john", "pair"]
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
assert output.type_ids == [0, 0, 0, 0, 1]

def test_encode_decode_in_subprocess(self, bert_files):
tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
encode_decode_in_subprocess(tokenizer)
14 changes: 12 additions & 2 deletions bindings/python/tests/implementations/test_byte_level_bpe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
from ..utils import data_dir, roberta_files
from ..utils import (
data_dir,
roberta_files,
encode_decode_in_subprocess,
)
from tokenizers import ByteLevelBPETokenizer


Expand Down Expand Up @@ -63,7 +67,7 @@ def test_add_prefix_space(self, roberta_files):

def test_lowerspace(self, roberta_files):
tokenizer = ByteLevelBPETokenizer(
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
)
output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")

Expand All @@ -79,3 +83,9 @@ def test_lowerspace(self, roberta_files):
"Ġlazy",
"Ġdog",
]

def test_encode_decode_in_subprocess(self, roberta_files):
tokenizer = ByteLevelBPETokenizer(
roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True,
)
encode_decode_in_subprocess(tokenizer)
6 changes: 5 additions & 1 deletion bindings/python/tests/implementations/test_char_bpe.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from ..utils import data_dir, openai_files
from ..utils import data_dir, openai_files, encode_decode_in_subprocess
from tokenizers import CharBPETokenizer


Expand Down Expand Up @@ -42,3 +42,7 @@ def test_decoding(self, openai_files):
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
assert decoded == "my name is john"

def test_encode_decode_in_subprocess(self, openai_files):
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
encode_decode_in_subprocess(tokenizer)
26 changes: 26 additions & 0 deletions bindings/python/tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from multiprocessing import Process
import os
import requests
import pytest
Expand Down Expand Up @@ -56,3 +57,28 @@ def openai_files(data_dir):
"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"
),
}


def encode_decode_in_subprocess(tokenizer):
# It's essential to this test that we call 'encode' or 'encode_batch'
# before the fork. This causes the main process to "lock" some resources
# provided by the Rust "rayon" crate that are needed for parallel processing.
tokenizer.encode("Hi")
tokenizer.encode_batch(["hi", "there"])

def encode():
encoding = tokenizer.encode("Hi")
tokenizer.decode(encoding.ids)

p = Process(target=encode)
p.start()
p.join(timeout=1)

# At this point the process should have successfully exited.
# If the subprocess is still alive, the test have failed.
# But we want terminate that process anyway otherwise pytest might hang forever.
if p.is_alive():
p.terminate()
assert False, "tokenizer in sub process caused dead lock"

assert p.exitcode == 0
3 changes: 1 addition & 2 deletions tokenizers/src/pre_tokenizers/byte_level.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ use crate::tokenizer::{
Decoder, Encoding, NormalizedString, Offsets, PostProcessor, PreTokenizer, Result,
};
use onig::Regex;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::collections::{HashMap, HashSet};

Expand Down Expand Up @@ -97,7 +96,7 @@ impl PreTokenizer for ByteLevel {
.collect::<Vec<_>>();

let splits = positions
.into_par_iter()
.into_iter()
.map(|range| {
// Process one of the splits
let slice = &normalized.get()[range];
Expand Down
3 changes: 1 addition & 2 deletions tokenizers/src/tokenizer/encoding.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use crate::tokenizer::{Offsets, Token};
use crate::utils::padding::PaddingDirection;
use rayon::prelude::*;
use serde::{Deserialize, Serialize};

/// Represents the output of a `Tokenizer`.
Expand Down Expand Up @@ -362,7 +361,7 @@ impl Encoding {
direction: PaddingDirection,
) {
// Dispatch call to all the overflowings first
self.overflowing.par_iter_mut().for_each(|encoding| {
self.overflowing.iter_mut().for_each(|encoding| {
encoding.pad(target_length, pad_id, pad_type_id, pad_token, direction)
});

Expand Down
9 changes: 2 additions & 7 deletions tokenizers/src/utils/padding.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
use crate::tokenizer::{Encoding, Result};
use rayon::prelude::*;
use serde::{Deserialize, Serialize};

/// The various possible padding directions.
Expand Down Expand Up @@ -54,11 +53,7 @@ pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Resu

let mut pad_length = match params.strategy {
PaddingStrategy::Fixed(size) => size,
PaddingStrategy::BatchLongest => encodings
.par_iter()
.map(|e| e.get_ids().len())
.max()
.unwrap(),
PaddingStrategy::BatchLongest => encodings.iter().map(|e| e.get_ids().len()).max().unwrap(),
};

if let Some(multiple) = params.pad_to_multiple_of {
Expand All @@ -67,7 +62,7 @@ pub fn pad_encodings(encodings: &mut [Encoding], params: &PaddingParams) -> Resu
}
}

encodings.par_iter_mut().for_each(|encoding| {
encodings.iter_mut().for_each(|encoding| {
encoding.pad(
pad_length,
params.pad_id,
Expand Down