Skip to content

Commit

Permalink
make sure we don't warn on empty tokens
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurZucker committed Jun 17, 2024
1 parent 1ff56c0 commit ecc61ff
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 10 deletions.
2 changes: 2 additions & 0 deletions tokenizers/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ unstable_wasm = ["fancy-regex", "getrandom/js"]
criterion = "0.5"
tempfile = "3.10"
assert_approx_eq = "1.1"
tracing-test = "0.2"
tracing = "0.1"

[profile.release]
lto = "fat"
27 changes: 17 additions & 10 deletions tokenizers/src/tokenizer/serialization.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,17 +155,15 @@ where
for token in &tokens {
// Warn the user if the id is different than expected
let received_id = tokenizer.token_to_id(&token.token.content);
if received_id != Some(token.id) {
warn!(
"Warning: Token '{}' was expected to have ID '{}' but was given ID '{}'",
token.token.content,
token.id,
if let Some(rid) = received_id {
if let Some(rid) = received_id {
if rid != token.id {
warn!(
"Warning: Token '{}' was expected to have ID '{}' but was given ID '{}'",
token.token.content,
token.id,
rid.to_string()
} else {
"None".to_string()
}
);
);
}
}
}
let added_tokens: Vec<_> = tokens.into_iter().map(|token| token.token).collect();
Expand All @@ -179,6 +177,7 @@ where
mod tests {
use crate::tokenizer::Tokenizer;
use std::str::FromStr;
use tracing_test::traced_test;

#[test]
fn test_deserialization_serialization_invariant() {
Expand Down Expand Up @@ -233,4 +232,12 @@ mod tests {
// It should be exactly the same as above
assert_eq!(tok_str, tok_json);
}

#[cfg(feature = "http")]
#[traced_test]
#[test]
fn test_from_pretrained() {
let _ = Tokenizer::from_pretrained("Qwen/Qwen2-7B-Instruct", None);
assert!(!logs_contain("WARN"), "Warning: Token '");
}
}

0 comments on commit ecc61ff

Please sign in to comment.