Skip to content

Commit

Permalink
something that works
Browse files Browse the repository at this point in the history
  • Loading branch information
ArthurZucker committed Jun 18, 2024
1 parent d59973a commit 34d7cf8
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 5 deletions.
3 changes: 2 additions & 1 deletion tokenizers/src/tokenizer/added_vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,8 @@ impl AddedVocabulary {
}

pub fn simple_id_to_token(&self, id: u32) -> Option<String> {
self.added_tokens_map_r.get(&id).map(|t| t.content.clone())
let token = self.added_tokens_map_r.get(&id).map(|t| t.content.clone());
token
}

//
Expand Down
15 changes: 11 additions & 4 deletions tokenizers/src/tokenizer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -856,14 +856,17 @@ where
} else if let Some(pre_tok) = &self.pre_tokenizer {
let mut string = PreTokenizedString::from(token);
pre_tok.pre_tokenize(&mut string);
println!("Pre-tok String: {}", string.original);
Some(string.original)
} else {
println!("String: {}", token);
Some(token)
}
})
.or_else(|| self.model.id_to_token(*id))
})
.collect::<Vec<_>>();

println!("This should print: {:?}", tokens);
if let Some(decoder) = &self.decoder {
decoder.decode(tokens)
} else {
Expand Down Expand Up @@ -1304,11 +1307,15 @@ where
mod test {

use crate::pre_tokenizers::byte_level;
use crate::tokenizer::Tokenizer;
use crate::AddedToken;
use crate::Tokenizer;

#[cfg(feature = "http")]
// #[cfg(feature = "http")]
#[test]
fn test_decoding_with_added_bpe() {
let tokenizer = Tokenizer::from_pretrained("gpt2", None);
let mut tokenizer = Tokenizer::from_pretrained("gpt2", None).unwrap();
tokenizer.add_tokens(&[AddedToken::from("ĠåĹİ", false)]);
let decoded = tokenizer.decode(&[0, 1, 3512, 50257], false);
println!("Fully decoded text{:?}", decoded.unwrap());
}
}

0 comments on commit 34d7cf8

Please sign in to comment.