Skip to content

Commit

Permalink
AddedVocabulary - Add tests, update bindings + various tweaks
Browse files Browse the repository at this point in the history
  • Loading branch information
n1t0 committed Jun 18, 2020
1 parent c6f633e commit 7cedb13
Show file tree
Hide file tree
Showing 12 changed files with 327 additions and 91 deletions.
12 changes: 11 additions & 1 deletion bindings/node/lib/bindings/tokenizer.d.ts
Expand Up @@ -392,6 +392,15 @@ export interface AddedTokenOptions {
* @default False
*/
singleWord?: boolean;
/**
* Whether this token should match on the normalized version of the text. For example
* with the added token `yesterday` and a normalizer in charge of lowercasing the text,
* the input `I saw a lion Yesterday` would match the token.
* This is False for special tokens by default, true otherwise
* @default True
*/
normalized?: boolean;

}

/**
Expand All @@ -404,9 +413,10 @@ export class AddedToken {
/**
* Instantiate a new AddedToken
* @param content The content of the token
* @param special Whether this is a special token
* @param [options] Options for the token
*/
constructor(content: string, options?: AddedTokenOptions);
constructor(content: string, special: boolean, options?: AddedTokenOptions);

/**
* Get the content of the AddedToken
Expand Down
12 changes: 6 additions & 6 deletions bindings/node/lib/bindings/tokenizer.test.ts
Expand Up @@ -32,17 +32,17 @@ import {

describe("AddedToken", () => {
it("instantiates with only content", () => {
const addToken = new AddedToken("test");
const addToken = new AddedToken("test", false);
expect(addToken.constructor.name).toEqual("AddedToken");
});

it("instantiates with empty options", () => {
const addToken = new AddedToken("test", {});
const addToken = new AddedToken("test", false, {});
expect(addToken.constructor.name).toEqual("AddedToken");
});

it("instantiates with options", () => {
const addToken = new AddedToken("test", {
const addToken = new AddedToken("test", false, {
leftStrip: true,
rightStrip: true,
singleWord: true
Expand All @@ -52,7 +52,7 @@ describe("AddedToken", () => {

describe("getContent", () => {
it("returns the string content of AddedToken", () => {
const addedToken = new AddedToken("test");
const addedToken = new AddedToken("test", false);
expect(addedToken.getContent()).toEqual("test");
});
});
Expand Down Expand Up @@ -107,7 +107,7 @@ describe("Tokenizer", () => {
it("accepts a list of AddedToken as new tokens when initial model is empty", () => {
const model = BPE.empty();
const tokenizer = new Tokenizer(model);
const addedToken = new AddedToken("test");
const addedToken = new AddedToken("test", false);

const nbAdd = tokenizer.addTokens([addedToken]);
expect(nbAdd).toBe(1);
Expand All @@ -132,7 +132,7 @@ describe("Tokenizer", () => {

const model = BPE.empty();
tokenizer = new Tokenizer(model);
tokenizer.addTokens(["my", "name", "is", "john", new AddedToken("pair")]);
tokenizer.addTokens(["my", "name", "is", "john", new AddedToken("pair", false)]);

encode = promisify(tokenizer.encode.bind(tokenizer));
encodeBatch = promisify(tokenizer.encodeBatch.bind(tokenizer));
Expand Down
41 changes: 31 additions & 10 deletions bindings/node/native/src/tokenizer.rs
Expand Up @@ -30,10 +30,11 @@ struct AddedTokenOptions {
singleWord: Option<bool>,
leftStrip: Option<bool>,
rightStrip: Option<bool>,
normalized: Option<bool>,
}
impl AddedTokenOptions {
fn into_added_token(self, content: String) -> tk::AddedToken {
let mut token = tk::AddedToken::from(content);
fn into_added_token(self, content: String, special: bool) -> tk::AddedToken {
let mut token = tk::AddedToken::from(content, special);
if let Some(sw) = self.singleWord {
token = token.single_word(sw);
}
Expand All @@ -43,6 +44,9 @@ impl AddedTokenOptions {
if let Some(rs) = self.rightStrip {
token = token.rstrip(rs);
}
if let Some(n) = self.normalized {
token = token.normalized(n);
}
token
}
}
Expand All @@ -52,18 +56,20 @@ declare_types! {
init(mut cx) {
// init(
// content: string,
// special: boolean,
// options?: {
// singleWord?: boolean = false,
// leftStrip?: boolean = false,
// rightStrip?: boolean = false
// normalized?: boolean = true,
// }
// )

let content = cx.extract::<String>(0)
.map_err(|_| Error("First argument must be string".into()))?;
let token = cx.extract_opt::<AddedTokenOptions>(1)?
let content = cx.extract::<String>(0)?;
let special = cx.extract::<bool>(1)?;
let token = cx.extract_opt::<AddedTokenOptions>(2)?
.unwrap_or_else(AddedTokenOptions::default)
.into_added_token(content);
.into_added_token(content, special);

Ok(AddedToken { token })
}
Expand All @@ -87,7 +93,7 @@ impl FromJsValue for AddedToken {
fn from_value<'c, C: Context<'c>>(from: Handle<'c, JsValue>, cx: &mut C) -> LibResult<Self> {
if let Ok(token) = from.downcast::<JsString>() {
Ok(AddedToken {
token: tk::AddedToken::from(token.value()),
token: tk::AddedToken::from(token.value(), false),
})
} else if let Ok(token) = from.downcast::<JsAddedToken>() {
let guard = cx.lock();
Expand All @@ -99,6 +105,21 @@ impl FromJsValue for AddedToken {
}
}

struct SpecialToken(tk::AddedToken);
impl FromJsValue for SpecialToken {
fn from_value<'c, C: Context<'c>>(from: Handle<'c, JsValue>, cx: &mut C) -> LibResult<Self> {
if let Ok(token) = from.downcast::<JsString>() {
Ok(SpecialToken(tk::AddedToken::from(token.value(), true)))
} else if let Ok(token) = from.downcast::<JsAddedToken>() {
let guard = cx.lock();
let token = token.borrow(&guard);
Ok(SpecialToken(token.token.clone()))
} else {
Err(Error("Expected `string | AddedToken`".into()))
}
}
}

// encode & encodeBatch types

struct TextInputSequence(tk::InputSequence);
Expand Down Expand Up @@ -623,7 +644,7 @@ declare_types! {

let this = cx.this();
let guard = cx.lock();
let token = this.borrow(&guard).tokenizer.id_to_token(id);
let token = this.borrow(&guard).tokenizer.id_to_token(id).map(|t| t.to_owned());

if let Some(token) = token {
Ok(cx.string(token).upcast())
Expand All @@ -650,9 +671,9 @@ declare_types! {
method addSpecialTokens(mut cx) {
// addSpecialTokens(tokens: (string | AddedToken)[]): number

let tokens = cx.extract_vec::<AddedToken>(0)?
let tokens = cx.extract_vec::<SpecialToken>(0)?
.into_iter()
.map(|token| token.into())
.map(|token| token.0)
.collect::<Vec<_>>();

let mut this = cx.this();
Expand Down
2 changes: 1 addition & 1 deletion bindings/python/src/tokenizer.rs
Expand Up @@ -29,7 +29,7 @@ impl AddedToken {
#[new]
#[args(kwargs = "**")]
fn new(content: &str, is_special_token: bool, kwargs: Option<&PyDict>) -> PyResult<Self> {
let mut token = tk::tokenizer::AddedToken::from(content.to_owned(), is_special_token);
let mut token = tk::tokenizer::AddedToken::from(content, is_special_token);

if let Some(kwargs) = kwargs {
for (key, value) in kwargs {
Expand Down
31 changes: 24 additions & 7 deletions bindings/python/tokenizers/__init__.pyi
Expand Up @@ -200,27 +200,44 @@ class AddedToken:
"""

def __new__(
cls, content: str, single_word: bool = False, lstrip: bool = False, rstrip: bool = False,
cls,
content: str,
is_special_token: bool,
single_word: bool = False,
lstrip: bool = False,
rstrip: bool = False,
normalized: bool = True,
) -> AddedToken:
""" Instantiate a new AddedToken
Args:
content: str:
The content of the token
is_special_token: bool:
Whether this token is a special token. This has an impact on the default value for
`normalized` which is False for special tokens, but True for others.
single_word: bool
Whether this token should only match against single word. If True,
this token will never match inside of a word.
Whether this token should only match against single words. If True,
this token will never match inside of a word. For example the token `ing` would
match on `tokenizing` if this option if False, but not if this option is True.
lstrip: bool
Whether this token should strip all potential whitespaces on the left side.
If True, this token will greedily match any whitespace on the left and then strip
them out.
If True, this token will greedily match any whitespace on the left. For example,
if we try to match the token `[MASK]` with lstrip=True, in the text `I saw a [MASK]`
we will match on ` [MASK]`.
rstrip: bool
Whether this token should strip all potential whitespaces on the right side.
If True, this token will greedily match any whitespace on the right and then strip
them out.
If True, this token will greedily match any whitespace on the right. It works just
like lstrip, but on the right.
normalized: bool:
Whether this token should be match the normalized version of the input text. For
example, with the added token `yesterday` and a normalizer in charge of lowercasing
the text, the token could be extract from the input `I saw a lion Yesterday`.
"""
pass

Expand Down
18 changes: 9 additions & 9 deletions tokenizers/README.md
Expand Up @@ -9,7 +9,7 @@
<img alt="GitHub" src="https://img.shields.io/github/license/huggingface/tokenizers.svg?color=blue">
</a>
<a href="https://docs.rs/tokenizers/">
<img alt="Doc" src="https://docs.rs/tokenizers/badge.svg">
<img alt="Doc" src="https://docs.rs/tokenizers/badge.svg">
</a>
</p>
<br>
Expand Down Expand Up @@ -56,22 +56,22 @@ fn main() -> Result<()>{
.vocab_size(vocab_size)
.min_frequency(0)
.special_tokens(vec![
AddedToken::from("<s>".into()),
AddedToken::from("<pad>".into()),
AddedToken::from("</s>".into()),
AddedToken::from("<unk>".into()),
AddedToken::from("<mask>".into()),
AddedToken::from("<s>", true),
AddedToken::from("<pad>", true),
AddedToken::from("</s>", true),
AddedToken::from("<unk>", true),
AddedToken::from("<mask>", true),
])
.build(),
);

let mut tokenizer = Tokenizer::new(Box::new(BPE::default()));
tokenizer.with_normalizer(Box::new(Sequence::new(vec![
Box::new(Strip::new(true, true)),
Box::new(NFC),
])));
tokenizer.with_pre_tokenizer(Box::new(ByteLevel::default()));

tokenizer.train(&trainer, vec!["/path/to/train.txt".to_string()])?;
tokenizer.save("/path/to/trained_tokenizer", true)?;

Expand All @@ -86,7 +86,7 @@ use tokenizers::Result;
use tokenizers::tokenizer::Tokenizer;

fn main() -> Result<()>{

let tokenizer = Tokenizer::from_file("/path/to/trained_tokenizer")?;

let sample_encoding = tokenizer.encode("Huggingface", false)?;
Expand Down
5 changes: 2 additions & 3 deletions tokenizers/benches/bpe_benchmark.rs
Expand Up @@ -17,9 +17,8 @@ fn create_gpt2_tokenizer(bpe: BPE) -> Tokenizer {
let mut tokenizer = Tokenizer::new(Box::new(bpe));
tokenizer.with_pre_tokenizer(Box::new(ByteLevel::default()));
tokenizer.with_decoder(Box::new(ByteLevel::default()));
tokenizer.add_tokens(&[AddedToken::from(String::from("ing"), false).single_word(false)]);
tokenizer
.add_special_tokens(&[AddedToken::from(String::from("[ENT]"), true).single_word(true)]);
tokenizer.add_tokens(&[AddedToken::from("ing", false).single_word(false)]);
tokenizer.add_special_tokens(&[AddedToken::from("[ENT]", true).single_word(true)]);
tokenizer
}

Expand Down

0 comments on commit 7cedb13

Please sign in to comment.