-
Notifications
You must be signed in to change notification settings - Fork 759
/
trainers.rs
104 lines (98 loc) · 4.03 KB
/
trainers.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
extern crate tokenizers as tk;
use super::utils::Container;
use pyo3::prelude::*;
use pyo3::types::*;
#[pyclass]
pub struct Trainer {
pub trainer: Container<dyn tk::tokenizer::Trainer>,
}
#[pyclass]
pub struct BpeTrainer {}
#[pymethods]
impl BpeTrainer {
/// new(/ vocab_size, min_frequency)
/// --
///
/// Create a new BpeTrainer with the given configuration
#[staticmethod]
#[args(kwargs = "**")]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<Trainer> {
let mut builder = tk::models::bpe::BpeTrainer::builder();
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
"vocab_size" => builder = builder.vocab_size(val.extract()?),
"min_frequency" => builder = builder.min_frequency(val.extract()?),
"show_progress" => builder = builder.show_progress(val.extract()?),
"special_tokens" => builder = builder.special_tokens(val.extract()?),
"limit_alphabet" => builder = builder.limit_alphabet(val.extract()?),
"initial_alphabet" => {
let alphabet: Vec<String> = val.extract()?;
builder = builder.initial_alphabet(
alphabet
.into_iter()
.map(|s| s.chars().nth(0))
.filter(|c| c.is_some())
.map(|c| c.unwrap())
.collect(),
);
}
"continuing_subword_prefix" => {
builder = builder.continuing_subword_prefix(val.extract()?)
}
"end_of_word_suffix" => builder = builder.end_of_word_suffix(val.extract()?),
_ => println!("Ignored unknown kwargs option {}", key),
};
}
}
Ok(Trainer {
trainer: Container::Owned(Box::new(builder.build())),
})
}
}
#[pyclass]
pub struct WordPieceTrainer {}
#[pymethods]
impl WordPieceTrainer {
/// new(/ vocab_size, min_frequency)
/// --
///
/// Create a new BpeTrainer with the given configuration
#[staticmethod]
#[args(kwargs = "**")]
pub fn new(kwargs: Option<&PyDict>) -> PyResult<Trainer> {
let mut builder = tk::models::wordpiece::WordPieceTrainer::builder();
if let Some(kwargs) = kwargs {
for (key, val) in kwargs {
let key: &str = key.extract()?;
match key {
"vocab_size" => builder = builder.vocab_size(val.extract()?),
"min_frequency" => builder = builder.min_frequency(val.extract()?),
"show_progress" => builder = builder.show_progress(val.extract()?),
"special_tokens" => builder = builder.special_tokens(val.extract()?),
"limit_alphabet" => builder = builder.limit_alphabet(val.extract()?),
"initial_alphabet" => {
let alphabet: Vec<String> = val.extract()?;
builder = builder.initial_alphabet(
alphabet
.into_iter()
.map(|s| s.chars().nth(0))
.filter(|c| c.is_some())
.map(|c| c.unwrap())
.collect(),
);
}
"continuing_subword_prefix" => {
builder = builder.continuing_subword_prefix(val.extract()?)
}
"end_of_word_suffix" => builder = builder.end_of_word_suffix(val.extract()?),
_ => println!("Ignored unknown kwargs option {}", key),
};
}
}
Ok(Trainer {
trainer: Container::Owned(Box::new(builder.build())),
})
}
}