Skip to content

Commit

Permalink
Switch FST to DoubleArrayTrie (#76)
Browse files Browse the repository at this point in the history
* Switch FST library to yada (Double Array Trie) in PrefixDict
Need rust version >= 1.46.0 to build

* Remove lindera-fst from Cargo.toml

* Upgrade yada-0.3.1

* Change .fst to .da
And remove FST name in this repository

* Add long text test case

* Upgrade yada 0.3.2
Need Tokenizer as a Cloneable
  • Loading branch information
johtani committed Oct 6, 2020
1 parent 0f94a38 commit 858b4ea
Show file tree
Hide file tree
Showing 19 changed files with 73 additions and 129 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Expand Up @@ -3,6 +3,7 @@ All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).

## Unreleased
- Switch FST to Double Array #76 @johtani
- Add long-text benchmark #74 @johtani
- Update modules to 2018 #73 @johtani
- Use new method instead of default_normal #72 @johtani
Expand Down
42 changes: 9 additions & 33 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -10,7 +10,7 @@ Lindera aims to build a library which is easy to install and provides concise AP

The following products are required to build:

- Rust >= 1.39.0
- Rust >= 1.46.0
- make >= 3.81
- jq

Expand Down
2 changes: 1 addition & 1 deletion lindera-cli/README.md
Expand Up @@ -14,7 +14,7 @@ A morphological analysis command-line interface for [Lindera](https://github.com

The following products are required to build:

- Rust >= 1.39.0
- Rust >= 1.46.0
- make >= 3.81

```shell script
Expand Down
2 changes: 1 addition & 1 deletion lindera-core/Cargo.toml
Expand Up @@ -22,4 +22,4 @@ byteorder = "1.3.4"
encoding = "0.2.33"
serde = {version="1.0.106", features = ["derive"] }

lindera-fst = "0.1.1"
yada = "0.3.2"
2 changes: 1 addition & 1 deletion lindera-core/README.md
Expand Up @@ -10,7 +10,7 @@ This package contains dictionary structures and the viterbi algorithm.

The following products are required to build:

- Rust >= 1.39.0
- Rust >= 1.46.0
- make >= 3.81

```text
Expand Down
77 changes: 11 additions & 66 deletions lindera-core/src/core/prefix_dict.rs
@@ -1,22 +1,20 @@
use std::ops::Deref;

use lindera_fst;
use lindera_fst::raw::Output;

use crate::core::word_entry::WordEntry;
use yada::DoubleArray;

#[derive(Clone)]
pub struct PrefixDict<Data = Vec<u8>> {
pub fst: lindera_fst::raw::Fst<Data>,
pub da: DoubleArray<Vec<u8>>,
pub vals_data: Data,
pub is_system: bool,
}

impl PrefixDict<&[u8]> {
pub fn from_static_slice(fst_data: &[u8], vals_data: &[u8]) -> lindera_fst::Result<PrefixDict> {
let fst = lindera_fst::raw::Fst::new(fst_data.to_vec())?;
pub fn from_static_slice(da_data: &[u8], vals_data: &[u8]) -> Result<PrefixDict, String> {
let da = DoubleArray::new(da_data.to_vec());
Ok(PrefixDict {
fst,
da,
vals_data: vals_data.to_vec(),
is_system: true,
})
Expand All @@ -25,31 +23,11 @@ impl PrefixDict<&[u8]> {

impl<D: Deref<Target = [u8]>> PrefixDict<D> {
pub fn prefix<'a>(&'a self, s: &'a str) -> impl Iterator<Item = (usize, WordEntry)> + 'a {
s.as_bytes()
.iter()
.scan(
(0, self.fst.root(), Output::zero()),
move |(prefix_len, node, output), &byte| {
if let Some(b_index) = node.find_input(byte) {
let transition = node.transition(b_index);
*prefix_len += 1;
*output = output.cat(transition.out);
*node = self.fst.node(transition.addr);
return Some((node.is_final(), *prefix_len, output.value()));
}
None
},
)
.filter_map(|(is_final, prefix_len, offset_len)| {
if is_final {
Some((prefix_len, offset_len))
} else {
None
}
})
.flat_map(move |(prefix_len, offset_len)| {
let len = offset_len & ((1u64 << 5) - 1u64);
let offset = offset_len >> 5u64;
self.da
.common_prefix_search(s)
.flat_map(move |(offset_len, prefix_len)| {
let len = offset_len & ((1u32 << 5) - 1u32);
let offset = offset_len >> 5u32;
let offset_bytes = (offset as usize) * WordEntry::SERIALIZED_LEN;
let data: &[u8] = &self.vals_data[offset_bytes..];
(0..len as usize).map(move |i| {
Expand All @@ -66,37 +44,4 @@ impl<D: Deref<Target = [u8]>> PrefixDict<D> {
}

#[cfg(test)]
mod tests {
// use crate::core::prefix_dict::PrefixDict;
//
// #[test]
// fn test_fst_prefix_2() {
// let prefix_dict = PrefixDict::default();
// let count_prefix = prefix_dict.prefix("—でも").count();
// assert_eq!(count_prefix, 1);
// }
//
// #[test]
// fn test_fst_prefix_tilde() {
// let prefix_dict = PrefixDict::default();
// let count_prefix = prefix_dict.prefix("〜").count();
// assert_eq!(count_prefix, 2);
// }
//
// #[test]
// fn test_fst_ikkagetsu() {
// let prefix_dict = PrefixDict::default();
// let count_prefix = prefix_dict.prefix("ー").count();
// assert_eq!(count_prefix, 0);
//
// let count_prefix = prefix_dict.prefix("ヶ月").count();
// assert_eq!(count_prefix, 1);
// }
//
// #[test]
// fn test_fst_prefix_asterisk_symbol() {
// let prefix_dict = PrefixDict::default();
// let count_prefix = prefix_dict.prefix("※").count();
// assert_eq!(count_prefix, 1);
// }
}
mod tests {}
2 changes: 1 addition & 1 deletion lindera-core/src/core/viterbi.rs
Expand Up @@ -194,7 +194,7 @@ impl Lattice {
}
}

// we check all word starting at start, using the fst, like we would use
// we check all word starting at start, using the double array, like we would use
// a prefix trie, and populate the lattice with as many edges
for (prefix_len, word_entry) in dict.prefix(suffix) {
let edge = Edge {
Expand Down
2 changes: 1 addition & 1 deletion lindera-dictionary/README.md
Expand Up @@ -8,7 +8,7 @@ A morphological dictionary loader for [Lindera](https://github.com/lindera-morph

The following products are required to build:

- Rust >= 1.39.0
- Rust >= 1.46.0
- make >= 3.81

```shell script
Expand Down
2 changes: 1 addition & 1 deletion lindera-dictionary/src/lib.rs
Expand Up @@ -29,7 +29,7 @@ pub fn connection(dir: &str) -> ConnectionCostMatrix {
}

pub fn prefix_dict(dir: &str) -> PrefixDict {
let unidic_data_path = Path::new(dir).join("dict.fst");
let unidic_data_path = Path::new(dir).join("dict.da");
let unidic_data = read_file(unidic_data_path.to_str().unwrap());

let unidic_vals_path = Path::new(dir).join("dict.vals");
Expand Down
3 changes: 1 addition & 2 deletions lindera-ipadic-builder/Cargo.toml
Expand Up @@ -22,8 +22,7 @@ byteorder = "1.3.4"
clap = "2.33.0"
encoding = "0.2.33"
glob = "0.3.0"

lindera-fst = "0.1.0"
yada = "0.3.2"

lindera-core = { version = "0.5.1", path = "../lindera-core" }

Expand Down
2 changes: 1 addition & 1 deletion lindera-ipadic-builder/README.md
Expand Up @@ -14,7 +14,7 @@ IPADIC dictionary builder for [Lindera](https://github.com/lindera-morphology/li

The following products are required to build:

- Rust >= 1.39.0
- Rust >= 1.46.0
- make >= 3.81

```shell script
Expand Down
45 changes: 28 additions & 17 deletions lindera-ipadic-builder/src/lib.rs
Expand Up @@ -19,7 +19,8 @@ use lindera_core::core::character_definition::{
use lindera_core::core::prefix_dict::PrefixDict;
use lindera_core::core::unknown_dictionary::UnknownDictionary;
use lindera_core::core::word_entry::{WordEntry, WordId};
use lindera_fst::MapBuilder;
use yada::builder::DoubleArrayBuilder;
use yada::DoubleArray;

#[derive(Debug)]
pub enum ParsingError {
Expand Down Expand Up @@ -177,9 +178,8 @@ fn build_dict(input_dir: &str, output_dir: &str) -> Result<(), ParsingError> {
println!("sorting entries");
rows.sort_by_key(|row| row.surface_form.clone());

let wtr_fst = io::BufWriter::new(
File::create(Path::new(output_dir).join(Path::new("dict.fst"))).unwrap(),
);
let mut wtr_da =
io::BufWriter::new(File::create(Path::new(output_dir).join(Path::new("dict.da"))).unwrap());
let mut wtr_vals = io::BufWriter::new(
File::create(Path::new(output_dir).join(Path::new("dict.vals"))).unwrap(),
);
Expand Down Expand Up @@ -232,21 +232,27 @@ fn build_dict(input_dir: &str, output_dir: &str) -> Result<(), ParsingError> {
wtr_words.flush()?;
wtr_words_idx.flush()?;

let mut id = 0u64;
let mut id = 0u32;

println!("building fst");
let mut fst_build = MapBuilder::new(wtr_fst).unwrap();
println!("building da");
let mut keyset: Vec<(&[u8], u32)> = vec![];
let mut lastlen = 0;
for (key, word_entries) in &word_entry_map {
let len = word_entries.len() as u64;
let len = word_entries.len() as u32;
assert!(
len < (1 << 5),
format!("{} is {} length. Too long. [{}]", key, len, (1 << 5))
);
let val = (id << 5) | len;
fst_build.insert(&key, val).unwrap();
keyset.push((key.as_bytes(), val));
id += len;
lastlen += len;
}
fst_build.finish().unwrap();
let da_bytes = DoubleArrayBuilder::build(&keyset);
assert!(da_bytes.is_some(), "DoubleArray build error. ");
wtr_da.write_all(&da_bytes.unwrap()[..])?;

println!("Last len is {}", lastlen);

println!("building values");
for word_entries in word_entry_map.values() {
Expand Down Expand Up @@ -309,21 +315,26 @@ pub fn build_user_dict(
bincode::serialize_into(&mut words_data, &word).unwrap();
}

let mut id = 0u64;
let mut id = 0u32;

// building fst
let mut fst_build = MapBuilder::<Vec<u8>>::memory();
// building da
let mut keyset: Vec<(&[u8], u32)> = vec![];
for (key, word_entries) in &word_entry_map {
let len = word_entries.len() as u64;
let len = word_entries.len() as u32;
assert!(
len < (1 << 5),
format!("{} is {} length. Too long. [{}]", key, len, (1 << 5))
);
let val = (id << 5) | len;
fst_build.insert(&key, val).unwrap();
keyset.push((key.as_bytes(), val));
id += len;
}
let fst_bytes = fst_build.into_inner().unwrap();

let da_bytes = DoubleArrayBuilder::build(&keyset);
assert!(
da_bytes.is_some(),
"DoubleArray build error for user dict. "
);

// building values
let mut vals_data = Vec::<u8>::new();
Expand All @@ -334,7 +345,7 @@ pub fn build_user_dict(
}

let dict = PrefixDict {
fst: lindera_fst::raw::Fst::new(fst_bytes).unwrap(),
da: DoubleArray::new(da_bytes.unwrap()),
vals_data: vals_data,
is_system: false,
};
Expand Down
2 changes: 1 addition & 1 deletion lindera-ipadic/README.md
Expand Up @@ -8,7 +8,7 @@ IPADIC dictionary loader for [Lindera](https://github.com/lindera-morphology/lin

The following products are required to build:

- Rust >= 1.39.0
- Rust >= 1.46.0
- make >= 3.81

```shell script
Expand Down
Binary file added lindera-ipadic/lindera-ipadic/dict.da
Binary file not shown.
Binary file removed lindera-ipadic/lindera-ipadic/dict.fst
Binary file not shown.

0 comments on commit 858b4ea

Please sign in to comment.