Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch FST to DoubleArrayTrie #76

Merged
merged 6 commits into from Oct 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.md
Expand Up @@ -3,6 +3,7 @@ All notable changes to this project will be documented in this file.
This project adheres to [Semantic Versioning](http://semver.org/).

## Unreleased
- Switch FST to Double Array #76 @johtani
- Add long-text benchmark #74 @johtani
- Update modules to 2018 #73 @johtani
- Use new method instead of default_normal #72 @johtani
Expand Down
42 changes: 9 additions & 33 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -10,7 +10,7 @@ Lindera aims to build a library which is easy to install and provides concise AP

The following products are required to build:

- Rust >= 1.39.0
- Rust >= 1.46.0
- make >= 3.81
- jq

Expand Down
2 changes: 1 addition & 1 deletion lindera-cli/README.md
Expand Up @@ -14,7 +14,7 @@ A morphological analysis command-line interface for [Lindera](https://github.com

The following products are required to build:

- Rust >= 1.39.0
- Rust >= 1.46.0
- make >= 3.81

```shell script
Expand Down
2 changes: 1 addition & 1 deletion lindera-core/Cargo.toml
Expand Up @@ -22,4 +22,4 @@ byteorder = "1.3.4"
encoding = "0.2.33"
serde = {version="1.0.106", features = ["derive"] }

lindera-fst = "0.1.1"
yada = "0.3.2"
2 changes: 1 addition & 1 deletion lindera-core/README.md
Expand Up @@ -10,7 +10,7 @@ This package contains dictionary structures and the viterbi algorithm.

The following products are required to build:

- Rust >= 1.39.0
- Rust >= 1.46.0
- make >= 3.81

```text
Expand Down
77 changes: 11 additions & 66 deletions lindera-core/src/core/prefix_dict.rs
@@ -1,22 +1,20 @@
use std::ops::Deref;

use lindera_fst;
use lindera_fst::raw::Output;

use crate::core::word_entry::WordEntry;
use yada::DoubleArray;

#[derive(Clone)]
pub struct PrefixDict<Data = Vec<u8>> {
pub fst: lindera_fst::raw::Fst<Data>,
pub da: DoubleArray<Vec<u8>>,
pub vals_data: Data,
pub is_system: bool,
}

impl PrefixDict<&[u8]> {
pub fn from_static_slice(fst_data: &[u8], vals_data: &[u8]) -> lindera_fst::Result<PrefixDict> {
let fst = lindera_fst::raw::Fst::new(fst_data.to_vec())?;
pub fn from_static_slice(da_data: &[u8], vals_data: &[u8]) -> Result<PrefixDict, String> {
let da = DoubleArray::new(da_data.to_vec());
Ok(PrefixDict {
fst,
da,
vals_data: vals_data.to_vec(),
is_system: true,
})
Expand All @@ -25,31 +23,11 @@ impl PrefixDict<&[u8]> {

impl<D: Deref<Target = [u8]>> PrefixDict<D> {
pub fn prefix<'a>(&'a self, s: &'a str) -> impl Iterator<Item = (usize, WordEntry)> + 'a {
s.as_bytes()
.iter()
.scan(
(0, self.fst.root(), Output::zero()),
move |(prefix_len, node, output), &byte| {
if let Some(b_index) = node.find_input(byte) {
let transition = node.transition(b_index);
*prefix_len += 1;
*output = output.cat(transition.out);
*node = self.fst.node(transition.addr);
return Some((node.is_final(), *prefix_len, output.value()));
}
None
},
)
.filter_map(|(is_final, prefix_len, offset_len)| {
if is_final {
Some((prefix_len, offset_len))
} else {
None
}
})
.flat_map(move |(prefix_len, offset_len)| {
let len = offset_len & ((1u64 << 5) - 1u64);
let offset = offset_len >> 5u64;
self.da
.common_prefix_search(s)
.flat_map(move |(offset_len, prefix_len)| {
let len = offset_len & ((1u32 << 5) - 1u32);
let offset = offset_len >> 5u32;
let offset_bytes = (offset as usize) * WordEntry::SERIALIZED_LEN;
let data: &[u8] = &self.vals_data[offset_bytes..];
(0..len as usize).map(move |i| {
Expand All @@ -66,37 +44,4 @@ impl<D: Deref<Target = [u8]>> PrefixDict<D> {
}

#[cfg(test)]
mod tests {
// use crate::core::prefix_dict::PrefixDict;
//
// #[test]
// fn test_fst_prefix_2() {
// let prefix_dict = PrefixDict::default();
// let count_prefix = prefix_dict.prefix("—でも").count();
// assert_eq!(count_prefix, 1);
// }
//
// #[test]
// fn test_fst_prefix_tilde() {
// let prefix_dict = PrefixDict::default();
// let count_prefix = prefix_dict.prefix("〜").count();
// assert_eq!(count_prefix, 2);
// }
//
// #[test]
// fn test_fst_ikkagetsu() {
// let prefix_dict = PrefixDict::default();
// let count_prefix = prefix_dict.prefix("ー").count();
// assert_eq!(count_prefix, 0);
//
// let count_prefix = prefix_dict.prefix("ヶ月").count();
// assert_eq!(count_prefix, 1);
// }
//
// #[test]
// fn test_fst_prefix_asterisk_symbol() {
// let prefix_dict = PrefixDict::default();
// let count_prefix = prefix_dict.prefix("※").count();
// assert_eq!(count_prefix, 1);
// }
}
mod tests {}
2 changes: 1 addition & 1 deletion lindera-core/src/core/viterbi.rs
Expand Up @@ -194,7 +194,7 @@ impl Lattice {
}
}

// we check all word starting at start, using the fst, like we would use
// we check all word starting at start, using the double array, like we would use
// a prefix trie, and populate the lattice with as many edges
for (prefix_len, word_entry) in dict.prefix(suffix) {
let edge = Edge {
Expand Down
2 changes: 1 addition & 1 deletion lindera-dictionary/README.md
Expand Up @@ -8,7 +8,7 @@ A morphological dictionary loader for [Lindera](https://github.com/lindera-morph

The following products are required to build:

- Rust >= 1.39.0
- Rust >= 1.46.0
- make >= 3.81

```shell script
Expand Down
2 changes: 1 addition & 1 deletion lindera-dictionary/src/lib.rs
Expand Up @@ -29,7 +29,7 @@ pub fn connection(dir: &str) -> ConnectionCostMatrix {
}

pub fn prefix_dict(dir: &str) -> PrefixDict {
let unidic_data_path = Path::new(dir).join("dict.fst");
let unidic_data_path = Path::new(dir).join("dict.da");
let unidic_data = read_file(unidic_data_path.to_str().unwrap());

let unidic_vals_path = Path::new(dir).join("dict.vals");
Expand Down
3 changes: 1 addition & 2 deletions lindera-ipadic-builder/Cargo.toml
Expand Up @@ -22,8 +22,7 @@ byteorder = "1.3.4"
clap = "2.33.0"
encoding = "0.2.33"
glob = "0.3.0"

lindera-fst = "0.1.0"
yada = "0.3.2"

lindera-core = { version = "0.5.1", path = "../lindera-core" }

Expand Down
2 changes: 1 addition & 1 deletion lindera-ipadic-builder/README.md
Expand Up @@ -14,7 +14,7 @@ IPADIC dictionary builder for [Lindera](https://github.com/lindera-morphology/li

The following products are required to build:

- Rust >= 1.39.0
- Rust >= 1.46.0
- make >= 3.81

```shell script
Expand Down
45 changes: 28 additions & 17 deletions lindera-ipadic-builder/src/lib.rs
Expand Up @@ -19,7 +19,8 @@ use lindera_core::core::character_definition::{
use lindera_core::core::prefix_dict::PrefixDict;
use lindera_core::core::unknown_dictionary::UnknownDictionary;
use lindera_core::core::word_entry::{WordEntry, WordId};
use lindera_fst::MapBuilder;
use yada::builder::DoubleArrayBuilder;
use yada::DoubleArray;

#[derive(Debug)]
pub enum ParsingError {
Expand Down Expand Up @@ -177,9 +178,8 @@ fn build_dict(input_dir: &str, output_dir: &str) -> Result<(), ParsingError> {
println!("sorting entries");
rows.sort_by_key(|row| row.surface_form.clone());

let wtr_fst = io::BufWriter::new(
File::create(Path::new(output_dir).join(Path::new("dict.fst"))).unwrap(),
);
let mut wtr_da =
io::BufWriter::new(File::create(Path::new(output_dir).join(Path::new("dict.da"))).unwrap());
let mut wtr_vals = io::BufWriter::new(
File::create(Path::new(output_dir).join(Path::new("dict.vals"))).unwrap(),
);
Expand Down Expand Up @@ -232,21 +232,27 @@ fn build_dict(input_dir: &str, output_dir: &str) -> Result<(), ParsingError> {
wtr_words.flush()?;
wtr_words_idx.flush()?;

let mut id = 0u64;
let mut id = 0u32;

println!("building fst");
let mut fst_build = MapBuilder::new(wtr_fst).unwrap();
println!("building da");
let mut keyset: Vec<(&[u8], u32)> = vec![];
let mut lastlen = 0;
for (key, word_entries) in &word_entry_map {
let len = word_entries.len() as u64;
let len = word_entries.len() as u32;
assert!(
len < (1 << 5),
format!("{} is {} length. Too long. [{}]", key, len, (1 << 5))
);
let val = (id << 5) | len;
fst_build.insert(&key, val).unwrap();
keyset.push((key.as_bytes(), val));
id += len;
lastlen += len;
}
fst_build.finish().unwrap();
let da_bytes = DoubleArrayBuilder::build(&keyset);
assert!(da_bytes.is_some(), "DoubleArray build error. ");
wtr_da.write_all(&da_bytes.unwrap()[..])?;

println!("Last len is {}", lastlen);

println!("building values");
for word_entries in word_entry_map.values() {
Expand Down Expand Up @@ -309,21 +315,26 @@ pub fn build_user_dict(
bincode::serialize_into(&mut words_data, &word).unwrap();
}

let mut id = 0u64;
let mut id = 0u32;

// building fst
let mut fst_build = MapBuilder::<Vec<u8>>::memory();
// building da
let mut keyset: Vec<(&[u8], u32)> = vec![];
for (key, word_entries) in &word_entry_map {
let len = word_entries.len() as u64;
let len = word_entries.len() as u32;
assert!(
len < (1 << 5),
format!("{} is {} length. Too long. [{}]", key, len, (1 << 5))
);
let val = (id << 5) | len;
fst_build.insert(&key, val).unwrap();
keyset.push((key.as_bytes(), val));
id += len;
}
let fst_bytes = fst_build.into_inner().unwrap();

let da_bytes = DoubleArrayBuilder::build(&keyset);
assert!(
da_bytes.is_some(),
"DoubleArray build error for user dict. "
);

// building values
let mut vals_data = Vec::<u8>::new();
Expand All @@ -334,7 +345,7 @@ pub fn build_user_dict(
}

let dict = PrefixDict {
fst: lindera_fst::raw::Fst::new(fst_bytes).unwrap(),
da: DoubleArray::new(da_bytes.unwrap()),
vals_data: vals_data,
is_system: false,
};
Expand Down
2 changes: 1 addition & 1 deletion lindera-ipadic/README.md
Expand Up @@ -8,7 +8,7 @@ IPADIC dictionary loader for [Lindera](https://github.com/lindera-morphology/lin

The following products are required to build:

- Rust >= 1.39.0
- Rust >= 1.46.0
- make >= 3.81

```shell script
Expand Down
Binary file added lindera-ipadic/lindera-ipadic/dict.da
Binary file not shown.
Binary file removed lindera-ipadic/lindera-ipadic/dict.fst
Binary file not shown.