Switch FST to DoubleArrayTrie (#76)

* Switch FST library to yada (Double Array Trie) in PrefixDict Need rust version >= 1.46.0 to build * Remove lindera-fst from Cargo.toml * Upgrade yada-0.3.1 * Change .fst to .da And remove FST name in this repository * Add long text test case * Upgrade yada 0.3.2 Need Tokenizer as a Cloneable
lindera-morphology · Oct 6, 2020 · 858b4ea · 858b4ea
1 parent 0f94a38
commit 858b4ea
Show file tree

Hide file tree

Showing 19 changed files with 73 additions and 129 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -3,6 +3,7 @@ All notable changes to this project will be documented in this file.
 This project adheres to [Semantic Versioning](http://semver.org/).
 
 ## Unreleased
+- Switch FST to Double Array #76 @johtani
 - Add long-text benchmark #74 @johtani
 - Update modules to 2018 #73 @johtani
 - Use new method instead of default_normal #72 @johtani

diff --git a/Cargo.lock b/Cargo.lock
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ Lindera aims to build a library which is easy to install and provides concise AP
 
 The following products are required to build:
 
-- Rust >= 1.39.0
+- Rust >= 1.46.0
 - make >= 3.81
 - jq
 

diff --git a/lindera-cli/README.md b/lindera-cli/README.md
@@ -14,7 +14,7 @@ A morphological analysis command-line interface for [Lindera](https://github.com
 
 The following products are required to build:
 
-- Rust >= 1.39.0
+- Rust >= 1.46.0
 - make >= 3.81
 
 ```shell script

diff --git a/lindera-core/Cargo.toml b/lindera-core/Cargo.toml
@@ -22,4 +22,4 @@ byteorder = "1.3.4"
 encoding = "0.2.33"
 serde = {version="1.0.106", features = ["derive"] }
 
-lindera-fst = "0.1.1"
+yada = "0.3.2"
diff --git a/lindera-core/README.md b/lindera-core/README.md
@@ -10,7 +10,7 @@ This package contains dictionary structures and the viterbi algorithm.
 
 The following products are required to build:
 
-- Rust >= 1.39.0
+- Rust >= 1.46.0
 - make >= 3.81
 
 ```text

diff --git a/lindera-core/src/core/prefix_dict.rs b/lindera-core/src/core/prefix_dict.rs
@@ -1,22 +1,20 @@
 use std::ops::Deref;
 
-use lindera_fst;
-use lindera_fst::raw::Output;
-
 use crate::core::word_entry::WordEntry;
+use yada::DoubleArray;
 
 #[derive(Clone)]
 pub struct PrefixDict<Data = Vec<u8>> {
-    pub fst: lindera_fst::raw::Fst<Data>,
+    pub da: DoubleArray<Vec<u8>>,
     pub vals_data: Data,
     pub is_system: bool,
 }
 
 impl PrefixDict<&[u8]> {
-    pub fn from_static_slice(fst_data: &[u8], vals_data: &[u8]) -> lindera_fst::Result<PrefixDict> {
-        let fst = lindera_fst::raw::Fst::new(fst_data.to_vec())?;
+    pub fn from_static_slice(da_data: &[u8], vals_data: &[u8]) -> Result<PrefixDict, String> {
+        let da = DoubleArray::new(da_data.to_vec());
         Ok(PrefixDict {
-            fst,
+            da,
             vals_data: vals_data.to_vec(),
             is_system: true,
         })
@@ -25,31 +23,11 @@ impl PrefixDict<&[u8]> {
 
 impl<D: Deref<Target = [u8]>> PrefixDict<D> {
     pub fn prefix<'a>(&'a self, s: &'a str) -> impl Iterator<Item = (usize, WordEntry)> + 'a {
-        s.as_bytes()
-            .iter()
-            .scan(
-                (0, self.fst.root(), Output::zero()),
-                move |(prefix_len, node, output), &byte| {
-                    if let Some(b_index) = node.find_input(byte) {
-                        let transition = node.transition(b_index);
-                        *prefix_len += 1;
-                        *output = output.cat(transition.out);
-                        *node = self.fst.node(transition.addr);
-                        return Some((node.is_final(), *prefix_len, output.value()));
-                    }
-                    None
-                },
-            )
-            .filter_map(|(is_final, prefix_len, offset_len)| {
-                if is_final {
-                    Some((prefix_len, offset_len))
-                } else {
-                    None
-                }
-            })
-            .flat_map(move |(prefix_len, offset_len)| {
-                let len = offset_len & ((1u64 << 5) - 1u64);
-                let offset = offset_len >> 5u64;
+        self.da
+            .common_prefix_search(s)
+            .flat_map(move |(offset_len, prefix_len)| {
+                let len = offset_len & ((1u32 << 5) - 1u32);
+                let offset = offset_len >> 5u32;
                 let offset_bytes = (offset as usize) * WordEntry::SERIALIZED_LEN;
                 let data: &[u8] = &self.vals_data[offset_bytes..];
                 (0..len as usize).map(move |i| {
@@ -66,37 +44,4 @@ impl<D: Deref<Target = [u8]>> PrefixDict<D> {
 }
 
 #[cfg(test)]
-mod tests {
-    //    use crate::core::prefix_dict::PrefixDict;
-    //
-    //    #[test]
-    //    fn test_fst_prefix_2() {
-    //        let prefix_dict = PrefixDict::default();
-    //        let count_prefix = prefix_dict.prefix("—でも").count();
-    //        assert_eq!(count_prefix, 1);
-    //    }
-    //
-    //    #[test]
-    //    fn test_fst_prefix_tilde() {
-    //        let prefix_dict = PrefixDict::default();
-    //        let count_prefix = prefix_dict.prefix("〜").count();
-    //        assert_eq!(count_prefix, 2);
-    //    }
-    //
-    //    #[test]
-    //    fn test_fst_ikkagetsu() {
-    //        let prefix_dict = PrefixDict::default();
-    //        let count_prefix = prefix_dict.prefix("ー").count();
-    //        assert_eq!(count_prefix, 0);
-    //
-    //        let count_prefix = prefix_dict.prefix("ヶ月").count();
-    //        assert_eq!(count_prefix, 1);
-    //    }
-    //
-    //    #[test]
-    //    fn test_fst_prefix_asterisk_symbol() {
-    //        let prefix_dict = PrefixDict::default();
-    //        let count_prefix = prefix_dict.prefix("※").count();
-    //        assert_eq!(count_prefix, 1);
-    //    }
-}
+mod tests {}
diff --git a/lindera-core/src/core/viterbi.rs b/lindera-core/src/core/viterbi.rs
@@ -194,7 +194,7 @@ impl Lattice {
                 }
             }
 
-            // we check all word starting at start, using the fst, like we would use
+            // we check all word starting at start, using the double array, like we would use
             // a prefix trie, and populate the lattice with as many edges
             for (prefix_len, word_entry) in dict.prefix(suffix) {
                 let edge = Edge {

diff --git a/lindera-dictionary/README.md b/lindera-dictionary/README.md
@@ -8,7 +8,7 @@ A morphological dictionary loader for [Lindera](https://github.com/lindera-morph
 
 The following products are required to build:
 
-- Rust >= 1.39.0
+- Rust >= 1.46.0
 - make >= 3.81
 
 ```shell script

diff --git a/lindera-dictionary/src/lib.rs b/lindera-dictionary/src/lib.rs
@@ -29,7 +29,7 @@ pub fn connection(dir: &str) -> ConnectionCostMatrix {
 }
 
 pub fn prefix_dict(dir: &str) -> PrefixDict {
-    let unidic_data_path = Path::new(dir).join("dict.fst");
+    let unidic_data_path = Path::new(dir).join("dict.da");
     let unidic_data = read_file(unidic_data_path.to_str().unwrap());
 
     let unidic_vals_path = Path::new(dir).join("dict.vals");

diff --git a/lindera-ipadic-builder/Cargo.toml b/lindera-ipadic-builder/Cargo.toml
@@ -22,8 +22,7 @@ byteorder = "1.3.4"
 clap = "2.33.0"
 encoding = "0.2.33"
 glob = "0.3.0"
-
-lindera-fst = "0.1.0"
+yada = "0.3.2"
 
 lindera-core = { version = "0.5.1", path = "../lindera-core" }
 

diff --git a/lindera-ipadic-builder/README.md b/lindera-ipadic-builder/README.md
@@ -14,7 +14,7 @@ IPADIC dictionary builder for [Lindera](https://github.com/lindera-morphology/li
 
 The following products are required to build:
 
-- Rust >= 1.39.0
+- Rust >= 1.46.0
 - make >= 3.81
 
 ```shell script

diff --git a/lindera-ipadic-builder/src/lib.rs b/lindera-ipadic-builder/src/lib.rs
@@ -19,7 +19,8 @@ use lindera_core::core::character_definition::{
 use lindera_core::core::prefix_dict::PrefixDict;
 use lindera_core::core::unknown_dictionary::UnknownDictionary;
 use lindera_core::core::word_entry::{WordEntry, WordId};
-use lindera_fst::MapBuilder;
+use yada::builder::DoubleArrayBuilder;
+use yada::DoubleArray;
 
 #[derive(Debug)]
 pub enum ParsingError {
@@ -177,9 +178,8 @@ fn build_dict(input_dir: &str, output_dir: &str) -> Result<(), ParsingError> {
     println!("sorting entries");
     rows.sort_by_key(|row| row.surface_form.clone());
 
-    let wtr_fst = io::BufWriter::new(
-        File::create(Path::new(output_dir).join(Path::new("dict.fst"))).unwrap(),
-    );
+    let mut wtr_da =
+        io::BufWriter::new(File::create(Path::new(output_dir).join(Path::new("dict.da"))).unwrap());
     let mut wtr_vals = io::BufWriter::new(
         File::create(Path::new(output_dir).join(Path::new("dict.vals"))).unwrap(),
     );
@@ -232,21 +232,27 @@ fn build_dict(input_dir: &str, output_dir: &str) -> Result<(), ParsingError> {
     wtr_words.flush()?;
     wtr_words_idx.flush()?;
 
-    let mut id = 0u64;
+    let mut id = 0u32;
 
-    println!("building fst");
-    let mut fst_build = MapBuilder::new(wtr_fst).unwrap();
+    println!("building da");
+    let mut keyset: Vec<(&[u8], u32)> = vec![];
+    let mut lastlen = 0;
     for (key, word_entries) in &word_entry_map {
-        let len = word_entries.len() as u64;
+        let len = word_entries.len() as u32;
         assert!(
             len < (1 << 5),
             format!("{} is {} length. Too long. [{}]", key, len, (1 << 5))
         );
         let val = (id << 5) | len;
-        fst_build.insert(&key, val).unwrap();
+        keyset.push((key.as_bytes(), val));
         id += len;
+        lastlen += len;
     }
-    fst_build.finish().unwrap();
+    let da_bytes = DoubleArrayBuilder::build(&keyset);
+    assert!(da_bytes.is_some(), "DoubleArray build error. ");
+    wtr_da.write_all(&da_bytes.unwrap()[..])?;
+
+    println!("Last len is {}", lastlen);
 
     println!("building values");
     for word_entries in word_entry_map.values() {
@@ -309,21 +315,26 @@ pub fn build_user_dict(
         bincode::serialize_into(&mut words_data, &word).unwrap();
     }
 
-    let mut id = 0u64;
+    let mut id = 0u32;
 
-    // building fst
-    let mut fst_build = MapBuilder::<Vec<u8>>::memory();
+    // building da
+    let mut keyset: Vec<(&[u8], u32)> = vec![];
     for (key, word_entries) in &word_entry_map {
-        let len = word_entries.len() as u64;
+        let len = word_entries.len() as u32;
         assert!(
             len < (1 << 5),
             format!("{} is {} length. Too long. [{}]", key, len, (1 << 5))
         );
         let val = (id << 5) | len;
-        fst_build.insert(&key, val).unwrap();
+        keyset.push((key.as_bytes(), val));
         id += len;
     }
-    let fst_bytes = fst_build.into_inner().unwrap();
+
+    let da_bytes = DoubleArrayBuilder::build(&keyset);
+    assert!(
+        da_bytes.is_some(),
+        "DoubleArray build error for user dict. "
+    );
 
     // building values
     let mut vals_data = Vec::<u8>::new();
@@ -334,7 +345,7 @@ pub fn build_user_dict(
     }
 
     let dict = PrefixDict {
-        fst: lindera_fst::raw::Fst::new(fst_bytes).unwrap(),
+        da: DoubleArray::new(da_bytes.unwrap()),
         vals_data: vals_data,
         is_system: false,
     };

diff --git a/lindera-ipadic/README.md b/lindera-ipadic/README.md
@@ -8,7 +8,7 @@ IPADIC dictionary loader for [Lindera](https://github.com/lindera-morphology/lin
 
 The following products are required to build:
 
-- Rust >= 1.39.0
+- Rust >= 1.46.0
 - make >= 3.81
 
 ```shell script

diff --git a/lindera-ipadic/lindera-ipadic/dict.da b/lindera-ipadic/lindera-ipadic/dict.da
diff --git a/lindera-ipadic/lindera-ipadic/dict.fst b/lindera-ipadic/lindera-ipadic/dict.fst