Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

initial import

  • Loading branch information...
commit 737bf7ec7c2e116657284a8d5be6fb63bc89605b 0 parents
@kmike authored
Showing with 18,013 additions and 0 deletions.
  1. +19 −0 .hgignore
  2. 0  CHANGES.rst
  3. +18 −0 LICENSE
  4. +9 −0 MANIFEST.in
  5. +77 −0 README.rst
  6. +1 −0  lib/AUTHORS
  7. +10 −0 lib/COPYING
  8. +23 −0 lib/dawgdic/base-types.h
  9. +43 −0 lib/dawgdic/base-unit.h
  10. +70 −0 lib/dawgdic/bit-pool.h
  11. +149 −0 lib/dawgdic/completer.h
  12. +383 −0 lib/dawgdic/dawg-builder.h
  13. +84 −0 lib/dawgdic/dawg-unit.h
  14. +137 −0 lib/dawgdic/dawg.h
  15. +332 −0 lib/dawgdic/dictionary-builder.h
  16. +60 −0 lib/dawgdic/dictionary-extra-unit.h
  17. +71 −0 lib/dawgdic/dictionary-unit.h
  18. +229 −0 lib/dawgdic/dictionary.h
  19. +103 −0 lib/dawgdic/guide-builder.h
  20. +35 −0 lib/dawgdic/guide-unit.h
  21. +124 −0 lib/dawgdic/guide.h
  22. +71 −0 lib/dawgdic/link-table.h
  23. +69 −0 lib/dawgdic/object-pool.h
  24. +61 −0 lib/dawgdic/ranked-completer-candidate.h
  25. +58 −0 lib/dawgdic/ranked-completer-node.h
  26. +222 −0 lib/dawgdic/ranked-completer.h
  27. +182 −0 lib/dawgdic/ranked-guide-builder.h
  28. +62 −0 lib/dawgdic/ranked-guide-link.h
  29. +35 −0 lib/dawgdic/ranked-guide-unit.h
  30. +124 −0 lib/dawgdic/ranked-guide.h
  31. +42 −0 setup.py
  32. +1,345 −0 src/_base_types.cpp
  33. +14 −0 src/_base_types.pxd
  34. +1,348 −0 src/_dawg.cpp
  35. +46 −0 src/_dawg.pxd
  36. +1,351 −0 src/_dawg_builder.cpp
  37. +36 −0 src/_dawg_builder.pxd
  38. +1,365 −0 src/_dictionary.cpp
  39. +61 −0 src/_dictionary.pxd
  40. +1,371 −0 src/_dictionary_builder.cpp
  41. +12 −0 src/_dictionary_builder.pxd
  42. +1,348 −0 src/_dictionary_unit.cpp
  43. +31 −0 src/_dictionary_unit.pxd
  44. +5,221 −0 src/dawg.cpp
  45. +111 −0 src/dawg.pyx
  46. +1,356 −0 src/iostream.cpp
  47. +20 −0 src/iostream.pxd
  48. +2 −0  tests/__init__.py
  49. +62 −0 tests/test_dawg.py
  50. +8 −0 tox.ini
  51. +2 −0  update_cpp.sh
19 .hgignore
@@ -0,0 +1,19 @@
+^build
+^MANIFEST$
+^dist
+\.so$
+\.o$
+\.lo$
+
+\.svn
+\.cvsignore
+
+^src/.*\.html$
+
+^stuff/
+\.rej$
+\.pyc$
+^.tox
+\.orig$
+\.prof$
+\.coverage$
0  CHANGES.rst
No changes.
18 LICENSE
@@ -0,0 +1,18 @@
+Copyright (c) Mikhail Korobov, 2012
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is furnished
+to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
+INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR
+A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF
+CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
+OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
9 MANIFEST.in
@@ -0,0 +1,9 @@
+include README.rst
+include CHANGES.rst
+include LICENSE
+include tox.ini
+include update_cpp.sh
+include lib/COPYING
+
+recursive-include src *.cpp *.pxd *.pyx
+recursive-include lib/dawgdic *.h
77 README.rst
@@ -0,0 +1,77 @@
+DAWG
+====
+
+This package provides DAWG-based dictionary-like
+read-only object for Python (2.x and 3.x).
+
+Based on `dawgdic` C++ library.
+
+.. _dawgdic: https://code.google.com/p/dawgdic/
+
+Installation
+============
+
+TODO
+
+Usage
+=====
+
+Create a new DAWG::
+
+ >>> import dawg
+ >>> d = dawg.IntDict({u'key1': value1, u'key2': value2, u'key3': value3})
+
+TODO
+
+Contributing
+============
+
+Development happens at github and bitbucket:
+
+* https://github.com/kmike/DAWG
+* https://bitbucket.org/kmike/DAWG
+
+The main issue tracker is at github: https://github.com/kmike/DAWG/issues
+
+Feel free to submit ideas, bugs, pull requests (git or hg) or
+regular patches.
+
+If you found a bug in a C++ part please report it to the original
+`bug tracker <https://code.google.com/p/dawgdic/issues/list>`_.
+
+
+Running tests and benchmarks
+----------------------------
+
+Make sure `tox`_ is installed and run
+
+::
+
+ $ tox
+
+from the source checkout. Tests should pass under python 2.6, 2.7, 3.2 and 3.3.
+
+.. note::
+
+ At the moment of writing the latest pip release (1.1) does not
+ support Python 3.3; in order to run tox tests under Python 3.3
+ find the "virtualenv_support" directory in site-packages
+ (of the env you run tox from) and place an sdist zip/tarball of the newer
+ pip (from github) there.
+
+.. _cython: http://cython.org
+.. _tox: http://tox.testrun.org
+
+Authors & Contributors
+----------------------
+
+* Mikhail Korobov <kmike84@gmail.com>
+
+This module is based on `dawgdic`_ C++ library by
+Susumu Yata & contributors.
+
+License
+=======
+
+Wrapper code is licensed under MIT License.
+Bundled `dawgdic`_ C++ library is licensed under BSD license.
1  lib/AUTHORS
@@ -0,0 +1 @@
+Susumu Yata <syata@acm.org>
10 lib/COPYING
@@ -0,0 +1,10 @@
+Copyright (c) 2009-2012, Susumu Yata
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+- Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+- Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+- Neither the name of the University of Tokushima nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23 lib/dawgdic/base-types.h
@@ -0,0 +1,23 @@
+#ifndef DAWGDIC_BASE_TYPES_H
+#define DAWGDIC_BASE_TYPES_H
+
+#include <cstddef>
+
+namespace dawgdic {
+
+// 8-bit characters.
+typedef char CharType;
+typedef unsigned char UCharType;
+
+// 32-bit integer.
+typedef int ValueType;
+
+// 32-bit unsigned integer.
+typedef unsigned int BaseType;
+
+// 32 or 64-bit unsigned integer.
+typedef std::size_t SizeType;
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_BASE_TYPES_H
43 lib/dawgdic/base-unit.h
@@ -0,0 +1,43 @@
+#ifndef DAWGDIC_BASE_UNIT_H
+#define DAWGDIC_BASE_UNIT_H
+
+#include "base-types.h"
+
+namespace dawgdic {
+
+// Unit for building a dawg.
+class BaseUnit {
+ public:
+ BaseUnit() : base_(0) {}
+
+ // Writes values.
+ void set_base(BaseType base) {
+ base_ = base;
+ }
+ BaseType base() const {
+ return base_;
+ }
+
+ // Reads values.
+ BaseType child() const {
+ return base_ >> 2;
+ }
+ bool has_sibling() const {
+ return (base_ & 1) ? true : false;
+ }
+ ValueType value() const {
+ return static_cast<ValueType>(base_ >> 1);
+ }
+ bool is_state() const {
+ return (base_ & 2) ? true : false;
+ }
+
+ private:
+ BaseType base_;
+
+ // Copyable.
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_BASE_UNIT_H
70 lib/dawgdic/bit-pool.h
@@ -0,0 +1,70 @@
+#ifndef DAWGDIC_BIT_POOL_H
+#define DAWGDIC_BIT_POOL_H
+
+#include "object-pool.h"
+
+namespace dawgdic {
+
+// This class works as an array of bit flags with compact memory management.
+template <SizeType BLOCK_SIZE = 1 << 10>
+class BitPool {
+ public:
+ BitPool() : pool_(), size_(0) {}
+
+ // Accessors.
+ void set(SizeType index, bool bit) {
+ SizeType pool_index = PoolIndex(index);
+ UCharType bit_flag = BitFlag(index);
+ if (bit) {
+ pool_[pool_index] |= bit_flag;
+ } else {
+ pool_[pool_index] &= ~bit_flag;
+ }
+ }
+ bool get(SizeType index) const {
+ SizeType pool_index = PoolIndex(index);
+ UCharType bit_flag = BitFlag(index);
+ return (pool_[pool_index] & bit_flag) ? true : false;
+ }
+
+ // Deletes all bits and frees memory.
+ void Clear() {
+ pool_.Clear();
+ size_ = 0;
+ }
+
+ // Swaps bit pools.
+ void Swap(BitPool *bit_pool) {
+ pool_.Swap(&bit_pool->pool_);
+ }
+
+ // Allocates memory for a new bit and returns its ID.
+ // Note: Allocated bits are filled with false.
+ SizeType Allocate() {
+ SizeType pool_index = PoolIndex(size_);
+ if (pool_index == pool_.size()) {
+ pool_.Allocate();
+ pool_[pool_index] = '\0';
+ }
+ return size_++;
+ }
+
+ private:
+ ObjectPool<UCharType> pool_;
+ SizeType size_;
+
+ // Disallows copies.
+ BitPool(const BitPool &);
+ BitPool &operator=(const BitPool &);
+
+ static SizeType PoolIndex(SizeType index) {
+ return index / 8;
+ }
+ static UCharType BitFlag(BaseType index) {
+ return static_cast<UCharType>(1) << (index % 8);
+ }
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_BIT_POOL_H
149 lib/dawgdic/completer.h
@@ -0,0 +1,149 @@
+#ifndef DAWGDIC_COMPLETER_H
+#define DAWGDIC_COMPLETER_H
+
+#include "dictionary.h"
+#include "guide.h"
+
+#include <vector>
+
+namespace dawgdic {
+
+class Completer {
+ public:
+ Completer()
+ : dic_(NULL), guide_(NULL), key_(), index_stack_(), last_index_(0) {}
+ Completer(const Dictionary &dic, const Guide &guide)
+ : dic_(&dic), guide_(&guide), key_(), index_stack_(), last_index_(0) {}
+
+ void set_dic(const Dictionary &dic) {
+ dic_ = &dic;
+ }
+ void set_guide(const Guide &guide) {
+ guide_ = &guide;
+ }
+
+ const Dictionary &dic() const {
+ return *dic_;
+ }
+ const Guide &guide() const {
+ return *guide_;
+ }
+
+ // These member functions are available only when Next() returns true.
+ const char *key() const {
+ return reinterpret_cast<const char *>(&key_[0]);
+ }
+ SizeType length() const {
+ return key_.size() - 1;
+ }
+ ValueType value() const {
+ return dic_->value(last_index_);
+ }
+
+ // Starts completing keys from given index and prefix.
+ void Start(BaseType index, const char *prefix = "") {
+ SizeType length = 0;
+ for (const char *p = prefix; *p != '\0'; ++p) {
+ ++length;
+ }
+ Start(index, prefix, length);
+ }
+ void Start(BaseType index, const char *prefix, SizeType length) {
+ key_.resize(length + 1);
+ for (SizeType i = 0; i < length; ++i) {
+ key_[i] = prefix[i];
+ }
+ key_[length] = '\0';
+
+ index_stack_.clear();
+ index_stack_.push_back(index);
+ last_index_ = dic_->root();
+ }
+
+ // Gets the next key.
+ bool Next() {
+ if (index_stack_.empty()) {
+ return false;
+ }
+ BaseType index = index_stack_.back();
+
+ if (last_index_ != dic_->root()) {
+ UCharType child_label = guide_->child(index);
+ if (child_label != '\0') {
+ // Follows a transition to the first child.
+ if (!Follow(child_label, &index))
+ return false;
+ } else {
+ for ( ; ; ) {
+ UCharType sibling_label = guide_->sibling(index);
+
+ // Moves to the previous node.
+ if (key_.size() > 1) {
+ key_.resize(key_.size() - 1);
+ key_.back() = '\0';
+ }
+ index_stack_.resize(index_stack_.size() - 1);
+ if (index_stack_.empty()) {
+ return false;
+ }
+
+ index = index_stack_.back();
+ if (sibling_label != '\0') {
+ // Follows a transition to the next sibling.
+ if (!Follow(sibling_label, &index)) {
+ return false;
+ }
+ break;
+ }
+ }
+ }
+ }
+
+ // Finds a terminal.
+ return FindTerminal(index);
+ }
+
+ private:
+ const Dictionary *dic_;
+ const Guide *guide_;
+ std::vector<UCharType> key_;
+ std::vector<BaseType> index_stack_;
+ BaseType last_index_;
+
+ // Disallows copies.
+ Completer(const Completer &);
+ Completer &operator=(const Completer &);
+
+ // Follows a transition.
+ bool Follow(UCharType label, BaseType *index) {
+ if (!dic_->Follow(label, index)) {
+ return false;
+ }
+
+ key_.back() = label;
+ key_.push_back('\0');
+ index_stack_.push_back(*index);
+ return true;
+ }
+
+ // Finds a terminal.
+ bool FindTerminal(BaseType index) {
+ while (!dic_->has_value(index)) {
+ UCharType label = guide_->child(index);
+ if (!dic_->Follow(label, &index)) {
+ return false;
+ }
+
+ key_.back() = label;
+ key_.push_back('\0');
+ index_stack_.push_back(index);
+ }
+
+ last_index_ = index;
+ return true;
+ }
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_COMPLETER_H
383 lib/dawgdic/dawg-builder.h
@@ -0,0 +1,383 @@
+#ifndef DAWGDIC_DAWG_BUILDER_H
+#define DAWGDIC_DAWG_BUILDER_H
+
+#include <algorithm>
+#include <stack>
+#include <vector>
+
+#include "dawg.h"
+#include "dawg-unit.h"
+
+namespace dawgdic {
+
+// DAWG builder.
+class DawgBuilder {
+ public:
+ explicit DawgBuilder(SizeType initial_hash_table_size =
+ DEFAULT_INITIAL_HASH_TABLE_SIZE)
+ : initial_hash_table_size_(initial_hash_table_size),
+ base_pool_(), label_pool_(), flag_pool_(), unit_pool_(),
+ hash_table_(), unfixed_units_(), unused_units_(), num_of_states_(1),
+ num_of_merged_transitions_(0), num_of_merging_states_(0) {}
+
+ // Number of units.
+ SizeType size() const {
+ return base_pool_.size();
+ }
+ // Number of transitions.
+ SizeType num_of_transitions() const {
+ return base_pool_.size() - 1;
+ }
+ // Number of states.
+ SizeType num_of_states() const {
+ return num_of_states_;
+ }
+ // Number of merged transitions.
+ SizeType num_of_merged_transitions() const {
+ return num_of_merged_transitions_;
+ }
+ // Number of merged states.
+ SizeType num_of_merged_states() const {
+ return num_of_transitions()
+ + num_of_merged_transitions() + 1 - num_of_states();
+ }
+ // Number of merging states.
+ SizeType num_of_merging_states() const {
+ return num_of_merging_states_;
+ }
+
+ // Initializes a builder.
+ void Clear() {
+ base_pool_.Clear();
+ label_pool_.Clear();
+ flag_pool_.Clear();
+ unit_pool_.Clear();
+
+ std::vector<BaseType>(0).swap(hash_table_);
+ while (!unfixed_units_.empty()) {
+ unfixed_units_.pop();
+ }
+ while (!unused_units_.empty()) {
+ unused_units_.pop();
+ }
+
+ num_of_states_ = 1;
+ num_of_merged_transitions_ = 0;
+ num_of_merging_states_ = 0;
+ }
+
+ // Inserts a key.
+ bool Insert(const CharType *key, ValueType value = 0) {
+ SizeType length = 0;
+ while (key[length]) {
+ ++length;
+ }
+ return Insert(key, length, value);
+ }
+
+ // Inserts a key.
+ bool Insert(const CharType *key, SizeType length, ValueType value) {
+ if (value < 0 || length <= 0) {
+ return false;
+ }
+
+ // Initializes a builder if not initialized.
+ if (hash_table_.empty()) {
+ Init();
+ }
+
+ BaseType index = 0;
+ SizeType key_pos = 0;
+
+ // Finds a separate unit.
+ for ( ; key_pos <= length; ++key_pos) {
+ BaseType child_index = unit_pool_[index].child();
+ if (!child_index) {
+ break;
+ }
+
+ UCharType key_label = static_cast<UCharType>(
+ (key_pos < length) ? key[key_pos] : '\0');
+ UCharType unit_label = unit_pool_[child_index].label();
+
+ // Checks the order of keys.
+ if (key_label < unit_label) {
+ return false;
+ } else if (key_label > unit_label) {
+ unit_pool_[child_index].set_has_sibling(true);
+ FixUnits(child_index);
+ break;
+ }
+
+ index = child_index;
+ }
+
+ // Adds new units.
+ for ( ; key_pos <= length; ++key_pos) {
+ UCharType key_label = static_cast<UCharType>(
+ (key_pos < length) ? key[key_pos] : '\0');
+ BaseType child_index = AllocateUnit();
+
+ if (!unit_pool_[index].child()) {
+ unit_pool_[child_index].set_is_state(true);
+ }
+ unit_pool_[child_index].set_sibling(unit_pool_[index].child());
+ unit_pool_[child_index].set_label(key_label);
+ unit_pool_[index].set_child(child_index);
+ unfixed_units_.push(child_index);
+
+ index = child_index;
+ }
+ unit_pool_[index].set_value(value);
+ return true;
+ }
+
+ // Finishes building a dawg.
+ bool Finish(Dawg *dawg) {
+ // Initializes a builder if not initialized.
+ if (hash_table_.empty()) {
+ Init();
+ }
+
+ FixUnits(0);
+ base_pool_[0].set_base(unit_pool_[0].base());
+ label_pool_[0] = unit_pool_[0].label();
+
+ dawg->set_num_of_states(num_of_states_);
+ dawg->set_num_of_merged_transitions(num_of_merged_transitions_);
+ dawg->set_num_of_merged_states(num_of_merged_states());
+ dawg->set_num_of_merging_states(num_of_merging_states_);
+
+ dawg->SwapBasePool(&base_pool_);
+ dawg->SwapLabelPool(&label_pool_);
+ dawg->SwapFlagPool(&flag_pool_);
+
+ Clear();
+ return true;
+ }
+
+ private:
+ enum {
+ DEFAULT_INITIAL_HASH_TABLE_SIZE = 1 << 8
+ };
+
+ const SizeType initial_hash_table_size_;
+ ObjectPool<BaseUnit> base_pool_;
+ ObjectPool<UCharType> label_pool_;
+ BitPool<> flag_pool_;
+ ObjectPool<DawgUnit> unit_pool_;
+ std::vector<BaseType> hash_table_;
+ std::stack<BaseType> unfixed_units_;
+ std::stack<BaseType> unused_units_;
+ SizeType num_of_states_;
+ SizeType num_of_merged_transitions_;
+ SizeType num_of_merging_states_;
+
+ // Disallows copies.
+ DawgBuilder(const DawgBuilder &);
+ DawgBuilder &operator=(const DawgBuilder &);
+
+ // Initializes an object.
+ void Init() {
+ hash_table_.resize(initial_hash_table_size_, 0);
+ AllocateUnit();
+ AllocateTransition();
+ unit_pool_[0].set_label(0xFF);
+ unfixed_units_.push(0);
+ }
+
+ // Fixes units corresponding to the last inserted key.
+ // Also, some of units are merged into equivalent transitions.
+ void FixUnits(BaseType index) {
+ while (unfixed_units_.top() != index) {
+ BaseType unfixed_index = unfixed_units_.top();
+ unfixed_units_.pop();
+
+ if (num_of_states_ >= hash_table_.size() - (hash_table_.size() >> 2)) {
+ ExpandHashTable();
+ }
+
+ BaseType num_of_siblings = 0;
+ for (BaseType i = unfixed_index; i != 0; i = unit_pool_[i].sibling()) {
+ ++num_of_siblings;
+ }
+
+ BaseType hash_id;
+ BaseType matched_index = FindUnit(unfixed_index, &hash_id);
+ if (matched_index != 0) {
+ num_of_merged_transitions_ += num_of_siblings;
+
+ // Records a merging state.
+ if (flag_pool_.get(matched_index) == false) {
+ ++num_of_merging_states_;
+ flag_pool_.set(matched_index, true);
+ }
+ } else {
+ // Fixes units into pairs of base values and labels.
+ BaseType transition_index = 0;
+ for (BaseType i = 0; i < num_of_siblings; ++i) {
+ transition_index = AllocateTransition();
+ }
+ for (BaseType i = unfixed_index; i != 0; i = unit_pool_[i].sibling()) {
+ base_pool_[transition_index].set_base(unit_pool_[i].base());
+ label_pool_[transition_index] = unit_pool_[i].label();
+ --transition_index;
+ }
+ matched_index = transition_index + 1;
+ hash_table_[hash_id] = matched_index;
+ ++num_of_states_;
+ }
+
+ // Deletes fixed units.
+ for (BaseType current = unfixed_index, next;
+ current != 0; current = next) {
+ next = unit_pool_[current].sibling();
+ FreeUnit(current);
+ }
+
+ unit_pool_[unfixed_units_.top()].set_child(matched_index);
+ }
+ unfixed_units_.pop();
+ }
+
+ // Expands a hash table.
+ void ExpandHashTable() {
+ SizeType hash_table_size = hash_table_.size() << 1;
+ std::vector<BaseType>(0).swap(hash_table_);
+ hash_table_.resize(hash_table_size, 0);
+
+ // Builds a new hash table.
+ BaseType count = 0;
+ for (SizeType i = 1; i < base_pool_.size(); ++i) {
+ BaseType index = static_cast<BaseType>(i);
+ if (label_pool_[index] == '\0' || base_pool_[index].is_state()) {
+ BaseType hash_id;
+ FindTransition(index, &hash_id);
+ hash_table_[hash_id] = index;
+ ++count;
+ }
+ }
+ }
+
+ // Finds a transition from a hash table.
+ BaseType FindTransition(BaseType index, BaseType *hash_id) const {
+ *hash_id = HashTransition(index) % hash_table_.size();
+ for ( ; ; *hash_id = (*hash_id + 1) % hash_table_.size()) {
+ BaseType transition_id = hash_table_[*hash_id];
+ if (transition_id == 0) {
+ break;
+ }
+
+ // There must not be the same base value.
+ }
+ return 0;
+ }
+
+ // Finds a unit from a hash table.
+ BaseType FindUnit(BaseType unit_index, BaseType *hash_id) const {
+ *hash_id = HashUnit(unit_index) % hash_table_.size();
+ for ( ; ; *hash_id = (*hash_id + 1) % hash_table_.size()) {
+ BaseType transition_id = hash_table_[*hash_id];
+ if (transition_id == 0) {
+ break;
+ }
+
+ if (AreEqual(unit_index, transition_id)) {
+ return transition_id;
+ }
+ }
+ return 0;
+ }
+
+ // Compares a unit and a transition.
+ bool AreEqual(BaseType unit_index, BaseType transition_index) const {
+ // Compares the numbers of transitions.
+ for (BaseType i = unit_pool_[unit_index].sibling(); i != 0;
+ i = unit_pool_[i].sibling()) {
+ if (base_pool_[transition_index].has_sibling() == false) {
+ return false;
+ }
+ ++transition_index;
+ }
+ if (base_pool_[transition_index].has_sibling() == true) {
+ return false;
+ }
+
+ // Compares out-transitions.
+ for (BaseType i = unit_index; i;
+ i = unit_pool_[i].sibling(), --transition_index) {
+ if (unit_pool_[i].base() != base_pool_[transition_index].base() ||
+ unit_pool_[i].label() != label_pool_[transition_index]) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Calculates a hash value from a transition.
+ BaseType HashTransition(BaseType index) const {
+ BaseType hash_value = 0;
+ for ( ; index != 0; ++index) {
+ BaseType base = base_pool_[index].base();
+ UCharType label = label_pool_[index];
+ hash_value ^= Hash((label << 24) ^ base);
+
+ if (base_pool_[index].has_sibling() == false) {
+ break;
+ }
+ }
+ return hash_value;
+ }
+
+ // Calculates a hash value from a unit.
+ BaseType HashUnit(BaseType index) const {
+ BaseType hash_value = 0;
+ for ( ; index != 0; index = unit_pool_[index].sibling()) {
+ BaseType base = unit_pool_[index].base();
+ UCharType label = unit_pool_[index].label();
+ hash_value ^= Hash((label << 24) ^ base);
+ }
+ return hash_value;
+ }
+
+ // 32-bit mix function.
+ // http://www.concentric.net/~Ttwang/tech/inthash.htm
+ static BaseType Hash(BaseType key) {
+ key = ~key + (key << 15); // key = (key << 15) - key - 1;
+ key = key ^ (key >> 12);
+ key = key + (key << 2);
+ key = key ^ (key >> 4);
+ key = key * 2057; // key = (key + (key << 3)) + (key << 11);
+ key = key ^ (key >> 16);
+ return key;
+ }
+
+ // Gets a transition from object pools.
+ BaseType AllocateTransition() {
+ flag_pool_.Allocate();
+ base_pool_.Allocate();
+ return static_cast<BaseType>(label_pool_.Allocate());
+ }
+
+ // Gets a unit from an object pool.
+ BaseType AllocateUnit() {
+ BaseType index = 0;
+ if (unused_units_.empty()) {
+ index = static_cast<BaseType>(unit_pool_.Allocate());
+ } else {
+ index = unused_units_.top();
+ unused_units_.pop();
+ }
+ unit_pool_[index].Clear();
+ return index;
+ }
+
+ // Returns a unit to an object pool.
+ void FreeUnit(BaseType index) {
+ unused_units_.push(index);
+ }
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_DAWG_BUILDER_H
84 lib/dawgdic/dawg-unit.h
@@ -0,0 +1,84 @@
+#ifndef DAWGDIC_DAWG_UNIT_H
+#define DAWGDIC_DAWG_UNIT_H
+
+#include "base-types.h"
+
+namespace dawgdic {
+
+// Unit for building a dawg.
+class DawgUnit {
+ public:
+ DawgUnit()
+ : child_(0), sibling_(0), label_('\0'),
+ is_state_(false), has_sibling_(false) {}
+
+ // Writes values.
+ void set_child(BaseType child) {
+ child_ = child;
+ }
+ void set_sibling(BaseType sibling) {
+ sibling_ = sibling;
+ }
+ void set_value(ValueType value) {
+ child_ = value;
+ }
+ void set_label(UCharType label) {
+ label_ = label;
+ }
+ void set_is_state(bool is_state) {
+ is_state_ = is_state;
+ }
+ void set_has_sibling(bool has_sibling) {
+ has_sibling_ = has_sibling;
+ }
+
+ // Reads values.
+ BaseType child() const {
+ return child_;
+ }
+ BaseType sibling() const {
+ return sibling_;
+ }
+ ValueType value() const {
+ return static_cast<ValueType>(child_);
+ }
+ UCharType label() const {
+ return label_;
+ }
+ bool is_state() const {
+ return is_state_;
+ }
+ bool has_sibling() const {
+ return has_sibling_;
+ }
+
+ // Calculates a base value of a unit.
+ BaseType base() const {
+ if (label_ == '\0') {
+ return (child_ << 1) | (has_sibling_ ? 1 : 0);
+ }
+ return (child_ << 2) | (is_state_ ? 2 : 0) | (has_sibling_ ? 1 : 0);
+ }
+
+ // Initializes a unit.
+ void Clear() {
+ child_ = 0;
+ sibling_ = 0;
+ label_ = '\0';
+ is_state_ = false;
+ has_sibling_ = false;
+ }
+
+ private:
+ BaseType child_;
+ BaseType sibling_;
+ UCharType label_;
+ bool is_state_;
+ bool has_sibling_;
+
+ // Copyable.
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_DAWG_UNIT_H
137 lib/dawgdic/dawg.h
@@ -0,0 +1,137 @@
+#ifndef DAWGDIC_DAWG_H
+#define DAWGDIC_DAWG_H
+
+#include "base-unit.h"
+#include "bit-pool.h"
+#include "object-pool.h"
+
+namespace dawgdic {
+
+class Dawg {
+ public:
+ Dawg()
+ : base_pool_(), label_pool_(), flag_pool_(),
+ num_of_states_(0), num_of_merged_transitions_(0),
+ num_of_merged_states_(0), num_of_merging_states_(0) {}
+
+ // The root index.
+ BaseType root() const {
+ return 0;
+ }
+
+ // Number of units.
+ SizeType size() const {
+ return base_pool_.size();
+ }
+ // Number of transitions.
+ SizeType num_of_transitions() const {
+ return base_pool_.size() - 1;
+ }
+ // Number of states.
+ SizeType num_of_states() const {
+ return num_of_states_;
+ }
+ // Number of merged transitions.
+ SizeType num_of_merged_transitions() const {
+ return num_of_merged_transitions_;
+ }
+ // Number of merged states.
+ SizeType num_of_merged_states() const {
+ return num_of_merged_states_;
+ }
+ // Number of merging states.
+ SizeType num_of_merging_states() const {
+ return num_of_merging_states_;
+ }
+
+ // Reads values.
+ BaseType child(BaseType index) const {
+ return base_pool_[index].child();
+ }
+ BaseType sibling(BaseType index) const {
+ return base_pool_[index].has_sibling() ? (index + 1) : 0;
+ }
+ ValueType value(BaseType index) const {
+ return base_pool_[index].value();
+ }
+
+ bool is_leaf(BaseType index) const {
+ return label(index) == '\0';
+ }
+ UCharType label(BaseType index) const {
+ return label_pool_[index];
+ }
+ bool is_merging(BaseType index) const {
+ return flag_pool_.get(index);
+ }
+
+ // Clears object pools.
+ void Clear() {
+ base_pool_.Clear();
+ label_pool_.Clear();
+ flag_pool_.Clear();
+ num_of_states_ = 0;
+ num_of_merged_states_ = 0;
+ }
+
+ // Swaps dawgs.
+ void Swap(Dawg *dawg) {
+ base_pool_.Swap(&dawg->base_pool_);
+ label_pool_.Swap(&dawg->label_pool_);
+ flag_pool_.Swap(&dawg->flag_pool_);
+ std::swap(num_of_states_, dawg->num_of_states_);
+ std::swap(num_of_merged_transitions_, dawg->num_of_merged_transitions_);
+ std::swap(num_of_merged_states_, dawg->num_of_merged_states_);
+ std::swap(num_of_merging_states_, dawg->num_of_merging_states_);
+ }
+
+ public:
+ // Following member functions are called from DawgBuilder.
+
+ // Sets the number of states.
+ void set_num_of_states(SizeType num_of_states) {
+ num_of_states_ = num_of_states;
+ }
+ // Sets the number of merged transitions.
+ void set_num_of_merged_transitions(SizeType num_of_merged_transitions) {
+ num_of_merged_transitions_ = num_of_merged_transitions;
+ }
+ // Sets the number of merged states.
+ void set_num_of_merged_states(SizeType num_of_merged_states) {
+ num_of_merged_states_ = num_of_merged_states;
+ }
+ // Sets the number of merging states.
+ void set_num_of_merging_states(SizeType num_of_merging_states) {
+ num_of_merging_states_ = num_of_merging_states;
+ }
+
+ // Swaps base pools.
+ void SwapBasePool(ObjectPool<BaseUnit> *base_pool) {
+ base_pool_.Swap(base_pool);
+ }
+ // Swaps label pools.
+ void SwapLabelPool(ObjectPool<UCharType> *label_pool) {
+ label_pool_.Swap(label_pool);
+ }
+ // Swaps flag pools.
+ void SwapFlagPool(BitPool<> *flag_pool) {
+ flag_pool_.Swap(flag_pool);
+ }
+
+ private:
+ ObjectPool<BaseUnit> base_pool_;
+ ObjectPool<UCharType> label_pool_;
+ BitPool<> flag_pool_;
+ SizeType num_of_states_;
+ SizeType num_of_merged_transitions_;
+ SizeType num_of_merged_states_;
+ SizeType num_of_merging_states_;
+
+ // Disallows copies.
+ Dawg(const Dawg &);
+ Dawg &operator=(const Dawg &);
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_DAWG_H
332 lib/dawgdic/dictionary-builder.h
@@ -0,0 +1,332 @@
+#ifndef DAWGDIC_DICTIONARY_BUILDER_H
+#define DAWGDIC_DICTIONARY_BUILDER_H
+
+#include <vector>
+
+#include "dawg.h"
+#include "dictionary.h"
+#include "dictionary-extra-unit.h"
+#include "link-table.h"
+
+namespace dawgdic {
+
+class DictionaryBuilder {
+ public:
+ enum {
+ // Number of units in a block.
+ BLOCK_SIZE = 256,
+ // Number of blocks kept unfixed.
+ NUM_OF_UNFIXED_BLOCKS = 16,
+ // Number of units kept unfixed.
+ UNFIXED_SIZE = BLOCK_SIZE * NUM_OF_UNFIXED_BLOCKS
+ };
+
+ // Builds a dictionary from a list-form dawg.
+ static bool Build(const Dawg &dawg, Dictionary *dic,
+ BaseType *num_of_unused_units = NULL) {
+ DictionaryBuilder builder(dawg, dic);
+ if (!builder.BuildDictionary()) {
+ return false;
+ }
+ if (num_of_unused_units != NULL) {
+ *num_of_unused_units = builder.num_of_unused_units_;
+ }
+ return true;
+ }
+
+ private:
+ const Dawg &dawg_;
+ Dictionary *dic_;
+
+ std::vector<DictionaryUnit> units_;
+ std::vector<DictionaryExtraUnit *> extras_;
+ std::vector<UCharType> labels_;
+ LinkTable link_table_;
+ BaseType unfixed_index_;
+ BaseType num_of_unused_units_;
+
+ // Masks for offsets.
+ static const BaseType UPPER_MASK = ~(DictionaryUnit::OFFSET_MAX - 1);
+ static const BaseType LOWER_MASK = 0xFF;
+
+ // Disallows copies.
+ DictionaryBuilder(const DictionaryBuilder &);
+ DictionaryBuilder &operator=(const DictionaryBuilder &);
+
+ DictionaryBuilder(const Dawg &dawg, Dictionary *dic)
+ : dawg_(dawg), dic_(dic), units_(), extras_(), labels_(),
+ link_table_(), unfixed_index_(), num_of_unused_units_(0) {}
+ ~DictionaryBuilder() {
+ for (SizeType i = 0; i < extras_.size(); ++i) {
+ delete [] extras_[i];
+ }
+ }
+
+ // Accesses units.
+ DictionaryUnit &units(BaseType index) {
+ return units_[index];
+ }
+ const DictionaryUnit &units(BaseType index) const {
+ return units_[index];
+ }
+ DictionaryExtraUnit &extras(BaseType index) {
+ return extras_[index / BLOCK_SIZE][index % BLOCK_SIZE];
+ }
+ const DictionaryExtraUnit &extras(BaseType index) const {
+ return extras_[index / BLOCK_SIZE][index % BLOCK_SIZE];
+ }
+
+ // Number of units.
+ BaseType num_of_units() const {
+ return static_cast<BaseType>(units_.size());
+ }
+ // Number of blocks.
+ BaseType num_of_blocks() const {
+ return static_cast<BaseType>(extras_.size());
+ }
+
+ // Builds a dictionary from a list-form dawg.
+ bool BuildDictionary() {
+ link_table_.Init(dawg_.num_of_merging_states() +
+ (dawg_.num_of_merging_states() >> 1));
+
+ ReserveUnit(0);
+ extras(0).set_is_used();
+ units(0).set_offset(1);
+ units(0).set_label('\0');
+
+ if (dawg_.size() > 1) {
+ if (!BuildDictionary(dawg_.root(), 0)) {
+ return false;
+ }
+ }
+
+ FixAllBlocks();
+
+ dic_->SwapUnitsBuf(&units_);
+ return true;
+ }
+
+ // Builds a dictionary from a dawg.
+ bool BuildDictionary(BaseType dawg_index, BaseType dic_index) {
+ if (dawg_.is_leaf(dawg_index)) {
+ return true;
+ }
+
+ // Uses an existing offset if available.
+ BaseType dawg_child_index = dawg_.child(dawg_index);
+ if (dawg_.is_merging(dawg_child_index)) {
+ BaseType offset = link_table_.Find(dawg_child_index);
+ if (offset != 0) {
+ offset ^= dic_index;
+ if (!(offset & UPPER_MASK) || !(offset & LOWER_MASK)) {
+ if (dawg_.is_leaf(dawg_child_index)) {
+ units(dic_index).set_has_leaf();
+ }
+ units(dic_index).set_offset(offset);
+ return true;
+ }
+ }
+ }
+
+ // Finds a good offset and arranges child nodes.
+ BaseType offset = ArrangeChildNodes(dawg_index, dic_index);
+ if (offset == 0) {
+ return false;
+ }
+
+ if (dawg_.is_merging(dawg_child_index))
+ link_table_.Insert(dawg_child_index, offset); {
+ }
+
+ // Builds a double-array in depth-first order.
+ do {
+ BaseType dic_child_index = offset ^ dawg_.label(dawg_child_index);
+ if (!BuildDictionary(dawg_child_index, dic_child_index)) {
+ return false;
+ }
+ dawg_child_index = dawg_.sibling(dawg_child_index);
+ } while (dawg_child_index != 0);
+
+ return true;
+ }
+
+ // Arranges child nodes.
+ BaseType ArrangeChildNodes(BaseType dawg_index, BaseType dic_index) {
+ labels_.clear();
+
+ BaseType dawg_child_index = dawg_.child(dawg_index);
+ while (dawg_child_index != 0) {
+ labels_.push_back(dawg_.label(dawg_child_index));
+ dawg_child_index = dawg_.sibling(dawg_child_index);
+ }
+
+ // Finds a good offset.
+ BaseType offset = FindGoodOffset(dic_index);
+ if (!units(dic_index).set_offset(dic_index ^ offset)) {
+ return 0;
+ }
+
+ dawg_child_index = dawg_.child(dawg_index);
+ for (SizeType i = 0; i < labels_.size(); ++i) {
+ BaseType dic_child_index = offset ^ labels_[i];
+ ReserveUnit(dic_child_index);
+
+ if (dawg_.is_leaf(dawg_child_index)) {
+ units(dic_index).set_has_leaf();
+ units(dic_child_index).set_value(dawg_.value(dawg_child_index));
+ } else {
+ units(dic_child_index).set_label(labels_[i]);
+ }
+
+ dawg_child_index = dawg_.sibling(dawg_child_index);
+ }
+ extras(offset).set_is_used();
+
+ return offset;
+ }
+
+ // Finds a good offset.
+ BaseType FindGoodOffset(BaseType index) const {
+ if (unfixed_index_ >= num_of_units()) {
+ return num_of_units() | (index & 0xFF);
+ }
+
+ // Scans unused units to find a good offset.
+ BaseType unfixed_index = unfixed_index_;
+ do {
+ BaseType offset = unfixed_index ^ labels_[0];
+ if (IsGoodOffset(index, offset)) {
+ return offset;
+ }
+ unfixed_index = extras(unfixed_index).next();
+ } while (unfixed_index != unfixed_index_);
+
+ return num_of_units() | (index & 0xFF);
+ }
+
+ // Checks if a given offset is valid or not.
+ bool IsGoodOffset(BaseType index, BaseType offset) const {
+ if (extras(offset).is_used()) {
+ return false;
+ }
+
+ BaseType relative_offset = index ^ offset;
+ if ((relative_offset & LOWER_MASK) && (relative_offset & UPPER_MASK)) {
+ return false;
+ }
+
+ // Finds a collision.
+ for (SizeType i = 1; i < labels_.size(); ++i) {
+ if (extras(offset ^ labels_[i]).is_fixed()) {
+ return false;
+ }
+ }
+
+ return true;
+ }
+
+ // Reserves an unused unit.
+ void ReserveUnit(BaseType index) {
+ if (index >= num_of_units()) {
+ ExpandDictionary();
+ }
+
+ // Removes an unused unit from a circular linked list.
+ if (index == unfixed_index_) {
+ unfixed_index_ = extras(index).next();
+ if (unfixed_index_ == index) {
+ unfixed_index_ = num_of_units();
+ }
+ }
+ extras(extras(index).prev()).set_next(extras(index).next());
+ extras(extras(index).next()).set_prev(extras(index).prev());
+ extras(index).set_is_fixed();
+ }
+
+ // Expands a dictionary.
+ void ExpandDictionary() {
+ BaseType src_num_of_units = num_of_units();
+ BaseType src_num_of_blocks = num_of_blocks();
+
+ BaseType dest_num_of_units = src_num_of_units + BLOCK_SIZE;
+ BaseType dest_num_of_blocks = src_num_of_blocks + 1;
+
+ // Fixes an old block.
+ if (dest_num_of_blocks > NUM_OF_UNFIXED_BLOCKS) {
+ FixBlock(src_num_of_blocks - NUM_OF_UNFIXED_BLOCKS);
+ }
+
+ units_.resize(dest_num_of_units);
+ extras_.resize(dest_num_of_blocks, 0);
+
+ // Allocates memory to a new block.
+ if (dest_num_of_blocks > NUM_OF_UNFIXED_BLOCKS) {
+ BaseType block_id = src_num_of_blocks - NUM_OF_UNFIXED_BLOCKS;
+ std::swap(extras_[block_id], extras_.back());
+ for (BaseType i = src_num_of_units; i < dest_num_of_units; ++i) {
+ extras(i).clear();
+ }
+ } else {
+ extras_.back() = new DictionaryExtraUnit[BLOCK_SIZE];
+ }
+
+ // Creates a circular linked list for a new block.
+ for (BaseType i = src_num_of_units + 1; i < dest_num_of_units; ++i) {
+ extras(i - 1).set_next(i);
+ extras(i).set_prev(i - 1);
+ }
+
+ extras(src_num_of_units).set_prev(dest_num_of_units - 1);
+ extras(dest_num_of_units - 1).set_next(src_num_of_units);
+
+ // Merges 2 circular linked lists.
+ extras(src_num_of_units).set_prev(extras(unfixed_index_).prev());
+ extras(dest_num_of_units - 1).set_next(unfixed_index_);
+
+ extras(extras(unfixed_index_).prev()).set_next(src_num_of_units);
+ extras(unfixed_index_).set_prev(dest_num_of_units - 1);
+ }
+
+ // Fixes all blocks to avoid invalid transitions.
+ void FixAllBlocks() {
+ BaseType begin = 0;
+ if (num_of_blocks() > NUM_OF_UNFIXED_BLOCKS) {
+ begin = num_of_blocks() - NUM_OF_UNFIXED_BLOCKS;
+ }
+ BaseType end = num_of_blocks();
+
+ for (BaseType block_id = begin; block_id != end; ++block_id) {
+ FixBlock(block_id);
+ }
+ }
+
+ // Adjusts labels of unused units in a given block.
+ void FixBlock(BaseType block_id) {
+ BaseType begin = block_id * BLOCK_SIZE;
+ BaseType end = begin + BLOCK_SIZE;
+
+ // Finds an unused offset.
+ BaseType unused_offset_for_label = 0;
+ for (BaseType offset = begin; offset != end; ++offset) {
+ if (!extras(offset).is_used()) {
+ unused_offset_for_label = offset;
+ break;
+ }
+ }
+
+ // Labels of unused units are modified.
+ for (BaseType index = begin; index != end; ++index) {
+ if (!extras(index).is_fixed()) {
+ ReserveUnit(index);
+ units(index).set_label(
+ static_cast<UCharType>(index ^ unused_offset_for_label));
+ ++num_of_unused_units_;
+ }
+ }
+ }
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_DICTIONARY_BUILDER_H
60 lib/dawgdic/dictionary-extra-unit.h
@@ -0,0 +1,60 @@
+#ifndef DAWGDIC_DICTIONARY_EXTRA_UNIT_H
+#define DAWGDIC_DICTIONARY_EXTRA_UNIT_H
+
+#include "base-types.h"
+
+namespace dawgdic {
+
+// Extra unit for building a dictionary.
+class DictionaryExtraUnit {
+ public:
+ DictionaryExtraUnit() : lo_values_(0), hi_values_(0) {}
+
+ void clear() {
+ lo_values_ = hi_values_ = 0;
+ }
+
+ // Sets if a unit is fixed or not.
+ void set_is_fixed() {
+ lo_values_ |= 1;
+ }
+ // Sets an index of the next unused unit.
+ void set_next(BaseType next) {
+ lo_values_ = (lo_values_ & 1) | (next << 1);
+ }
+ // Sets if an index is used as an offset or not.
+ void set_is_used() {
+ hi_values_ |= 1;
+ }
+ // Sets an index of the previous unused unit.
+ void set_prev(BaseType prev) {
+ hi_values_ = (hi_values_ & 1) | (prev << 1);
+ }
+
+ // Reads if a unit is fixed or not.
+ bool is_fixed() const {
+ return (lo_values_ & 1) == 1;
+ }
+ // Reads an index of the next unused unit.
+ BaseType next() const {
+ return lo_values_ >> 1;
+ }
+ // Reads if an index is used as an offset or not.
+ bool is_used() const {
+ return (hi_values_ & 1) == 1;
+ }
+ // Reads an index of the previous unused unit.
+ BaseType prev() const {
+ return hi_values_ >> 1;
+ }
+
+ private:
+ BaseType lo_values_;
+ BaseType hi_values_;
+
+ // Copyable.
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_DICTIONARY_EXTRA_UNIT_H
71 lib/dawgdic/dictionary-unit.h
@@ -0,0 +1,71 @@
+#ifndef DAWGDIC_DICTIONARY_UNIT_H
+#define DAWGDIC_DICTIONARY_UNIT_H
+
+#include "base-types.h"
+
+namespace dawgdic {
+
+// Unit of a dictionary.
+class DictionaryUnit
+{
+ public:
+ static const BaseType OFFSET_MAX = static_cast<BaseType>(1) << 21;
+ static const BaseType IS_LEAF_BIT = static_cast<BaseType>(1) << 31;
+ static const BaseType HAS_LEAF_BIT = static_cast<BaseType>(1) << 8;
+ static const BaseType EXTENSION_BIT = static_cast<BaseType>(1) << 9;
+
+ DictionaryUnit() : base_(0) {}
+
+ // Sets a flag to show that a unit has a leaf as a child.
+ void set_has_leaf() {
+ base_ |= HAS_LEAF_BIT;
+ }
+ // Sets a value to a leaf unit.
+ void set_value(ValueType value) {
+ base_ = static_cast<BaseType>(value) | IS_LEAF_BIT;
+ }
+ // Sets a label to a non-leaf unit.
+ void set_label(UCharType label) {
+ base_ = (base_ & ~static_cast<BaseType>(0xFF)) | label;
+ }
+ // Sets an offset to a non-leaf unit.
+ bool set_offset(BaseType offset) {
+ if (offset >= (OFFSET_MAX << 8)) {
+ return false;
+ }
+
+ base_ &= IS_LEAF_BIT | HAS_LEAF_BIT | 0xFF;
+ if (offset < OFFSET_MAX) {
+ base_ |= (offset << 10);
+ } else {
+ base_ |= (offset << 2) | EXTENSION_BIT;
+ }
+ return true;
+ }
+
+ // Checks if a unit has a leaf as a child or not.
+ bool has_leaf() const {
+ return (base_ & HAS_LEAF_BIT) ? true : false;
+ }
+ // Checks if a unit corresponds to a leaf or not.
+ ValueType value() const {
+ return static_cast<ValueType>(base_ & ~IS_LEAF_BIT);
+ }
+ // Reads a label with a leaf flag from a non-leaf unit.
+ BaseType label() const {
+ return base_ & (IS_LEAF_BIT | 0xFF);
+ }
+ // Reads an offset to child units from a non-leaf unit.
+ BaseType offset() const {
+ return (base_ >> 10) << ((base_ & EXTENSION_BIT) >> 6);
+ }
+
+ private:
+ BaseType base_;
+
+ // Copyable.
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_DICTIONARY_UNIT_H
229 lib/dawgdic/dictionary.h
@@ -0,0 +1,229 @@
+#ifndef DAWGDIC_DICTIONARY_H
+#define DAWGDIC_DICTIONARY_H
+
+#include <iostream>
+#include <vector>
+
+#include "base-types.h"
+#include "dictionary-unit.h"
+
+namespace dawgdic {
+
+// Dictionary class for retrieval and binary I/O.
+class Dictionary {
+ public:
+ Dictionary() : units_(NULL), size_(0), units_buf_() {}
+
+ const DictionaryUnit *units() const {
+ return units_;
+ }
+ SizeType size() const {
+ return size_;
+ }
+ SizeType total_size() const {
+ return sizeof(DictionaryUnit) * size_;
+ }
+ SizeType file_size() const {
+ return sizeof(BaseType) + total_size();
+ }
+
+ // Root index.
+ BaseType root() const {
+ return 0;
+ }
+
+ // Checks if a given index is related to the end of a key.
+ bool has_value(BaseType index) const {
+ return units_[index].has_leaf();
+ }
+ // Gets a value from a given index.
+ ValueType value(BaseType index) const {
+ return units_[index ^ units_[index].offset()].value();
+ }
+
+ // Reads a dictionary from an input stream.
+ bool Read(std::istream *input) {
+ BaseType base_size;
+ if (!input->read(reinterpret_cast<char *>(&base_size), sizeof(BaseType))) {
+ return false;
+ }
+
+ SizeType size = static_cast<SizeType>(base_size);
+ std::vector<DictionaryUnit> units_buf(size);
+ if (!input->read(reinterpret_cast<char *>(&units_buf[0]),
+ sizeof(DictionaryUnit) * size)) {
+ return false;
+ }
+
+ SwapUnitsBuf(&units_buf);
+ return true;
+ }
+
+ // Writes a dictionry to an output stream.
+ bool Write(std::ostream *output) const {
+ BaseType base_size = static_cast<BaseType>(size_);
+ if (!output->write(reinterpret_cast<const char *>(&base_size),
+ sizeof(BaseType))) {
+ return false;
+ }
+
+ if (!output->write(reinterpret_cast<const char *>(units_),
+ sizeof(DictionaryUnit) * size_)) {
+ return false;
+ }
+
+ return true;
+ }
+
+ // Exact matching.
+ bool Contains(const CharType *key) const {
+ BaseType index = root();
+ if (!Follow(key, &index)) {
+ return false;
+ }
+ return has_value(index);
+ }
+ bool Contains(const CharType *key, SizeType length) const {
+ BaseType index = root();
+ if (!Follow(key, length, &index)) {
+ return false;
+ }
+ return has_value(index);
+ }
+
+ // Exact matching.
+ ValueType Find(const CharType *key) const {
+ BaseType index = root();
+ if (!Follow(key, &index)) {
+ return -1;
+ }
+ return has_value(index) ? value(index) : -1;
+ }
+ ValueType Find(const CharType *key, SizeType length) const {
+ BaseType index = root();
+ if (!Follow(key, length, &index)) {
+ return -1;
+ }
+ return has_value(index) ? value(index) : -1;
+ }
+ bool Find(const CharType *key, ValueType *value) const {
+ BaseType index = root();
+ if (!Follow(key, &index) || !has_value(index)) {
+ return false;
+ }
+ *value = this->value(index);
+ return true;
+ }
+ bool Find(const CharType *key, SizeType length, ValueType *value) const {
+ BaseType index = root();
+ if (!Follow(key, length, &index) || !has_value(index)) {
+ return false;
+ }
+ *value = this->value(index);
+ return true;
+ }
+
+ // Follows a transition.
+ bool Follow(CharType label, BaseType *index) const {
+ BaseType next_index =
+ *index ^ units_[*index].offset() ^ static_cast<UCharType>(label);
+ if (units_[next_index].label() != static_cast<UCharType>(label)) {
+ return false;
+ }
+ *index = next_index;
+ return true;
+ }
+
+ // Follows transitions.
+ bool Follow(const CharType *s, BaseType *index) const {
+ while (*s != '\0' && Follow(*s, index)) {
+ ++s;
+ }
+ return *s == '\0';
+ }
+ bool Follow(const CharType *s, BaseType *index, SizeType *count) const {
+ while (*s != '\0' && Follow(*s, index)) {
+ ++s, ++*count;
+ }
+ return *s == '\0';
+ }
+
+ // Follows transitions.
+ bool Follow(const CharType *s, SizeType length, BaseType *index) const {
+ for (SizeType i = 0; i < length; ++i) {
+ if (!Follow(s[i], index)) {
+ return false;
+ }
+ }
+ return true;
+ }
+ bool Follow(const CharType *s, SizeType length, BaseType *index,
+ SizeType *count) const {
+ for (SizeType i = 0; i < length; ++i, ++*count) {
+ if (!Follow(s[i], index)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ // Maps memory with its size.
+ void Map(const void *address) {
+ Clear();
+ units_ = reinterpret_cast<const DictionaryUnit *>(
+ static_cast<const BaseType *>(address) + 1);
+ size_ = *static_cast<const BaseType *>(address);
+ }
+ void Map(const void *address, SizeType size) {
+ Clear();
+ units_ = static_cast<const DictionaryUnit *>(address);
+ size_ = size;
+ }
+
+ // Initializes a dictionary.
+ void Clear() {
+ units_ = NULL;
+ size_ = 0;
+ std::vector<DictionaryUnit>(0).swap(units_buf_);
+ }
+
+ // Swaps dictionaries.
+ void Swap(Dictionary *dic) {
+ std::swap(units_, dic->units_);
+ std::swap(size_, dic->size_);
+ units_buf_.swap(dic->units_buf_);
+ }
+
+ // Shrinks a vector.
+ void Shrink() {
+ if (units_buf_.size() == units_buf_.capacity()) {
+ return;
+ }
+
+ std::vector<DictionaryUnit> units_buf(units_buf_);
+ SwapUnitsBuf(&units_buf);
+ }
+
+public:
+ // Following member function is called from DawgBuilder.
+
+ // Swaps buffers for units.
+ void SwapUnitsBuf(std::vector<DictionaryUnit> *units_buf) {
+ units_ = &(*units_buf)[0];
+ size_ = static_cast<BaseType>(units_buf->size());
+ units_buf_.swap(*units_buf);
+ }
+
+ private:
+ const DictionaryUnit *units_;
+ SizeType size_;
+ std::vector<DictionaryUnit> units_buf_;
+
+ // Disallows copies.
+ Dictionary(const Dictionary &);
+ Dictionary &operator=(const Dictionary &);
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_DICTIONARY_H
103 lib/dawgdic/guide-builder.h
@@ -0,0 +1,103 @@
+#ifndef DAWGDIC_GUIDE_BUILDER_H
+#define DAWGDIC_GUIDE_BUILDER_H
+
+#include "guide.h"
+#include "dawg.h"
+#include "dictionary.h"
+
+#include <vector>
+
+namespace dawgdic {
+
+class GuideBuilder {
+ public:
+ // Builds a dictionary for completing keys.
+ static bool Build(const Dawg &dawg, const Dictionary &dic, Guide *guide) {
+ GuideBuilder builder(dawg, dic, guide);
+ return builder.BuildGuide();
+ }
+
+ private:
+ const Dawg &dawg_;
+ const Dictionary &dic_;
+ Guide *guide_;
+
+ std::vector<GuideUnit> units_;
+ std::vector<UCharType> is_fixed_table_;
+
+ // Disallows copies.
+ GuideBuilder(const GuideBuilder &);
+ GuideBuilder &operator=(const GuideBuilder &);
+
+ GuideBuilder(const Dawg &dawg, const Dictionary &dic, Guide *guide)
+ : dawg_(dawg), dic_(dic), guide_(guide), units_(), is_fixed_table_() {}
+
+ bool BuildGuide() {
+ // Initializes units and flags.
+ units_.resize(dic_.size());
+ is_fixed_table_.resize(dic_.size() / 8, '\0');
+
+ if (dawg_.size() <= 1) {
+ return true;
+ }
+
+ if (!BuildGuide(dawg_.root(), dic_.root())) {
+ return false;
+ }
+
+ guide_->SwapUnitsBuf(&units_);
+ return true;
+ }
+
+ // Builds a guide recursively.
+ bool BuildGuide(BaseType dawg_index, BaseType dic_index) {
+ if (is_fixed(dic_index)) {
+ return true;
+ }
+ set_is_fixed(dic_index);
+
+ // Finds the first non-terminal child.
+ BaseType dawg_child_index = dawg_.child(dawg_index);
+ if (dawg_.label(dawg_child_index) == '\0') {
+ dawg_child_index = dawg_.sibling(dawg_child_index);
+ if (dawg_child_index == 0) {
+ return true;
+ }
+ }
+ units_[dic_index].set_child(dawg_.label(dawg_child_index));
+
+ do {
+ UCharType child_label = dawg_.label(dawg_child_index);
+ BaseType dic_child_index = dic_index;
+ if (!dic_.Follow(child_label, &dic_child_index)) {
+ return false;
+ }
+
+ if (!BuildGuide(dawg_child_index, dic_child_index)) {
+ return false;
+ }
+
+ BaseType dawg_sibling_index = dawg_.sibling(dawg_child_index);
+ UCharType sibling_label = dawg_.label(dawg_sibling_index);
+ if (dawg_sibling_index != 0) {
+ units_[dic_child_index].set_sibling(sibling_label);
+ }
+
+ dawg_child_index = dawg_sibling_index;
+ } while (dawg_child_index != 0);
+
+ return true;
+ }
+
+ void set_is_fixed(BaseType index) {
+ is_fixed_table_[index / 8] |= 1 << (index % 8);
+ }
+
+ bool is_fixed(BaseType index) const {
+ return (is_fixed_table_[index / 8] & (1 << (index % 8))) != 0;
+ }
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_GUIDE_BUILDER_H
35 lib/dawgdic/guide-unit.h
@@ -0,0 +1,35 @@
+#ifndef DAWGDIC_GUIDE_UNIT_H
+#define DAWGDIC_GUIDE_UNIT_H
+
+#include "base-types.h"
+
+namespace dawgdic {
+
+class GuideUnit {
+ public:
+ GuideUnit() : child_('\0'), sibling_('\0') {}
+
+ void set_child(UCharType child) {
+ child_ = child;
+ }
+ void set_sibling(UCharType sibling) {
+ sibling_ = sibling;
+ }
+
+ UCharType child() const {
+ return child_;
+ }
+ UCharType sibling() const {
+ return sibling_;
+ }
+
+ private:
+ UCharType child_;
+ UCharType sibling_;
+
+ // Copyable.
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_GUIDE_UNIT_H
124 lib/dawgdic/guide.h
@@ -0,0 +1,124 @@
+#ifndef DAWGDIC_GUIDE_H
+#define DAWGDIC_GUIDE_H
+
+#include "dictionary.h"
+#include "guide-unit.h"
+
+#include <iostream>
+#include <vector>
+
+namespace dawgdic {
+
+class Guide {
+ public:
+ Guide() : units_(NULL), size_(0), units_buf_() {}
+
+ const GuideUnit *units() const {
+ return units_;
+ }
+ SizeType size() const {
+ return size_;
+ }
+ SizeType total_size() const {
+ return sizeof(GuideUnit) * size_;
+ }
+ SizeType file_size() const {
+ return sizeof(BaseType) + total_size();
+ }
+
+ // The root index.
+ BaseType root() const {
+ return 0;
+ }
+
+ UCharType child(BaseType index) const {
+ return units_[index].child();
+ }
+ UCharType sibling(BaseType index) const {
+ return units_[index].sibling();
+ }
+
+ // Reads a dictionary from an input stream.
+ bool Read(std::istream *input) {
+ BaseType base_size;
+ if (!input->read(reinterpret_cast<char *>(&base_size), sizeof(BaseType))) {
+ return false;
+ }
+
+ SizeType size = static_cast<SizeType>(base_size);
+ std::vector<GuideUnit> units_buf(size);
+ if (!input->read(reinterpret_cast<char *>(&units_buf[0]),
+ sizeof(GuideUnit) * size)) {
+ return false;
+ }
+
+ SwapUnitsBuf(&units_buf);
+ return true;
+ }
+
+ // Writes a dictionry to an output stream.
+ bool Write(std::ostream *output) const {
+ BaseType base_size = static_cast<BaseType>(size_);
+ if (!output->write(reinterpret_cast<const char *>(&base_size),
+ sizeof(BaseType))) {
+ return false;
+ }
+
+ if (!output->write(reinterpret_cast<const char *>(units_),
+ sizeof(GuideUnit) * size_)) {
+ return false;
+ }
+
+ return true;
+ }
+
+ // Maps memory with its size.
+ void Map(const void *address) {
+ Clear();
+ units_ = reinterpret_cast<const GuideUnit *>(
+ static_cast<const BaseType *>(address) + 1);
+ size_ = *static_cast<const BaseType *>(address);
+ }
+ void Map(const void *address, SizeType size) {
+ Clear();
+ units_ = static_cast<const GuideUnit *>(address);
+ size_ = size;
+ }
+
+ // Swaps Guides.
+ void Swap(Guide *Guide) {
+ std::swap(units_, Guide->units_);
+ std::swap(size_, Guide->size_);
+ units_buf_.swap(Guide->units_buf_);
+ }
+
+ // Initializes a Guide.
+ void Clear() {
+ units_ = NULL;
+ size_ = 0;
+ std::vector<GuideUnit>(0).swap(units_buf_);
+ }
+
+ public:
+ // Following member function is called from DawgBuilder.
+
+ // Swaps buffers for units.
+ void SwapUnitsBuf(std::vector<GuideUnit> *units_buf) {
+ units_ = &(*units_buf)[0];
+ size_ = static_cast<BaseType>(units_buf->size());
+ units_buf_.swap(*units_buf);
+ }
+
+ private:
+ const GuideUnit *units_;
+ SizeType size_;
+ std::vector<GuideUnit> units_buf_;
+
+ // Disables copies.
+ Guide(const Guide &);
+ Guide &operator=(const Guide &);
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_GUIDE_H
71 lib/dawgdic/link-table.h
@@ -0,0 +1,71 @@
+#ifndef DAWGDIC_LINK_TABLE_H
+#define DAWGDIC_LINK_TABLE_H
+
+#include "base-types.h"
+#include "dictionary-unit.h"
+
+#include <vector>
+
+namespace dawgdic {
+
+class LinkTable {
+ public:
+ explicit LinkTable() : hash_table_() {}
+
+ // Initializes a hash table.
+ void Init(SizeType table_size) {
+ PairType initial_pair(0, 0);
+ std::vector<PairType> table(table_size, initial_pair);
+ hash_table_.swap(table);
+ }
+
+ // Finds an offset that corresponds to a given index.
+ BaseType Find(BaseType index) const {
+ BaseType hash_id = FindId(index);
+ return hash_table_[hash_id].second;
+ }
+
+ // Inserts an index with its offset.
+ void Insert(BaseType index, BaseType offset) {
+ BaseType hash_id = FindId(index);
+ hash_table_[hash_id].first = index;
+ hash_table_[hash_id].second = offset;
+ }
+
+ private:
+ typedef std::pair<BaseType, BaseType> PairType;
+
+ std::vector<PairType> hash_table_;
+
+ // Disallows copies.
+ LinkTable(const LinkTable &);
+ LinkTable &operator=(const LinkTable &);
+
+ // Finds an Id from an upper table.
+ BaseType FindId(BaseType index) const {
+ BaseType hash_id = Hash(index) % hash_table_.size();
+ while (hash_table_[hash_id].first != 0) {
+ if (index == hash_table_[hash_id].first) {
+ return hash_id;
+ }
+ hash_id = (hash_id + 1) % hash_table_.size();
+ }
+ return hash_id;
+ }
+
+ // 32-bit mix function.
+ // http://www.concentric.net/~Ttwang/tech/inthash.htm
+ static BaseType Hash(BaseType key) {
+ key = ~key + (key << 15); // key = (key << 15) - key - 1;
+ key = key ^ (key >> 12);
+ key = key + (key << 2);
+ key = key ^ (key >> 4);
+ key = key * 2057; // key = (key + (key << 3)) + (key << 11);
+ key = key ^ (key >> 16);
+ return key;
+ }
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_LINK_TABLE_H
69 lib/dawgdic/object-pool.h
@@ -0,0 +1,69 @@
+#ifndef DAWGDIC_OBJECT_POOL_H
+#define DAWGDIC_OBJECT_POOL_H
+
+#include <vector>
+
+#include "base-types.h"
+
+namespace dawgdic {
+
+// This class works like an array of objects with compact memory management.
+template <typename OBJECT_TYPE, SizeType BLOCK_SIZE = 1 << 10>
+class ObjectPool {
+ public:
+ typedef OBJECT_TYPE ObjectType;
+
+ ObjectPool() : blocks_(), size_(0) {}
+ ~ObjectPool() {
+ Clear();
+ }
+
+ // Accessors.
+ ObjectType &operator[](SizeType index) {
+ return blocks_[index / BLOCK_SIZE][index % BLOCK_SIZE];
+ }
+ const ObjectType &operator[](SizeType index) const {
+ return blocks_[index / BLOCK_SIZE][index % BLOCK_SIZE];
+ }
+
+ // Number of allocated objects.
+ SizeType size() const {
+ return size_;
+ }
+
+ // Deletes all objects and frees memory.
+ void Clear() {
+ for (SizeType i = 0; i < blocks_.size(); ++i) {
+ delete [] blocks_[i];
+ }
+
+ std::vector<ObjectType *>(0).swap(blocks_);
+ size_ = 0;
+ }
+
+ // Swaps object pools.
+ void Swap(ObjectPool *pool) {
+ blocks_.swap(pool->blocks_);
+ std::swap(size_, pool->size_);
+ }
+
+ // Allocates memory for a new object and returns its ID.
+ SizeType Allocate() {
+ if (size_ == BLOCK_SIZE * blocks_.size()) {
+ blocks_.push_back(new ObjectType[BLOCK_SIZE]);
+ }
+ return size_++;
+ }
+
+ private:
+ std::vector<ObjectType *> blocks_;
+ SizeType size_;
+
+ // Disallows copies.
+ ObjectPool(const ObjectPool &);
+ ObjectPool &operator=(const ObjectPool &);
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_OBJECT_POOL_H
61 lib/dawgdic/ranked-completer-candidate.h
@@ -0,0 +1,61 @@
+#ifndef DAWGDIC_RANKED_COMPLETER_CANDIDATE_H
+#define DAWGDIC_RANKED_COMPLETER_CANDIDATE_H
+
+#include "base-types.h"
+
+namespace dawgdic {
+
+class RankedCompleterCandidate {
+ public:
+ RankedCompleterCandidate() : node_index_(0), value_(-1) {}
+
+ void set_node_index(BaseType node_index) {
+ node_index_ = node_index;
+ }
+ void set_value(ValueType value) {
+ value_ = value;
+ }
+
+ BaseType node_index() const {
+ return node_index_;
+ }
+ ValueType value() const {
+ return value_;
+ }
+
+ template <typename VALUE_COMPARER_TYPE>
+ class Comparer {
+ public:
+ typedef VALUE_COMPARER_TYPE ValueComparerType;
+
+ explicit Comparer(ValueComparerType value_comparer)
+ : value_comparer_(value_comparer) {}
+
+ bool operator()(const RankedCompleterCandidate &lhs,
+ const RankedCompleterCandidate &rhs) const {
+ if (lhs.value() != rhs.value()) {
+ return value_comparer_(lhs.value(), rhs.value());
+ }
+ return lhs.node_index() > rhs.node_index();
+ }
+
+ private:
+ ValueComparerType value_comparer_;
+ };
+
+ template <typename VALUE_COMPARER_TYPE>
+ static Comparer<VALUE_COMPARER_TYPE> MakeComparer(
+ VALUE_COMPARER_TYPE value_comparer) {
+ return Comparer<VALUE_COMPARER_TYPE>(value_comparer);
+ }
+
+ private:
+ BaseType node_index_;
+ ValueType value_;
+
+ // Copyable.
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_RANKED_COMPLETER_CANDIDATE_H
58 lib/dawgdic/ranked-completer-node.h
@@ -0,0 +1,58 @@
+#ifndef DAWGDIC_RANKED_COMPLETER_NODE_H
+#define DAWGDIC_RANKED_COMPLETER_NODE_H
+
+#include "base-types.h"
+
+namespace dawgdic {
+
+class RankedCompleterNode {
+ public:
+ RankedCompleterNode()
+ : dic_index_(0), prev_node_index_(0),
+ label_('\0'), is_queued_(false), has_terminal_(false) {}
+
+ void set_dic_index(BaseType dic_index) {
+ dic_index_ = dic_index;
+ }
+ void set_prev_node_index(BaseType prev_node_index) {
+ prev_node_index_ = prev_node_index;
+ }
+ void set_label(UCharType label) {
+ label_ = label;
+ }
+ void set_is_queued() {
+ is_queued_ = true;
+ }
+ void set_has_terminal(bool has_terminal) {
+ has_terminal_ = has_terminal;
+ }
+
+ BaseType dic_index() const {
+ return dic_index_;
+ }
+ BaseType prev_node_index() const {
+ return prev_node_index_;
+ }
+ UCharType label() const {
+ return label_;
+ }
+ bool is_queued() const {
+ return is_queued_;
+ }
+ bool has_terminal() const {
+ return has_terminal_;
+ }
+
+ private:
+ BaseType dic_index_;
+ BaseType prev_node_index_;
+ UCharType label_;
+ bool is_queued_;
+ bool has_terminal_;
+
+ // Copyable.
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_RANKED_COMPLETER_NODE_H
222 lib/dawgdic/ranked-completer.h
@@ -0,0 +1,222 @@
+#ifndef DAWGDIC_RANKED_COMPLETER_H
+#define DAWGDIC_RANKED_COMPLETER_H
+
+#include "dictionary.h"
+#include "ranked-completer-candidate.h"
+#include "ranked-completer-node.h"
+#include "ranked-guide.h"
+
+#include <algorithm>
+#include <functional>
+#include <queue>
+#include <vector>
+
+namespace dawgdic {
+
+template <typename VALUE_COMPARER_TYPE = std::less<ValueType> >
+class RankedCompleterBase {
+ public:
+ typedef VALUE_COMPARER_TYPE ValueComparerType;
+
+ explicit RankedCompleterBase(
+ ValueComparerType value_comparer = ValueComparerType())
+ : dic_(NULL), guide_(NULL), key_(), prefix_length_(0), value_(-1),
+ nodes_(), node_queue_(), candidate_queue_(
+ RankedCompleterCandidate::MakeComparer(value_comparer)) {}
+ RankedCompleterBase(const Dictionary &dic, const RankedGuide &guide,
+ ValueComparerType value_comparer = ValueComparerType())
+ : dic_(&dic), guide_(&guide), key_(), prefix_length_(0), value_(-1),
+ nodes_(), node_queue_(), candidate_queue_(
+ RankedCompleterCandidate::MakeComparer(value_comparer)) {}
+
+ void set_dic(const Dictionary &dic) {
+ dic_ = &dic;
+ }
+ void set_guide(const RankedGuide &guide) {
+ guide_ = &guide;
+ }
+
+ const Dictionary &dic() const {
+ return *dic_;
+ }
+ const RankedGuide &guide() const {
+ return *guide_;
+ }
+
+ // These member functions are available only when Next() returns true.
+ const char *key() const {
+ return reinterpret_cast<const char *>(&key_[0]);
+ }
+ SizeType length() const {
+ return key_.size() - 1;
+ }
+ ValueType value() const {
+ return value_;
+ }
+
+ // Starts completing keys from given index and prefix.
+ void Start(BaseType index, const char *prefix = "") {
+ SizeType length = 0;
+ for (const char *p = prefix; *p != '\0'; ++p) {
+ ++length;
+ }
+
+ Start(index, prefix, length);
+ }
+ void Start(BaseType index, const char *prefix, SizeType length) {
+ key_.resize(length);
+ for (SizeType i = 0; i < length; ++i) {
+ key_[i] = prefix[i];
+ }
+ prefix_length_ = length;
+ value_ = -1;
+
+ nodes_.clear();
+ CreateNode(index, 0, 'X');
+
+ node_queue_.clear();
+ EnqueueNode(0);
+
+ while (!candidate_queue_.empty()) {
+ candidate_queue_.pop();
+ }
+ }
+
+ // Gets the next key.
+ bool Next() {
+ for (SizeType i = 0; i < node_queue_.size(); ++i) {
+ BaseType node_index = node_queue_[i];
+ if (value_ != -1 && !FindSibling(&node_index)) {
+ continue;
+ }
+ node_index = FindTerminal(node_index);
+ EnqueueCandidate(node_index);
+ }
+ node_queue_.clear();
+
+ // Returns false if there is no candidate.
+ if (candidate_queue_.empty()) {
+ return false;
+ }
+
+ const RankedCompleterCandidate &candidate = candidate_queue_.top();
+
+ BaseType node_index = candidate.node_index();
+ EnqueueNode(node_index);
+ node_index = nodes_[node_index].prev_node_index();
+
+ key_.resize(prefix_length_);
+ while (node_index != 0) {
+ key_.push_back(nodes_[node_index].label());
+ EnqueueNode(node_index);
+ node_index = nodes_[node_index].prev_node_index();
+ }
+ std::reverse(key_.begin() + prefix_length_, key_.end());
+ key_.push_back('\0');
+
+ value_ = candidate.value();
+ candidate_queue_.pop();
+
+ return true;
+ }
+
+ private:
+ const Dictionary *dic_;
+ const RankedGuide *guide_;
+ std::vector<UCharType> key_;
+ SizeType prefix_length_;
+ ValueType value_;
+
+ std::vector<RankedCompleterNode> nodes_;
+ std::vector<BaseType> node_queue_;
+ std::priority_queue<RankedCompleterCandidate,
+ std::vector<RankedCompleterCandidate>,
+ RankedCompleterCandidate::Comparer<ValueComparerType> >
+ candidate_queue_;
+
+ // Disallows copies.
+ RankedCompleterBase(const RankedCompleterBase &);
+ RankedCompleterBase &operator=(const RankedCompleterBase &);
+
+ // Pushes a node to queue.
+ void EnqueueNode(BaseType node_index) {
+ if (nodes_[node_index].is_queued()) {
+ return;
+ }
+
+ node_queue_.push_back(node_index);
+ nodes_[node_index].set_is_queued();
+ }
+
+ // Pushes a candidate to priority queue.
+ void EnqueueCandidate(BaseType node_index) {
+ RankedCompleterCandidate candidate;
+ candidate.set_node_index(node_index);
+ candidate.set_value(
+ dic_->units()[nodes_[node_index].dic_index()].value());
+ candidate_queue_.push(candidate);
+ }
+
+ // Finds a sibling of a given node.
+ bool FindSibling(BaseType *node_index) {
+ BaseType prev_node_index = nodes_[*node_index].prev_node_index();
+ BaseType dic_index = nodes_[*node_index].dic_index();
+
+ UCharType sibling_label = guide_->sibling(dic_index);
+ if (sibling_label == '\0') {
+ if (!nodes_[prev_node_index].has_terminal()) {
+ return false;
+ }
+ nodes_[prev_node_index].set_has_terminal(false);
+ }
+
+ // Follows a transition to sibling and creates a node for the sibling.
+ BaseType dic_prev_index = nodes_[prev_node_index].dic_index();
+ dic_index = FollowWithoutCheck(dic_prev_index, sibling_label);
+ *node_index = CreateNode(dic_index, prev_node_index, sibling_label);
+
+ return true;
+ }
+
+ // Follows transitions and finds a terminal.
+ BaseType FindTerminal(BaseType node_index) {
+ while (nodes_[node_index].label() != '\0') {
+ BaseType dic_index = nodes_[node_index].dic_index();
+ UCharType child_label = guide_->child(dic_index);
+ if (child_label == '\0') {
+ nodes_[node_index].set_has_terminal(false);
+ }
+
+ // Follows a transition to child and creates a node for the child.
+ dic_index = FollowWithoutCheck(dic_index, child_label);
+ node_index = CreateNode(dic_index, node_index, child_label);
+ }
+ return node_index;
+ }
+
+ // Follows a transition without any check.
+ BaseType FollowWithoutCheck(BaseType index, UCharType label) const {
+ return index ^ dic_->units()[index].offset() ^ label;
+ }
+
+ // Creates a node.
+ BaseType CreateNode(BaseType dic_index, BaseType prev_node_index,
+ UCharType label) {
+ RankedCompleterNode node;
+ node.set_dic_index(dic_index);
+ node.set_prev_node_index(prev_node_index);
+ node.set_label(label);
+ if (node.label() != '\0') {
+ node.set_has_terminal(dic_->has_value(node.dic_index()));
+ }
+ nodes_.push_back(node);
+
+ return static_cast<BaseType>(nodes_.size() - 1);
+ }
+};
+
+typedef RankedCompleterBase<> RankedCompleter;
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_RANKED_COMPLETER_H
182 lib/dawgdic/ranked-guide-builder.h
@@ -0,0 +1,182 @@
+#ifndef DAWGDIC_RANKED_GUIDE_BUILDER_H
+#define DAWGDIC_RANKED_GUIDE_BUILDER_H
+
+#include "dawg.h"
+#include "dictionary.h"
+#include "ranked-guide.h"
+#include "ranked-guide-link.h"
+
+#include <algorithm>
+#include <functional>
+#include <vector>
+
+namespace dawgdic {
+
+class RankedGuideBuilder {
+ public:
+ // Builds a dictionary for completing keys.
+ static bool Build(const Dawg &dawg, const Dictionary &dic,
+ RankedGuide *guide) {
+ return Build(dawg, dic, guide, std::less<ValueType>());
+ }
+
+ // Builds a dictionary for completing keys.
+ template <typename VALUE_COMPARER_TYPE>
+ static bool Build(const Dawg &dawg, const Dictionary &dic,
+ RankedGuide *guide, VALUE_COMPARER_TYPE value_comparer) {
+ RankedGuideBuilder builder(dawg, dic, guide);
+ return builder.BuildRankedGuide(value_comparer);
+ }
+
+ private:
+ const Dawg &dawg_;
+ const Dictionary &dic_;
+ RankedGuide *guide_;
+
+ std::vector<RankedGuideUnit> units_;
+ std::vector<RankedGuideLink> links_;
+ std::vector<UCharType> is_fixed_table_;
+
+ // Disallows copies.
+ RankedGuideBuilder(const RankedGuideBuilder &);
+ RankedGuideBuilder &operator=(const RankedGuideBuilder &);
+
+ RankedGuideBuilder(const Dawg &dawg, const Dictionary &dic,
+ RankedGuide *guide)
+ : dawg_(dawg), dic_(dic), guide_(guide),
+ units_(), links_(), is_fixed_table_() {}
+
+ template <typename VALUE_COMPARER_TYPE>
+ bool BuildRankedGuide(VALUE_COMPARER_TYPE value_comparer) {
+ // Initializes units and flags.
+ units_.resize(dic_.size());
+ is_fixed_table_.resize(dic_.size() / 8, '\0');
+
+ if (dawg_.size() <= 1) {
+ return true;
+ }
+
+ ValueType max_value = -1;
+ if (!BuildRankedGuide(dawg_.root(), dic_.root(),
+ &max_value, value_comparer)) {
+ return false;
+ }
+
+ guide_->SwapUnitsBuf(&units_);
+ return true;
+ }
+
+ // Builds a guide recursively.
+ template <typename VALUE_COMPARER_TYPE>
+ bool BuildRankedGuide(BaseType dawg_index, BaseType dic_index,
+ ValueType *max_value,
+ VALUE_COMPARER_TYPE value_comparer) {
+ if (is_fixed(dic_index)) {
+ return FindMaxValue(dic_index, max_value);
+ }
+ set_is_fixed(dic_index);
+
+ SizeType initial_num_links = links_.size();
+
+ // Enumerates links to the next states.
+ if (!EnumerateLinks(dawg_index, dic_index, value_comparer)) {
+ return false;
+ }
+
+ std::stable_sort(links_.begin() + initial_num_links, links_.end(),
+ RankedGuideLink::MakeComparer(value_comparer));
+
+ // Reflects links into units.
+ if (!TurnLinksToUnits(dic_index, initial_num_links)) {
+ return false;
+ }
+
+ *max_value = links_[initial_num_links].value();
+ links_.resize(initial_num_links);
+
+ return true;
+ }
+
+ // Finds the maximum value by using fixed units.
+ bool FindMaxValue(BaseType dic_index, ValueType *max_value) const {
+ while (units_[dic_index].child() != '\0') {
+ UCharType child_label = units_[dic_index].child();
+ if (!dic_.Follow(child_label, &dic_index)) {
+ return false;
+ }
+ }
+ if (!dic_.has_value(dic_index)) {
+ return false;
+ }
+ *max_value = dic_.value(dic_index);
+ return true;
+ }
+
+ // Enumerates links to the next states.
+ template <typename VALUE_COMPARER_TYPE>
+ bool EnumerateLinks(BaseType dawg_index, BaseType dic_index,
+ VALUE_COMPARER_TYPE value_comparer) {
+ for (BaseType dawg_child_index = dawg_.child(dawg_index);
+ dawg_child_index != 0;
+ dawg_child_index = dawg_.sibling(dawg_child_index)) {
+ ValueType value = -1;
+ UCharType child_label = dawg_.label(dawg_child_index);
+ if (child_label == '\0') {
+ if (!dic_.has_value(dic_index)) {
+ return false;
+ }
+ value = dic_.value(dic_index);
+ } else {
+ BaseType dic_child_index = dic_index;
+ if (!dic_.Follow(child_label, &dic_child_index)) {
+ return false;
+ }
+
+ if (!BuildRankedGuide(dawg_child_index, dic_child_index,
+ &value, value_comparer)) {
+ return false;
+ }
+ }
+ links_.push_back(RankedGuideLink(child_label, value));
+ }
+
+ return true;
+ }
+
+ // Modifies units.
+ bool TurnLinksToUnits(BaseType dic_index, SizeType links_begin) {
+ // The first child.
+ UCharType first_label = links_[links_begin].label();
+ units_[dic_index].set_child(first_label);
+ BaseType dic_child_index = FollowWithoutCheck(dic_index, first_label);
+
+ // Other children.
+ for (SizeType i = links_begin + 1; i < links_.size(); ++i) {
+ UCharType sibling_label = links_[i].label();
+
+ BaseType dic_sibling_index =
+ FollowWithoutCheck(dic_index, sibling_label);
+ units_[dic_child_index].set_sibling(sibling_label);
+ dic_child_index = dic_sibling_index;
+ }
+
+ return true;
+ }
+
+ // Follows a transition without any check.
+ BaseType FollowWithoutCheck(BaseType index, UCharType label) const {
+ return index ^ dic_.units()[index].offset() ^ label;
+ }
+
+ void set_is_fixed(BaseType index) {
+ is_fixed_table_[index / 8] |= 1 << (index % 8);
+ }
+
+ bool is_fixed(BaseType index) const {
+ return (is_fixed_table_[index / 8] & (1 << (index % 8))) != 0;
+ }
+};
+
+} // namespace dawgdic
+
+#endif // DAWGDIC_RANKED_GUIDE_BUILDER_H
62 lib/dawgdic/ranked-guide-link.h
@@ -0,0 +1,62 @@
+#ifndef DAWGDIC_RANKED_GUIDE_LINK_H
+#define DAWGDIC_RANKED_GUIDE_LINK_H
+
+namespace dawgdic {
+
+class RankedGuideLink {
+ public:
+ RankedGuideLink() : label_('\0'), value_(-1) {}
+ RankedGuideLink(UCharType label, ValueType value)
+ : label_(label), value_(value) {}
+
+ void set_label(UCharType label) {
+ label_ = label;
+ }
+ void set_value(ValueType value) {
+ value_ = value;