Skip to content

Commit

Permalink
Merge pull request cliqz-oss#118 from hendrik-cliqz/api-cleanup
Browse files Browse the repository at this point in the history
Api cleanup + SetManifest for Merger
  • Loading branch information
hendrikmuhs committed Aug 23, 2016
2 parents 65a7035 + 814c051 commit 382f88f
Show file tree
Hide file tree
Showing 21 changed files with 5,941 additions and 5,101 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ final {
};

MatchIterator::MatchIteratorPair GetCompletions(
const char* query, int number_of_results = 10) {
const std::string& query, int number_of_results = 10) {

// get query length
size_t query_length = strlen(query);
const size_t query_length = query.size();

// get tokens
std::vector<std::string> strs;
Expand Down
2 changes: 1 addition & 1 deletion keyvi/src/cpp/dictionary/completion/multiword_completion.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class MultiWordCompletion final {
}

MatchIterator::MatchIteratorPair GetCompletions(
const char* query, int number_of_results = 10) {
const std::string& query, int number_of_results = 10) const {

uint64_t state = fsa_->GetStartState();
size_t number_of_tokens;
Expand Down
14 changes: 7 additions & 7 deletions keyvi/src/cpp/dictionary/completion/prefix_completion.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ final {
}

MatchIterator::MatchIteratorPair GetCompletions(
const char* query, int number_of_results = 10) {
const std::string& query, int number_of_results = 10) {

uint64_t state = fsa_->GetStartState();
size_t query_length = strlen(query);
const size_t query_length = query.size();
size_t depth = 0;

std::vector<unsigned char> traversal_stack;
Expand Down Expand Up @@ -93,7 +93,7 @@ final {
if (fsa_->IsFinalState(state)) {
TRACE("prefix matched depth %d %s", query_length + data->traverser.GetDepth(), std::string(reinterpret_cast<char*> (&data->traversal_stack[0]), query_length + data->traverser.GetDepth()).c_str());
first_match = Match(
0, query_length, std::string(query, query_length), 0, fsa_, fsa_->GetStateValue(state));
0, query_length, query, 0, fsa_, fsa_->GetStateValue(state));
}

auto tfunc =
Expand Down Expand Up @@ -137,16 +137,16 @@ final {
}

MatchIterator::MatchIteratorPair GetFuzzyCompletions(
const char* query, int max_edit_distance) {
const std::string& query, int max_edit_distance) {

uint64_t state = fsa_->GetStartState();
size_t query_length = strlen(query);
const size_t query_length = query.size();
size_t depth = 0;
const size_t minimum_exact_prefix = 2;
size_t exact_prefix = std::min(query_length, minimum_exact_prefix);
std::vector<int> codepoints;

utf8::unchecked::utf8to32(query, query + query_length,
utf8::unchecked::utf8to32(query.c_str(), query.c_str() + query_length,
back_inserter(codepoints));

stringdistance::Levenshtein metric(codepoints, 20, 3);
Expand Down Expand Up @@ -185,7 +185,7 @@ final {
if (depth == query_length && fsa_->IsFinalState(state)) {
TRACE("prefix matched depth %d %s", query_length + data->traverser.GetDepth(), std::string(query, query_length).c_str());
first_match = Match(
0, query_length, std::string(query, query_length), 0, fsa_, fsa_->GetStateValue(state));
0, query_length, query, 0, fsa_, fsa_->GetStateValue(state));
}

auto tfunc =
Expand Down
30 changes: 14 additions & 16 deletions keyvi/src/cpp/dictionary/dictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ final {
* @param filename the filename
* @param load_lazy whether to load lazy.
*/
Dictionary(const char* filename, bool load_lazy)
Dictionary(const std::string& filename, bool load_lazy)
: fsa_(std::make_shared<fsa::Automata>(filename, load_lazy)) {
TRACE("Dictionary from file %s", filename);
}
Expand All @@ -60,7 +60,7 @@ final {
* @param filename filename to load keyvi file from.
* @param loading_strategy optional: Loading strategy to use.
*/
explicit Dictionary(const char* filename, loading_strategy_types loading_strategy = loading_strategy_types::lazy)
explicit Dictionary(const std::string& filename, loading_strategy_types loading_strategy = loading_strategy_types::lazy)
: fsa_(std::make_shared<fsa::Automata>(filename, loading_strategy)) {
TRACE("Dictionary from file %s", filename);
}
Expand All @@ -69,8 +69,7 @@ final {
: fsa_(f) {
}

// temporary implementation
fsa::automata_t GetFsa() {
fsa::automata_t GetFsa() const {
return fsa_;
}

Expand All @@ -88,9 +87,9 @@ final {
* @param key The key
* @return True if key is in the dictionary, False otherwise.
*/
bool Contains(const char* key) const {
bool Contains(const std::string& key) const {
uint64_t state = fsa_->GetStartState();
size_t key_length = strlen(key);
const size_t key_length = key.size();

TRACE("Contains for %s", key);
for (size_t i = 0; i < key_length; ++i) {
Expand All @@ -111,9 +110,9 @@ final {
return false;
}

Match operator[](const char* key) const {
Match operator[](const std::string& key) const {
uint64_t state = fsa_->GetStartState();
size_t text_length = strlen(key);
const size_t text_length = key.size();

for (size_t i = 0; i < text_length; ++i) {
state = fsa_->TryWalkTransition(state, key[i]);
Expand Down Expand Up @@ -142,9 +141,9 @@ final {
* @param key the key to lookup.
* @return a match iterator
*/
MatchIterator::MatchIteratorPair Get(const char* key) const {
MatchIterator::MatchIteratorPair Get(const std::string& key) const {
uint64_t state = fsa_->GetStartState();
size_t text_length = strlen(key);
const size_t text_length = key.size();

for (size_t i = 0; i < text_length; ++i) {
state = fsa_->TryWalkTransition(state, key[i]);
Expand Down Expand Up @@ -256,11 +255,11 @@ final {
* @param text the input
* @return a match iterator.
*/
MatchIterator::MatchIteratorPair Lookup(const char* text,
MatchIterator::MatchIteratorPair Lookup(const std::string& text,
size_t offset = 0) {

uint64_t state = fsa_->GetStartState();
size_t text_length = strlen(text);
const size_t text_length = text.size();
uint64_t last_final_state = 0;
size_t last_final_state_position = 0;

Expand All @@ -287,8 +286,7 @@ final {
m = Match(
offset,
last_final_state_position,
/*text.substr(0, last_final_state_position),*/
std::string(text + offset, last_final_state_position - offset),
text.substr(offset, last_final_state_position - offset),
0,
fsa_,
fsa_->GetStateValue(last_final_state));
Expand All @@ -313,9 +311,9 @@ final {
* @param text the input
* @return a match iterator.
*/
MatchIterator::MatchIteratorPair LookupText(const char* text) {
MatchIterator::MatchIteratorPair LookupText(const std::string& text) {

size_t text_length = strlen(text);
const size_t text_length = text.size();
std::queue<MatchIterator> iterators;

TRACE("LookupText, 1st lookup for: %s", text);
Expand Down
1 change: 0 additions & 1 deletion keyvi/src/cpp/dictionary/dictionary_compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,6 @@ class DictionaryCompiler
if (params_.count(TEMPORARY_PATH_KEY) == 0) {
params_[TEMPORARY_PATH_KEY] =
boost::filesystem::temp_directory_path().string();

}

TRACE("tmp path set to %s", params_[TEMPORARY_PATH_KEY].c_str());
Expand Down
11 changes: 11 additions & 0 deletions keyvi/src/cpp/dictionary/dictionary_merger.h
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,15 @@ final {
dicts_to_merge_.push_back(fsa);
}

/**
* Set a custom manifest to be embedded into the index file.
*
* @param manifest as JSON string
*/
void SetManifestFromString(const std::string& manifest){
manifest_ = manifest;
}

void Merge(const std::string& filename){
std::priority_queue<SegmentIterator> pqueue;

Expand Down Expand Up @@ -165,13 +174,15 @@ final {

generator.CloseFeeding();

generator.SetManifestFromString(manifest_);
generator.WriteToFile(filename);
}

private:
std::vector<fsa::automata_t> dicts_to_merge_;
size_t memory_limit_;
fsa::internal::IValueStoreWriter::vs_param_t params_;
std::string manifest_ = std::string();
};

} /* namespace dictionary */
Expand Down
8 changes: 4 additions & 4 deletions keyvi/src/cpp/dictionary/fsa/automata.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@ class Automata
final {

public:
Automata(const char * filename, bool load_lazy):
Automata(const std::string& filename, bool load_lazy):
Automata(filename, load_lazy ? loading_strategy_types::default_os : loading_strategy_types::populate) {}

explicit Automata(const char * filename, loading_strategy_types loading_strategy = loading_strategy_types::lazy) {
explicit Automata(const std::string& filename, loading_strategy_types loading_strategy = loading_strategy_types::lazy) {
std::ifstream in_stream(filename, std::ios::binary);

if (!in_stream.good()) {
Expand Down Expand Up @@ -85,7 +85,7 @@ final {
size_t offset = in_stream.tellg();

file_mapping_ = new boost::interprocess::file_mapping(
filename, boost::interprocess::read_only);
filename.c_str(), boost::interprocess::read_only);
size_t array_size = boost::lexical_cast<size_t>(sparse_array_properties_.get<std::string>("size"));

in_stream.seekg(offset + array_size + bucket_size * array_size - 1);
Expand Down Expand Up @@ -533,7 +533,7 @@ final {
};

// shared pointer
typedef std::shared_ptr<Automata> automata_t;
typedef std::shared_ptr<const Automata> automata_t;

} /* namespace fsa */
} /* namespace dictionary */
Expand Down
29 changes: 12 additions & 17 deletions keyvi/src/cpp/dictionary/fsa/generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,11 @@ namespace fsa {
* @returns length of the longest common prefix of given strings
*/

inline size_t get_common_prefix_length(const char* first, const char* second) {
inline size_t get_common_prefix_length(const std::string& first, const std::string& second) {

size_t common_prefix_length = 0;

while (first[common_prefix_length] == second[common_prefix_length]
&& first[common_prefix_length] != 0) {
while (first[common_prefix_length] == second[common_prefix_length] && common_prefix_length < first.size()) {
++common_prefix_length;
}
return common_prefix_length;
Expand Down Expand Up @@ -190,9 +189,7 @@ final {
void Add(const std::string& input_key, typename ValueStoreT::value_t value =
ValueStoreT::no_value) {

const char* key = input_key.c_str();

size_t commonPrefixLength = get_common_prefix_length(last_key_.c_str(), key);
const size_t commonPrefixLength = get_common_prefix_length(last_key_, input_key);

// keys are equal, just return
if (commonPrefixLength == input_key.size() && last_key_.size() == input_key.size()) {
Expand All @@ -203,7 +200,7 @@ final {
ConsumeStack(commonPrefixLength);

// put everything that is not common between the two strings (the suffix) into the stack
FeedStack(commonPrefixLength, input_key.size(), key);
FeedStack(commonPrefixLength, input_key);

// get value and mark final state
bool no_minimization = false;
Expand All @@ -220,7 +217,7 @@ final {
stack_->UpdateWeights(0, input_key.size() + 1, weight);
}

last_key_ = key;
last_key_ = input_key;
state_ = generator_state::FEEDING;
}

Expand All @@ -231,9 +228,7 @@ final {
*/
void Add(const std::string& input_key, const ValueHandle& handle) {

const char* key = input_key.c_str();

size_t commonPrefixLength = get_common_prefix_length(last_key_.c_str(), key);
const size_t commonPrefixLength = get_common_prefix_length(last_key_, input_key);

// keys are equal, just return
if (commonPrefixLength == input_key.size() && last_key_.size() == input_key.size()) {
Expand All @@ -244,7 +239,7 @@ final {
ConsumeStack(commonPrefixLength);

// put everything that is not common between the two strings (the suffix) into the stack
FeedStack(commonPrefixLength, input_key.size(), key);
FeedStack(commonPrefixLength, input_key);

stack_->InsertFinalState(input_key.size(), handle.value_idx, handle.no_minimization);

Expand All @@ -256,7 +251,7 @@ final {
stack_->UpdateWeights(0, input_key.size() + 1, handle.weight);
}

last_key_ = std::move(input_key);
last_key_ = input_key;
state_ = generator_state::FEEDING;
}

Expand Down Expand Up @@ -358,16 +353,16 @@ final {
internal::SerializationUtils::WriteJsonRecord(stream, pt);
}

inline void FeedStack(const size_t start, const size_t end, const char* key) {
for (size_t i = start; i < end; ++i) {
inline void FeedStack(const size_t start, const std::string& key) {
for (size_t i = start; i < key.size(); ++i) {
uint32_t ukey =
static_cast<uint32_t>(static_cast<unsigned char>(key[i]));
stack_->Insert(i, ukey, 0);
}

// remember highest stack
if (end > highest_stack_) {
highest_stack_ = end;
if (key.size() > highest_stack_) {
highest_stack_ = key.size();
}
}

Expand Down
4 changes: 2 additions & 2 deletions pykeyvi/src/addons/Dictionary.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
key = key.encode('utf-8')
assert isinstance(key, bytes), 'arg in_0 wrong type'

cdef shared_ptr[_Match] _r = shared_ptr[_Match](new _Match(deref(self.inst.get())[(<const_char *>key)]))
cdef shared_ptr[_Match] _r = shared_ptr[_Match](new _Match(deref(self.inst.get())[(<libcpp_string>key)]))

if _r.get().IsEmpty():
return default
Expand All @@ -30,7 +30,7 @@

assert isinstance(key, bytes), 'arg in_0 wrong type'

cdef shared_ptr[_Match] _r = shared_ptr[_Match](new _Match(deref(self.inst.get())[(<const_char *>key)]))
cdef shared_ptr[_Match] _r = shared_ptr[_Match](new _Match(deref(self.inst.get())[(<libcpp_string>key)]))

if _r.get().IsEmpty():
raise KeyError(key)
Expand Down
4 changes: 2 additions & 2 deletions pykeyvi/src/addons/JsonDictionaryCompiler.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@

if isinstance(key, unicode):
key = key.encode('UTF-8')
cdef const_char * input_in_0 = <const_char *> key
cdef libcpp_string input_in_0 = <libcpp_string> key

if isinstance(value, unicode):
value = value.encode('UTF-8')
cdef const_char * input_in_1 = <const_char *> value
cdef libcpp_string input_in_1 = <libcpp_string> value

self.inst.get().Add(input_in_0, input_in_1)

Expand Down
5 changes: 5 additions & 0 deletions pykeyvi/src/addons/JsonDictionaryMerger.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@


def SetManifest(self, manifest):
m = json.dumps(manifest)
self.inst.get().SetManifestFromString(m)
14 changes: 7 additions & 7 deletions pykeyvi/src/pxds/dictionary.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,16 @@ cdef extern from "dictionary/dictionary.h" namespace "keyvi::dictionary":
populate_key_part_no_readahead_value_part # populate the key part, but disable read ahead value part

cdef cppclass Dictionary:
Dictionary (const_char* filename) except +
Dictionary (const_char* filename, loading_strategy_types) except +
bool Contains (const_char*) # wrap-ignore
Match operator[](const_char*) # wrap-ignore
_MatchIteratorPair Get (const_char*)
Dictionary (libcpp_string filename) except +
Dictionary (libcpp_string filename, loading_strategy_types) except +
bool Contains (libcpp_string) # wrap-ignore
Match operator[](libcpp_string) # wrap-ignore
_MatchIteratorPair Get (libcpp_string)
_MatchIteratorPair GetNear (libcpp_string, size_t minimum_prefix_length) except +
_MatchIteratorPair GetNear (libcpp_string, size_t minimum_prefix_length, bool greedy) except +
_MatchIteratorPair GetAllItems () # wrap-ignore
_MatchIteratorPair Lookup(const_char*)
_MatchIteratorPair LookupText(const_char*)
_MatchIteratorPair Lookup(libcpp_string)
_MatchIteratorPair LookupText(libcpp_string)
libcpp_string GetManifestAsString() except + # wrap-ignore
libcpp_string GetStatistics() # wrap-ignore
uint32_t GetSize() # wrap-ignore
Loading

0 comments on commit 382f88f

Please sign in to comment.