Skip to content

Commit

Permalink
support morpheme n-gram in mecab plugin (fix #1054)
Browse files Browse the repository at this point in the history
  • Loading branch information
kmaehashi committed Dec 14, 2015
1 parent c704d3a commit ba174c0
Show file tree
Hide file tree
Showing 3 changed files with 135 additions and 49 deletions.
51 changes: 44 additions & 7 deletions plugin/src/fv_converter/mecab_splitter.cpp
Expand Up @@ -28,6 +28,7 @@ namespace plugin {
namespace fv_converter {

using core::fv_converter::converter_exception;
using core::fv_converter::string_feature_element;

static MeCab::Model* create_mecab_model(const char* arg) {
MeCab::Model* t = MeCab::createModel(arg);
Expand All @@ -41,16 +42,22 @@ static MeCab::Model* create_mecab_model(const char* arg) {
}

mecab_splitter::mecab_splitter()
: model_(create_mecab_model("")) {
: model_(create_mecab_model("")),
ngram_(1) {
}

mecab_splitter::mecab_splitter(const char* arg)
: model_(create_mecab_model(arg)) {
mecab_splitter::mecab_splitter(const char* arg, size_t ngram)
: model_(create_mecab_model(arg)),
ngram_(ngram) {
if (ngram == 0) {
throw JUBATUS_EXCEPTION(
converter_exception("ngram must be a positive number"));
}
}

void mecab_splitter::split(
void mecab_splitter::extract(
const std::string& string,
std::vector<std::pair<size_t, size_t> >& ret_boundaries) const {
std::vector<string_feature_element>& result) const {
jubatus::util::lang::scoped_ptr<MeCab::Tagger> tagger(model_->createTagger());
if (!tagger) {
// cannot create tagger
Expand Down Expand Up @@ -82,7 +89,34 @@ void mecab_splitter::split(
p += node->length;
}

bounds.swap(ret_boundaries);
// Need at least N surfaces to extract N-gram.
if (bounds.size() < ngram_) {
return;
}

// Number of features: e.g., 2-gram for 4 surfaces (bounds) like "a,b,c,d"
// will produce 3 features ({"a,b", "b,c", "c,d"}.)
size_t num_features = bounds.size() - ngram_ + 1;

std::vector<string_feature_element> feature_elems;
feature_elems.reserve(num_features);
for (size_t i = 0; i < num_features; ++i) {
size_t begin = bounds[i].first;
size_t length = bounds[i].second;
std::string feature = std::string(string, begin, length);

for (size_t j = 1; j < ngram_; ++j) {
size_t begin_j = bounds[i+j].first;
size_t length_j = bounds[i+j].second;
length += length_j;
feature += "," + std::string(string, begin_j, length_j);
}

feature_elems.push_back(
string_feature_element(begin, length, feature, 1.0));
}

feature_elems.swap(result);
}

} // namespace fv_converter
Expand All @@ -94,7 +128,10 @@ jubatus::plugin::fv_converter::mecab_splitter* create(
const std::map<std::string, std::string>& params) {
std::string param =
jubatus::core::fv_converter::get_with_default(params, "arg", "");
return new jubatus::plugin::fv_converter::mecab_splitter(param.c_str());
size_t ngram = jubatus::util::lang::lexical_cast<size_t>(
jubatus::core::fv_converter::get_with_default(params, "ngram", "1"));
return new jubatus::plugin::fv_converter::mecab_splitter(
param.c_str(), ngram);
}

std::string version() {
Expand Down
14 changes: 9 additions & 5 deletions plugin/src/fv_converter/mecab_splitter.hpp
Expand Up @@ -24,22 +24,26 @@
#include <mecab.h>
#include "jubatus/util/lang/scoped_ptr.h"

#include "jubatus/core/fv_converter/word_splitter.hpp"
#include "jubatus/core/fv_converter/string_feature.hpp"

namespace jubatus {
namespace plugin {
namespace fv_converter {

class mecab_splitter : public jubatus::core::fv_converter::word_splitter {
using core::fv_converter::string_feature_element;

class mecab_splitter : public jubatus::core::fv_converter::string_feature {
public:
mecab_splitter();
explicit mecab_splitter(const char* arg);
explicit mecab_splitter(const char* arg, size_t ngram);

void split(const std::string& string,
std::vector<std::pair<size_t, size_t> >& ret_boundaries) const;
void extract(
const std::string& string,
std::vector<string_feature_element>& result) const;

private:
jubatus::util::lang::scoped_ptr<MeCab::Model> model_;
size_t ngram_;
};

} // namespace fv_converter
Expand Down
119 changes: 82 additions & 37 deletions plugin/src/fv_converter/mecab_splitter_test.cpp
Expand Up @@ -29,73 +29,118 @@ namespace jubatus {
namespace plugin {
namespace fv_converter {

using core::fv_converter::word_splitter;
using core::fv_converter::string_feature;
using core::fv_converter::string_feature_element;
using core::fv_converter::converter_exception;

void assert_elements_eq(std::vector<string_feature_element> expected,
std::vector<string_feature_element> actual) {
ASSERT_EQ(expected.size(), actual.size());
for (size_t i = 0; i < expected.size(); ++i) {
ASSERT_EQ(expected[i].begin, actual[i].begin);
ASSERT_EQ(expected[i].length, actual[i].length);
ASSERT_EQ(expected[i].value, actual[i].value);
ASSERT_EQ(expected[i].score, actual[i].score);
}
}

TEST(mecab_splitter, trivial) {
mecab_splitter m;
std::vector<std::pair<size_t, size_t> > bs;
m.split("本日は晴天なり", bs);
std::vector<std::pair<size_t, size_t> > exp;
std::vector<string_feature_element> elems;
m.extract("本日は晴天なり", elems);
std::vector<string_feature_element> exp;

exp.push_back(string_feature_element(0, 6, "本日", 1.0));
exp.push_back(string_feature_element(6, 3, "", 1.0));
exp.push_back(string_feature_element(9, 6, "晴天", 1.0));
exp.push_back(string_feature_element(15, 6, "なり", 1.0));

exp.push_back(std::make_pair(0, 6));
exp.push_back(std::make_pair(6, 3));
exp.push_back(std::make_pair(9, 6));
exp.push_back(std::make_pair(15, 6));
assert_elements_eq(exp, elems);
}

TEST(mecab_splitter, bigram) {
mecab_splitter m("", 2);
std::vector<string_feature_element> elems;
m.extract("本日は晴天なり", elems);
std::vector<string_feature_element> exp;

ASSERT_EQ(exp, bs);
exp.push_back(string_feature_element(0, 9, "本日,は", 1.0));
exp.push_back(string_feature_element(6, 9, "は,晴天", 1.0));
exp.push_back(string_feature_element(9, 12, "晴天,なり", 1.0));

assert_elements_eq(exp, elems);
}

TEST(mecab_splitter, bigram_with_single_surface) {
mecab_splitter m("", 2);
std::vector<string_feature_element> elems;
m.extract("本日", elems);

ASSERT_EQ(0u, elems.size());
}

TEST(mecab_splitter, illegal_argument) {
EXPECT_THROW(mecab_splitter("-r unknown_file"), converter_exception);
// Invalid MeCab argument
EXPECT_THROW(mecab_splitter("-r unknown_file", 1), converter_exception);

// Invalid N-gram
EXPECT_THROW(mecab_splitter("", 0), converter_exception);
}

TEST(mecab_splitter_create, trivial) {
std::map<std::string, std::string> param;
jubatus::util::lang::scoped_ptr<word_splitter> s(create(param));
jubatus::util::lang::scoped_ptr<string_feature> s(create(param));
std::string d("東京へ行く");
std::vector<std::pair<size_t, size_t> > bs;
s->split(d, bs);
ASSERT_EQ(3u, bs.size());
ASSERT_EQ(0u, bs[0].first);
ASSERT_EQ(6u, bs[0].second);
ASSERT_EQ(6u, bs[1].first);
ASSERT_EQ(3u, bs[1].second);
ASSERT_EQ(9u, bs[2].first);
ASSERT_EQ(6u, bs[2].second);
std::vector<string_feature_element> elems;
s->extract(d, elems);
ASSERT_EQ(3u, elems.size());
ASSERT_EQ(0u, elems[0].begin);
ASSERT_EQ(6u, elems[0].length);
ASSERT_EQ(6u, elems[1].begin);
ASSERT_EQ(3u, elems[1].length);
ASSERT_EQ(9u, elems[2].begin);
ASSERT_EQ(6u, elems[2].length);
}

TEST(mecab_splitter_create, illegal_argument) {
std::map<std::string, std::string> param;
param["arg"] = "-r unknown_file";
// Invalid MeCab argument
std::map<std::string, std::string> param1;
param1["arg"] = "-r unknown_file";
EXPECT_THROW(
jubatus::util::lang::scoped_ptr<string_feature>(create(param1)),
converter_exception);

// Invalid N-gram
std::map<std::string, std::string> param2;
param2["ngram"] = "0";
EXPECT_THROW(
jubatus::util::lang::scoped_ptr<word_splitter>(create(param)),
jubatus::util::lang::scoped_ptr<string_feature>(create(param2)),
converter_exception);
}

TEST(mecab_splitter, with_space) {
mecab_splitter m;
std::vector<std::pair<size_t, size_t> > bs;
m.split(" テスト テスト ", bs);
std::vector<std::pair<size_t, size_t> > exp;
std::vector<string_feature_element> elems;
m.extract(" テスト テスト ", elems);
std::vector<string_feature_element> exp;

exp.push_back(std::make_pair(1, 9));
exp.push_back(std::make_pair(11, 9));
exp.push_back(string_feature_element(1, 9, "テスト", 1.0));
exp.push_back(string_feature_element(11, 9, "テスト", 1.0));

ASSERT_EQ(exp, bs);
assert_elements_eq(exp, elems);
}

void run(mecab_splitter* m) {
std::vector<std::pair<size_t, size_t> > exp;
exp.push_back(std::make_pair(0, 6));
exp.push_back(std::make_pair(6, 3));
exp.push_back(std::make_pair(9, 6));
exp.push_back(std::make_pair(15, 6));
std::vector<string_feature_element> exp;
exp.push_back(string_feature_element(0, 6, "本日", 1.0));
exp.push_back(string_feature_element(6, 3, "", 1.0));
exp.push_back(string_feature_element(9, 6, "晴天", 1.0));
exp.push_back(string_feature_element(15, 6, "なり", 1.0));

for (int i = 0; i < 1000; ++i) {
std::vector<std::pair<size_t, size_t> > bs;
m->split("本日は晴天なり", bs);
ASSERT_EQ(exp, bs);
std::vector<string_feature_element> elems;
m->extract("本日は晴天なり", elems);
assert_elements_eq(exp, elems);
}
}

Expand Down

0 comments on commit ba174c0

Please sign in to comment.