diff --git a/learn.c b/learn.c index 3356ba1..fd14b64 100644 --- a/learn.c +++ b/learn.c @@ -690,6 +690,7 @@ varnam_is_known_word(varnam* handle, const char* word) return 0; } +/*Gets the stemrule for a particular ending from the database*/ int varnam_get_stem(varnam* handle, const char* old_ending, const char *new_ending) { @@ -730,6 +731,7 @@ varnam_get_stem(varnam* handle, const char* old_ending, const char *new_ending) } +/*Stems the supplied word*/ int varnam_stem(varnam *handle, char *word) { @@ -738,6 +740,13 @@ varnam_stem(varnam *handle, char *word) char *ending,*new_ending; char *p; + + if(word == NULL) + { + set_last_error(handle, "Cannot stem empty word"); + return VARNAM_ERROR; + } + word_buf = strbuf_init(strlen(word)); strbuf_add(word_buf, word); new_ending = strbuf_init(15); @@ -766,6 +775,4 @@ varnam_stem(varnam *handle, char *word) else return VARNAM_ERROR; - - } \ No newline at end of file diff --git a/strbuf.c b/strbuf.c index c5359b4..fdc6b3d 100644 --- a/strbuf.c +++ b/strbuf.c @@ -178,7 +178,30 @@ int strbuf_addvf(struct strbuf *string, const char *format, va_list args) } /* - Returns the last unicode character of the word + * Gets each unicode character in this string + * returned result should be destroyed + * */ +varray* +strbuf_chars(strbuf *b) +{ + const unsigned char *ustring; const char *inputcopy; + int bytes_read; + varray *chars; + strbuf *tmp; + + inputcopy = b->buffer; + chars = varray_init(); + while (*inputcopy != '\0') { + READ_A_UTF8_CHAR (ustring, inputcopy, bytes_read); + tmp = strbuf_init(8); + strbuf_add_bytes (tmp, inputcopy - bytes_read, bytes_read); + varray_push (chars, strbuf_detach(tmp)); + bytes_read = 0; + } + return chars; +} + + /* Returns the last unicode character of the word */ char* strbuf_get_ending(strbuf *word) @@ -192,11 +215,6 @@ strbuf_get_ending(strbuf *word) return (char*)characters->memory[characters->index]; } -/* - decreases the index of the word by - the length of its ending -*/ - void strbuf_destroy(void *s) { strbuf *string; diff --git a/symbol-table.c b/symbol-table.c index 1afeece..5f039eb 100644 --- a/symbol-table.c +++ b/symbol-table.c @@ -324,6 +324,8 @@ vst_persist_token( return VARNAM_SUCCESS; } +/*Stores a stem rule into the database*/ + int vst_persist_stemrule(varnam *handle, const char* old_ending, const char* new_ending, int level) { sqlite3 *db; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a1b0e04..a5dfac5 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -27,6 +27,7 @@ list (APPEND TEST_FILES export-words.c strbuftest.c varnamc_tests.c + stemmer_tests.c ) set(test_executable_name runtests) diff --git a/tests/stemmer_tests.c b/tests/stemmer_tests.c new file mode 100644 index 0000000..b2ed366 --- /dev/null +++ b/tests/stemmer_tests.c @@ -0,0 +1,71 @@ +/* + * Copyright (C) Navaneeth.K.N + * + * This is part of libvarnam. See LICENSE.txt for the license + */ + +#include +#include +#include +#include "testcases.h" +#include "../varnam.h" + +void setup_test_data() +{ + int rc; + char **msg; + char *filename = get_unique_filename(); + + reinitialize_varnam_instance(filename); + ensure_schema_exists(varnam_instance, msg); +} + +START_TEST (insert_stemrule) +{ + int rc; + char *empty_word=""; + + rc = varnam_create_stemrule(varnam_instance, empty_word, "ല", 1); + assert_error(rc); + + rc = varnam_create_stemrule(varnam_instance, "ക", "ല", 0); + assert_error(rc); + + rc = varnam_create_stemrule(varnam_instance, "ക", "ല", 4); + assert_error(rc); + + rc = varnam_create_stemrule(varnam_instance, "ക", "ല", 1); + assert_success(rc); +} +END_TEST + +START_TEST (stem_word) +{ + int rc; + char *word;; + char *empty_word=NULL; + + word = (char*)malloc(sizeof("പലക")); + strcpy(word, "പലക"); + + rc = varnam_create_stemrule(varnam_instance, "ക", "ല", 1); + assert_success(rc); + + rc = varnam_stem(varnam_instance, word); + assert_success(rc); + ck_assert_str_eq(word, "പലല"); + + rc = varnam_stem(varnam_instance, empty_word); + assert_error(rc); +} +END_TEST + +TCase* get_stemmer_tests() +{ + TCase* tcase = tcase_create("stemmer"); + tcase_add_checked_fixture(tcase, setup, teardown); + tcase_add_checked_fixture(tcase, setup_test_data, NULL); + tcase_add_test(tcase, insert_stemrule); + tcase_add_test(tcase, stem_word); + return tcase; +} \ No newline at end of file diff --git a/tests/test-runner.c b/tests/test-runner.c index 6567e4a..4dd8f98 100644 --- a/tests/test-runner.c +++ b/tests/test-runner.c @@ -31,6 +31,7 @@ int main(int argc, char **argv) suite_add_tcase (suite, get_learning_tests()); suite_add_tcase (suite, get_export_tests()); suite_add_tcase (suite, get_token_creation_tests()); + suite_add_tcase (suite, get_stemmer_tests()); util = suite_create ("util"); suite_add_tcase (util, get_strbuf_tests()); @@ -45,6 +46,7 @@ int main(int argc, char **argv) srunner_set_xml (runner, "testrun.xml"); srunner_set_fork_status (runner, CK_NOFORK); srunner_run_all (runner, CK_NORMAL); + failed = srunner_ntests_failed (runner); srunner_free (runner); diff --git a/varnam.c b/varnam.c index 5540a8a..f23830c 100644 --- a/varnam.c +++ b/varnam.c @@ -675,6 +675,18 @@ int varnam_create_stemrule(varnam* handle, const char* old_ending, const char* n { int rc; + if(old_ending == NULL || strlen(old_ending) == 0) + { + set_last_error(handle, "No ending supplied"); + return VARNAM_ERROR; + } + + if(level < 1 || level > 3) + { + set_last_error(handle, "Invalid level"); + return VARNAM_ERROR; + } + rc = vst_persist_stemrule(handle, old_ending, new_ending, level); if(rc != VARNAM_SUCCESS)