From 5bda7fd45c12473ea05be64111cb94b20a771480 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Mon, 27 Feb 2023 17:47:37 +0400 Subject: [PATCH 1/6] Support for multilang ZIMs --- src/library.cpp | 20 ++++++++++++++++++-- test/data/library.xml | 2 +- test/library.cpp | 6 +++--- test/library_server.cpp | 30 +++++++++++++++++------------- 4 files changed, 39 insertions(+), 19 deletions(-) diff --git a/src/library.cpp b/src/library.cpp index fb9fc035b..632b86e3a 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -373,12 +373,28 @@ std::vector Library::getBookPropValueSet(BookStrPropMemFn p) const std::vector Library::getBooksLanguages() const { - return getBookPropValueSet(&Book::getLanguage); + std::vector langs; + for ( const auto& langAndCount : getBooksLanguagesWithCounts() ) { + langs.push_back(langAndCount.first); + } + return langs; } Library::AttributeCounts Library::getBooksLanguagesWithCounts() const { - return getBookAttributeCounts(&Book::getLanguage); + std::lock_guard lock(m_mutex); + AttributeCounts langsWithCounts; + + for (const auto& pair: mp_impl->m_books) { + const auto& book = pair.second; + if (book.getOrigId().empty()) { + const std::string commaSeparatedLangList = book.getLanguage(); + for ( const auto& lang : kiwix::split(commaSeparatedLangList, ",") ) { + ++langsWithCounts[lang]; + } + } + } + return langsWithCounts; } std::vector Library::getBooksCategories() const diff --git a/test/data/library.xml b/test/data/library.xml index d1b15118a..1f569f09e 100644 --- a/test/data/library.xml +++ b/test/data/library.xml @@ -23,7 +23,7 @@ url="https://github.com/kiwix/libkiwix/raw/master/test/data/zimfile.zim" title="Ray (uncategorized) Charles" description="No category is assigned to this library entry." - language="rus" + language="rus,eng" creator="Wikipedia" publisher="Kiwix" date="2020-03-31" diff --git a/test/library.cpp b/test/library.cpp index 60762cb51..f0100db19 100644 --- a/test/library.cpp +++ b/test/library.cpp @@ -69,7 +69,7 @@ const char * sampleOpdsStream = R"( urn:uuid:0ea1cde6-441d-6c58-f2c7-21c2838e659f /meta?name=favicon&content=wikiquote_fr_all_nopic_2019-06 2019-06-05T00:00::00:Z - fra + fra,ita Une page de Wikiquote, le recueil des citations libres. category_defined_via_category_element_only wikiquote;nopic @@ -199,7 +199,7 @@ const char sampleLibraryXML[] = R"( url="https://github.com/kiwix/libkiwix/raw/master/test/data/zimfile.zim" title="Ray Charles" description="Wikipedia articles about Ray Charles" - language="eng" + language="eng,spa" creator="Wikipedia" publisher="Kiwix" date="2020-03-31" @@ -344,7 +344,7 @@ TEST_F(LibraryTest, sanityCheck) { EXPECT_EQ(lib.getBookCount(true, true), 12U); EXPECT_EQ(lib.getBooksLanguages(), - std::vector({"deu", "eng", "fra"}) + std::vector({"deu", "eng", "fra", "ita", "spa"}) ); EXPECT_EQ(lib.getBooksCreators(), std::vector({ "Islam Stack Exchange", diff --git a/test/library_server.cpp b/test/library_server.cpp index 723d0c60c..d5270a200 100644 --- a/test/library_server.cpp +++ b/test/library_server.cpp @@ -140,7 +140,7 @@ std::string maskVariableOPDSFeedData(std::string s) "raycharles_uncategorized",\ "Ray (uncategorized) Charles",\ "No category is assigned to this library entry.",\ - "rus",\ + "rus,eng",\ "wikipedia_ru_ray_charles",\ "",\ "public_tag_with_a_value:value_of_a_public_tag;_private_tag_with_a_value:value_of_a_private_tag;wikipedia;_pictures:no;_videos:no;_details:no",\ @@ -327,10 +327,11 @@ TEST_F(LibraryServerTest, catalog_search_by_language) " 12345678-90ab-cdef-1234-567890abcdef\n" " Filtered zims (lang=eng)\n" " YYYY-MM-DDThh:mm:ssZ\n" - " 1\n" + " 2\n" " 0\n" - " 1\n" + " 2\n" CATALOG_LINK_TAGS + UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY RAY_CHARLES_CATALOG_ENTRY "\n" ); @@ -344,12 +345,13 @@ TEST_F(LibraryServerTest, catalog_search_by_language) " 12345678-90ab-cdef-1234-567890abcdef\n" " Filtered zims (lang=eng%2Cfra)\n" " YYYY-MM-DDThh:mm:ssZ\n" - " 2\n" + " 3\n" " 0\n" - " 2\n" + " 3\n" CATALOG_LINK_TAGS - RAY_CHARLES_CATALOG_ENTRY CHARLES_RAY_CATALOG_ENTRY + UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY + RAY_CHARLES_CATALOG_ENTRY "\n" ); } @@ -582,7 +584,7 @@ TEST_F(LibraryServerTest, catalog_v2_languages) English eng - 1 + 2 @@ -764,9 +766,10 @@ TEST_F(LibraryServerTest, catalog_v2_entries_filtered_by_language) CATALOG_V2_ENTRIES_PREAMBLE("?lang=eng") " Filtered Entries (lang=eng)\n" " YYYY-MM-DDThh:mm:ssZ\n" - " 1\n" + " 2\n" " 0\n" - " 1\n" + " 2\n" + UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY RAY_CHARLES_CATALOG_ENTRY "\n" ); @@ -779,11 +782,12 @@ TEST_F(LibraryServerTest, catalog_v2_entries_filtered_by_language) CATALOG_V2_ENTRIES_PREAMBLE("?lang=eng%2Cfra") " Filtered Entries (lang=eng%2Cfra)\n" " YYYY-MM-DDThh:mm:ssZ\n" - " 2\n" + " 3\n" " 0\n" - " 2\n" - RAY_CHARLES_CATALOG_ENTRY + " 3\n" CHARLES_RAY_CATALOG_ENTRY + UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY + RAY_CHARLES_CATALOG_ENTRY "\n" ); } @@ -874,8 +878,8 @@ TEST_F(LibraryServerTest, catalog_search_includes_public_tags) // prefix search works on tag names EXPECT_SEARCH_RESULTS("public_tag", 2, - RAY_CHARLES_CATALOG_ENTRY UNCATEGORIZED_RAY_CHARLES_CATALOG_ENTRY + RAY_CHARLES_CATALOG_ENTRY ); EXPECT_SEARCH_RESULTS("value_of_a_public_tag", From 12826a57bdb57c6b1b7e4342e8d0033fda02ae91 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 1 Mar 2023 16:28:44 +0400 Subject: [PATCH 2/6] Less verbose book creation in unit-tests --- test/book.cpp | 89 +++++++++++++++++++++------------------------------ 1 file changed, 37 insertions(+), 52 deletions(-) diff --git a/test/book.cpp b/test/book.cpp index 4ed2df77a..8b1a29ee6 100644 --- a/test/book.cpp +++ b/test/book.cpp @@ -58,60 +58,53 @@ TEST(BookTest, updateFromXMLTest) EXPECT_EQ(defaultIllustration->url, "http://who.org/zara.fav"); } +namespace +{ + +kiwix::Book makeBook(const std::string& attr, const std::string& baseDir="") +{ + const XMLDoc xml(""); + kiwix::Book book; + book.updateFromXml(xml.child("book"), baseDir); + return book; +} + +} // unnamed namespace + TEST(BookTest, updateFromXMLCategoryHandlingTest) { { - const XMLDoc xml(R"( - - + const kiwix::Book book = makeBook(R"( + id="abcd" + tags="_category:category_defined_via_tags_only" )"); - kiwix::Book book; - book.updateFromXml(xml.child("book"), ""); - EXPECT_EQ(book.getCategory(), "category_defined_via_tags_only"); } { - const XMLDoc xml(R"( - - + const kiwix::Book book = makeBook(R"( + id="abcd" + category="category_defined_via_attribute_only" )"); - kiwix::Book book; - book.updateFromXml(xml.child("book"), ""); - EXPECT_EQ(book.getCategory(), "category_defined_via_attribute_only"); } { - const XMLDoc xml(R"( - - + const kiwix::Book book = makeBook(R"( + id="abcd" + category="category_attribute_overrides_tags" + tags="_category:tags_override_category_attribute" )"); - kiwix::Book book; - book.updateFromXml(xml.child("book"), ""); - EXPECT_EQ(book.getCategory(), "category_attribute_overrides_tags"); } { - const XMLDoc xml(R"( - - + const kiwix::Book book = makeBook(R"( + id="abcd" + tags="_category:tags_override_category_attribute" + category="category_attribute_overrides_tags" )"); - kiwix::Book book; - book.updateFromXml(xml.child("book"), ""); - EXPECT_EQ(book.getCategory(), "category_attribute_overrides_tags"); } } @@ -126,10 +119,7 @@ TEST(BookTest, setTagsDoesntAffectCategory) TEST(BookTest, updateCopiesCategory) { - const XMLDoc xml(R"()"); - - kiwix::Book book; - book.updateFromXml(xml.child("book"), ""); + const kiwix::Book book = makeBook(R"(id="abcd" category="ted")"); kiwix::Book newBook; newBook.setId("abcd"); @@ -140,20 +130,15 @@ TEST(BookTest, updateCopiesCategory) TEST(BookTest, updateTest) { - const XMLDoc xml(R"( - - - )"); - - kiwix::Book book; - book.updateFromXml(xml.child("book"), "/data/zim"); + kiwix::Book book = makeBook(R"( + id="xyz" + path="/home/user/Downloads/skin-of-color-society_en_all_2019-11.zim" + url="book-url" + name="skin-of-color-society_en_all" + tags="youtube;_videos:yes;_ftindex:yes;_ftindex:yes;_pictures:yes;_details:yes" + favicon="Ym9vay1mYXZpY29u" + faviconMimeType="book-favicon-mimetype" + )", "/data/zim"); book.setReadOnly(false); book.setPathValid(true); From b1ad319d525490012109d9b4c08d240df52261cf Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 1 Mar 2023 16:38:13 +0400 Subject: [PATCH 3/6] Enter Book::getLanguages() --- include/book.h | 1 + src/book.cpp | 5 +++++ src/library.cpp | 3 +-- test/book.cpp | 19 +++++++++++++++++++ 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/include/book.h b/include/book.h index e3cb750ad..dacab0776 100644 --- a/include/book.h +++ b/include/book.h @@ -80,6 +80,7 @@ class Book const std::string& getTitle() const { return m_title; } const std::string& getDescription() const { return m_description; } const std::string& getLanguage() const { return m_language; } + const std::vector getLanguages() const; const std::string& getCreator() const { return m_creator; } const std::string& getPublisher() const { return m_publisher; } const std::string& getDate() const { return m_date; } diff --git a/src/book.cpp b/src/book.cpp index 56f96c738..316645ccf 100644 --- a/src/book.cpp +++ b/src/book.cpp @@ -286,4 +286,9 @@ std::string Book::getCategoryFromTags() const } } +const std::vector Book::getLanguages() const +{ + return kiwix::split(m_language, ","); +} + } diff --git a/src/library.cpp b/src/library.cpp index 632b86e3a..1ca875cfb 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -388,8 +388,7 @@ Library::AttributeCounts Library::getBooksLanguagesWithCounts() const for (const auto& pair: mp_impl->m_books) { const auto& book = pair.second; if (book.getOrigId().empty()) { - const std::string commaSeparatedLangList = book.getLanguage(); - for ( const auto& lang : kiwix::split(commaSeparatedLangList, ",") ) { + for ( const auto& lang : book.getLanguages() ) { ++langsWithCounts[lang]; } } diff --git a/test/book.cpp b/test/book.cpp index 8b1a29ee6..9b7200ae2 100644 --- a/test/book.cpp +++ b/test/book.cpp @@ -195,3 +195,22 @@ TEST(BookTest, getHumanReadableIdFromPath) #endif EXPECT_EQ("3plus2", path2HumanReadableId("3+2.zim")); } + +TEST(BookTest, getLanguages) +{ + typedef std::vector Langs; + + { + const kiwix::Book book = makeBook(R"(id="abcd" language="fra")"); + + EXPECT_EQ(book.getLanguage(), "fra"); + EXPECT_EQ(book.getLanguages(), Langs{ "fra" }); + } + + { + const kiwix::Book book = makeBook(R"(id="abcd" language="eng,ong,ing")"); + + EXPECT_EQ(book.getLanguage(), "eng,ong,ing"); + EXPECT_EQ(book.getLanguages(), Langs({ "eng", "ong", "ing" })); + } +} From 51fcb90dc0abad93e768190ca305142cb61e1f50 Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Wed, 1 Mar 2023 16:46:30 +0400 Subject: [PATCH 4/6] Library::updateBookDB() uses Book::getLanguages() --- src/library.cpp | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/library.cpp b/src/library.cpp index 1ca875cfb..4982a70bd 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -455,12 +455,14 @@ void Library::updateBookDB(const Book& book) { Xapian::Stem stemmer; Xapian::TermGenerator indexer; - const std::string lang = book.getLanguage(); - try { - stemmer = Xapian::Stem(iso639_3ToXapian(lang)); - indexer.set_stemmer(stemmer); - indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME); - } catch (...) {} + const auto langs = book.getLanguages(); + if ( langs.size() == 1 ) { + try { + stemmer = Xapian::Stem(iso639_3ToXapian(langs[0])); + indexer.set_stemmer(stemmer); + indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME); + } catch (...) {} + } Xapian::Document doc; indexer.set_document(doc); @@ -475,7 +477,9 @@ void Library::updateBookDB(const Book& book) // Index all fields for field-based search indexer.index_text(title, 1, "S"); indexer.index_text(desc, 1, "XD"); - indexer.index_text(lang, 1, "L"); + for ( const auto& lang : langs ) { + indexer.index_text(lang, 1, "L"); + } indexer.index_text(normalizeText(book.getCreator()), 1, "A"); indexer.index_text(normalizeText(book.getPublisher()), 1, "XP"); indexer.index_text(normalizeText(book.getName()), 1, "XN"); From 25503060527108b31fe297e2465d7d215cf5410b Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Mon, 6 Mar 2023 15:41:01 +0400 Subject: [PATCH 5/6] One more usage of Book::getLanguages() `Book::getLanguages()` is used instead of `Book::getLanguage()` when determining the set of languages for a collection of books. --- src/server/internalServer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/server/internalServer.cpp b/src/server/internalServer.cpp index 76d1494f7..8db0f81f3 100644 --- a/src/server/internalServer.cpp +++ b/src/server/internalServer.cpp @@ -223,7 +223,8 @@ typedef std::set Languages; Languages getLanguages(const Library& lib, const Library::BookIdSet& bookIds) { Languages langs; for ( const auto& b : bookIds ) { - langs.insert(lib.getBookById(b).getLanguage()); + const auto bookLangs = lib.getBookById(b).getLanguages(); + langs.insert(bookLangs.begin(), bookLangs.end()); } return langs; } From eb002ae30633c08d560e7833b8096967c4ef4d5e Mon Sep 17 00:00:00 2001 From: Veloman Yunkan Date: Thu, 2 Mar 2023 17:12:36 +0400 Subject: [PATCH 6/6] Deprecated Book::getLanguage() Introduced `Book::getCommaSeparatedLanguages()` instead. --- include/book.h | 3 ++- src/libxml_dumper.cpp | 4 ++-- src/manager.cpp | 2 +- src/opds_dumper.cpp | 2 +- test/book.cpp | 4 ++-- test/library.cpp | 8 ++++++-- test/manager.cpp | 2 +- 7 files changed, 15 insertions(+), 10 deletions(-) diff --git a/include/book.h b/include/book.h index dacab0776..70c3bae55 100644 --- a/include/book.h +++ b/include/book.h @@ -79,7 +79,8 @@ class Book bool isPathValid() const { return m_pathValid; } const std::string& getTitle() const { return m_title; } const std::string& getDescription() const { return m_description; } - const std::string& getLanguage() const { return m_language; } + DEPRECATED const std::string& getLanguage() const { return m_language; } + const std::string& getCommaSeparatedLanguages() const { return m_language; } const std::vector getLanguages() const; const std::string& getCreator() const { return m_creator; } const std::string& getPublisher() const { return m_publisher; } diff --git a/src/libxml_dumper.cpp b/src/libxml_dumper.cpp index 6dc884d24..722b9dac9 100644 --- a/src/libxml_dumper.cpp +++ b/src/libxml_dumper.cpp @@ -54,7 +54,7 @@ void LibXMLDumper::handleBook(Book book, pugi::xml_node root_node) { if (book.getOrigId().empty()) { ADD_ATTR_NOT_EMPTY(entry_node, "title", book.getTitle()); ADD_ATTR_NOT_EMPTY(entry_node, "description", book.getDescription()); - ADD_ATTR_NOT_EMPTY(entry_node, "language", book.getLanguage()); + ADD_ATTR_NOT_EMPTY(entry_node, "language", book.getCommaSeparatedLanguages()); ADD_ATTR_NOT_EMPTY(entry_node, "creator", book.getCreator()); ADD_ATTR_NOT_EMPTY(entry_node, "publisher", book.getPublisher()); ADD_ATTR_NOT_EMPTY(entry_node, "name", book.getName()); @@ -97,7 +97,7 @@ void LibXMLDumper::handleBookmark(Bookmark bookmark, pugi::xml_node root_node) { auto book = library->getBookByIdThreadSafe(bookmark.getBookId()); ADD_TEXT_ENTRY(book_node, "id", book.getId()); ADD_TEXT_ENTRY(book_node, "title", book.getTitle()); - ADD_TEXT_ENTRY(book_node, "language", book.getLanguage()); + ADD_TEXT_ENTRY(book_node, "language", book.getCommaSeparatedLanguages()); ADD_TEXT_ENTRY(book_node, "date", book.getDate()); } catch (...) { ADD_TEXT_ENTRY(book_node, "id", bookmark.getBookId()); diff --git a/src/manager.cpp b/src/manager.cpp index 3f24fad17..55d9eff70 100644 --- a/src/manager.cpp +++ b/src/manager.cpp @@ -238,7 +238,7 @@ std::string Manager::addBookFromPathAndGetId(const std::string& pathToOpen, } if (!checkMetaData - || (checkMetaData && !book.getTitle().empty() && !book.getLanguage().empty() + || (!book.getTitle().empty() && !book.getLanguages().empty() && !book.getDate().empty())) { book.setUrl(url); manipulator->addBookToLibrary(book); diff --git a/src/opds_dumper.cpp b/src/opds_dumper.cpp index 98b233be7..1104029ac 100644 --- a/src/opds_dumper.cpp +++ b/src/opds_dumper.cpp @@ -81,7 +81,7 @@ std::string fullEntryXML(const Book& book, const std::string& rootLocation, cons {"name", book.getName()}, {"title", book.getTitle()}, {"description", book.getDescription()}, - {"language", book.getLanguage()}, + {"language", book.getCommaSeparatedLanguages()}, {"content_id", urlEncode(contentId)}, {"updated", bookDate}, // XXX: this should be the entry update datetime {"book_date", bookDate}, diff --git a/test/book.cpp b/test/book.cpp index 9b7200ae2..55de4e2df 100644 --- a/test/book.cpp +++ b/test/book.cpp @@ -203,14 +203,14 @@ TEST(BookTest, getLanguages) { const kiwix::Book book = makeBook(R"(id="abcd" language="fra")"); - EXPECT_EQ(book.getLanguage(), "fra"); + EXPECT_EQ(book.getCommaSeparatedLanguages(), "fra"); EXPECT_EQ(book.getLanguages(), Langs{ "fra" }); } { const kiwix::Book book = makeBook(R"(id="abcd" language="eng,ong,ing")"); - EXPECT_EQ(book.getLanguage(), "eng,ong,ing"); + EXPECT_EQ(book.getCommaSeparatedLanguages(), "eng,ong,ing"); EXPECT_EQ(book.getLanguages(), Langs({ "eng", "ong", "ing" })); } } diff --git a/test/library.cpp b/test/library.cpp index f0100db19..4fb4b7dd1 100644 --- a/test/library.cpp +++ b/test/library.cpp @@ -234,6 +234,8 @@ const char sampleLibraryXML[] = R"( namespace { +typedef std::vector Langs; + TEST(LibraryOpdsImportTest, allInOne) { kiwix::Library lib; @@ -248,7 +250,8 @@ TEST(LibraryOpdsImportTest, allInOne) EXPECT_EQ(book1.getTitle(), "Encyclopédie de la Tunisie"); EXPECT_EQ(book1.getName(), "wikipedia_fr_tunisie_novid_2018-10"); EXPECT_EQ(book1.getFlavour(), "unforgettable"); - EXPECT_EQ(book1.getLanguage(), "fra"); + EXPECT_EQ(book1.getLanguages(), Langs{ "fra" }); + EXPECT_EQ(book1.getCommaSeparatedLanguages(), "fra"); EXPECT_EQ(book1.getDate(), "8 Oct 2018"); EXPECT_EQ(book1.getDescription(), "Le meilleur de Wikipédia sur la Tunisie"); EXPECT_EQ(book1.getCreator(), "Wikipedia"); @@ -272,7 +275,8 @@ TEST(LibraryOpdsImportTest, allInOne) EXPECT_EQ(book2.getTitle(), "TED talks - Business"); EXPECT_EQ(book2.getName(), ""); EXPECT_EQ(book2.getFlavour(), ""); - EXPECT_EQ(book2.getLanguage(), "eng"); + EXPECT_EQ(book2.getLanguages(), Langs{ "eng" }); + EXPECT_EQ(book2.getCommaSeparatedLanguages(), "eng"); EXPECT_EQ(book2.getDate(), "2018-07-23"); EXPECT_EQ(book2.getDescription(), "Ideas worth spreading"); EXPECT_EQ(book2.getCreator(), "TED"); diff --git a/test/manager.cpp b/test/manager.cpp index 34774f630..e2b600644 100644 --- a/test/manager.cpp +++ b/test/manager.cpp @@ -57,7 +57,7 @@ TEST(ManagerTest, readXml) EXPECT_EQ("https://example.com/zimfiles/unittest.zim", book.getUrl()); EXPECT_EQ("Unit Test", book.getTitle()); EXPECT_EQ("Wikipedia articles about unit testing", book.getDescription()); - EXPECT_EQ("eng", book.getLanguage()); + EXPECT_EQ("eng", book.getCommaSeparatedLanguages()); EXPECT_EQ("Wikipedia", book.getCreator()); EXPECT_EQ("Kiwix", book.getPublisher()); EXPECT_EQ("2020-03-31", book.getDate());