Skip to content

Commit

Permalink
Merge pull request #904 from kiwix/support_for_multilang_zims
Browse files Browse the repository at this point in the history
  • Loading branch information
mgautierfr committed Mar 8, 2023
2 parents 3072513 + eb002ae commit 88de978
Show file tree
Hide file tree
Showing 12 changed files with 126 additions and 87 deletions.
4 changes: 3 additions & 1 deletion include/book.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ class Book
bool isPathValid() const { return m_pathValid; }
const std::string& getTitle() const { return m_title; }
const std::string& getDescription() const { return m_description; }
const std::string& getLanguage() const { return m_language; }
DEPRECATED const std::string& getLanguage() const { return m_language; }
const std::string& getCommaSeparatedLanguages() const { return m_language; }
const std::vector<std::string> getLanguages() const;
const std::string& getCreator() const { return m_creator; }
const std::string& getPublisher() const { return m_publisher; }
const std::string& getDate() const { return m_date; }
Expand Down
5 changes: 5 additions & 0 deletions src/book.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,4 +286,9 @@ std::string Book::getCategoryFromTags() const
}
}

const std::vector<std::string> Book::getLanguages() const
{
return kiwix::split(m_language, ",");
}

}
37 changes: 28 additions & 9 deletions src/library.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,12 +373,27 @@ std::vector<std::string> Library::getBookPropValueSet(BookStrPropMemFn p) const

std::vector<std::string> Library::getBooksLanguages() const
{
return getBookPropValueSet(&Book::getLanguage);
std::vector<std::string> langs;
for ( const auto& langAndCount : getBooksLanguagesWithCounts() ) {
langs.push_back(langAndCount.first);
}
return langs;
}

Library::AttributeCounts Library::getBooksLanguagesWithCounts() const
{
return getBookAttributeCounts(&Book::getLanguage);
std::lock_guard<std::mutex> lock(m_mutex);
AttributeCounts langsWithCounts;

for (const auto& pair: mp_impl->m_books) {
const auto& book = pair.second;
if (book.getOrigId().empty()) {
for ( const auto& lang : book.getLanguages() ) {
++langsWithCounts[lang];
}
}
}
return langsWithCounts;
}

std::vector<std::string> Library::getBooksCategories() const
Expand Down Expand Up @@ -440,12 +455,14 @@ void Library::updateBookDB(const Book& book)
{
Xapian::Stem stemmer;
Xapian::TermGenerator indexer;
const std::string lang = book.getLanguage();
try {
stemmer = Xapian::Stem(iso639_3ToXapian(lang));
indexer.set_stemmer(stemmer);
indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
} catch (...) {}
const auto langs = book.getLanguages();
if ( langs.size() == 1 ) {
try {
stemmer = Xapian::Stem(iso639_3ToXapian(langs[0]));
indexer.set_stemmer(stemmer);
indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
} catch (...) {}
}
Xapian::Document doc;
indexer.set_document(doc);

Expand All @@ -460,7 +477,9 @@ void Library::updateBookDB(const Book& book)
// Index all fields for field-based search
indexer.index_text(title, 1, "S");
indexer.index_text(desc, 1, "XD");
indexer.index_text(lang, 1, "L");
for ( const auto& lang : langs ) {
indexer.index_text(lang, 1, "L");
}
indexer.index_text(normalizeText(book.getCreator()), 1, "A");
indexer.index_text(normalizeText(book.getPublisher()), 1, "XP");
indexer.index_text(normalizeText(book.getName()), 1, "XN");
Expand Down
4 changes: 2 additions & 2 deletions src/libxml_dumper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void LibXMLDumper::handleBook(Book book, pugi::xml_node root_node) {
if (book.getOrigId().empty()) {
ADD_ATTR_NOT_EMPTY(entry_node, "title", book.getTitle());
ADD_ATTR_NOT_EMPTY(entry_node, "description", book.getDescription());
ADD_ATTR_NOT_EMPTY(entry_node, "language", book.getLanguage());
ADD_ATTR_NOT_EMPTY(entry_node, "language", book.getCommaSeparatedLanguages());
ADD_ATTR_NOT_EMPTY(entry_node, "creator", book.getCreator());
ADD_ATTR_NOT_EMPTY(entry_node, "publisher", book.getPublisher());
ADD_ATTR_NOT_EMPTY(entry_node, "name", book.getName());
Expand Down Expand Up @@ -97,7 +97,7 @@ void LibXMLDumper::handleBookmark(Bookmark bookmark, pugi::xml_node root_node) {
auto book = library->getBookByIdThreadSafe(bookmark.getBookId());
ADD_TEXT_ENTRY(book_node, "id", book.getId());
ADD_TEXT_ENTRY(book_node, "title", book.getTitle());
ADD_TEXT_ENTRY(book_node, "language", book.getLanguage());
ADD_TEXT_ENTRY(book_node, "language", book.getCommaSeparatedLanguages());
ADD_TEXT_ENTRY(book_node, "date", book.getDate());
} catch (...) {
ADD_TEXT_ENTRY(book_node, "id", bookmark.getBookId());
Expand Down
2 changes: 1 addition & 1 deletion src/manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ std::string Manager::addBookFromPathAndGetId(const std::string& pathToOpen,
}

if (!checkMetaData
|| (checkMetaData && !book.getTitle().empty() && !book.getLanguage().empty()
|| (!book.getTitle().empty() && !book.getLanguages().empty()
&& !book.getDate().empty())) {
book.setUrl(url);
manipulator->addBookToLibrary(book);
Expand Down
2 changes: 1 addition & 1 deletion src/opds_dumper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ std::string fullEntryXML(const Book& book, const std::string& rootLocation, cons
{"name", book.getName()},
{"title", book.getTitle()},
{"description", book.getDescription()},
{"language", book.getLanguage()},
{"language", book.getCommaSeparatedLanguages()},
{"content_id", urlEncode(contentId)},
{"updated", bookDate}, // XXX: this should be the entry update datetime
{"book_date", bookDate},
Expand Down
3 changes: 2 additions & 1 deletion src/server/internalServer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,8 @@ typedef std::set<std::string> Languages;
Languages getLanguages(const Library& lib, const Library::BookIdSet& bookIds) {
Languages langs;
for ( const auto& b : bookIds ) {
langs.insert(lib.getBookById(b).getLanguage());
const auto bookLangs = lib.getBookById(b).getLanguages();
langs.insert(bookLangs.begin(), bookLangs.end());
}
return langs;
}
Expand Down
108 changes: 56 additions & 52 deletions test/book.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,60 +58,53 @@ TEST(BookTest, updateFromXMLTest)
EXPECT_EQ(defaultIllustration->url, "http://who.org/zara.fav");
}

namespace
{

kiwix::Book makeBook(const std::string& attr, const std::string& baseDir="")
{
const XMLDoc xml("<book " + attr + "></book>");
kiwix::Book book;
book.updateFromXml(xml.child("book"), baseDir);
return book;
}

} // unnamed namespace

TEST(BookTest, updateFromXMLCategoryHandlingTest)
{
{
const XMLDoc xml(R"(
<book id="abcd"
tags="_category:category_defined_via_tags_only"
>
</book>
const kiwix::Book book = makeBook(R"(
id="abcd"
tags="_category:category_defined_via_tags_only"
)");

kiwix::Book book;
book.updateFromXml(xml.child("book"), "");

EXPECT_EQ(book.getCategory(), "category_defined_via_tags_only");
}
{
const XMLDoc xml(R"(
<book id="abcd"
category="category_defined_via_attribute_only"
>
</book>
const kiwix::Book book = makeBook(R"(
id="abcd"
category="category_defined_via_attribute_only"
)");

kiwix::Book book;
book.updateFromXml(xml.child("book"), "");

EXPECT_EQ(book.getCategory(), "category_defined_via_attribute_only");
}
{
const XMLDoc xml(R"(
<book id="abcd"
category="category_attribute_overrides_tags"
tags="_category:tags_override_category_attribute"
>
</book>
const kiwix::Book book = makeBook(R"(
id="abcd"
category="category_attribute_overrides_tags"
tags="_category:tags_override_category_attribute"
)");

kiwix::Book book;
book.updateFromXml(xml.child("book"), "");

EXPECT_EQ(book.getCategory(), "category_attribute_overrides_tags");
}
{
const XMLDoc xml(R"(
<book id="abcd"
tags="_category:tags_override_category_attribute"
category="category_attribute_overrides_tags"
>
</book>
const kiwix::Book book = makeBook(R"(
id="abcd"
tags="_category:tags_override_category_attribute"
category="category_attribute_overrides_tags"
)");

kiwix::Book book;
book.updateFromXml(xml.child("book"), "");

EXPECT_EQ(book.getCategory(), "category_attribute_overrides_tags");
}
}
Expand All @@ -126,10 +119,7 @@ TEST(BookTest, setTagsDoesntAffectCategory)

TEST(BookTest, updateCopiesCategory)
{
const XMLDoc xml(R"(<book id="abcd" category="ted"></book>)");

kiwix::Book book;
book.updateFromXml(xml.child("book"), "");
const kiwix::Book book = makeBook(R"(id="abcd" category="ted")");

kiwix::Book newBook;
newBook.setId("abcd");
Expand All @@ -140,20 +130,15 @@ TEST(BookTest, updateCopiesCategory)

TEST(BookTest, updateTest)
{
const XMLDoc xml(R"(
<book id="xyz"
path="/home/user/Downloads/skin-of-color-society_en_all_2019-11.zim"
url="book-url"
name="skin-of-color-society_en_all"
tags="youtube;_videos:yes;_ftindex:yes;_ftindex:yes;_pictures:yes;_details:yes"
favicon="Ym9vay1mYXZpY29u"
faviconMimeType="book-favicon-mimetype"
>
</book>
)");

kiwix::Book book;
book.updateFromXml(xml.child("book"), "/data/zim");
kiwix::Book book = makeBook(R"(
id="xyz"
path="/home/user/Downloads/skin-of-color-society_en_all_2019-11.zim"
url="book-url"
name="skin-of-color-society_en_all"
tags="youtube;_videos:yes;_ftindex:yes;_ftindex:yes;_pictures:yes;_details:yes"
favicon="Ym9vay1mYXZpY29u"
faviconMimeType="book-favicon-mimetype"
)", "/data/zim");

book.setReadOnly(false);
book.setPathValid(true);
Expand Down Expand Up @@ -210,3 +195,22 @@ TEST(BookTest, getHumanReadableIdFromPath)
#endif
EXPECT_EQ("3plus2", path2HumanReadableId("3+2.zim"));
}

TEST(BookTest, getLanguages)
{
typedef std::vector<std::string> Langs;

{
const kiwix::Book book = makeBook(R"(id="abcd" language="fra")");

EXPECT_EQ(book.getCommaSeparatedLanguages(), "fra");
EXPECT_EQ(book.getLanguages(), Langs{ "fra" });
}

{
const kiwix::Book book = makeBook(R"(id="abcd" language="eng,ong,ing")");

EXPECT_EQ(book.getCommaSeparatedLanguages(), "eng,ong,ing");
EXPECT_EQ(book.getLanguages(), Langs({ "eng", "ong", "ing" }));
}
}
2 changes: 1 addition & 1 deletion test/data/library.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
url="https://github.com/kiwix/libkiwix/raw/master/test/data/zimfile.zim"
title="Ray (uncategorized) Charles"
description="No category is assigned to this library entry."
language="rus"
language="rus,eng"
creator="Wikipedia"
publisher="Kiwix"
date="2020-03-31"
Expand Down
14 changes: 9 additions & 5 deletions test/library.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ const char * sampleOpdsStream = R"(
<id>urn:uuid:0ea1cde6-441d-6c58-f2c7-21c2838e659f</id>
<icon>/meta?name=favicon&amp;content=wikiquote_fr_all_nopic_2019-06</icon>
<updated>2019-06-05T00:00::00:Z</updated>
<language>fra</language>
<language>fra,ita</language>
<summary>Une page de Wikiquote, le recueil des citations libres.</summary>
<category>category_defined_via_category_element_only</category>
<tags>wikiquote;nopic</tags>
Expand Down Expand Up @@ -199,7 +199,7 @@ const char sampleLibraryXML[] = R"(
url="https://github.com/kiwix/libkiwix/raw/master/test/data/zimfile.zim"
title="Ray Charles"
description="Wikipedia articles about Ray Charles"
language="eng"
language="eng,spa"
creator="Wikipedia"
publisher="Kiwix"
date="2020-03-31"
Expand Down Expand Up @@ -234,6 +234,8 @@ const char sampleLibraryXML[] = R"(
namespace
{

typedef std::vector<std::string> Langs;

TEST(LibraryOpdsImportTest, allInOne)
{
kiwix::Library lib;
Expand All @@ -248,7 +250,8 @@ TEST(LibraryOpdsImportTest, allInOne)
EXPECT_EQ(book1.getTitle(), "Encyclopédie de la Tunisie");
EXPECT_EQ(book1.getName(), "wikipedia_fr_tunisie_novid_2018-10");
EXPECT_EQ(book1.getFlavour(), "unforgettable");
EXPECT_EQ(book1.getLanguage(), "fra");
EXPECT_EQ(book1.getLanguages(), Langs{ "fra" });
EXPECT_EQ(book1.getCommaSeparatedLanguages(), "fra");
EXPECT_EQ(book1.getDate(), "8 Oct 2018");
EXPECT_EQ(book1.getDescription(), "Le meilleur de Wikipédia sur la Tunisie");
EXPECT_EQ(book1.getCreator(), "Wikipedia");
Expand All @@ -272,7 +275,8 @@ TEST(LibraryOpdsImportTest, allInOne)
EXPECT_EQ(book2.getTitle(), "TED talks - Business");
EXPECT_EQ(book2.getName(), "");
EXPECT_EQ(book2.getFlavour(), "");
EXPECT_EQ(book2.getLanguage(), "eng");
EXPECT_EQ(book2.getLanguages(), Langs{ "eng" });
EXPECT_EQ(book2.getCommaSeparatedLanguages(), "eng");
EXPECT_EQ(book2.getDate(), "2018-07-23");
EXPECT_EQ(book2.getDescription(), "Ideas worth spreading");
EXPECT_EQ(book2.getCreator(), "TED");
Expand Down Expand Up @@ -344,7 +348,7 @@ TEST_F(LibraryTest, sanityCheck)
{
EXPECT_EQ(lib.getBookCount(true, true), 12U);
EXPECT_EQ(lib.getBooksLanguages(),
std::vector<std::string>({"deu", "eng", "fra"})
std::vector<std::string>({"deu", "eng", "fra", "ita", "spa"})
);
EXPECT_EQ(lib.getBooksCreators(), std::vector<std::string>({
"Islam Stack Exchange",
Expand Down
Loading

0 comments on commit 88de978

Please sign in to comment.