Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for multilang ZIMs #904

Merged
merged 6 commits into from
Mar 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion include/book.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,9 @@ class Book
bool isPathValid() const { return m_pathValid; }
const std::string& getTitle() const { return m_title; }
const std::string& getDescription() const { return m_description; }
const std::string& getLanguage() const { return m_language; }
DEPRECATED const std::string& getLanguage() const { return m_language; }
const std::string& getCommaSeparatedLanguages() const { return m_language; }
const std::vector<std::string> getLanguages() const;
const std::string& getCreator() const { return m_creator; }
const std::string& getPublisher() const { return m_publisher; }
const std::string& getDate() const { return m_date; }
Expand Down
5 changes: 5 additions & 0 deletions src/book.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,4 +286,9 @@ std::string Book::getCategoryFromTags() const
}
}

const std::vector<std::string> Book::getLanguages() const
{
return kiwix::split(m_language, ",");
}

}
37 changes: 28 additions & 9 deletions src/library.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,12 +373,27 @@ std::vector<std::string> Library::getBookPropValueSet(BookStrPropMemFn p) const

std::vector<std::string> Library::getBooksLanguages() const
{
return getBookPropValueSet(&Book::getLanguage);
std::vector<std::string> langs;
for ( const auto& langAndCount : getBooksLanguagesWithCounts() ) {
langs.push_back(langAndCount.first);
}
return langs;
}

Library::AttributeCounts Library::getBooksLanguagesWithCounts() const
{
return getBookAttributeCounts(&Book::getLanguage);
std::lock_guard<std::mutex> lock(m_mutex);
AttributeCounts langsWithCounts;

for (const auto& pair: mp_impl->m_books) {
const auto& book = pair.second;
if (book.getOrigId().empty()) {
for ( const auto& lang : book.getLanguages() ) {
++langsWithCounts[lang];
}
}
}
return langsWithCounts;
}

std::vector<std::string> Library::getBooksCategories() const
Expand Down Expand Up @@ -440,12 +455,14 @@ void Library::updateBookDB(const Book& book)
{
Xapian::Stem stemmer;
Xapian::TermGenerator indexer;
const std::string lang = book.getLanguage();
try {
stemmer = Xapian::Stem(iso639_3ToXapian(lang));
indexer.set_stemmer(stemmer);
indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
} catch (...) {}
const auto langs = book.getLanguages();
if ( langs.size() == 1 ) {
try {
stemmer = Xapian::Stem(iso639_3ToXapian(langs[0]));
indexer.set_stemmer(stemmer);
indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
} catch (...) {}
}
mgautierfr marked this conversation as resolved.
Show resolved Hide resolved
Xapian::Document doc;
indexer.set_document(doc);

Expand All @@ -460,7 +477,9 @@ void Library::updateBookDB(const Book& book)
// Index all fields for field-based search
indexer.index_text(title, 1, "S");
indexer.index_text(desc, 1, "XD");
indexer.index_text(lang, 1, "L");
for ( const auto& lang : langs ) {
indexer.index_text(lang, 1, "L");
}
indexer.index_text(normalizeText(book.getCreator()), 1, "A");
indexer.index_text(normalizeText(book.getPublisher()), 1, "XP");
indexer.index_text(normalizeText(book.getName()), 1, "XN");
Expand Down
4 changes: 2 additions & 2 deletions src/libxml_dumper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void LibXMLDumper::handleBook(Book book, pugi::xml_node root_node) {
if (book.getOrigId().empty()) {
ADD_ATTR_NOT_EMPTY(entry_node, "title", book.getTitle());
ADD_ATTR_NOT_EMPTY(entry_node, "description", book.getDescription());
ADD_ATTR_NOT_EMPTY(entry_node, "language", book.getLanguage());
ADD_ATTR_NOT_EMPTY(entry_node, "language", book.getCommaSeparatedLanguages());
ADD_ATTR_NOT_EMPTY(entry_node, "creator", book.getCreator());
ADD_ATTR_NOT_EMPTY(entry_node, "publisher", book.getPublisher());
ADD_ATTR_NOT_EMPTY(entry_node, "name", book.getName());
Expand Down Expand Up @@ -97,7 +97,7 @@ void LibXMLDumper::handleBookmark(Bookmark bookmark, pugi::xml_node root_node) {
auto book = library->getBookByIdThreadSafe(bookmark.getBookId());
ADD_TEXT_ENTRY(book_node, "id", book.getId());
ADD_TEXT_ENTRY(book_node, "title", book.getTitle());
ADD_TEXT_ENTRY(book_node, "language", book.getLanguage());
ADD_TEXT_ENTRY(book_node, "language", book.getCommaSeparatedLanguages());
ADD_TEXT_ENTRY(book_node, "date", book.getDate());
} catch (...) {
ADD_TEXT_ENTRY(book_node, "id", bookmark.getBookId());
Expand Down
2 changes: 1 addition & 1 deletion src/manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ std::string Manager::addBookFromPathAndGetId(const std::string& pathToOpen,
}

if (!checkMetaData
|| (checkMetaData && !book.getTitle().empty() && !book.getLanguage().empty()
|| (!book.getTitle().empty() && !book.getLanguages().empty()
&& !book.getDate().empty())) {
book.setUrl(url);
manipulator->addBookToLibrary(book);
Expand Down
2 changes: 1 addition & 1 deletion src/opds_dumper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ std::string fullEntryXML(const Book& book, const std::string& rootLocation, cons
{"name", book.getName()},
{"title", book.getTitle()},
{"description", book.getDescription()},
{"language", book.getLanguage()},
{"language", book.getCommaSeparatedLanguages()},
{"content_id", urlEncode(contentId)},
{"updated", bookDate}, // XXX: this should be the entry update datetime
{"book_date", bookDate},
Expand Down
3 changes: 2 additions & 1 deletion src/server/internalServer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,8 @@ typedef std::set<std::string> Languages;
Languages getLanguages(const Library& lib, const Library::BookIdSet& bookIds) {
Languages langs;
for ( const auto& b : bookIds ) {
langs.insert(lib.getBookById(b).getLanguage());
const auto bookLangs = lib.getBookById(b).getLanguages();
langs.insert(bookLangs.begin(), bookLangs.end());
mgautierfr marked this conversation as resolved.
Show resolved Hide resolved
}
return langs;
}
Expand Down
108 changes: 56 additions & 52 deletions test/book.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,60 +58,53 @@ TEST(BookTest, updateFromXMLTest)
EXPECT_EQ(defaultIllustration->url, "http://who.org/zara.fav");
}

namespace
{

kiwix::Book makeBook(const std::string& attr, const std::string& baseDir="")
{
const XMLDoc xml("<book " + attr + "></book>");
kiwix::Book book;
book.updateFromXml(xml.child("book"), baseDir);
return book;
}

} // unnamed namespace

TEST(BookTest, updateFromXMLCategoryHandlingTest)
{
{
const XMLDoc xml(R"(
<book id="abcd"
tags="_category:category_defined_via_tags_only"
>
</book>
const kiwix::Book book = makeBook(R"(
id="abcd"
tags="_category:category_defined_via_tags_only"
)");

kiwix::Book book;
book.updateFromXml(xml.child("book"), "");

EXPECT_EQ(book.getCategory(), "category_defined_via_tags_only");
}
{
const XMLDoc xml(R"(
<book id="abcd"
category="category_defined_via_attribute_only"
>
</book>
const kiwix::Book book = makeBook(R"(
id="abcd"
category="category_defined_via_attribute_only"
)");

kiwix::Book book;
book.updateFromXml(xml.child("book"), "");

EXPECT_EQ(book.getCategory(), "category_defined_via_attribute_only");
}
{
const XMLDoc xml(R"(
<book id="abcd"
category="category_attribute_overrides_tags"
tags="_category:tags_override_category_attribute"
>
</book>
const kiwix::Book book = makeBook(R"(
id="abcd"
category="category_attribute_overrides_tags"
tags="_category:tags_override_category_attribute"
)");

kiwix::Book book;
book.updateFromXml(xml.child("book"), "");

EXPECT_EQ(book.getCategory(), "category_attribute_overrides_tags");
}
{
const XMLDoc xml(R"(
<book id="abcd"
tags="_category:tags_override_category_attribute"
category="category_attribute_overrides_tags"
>
</book>
const kiwix::Book book = makeBook(R"(
id="abcd"
tags="_category:tags_override_category_attribute"
category="category_attribute_overrides_tags"
)");

kiwix::Book book;
book.updateFromXml(xml.child("book"), "");

EXPECT_EQ(book.getCategory(), "category_attribute_overrides_tags");
}
}
Expand All @@ -126,10 +119,7 @@ TEST(BookTest, setTagsDoesntAffectCategory)

TEST(BookTest, updateCopiesCategory)
{
const XMLDoc xml(R"(<book id="abcd" category="ted"></book>)");

kiwix::Book book;
book.updateFromXml(xml.child("book"), "");
const kiwix::Book book = makeBook(R"(id="abcd" category="ted")");

kiwix::Book newBook;
newBook.setId("abcd");
Expand All @@ -140,20 +130,15 @@ TEST(BookTest, updateCopiesCategory)

TEST(BookTest, updateTest)
{
const XMLDoc xml(R"(
<book id="xyz"
path="/home/user/Downloads/skin-of-color-society_en_all_2019-11.zim"
url="book-url"
name="skin-of-color-society_en_all"
tags="youtube;_videos:yes;_ftindex:yes;_ftindex:yes;_pictures:yes;_details:yes"
favicon="Ym9vay1mYXZpY29u"
faviconMimeType="book-favicon-mimetype"
>
</book>
)");

kiwix::Book book;
book.updateFromXml(xml.child("book"), "/data/zim");
kiwix::Book book = makeBook(R"(
id="xyz"
path="/home/user/Downloads/skin-of-color-society_en_all_2019-11.zim"
url="book-url"
name="skin-of-color-society_en_all"
tags="youtube;_videos:yes;_ftindex:yes;_ftindex:yes;_pictures:yes;_details:yes"
favicon="Ym9vay1mYXZpY29u"
faviconMimeType="book-favicon-mimetype"
)", "/data/zim");

book.setReadOnly(false);
book.setPathValid(true);
Expand Down Expand Up @@ -210,3 +195,22 @@ TEST(BookTest, getHumanReadableIdFromPath)
#endif
EXPECT_EQ("3plus2", path2HumanReadableId("3+2.zim"));
}

TEST(BookTest, getLanguages)
{
typedef std::vector<std::string> Langs;

{
const kiwix::Book book = makeBook(R"(id="abcd" language="fra")");

EXPECT_EQ(book.getCommaSeparatedLanguages(), "fra");
EXPECT_EQ(book.getLanguages(), Langs{ "fra" });
}

{
const kiwix::Book book = makeBook(R"(id="abcd" language="eng,ong,ing")");

EXPECT_EQ(book.getCommaSeparatedLanguages(), "eng,ong,ing");
EXPECT_EQ(book.getLanguages(), Langs({ "eng", "ong", "ing" }));
}
}
2 changes: 1 addition & 1 deletion test/data/library.xml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
url="https://github.com/kiwix/libkiwix/raw/master/test/data/zimfile.zim"
title="Ray (uncategorized) Charles"
description="No category is assigned to this library entry."
language="rus"
language="rus,eng"
creator="Wikipedia"
publisher="Kiwix"
date="2020-03-31"
Expand Down
14 changes: 9 additions & 5 deletions test/library.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ const char * sampleOpdsStream = R"(
<id>urn:uuid:0ea1cde6-441d-6c58-f2c7-21c2838e659f</id>
<icon>/meta?name=favicon&amp;content=wikiquote_fr_all_nopic_2019-06</icon>
<updated>2019-06-05T00:00::00:Z</updated>
<language>fra</language>
<language>fra,ita</language>
<summary>Une page de Wikiquote, le recueil des citations libres.</summary>
<category>category_defined_via_category_element_only</category>
<tags>wikiquote;nopic</tags>
Expand Down Expand Up @@ -199,7 +199,7 @@ const char sampleLibraryXML[] = R"(
url="https://github.com/kiwix/libkiwix/raw/master/test/data/zimfile.zim"
title="Ray Charles"
description="Wikipedia articles about Ray Charles"
language="eng"
language="eng,spa"
creator="Wikipedia"
publisher="Kiwix"
date="2020-03-31"
Expand Down Expand Up @@ -234,6 +234,8 @@ const char sampleLibraryXML[] = R"(
namespace
{

typedef std::vector<std::string> Langs;

TEST(LibraryOpdsImportTest, allInOne)
{
kiwix::Library lib;
Expand All @@ -248,7 +250,8 @@ TEST(LibraryOpdsImportTest, allInOne)
EXPECT_EQ(book1.getTitle(), "Encyclopédie de la Tunisie");
EXPECT_EQ(book1.getName(), "wikipedia_fr_tunisie_novid_2018-10");
EXPECT_EQ(book1.getFlavour(), "unforgettable");
EXPECT_EQ(book1.getLanguage(), "fra");
EXPECT_EQ(book1.getLanguages(), Langs{ "fra" });
EXPECT_EQ(book1.getCommaSeparatedLanguages(), "fra");
EXPECT_EQ(book1.getDate(), "8 Oct 2018");
EXPECT_EQ(book1.getDescription(), "Le meilleur de Wikipédia sur la Tunisie");
EXPECT_EQ(book1.getCreator(), "Wikipedia");
Expand All @@ -272,7 +275,8 @@ TEST(LibraryOpdsImportTest, allInOne)
EXPECT_EQ(book2.getTitle(), "TED talks - Business");
EXPECT_EQ(book2.getName(), "");
EXPECT_EQ(book2.getFlavour(), "");
EXPECT_EQ(book2.getLanguage(), "eng");
EXPECT_EQ(book2.getLanguages(), Langs{ "eng" });
EXPECT_EQ(book2.getCommaSeparatedLanguages(), "eng");
EXPECT_EQ(book2.getDate(), "2018-07-23");
EXPECT_EQ(book2.getDescription(), "Ideas worth spreading");
EXPECT_EQ(book2.getCreator(), "TED");
Expand Down Expand Up @@ -344,7 +348,7 @@ TEST_F(LibraryTest, sanityCheck)
{
EXPECT_EQ(lib.getBookCount(true, true), 12U);
EXPECT_EQ(lib.getBooksLanguages(),
std::vector<std::string>({"deu", "eng", "fra"})
std::vector<std::string>({"deu", "eng", "fra", "ita", "spa"})
);
EXPECT_EQ(lib.getBooksCreators(), std::vector<std::string>({
"Islam Stack Exchange",
Expand Down