diff --git a/include/library.h b/include/library.h index 1763de7d5..ef43f2a7e 100644 --- a/include/library.h +++ b/include/library.h @@ -26,6 +26,7 @@ #include #include #include +#include #include "book.h" #include "bookmark.h" @@ -140,6 +141,22 @@ class Filter { bool accept(const Book& book) const; }; + +class ZimSearcher : public zim::Searcher +{ + public: + explicit ZimSearcher(zim::Searcher&& searcher) + : zim::Searcher(searcher) + {} + + std::unique_lock getLock() { + return std::unique_lock(m_mutex); + } + virtual ~ZimSearcher() = default; + private: + std::mutex m_mutex; +}; + /** * A Library store several books. */ @@ -152,6 +169,7 @@ class Library typedef uint64_t Revision; typedef std::vector BookIdCollection; typedef std::map AttributeCounts; + typedef std::set BookIdSet; public: Library(); @@ -207,6 +225,10 @@ class Library DEPRECATED std::shared_ptr getReaderById(const std::string& id); std::shared_ptr getArchiveById(const std::string& id); + std::shared_ptr getSearcherById(const std::string& id) { + return getSearcherByIds(BookIdSet{id}); + } + std::shared_ptr getSearcherByIds(const BookIdSet& ids); /** * Remove a book from the library. @@ -338,7 +360,7 @@ class Library std::vector getBookPropValueSet(BookStrPropMemFn p) const; BookIdCollection filterViaBookDB(const Filter& filter) const; void updateBookDB(const Book& book); - void dropReader(const std::string& bookId); + void dropCache(const std::string& bookId); private: //data std::unique_ptr mp_impl; diff --git a/include/search_renderer.h b/include/search_renderer.h index cd499d13d..22dccef1f 100644 --- a/include/search_renderer.h +++ b/include/search_renderer.h @@ -81,9 +81,9 @@ class SearchRenderer void setSearchPattern(const std::string& pattern); /** - * Set the search content id. + * Set the querystring used to select books */ - void setSearchContent(const std::string& name); + void setSearchBookQuery(const std::string& bookQuery); /** * Set protocol prefix. @@ -112,7 +112,7 @@ class SearchRenderer zim::SearchResultSet m_srs; NameMapper* mp_nameMapper; Library* mp_library; - std::string searchContent; + std::string searchBookQuery; std::string searchPattern; std::string protocolPrefix; std::string searchProtocolPrefix; diff --git a/include/server.h b/include/server.h index b3cd240fd..0cd579c57 100644 --- a/include/server.h +++ b/include/server.h @@ -54,6 +54,7 @@ namespace kiwix void setAddress(const std::string& addr) { m_addr = addr; } void setPort(int port) { m_port = port; } void setNbThreads(int threads) { m_nbThreads = threads; } + void setMultiZimSearchLimit(unsigned int limit) { m_multizimSearchLimit = limit; } void setIpConnectionLimit(int limit) { m_ipConnectionLimit = limit; } void setVerbose(bool verbose) { m_verbose = verbose; } void setIndexTemplateString(const std::string& indexTemplateString) { m_indexTemplateString = indexTemplateString; } @@ -63,7 +64,7 @@ namespace kiwix { m_blockExternalLinks = blockExternalLinks; } int getPort(); std::string getAddress(); - + protected: Library* mp_library; NameMapper* mp_nameMapper; @@ -72,6 +73,7 @@ namespace kiwix std::string m_indexTemplateString = ""; int m_port = 80; int m_nbThreads = 1; + unsigned int m_multizimSearchLimit = 0; bool m_verbose = false; bool m_withTaskbar = true; bool m_withLibraryButton = true; diff --git a/src/library.cpp b/src/library.cpp index f77059404..ad97a0c38 100644 --- a/src/library.cpp +++ b/src/library.cpp @@ -27,10 +27,13 @@ #include "tools/regexTools.h" #include "tools/pathTools.h" #include "tools/stringTools.h" +#include "tools/otherTools.h" +#include "tools/concurrent_cache.h" #include #include #include +#include #include #include @@ -56,6 +59,27 @@ bool booksReferToTheSameArchive(const Book& book1, const Book& book2) && book1.getPath() == book2.getPath(); } +template +class MultiKeyCache: public ConcurrentCache, Value> +{ + public: + explicit MultiKeyCache(size_t maxEntries) + : ConcurrentCache, Value>(maxEntries) + {} + + bool drop(const Key& key) + { + std::unique_lock l(this->lock_); + bool removed = false; + for(auto& cache_key: this->impl_.keys()) { + if(cache_key.find(key)!=cache_key.end()) { + removed |= this->impl_.drop(cache_key); + } + } + return removed; + } +}; + } // unnamed namespace struct Library::Impl @@ -63,15 +87,14 @@ struct Library::Impl struct Entry : Book { Library::Revision lastUpdatedRevision = 0; - - // May also keep the Archive and Reader pointers here and get - // rid of the m_readers and m_archives data members in Library }; Library::Revision m_revision; std::map m_books; - std::map> m_readers; - std::map> m_archives; + using ArchiveCache = ConcurrentCache>; + std::unique_ptr mp_archiveCache; + using SearcherCache = MultiKeyCache>; + std::unique_ptr mp_searcherCache; std::vector m_bookmarks; Xapian::WritableDatabase m_bookDB; @@ -85,7 +108,9 @@ struct Library::Impl }; Library::Impl::Impl() - : m_bookDB("", Xapian::DB_BACKEND_INMEMORY) + : mp_archiveCache(new ArchiveCache(std::max(getEnvVar("KIWIX_ARCHIVE_CACHE_SIZE", 1), 1))), + mp_searcherCache(new SearcherCache(std::max(getEnvVar("KIWIX_SEARCHER_CACHE_SIZE", 1), 1))), + m_bookDB("", Xapian::DB_BACKEND_INMEMORY) { } @@ -139,7 +164,7 @@ bool Library::addBook(const Book& book) try { auto& oldbook = mp_impl->m_books.at(book.getId()); if ( ! booksReferToTheSameArchive(oldbook, book) ) { - dropReader(book.getId()); + dropCache(book.getId()); } oldbook.update(book); // XXX: This may have no effect if oldbook is readonly // XXX: Then m_bookDB will become out-of-sync with @@ -150,6 +175,13 @@ bool Library::addBook(const Book& book) auto& newEntry = mp_impl->m_books[book.getId()]; static_cast(newEntry) = book; newEntry.lastUpdatedRevision = mp_impl->m_revision; + size_t new_cache_size = std::ceil(mp_impl->getBookCount(true, true)*0.1); + if (getEnvVar("KIWIX_ARCHIVE_CACHE_SIZE", -1) <= 0) { + mp_impl->mp_archiveCache->setMaxSize(new_cache_size); + } + if (getEnvVar("KIWIX_SEARCHER_CACHE_SIZE", -1) <= 0) { + mp_impl->mp_searcherCache->setMaxSize(new_cache_size); + } return true; } } @@ -173,17 +205,23 @@ bool Library::removeBookmark(const std::string& zimId, const std::string& url) } -void Library::dropReader(const std::string& id) +void Library::dropCache(const std::string& id) { - mp_impl->m_readers.erase(id); - mp_impl->m_archives.erase(id); + mp_impl->mp_archiveCache->drop(id); + mp_impl->mp_searcherCache->drop(id); } bool Library::removeBookById(const std::string& id) { std::lock_guard lock(m_mutex); mp_impl->m_bookDB.delete_document("Q" + id); - dropReader(id); + dropCache(id); + // We do not change the cache size here + // Most of the time, the book is remove in case of library refresh, it is + // often associated with addBook calls (which will properly set the cache size) + // Having a too big cache is not a problem here (or it would have been before) + // (And setMaxSize doesn't actually reduce the cache size, extra cached items + // will be removed in put or getOrPut). return mp_impl->m_books.erase(id) == 1; } @@ -242,35 +280,49 @@ const Book& Library::getBookByPath(const std::string& path) const std::shared_ptr Library::getReaderById(const std::string& id) { - try { - std::lock_guard lock(m_mutex); - return mp_impl->m_readers.at(id); - } catch (std::out_of_range& e) {} - - const auto archive = getArchiveById(id); - if ( !archive ) + auto archive = getArchiveById(id); + if(archive) { + return std::make_shared(archive); + } else { return nullptr; - - const shared_ptr reader(new Reader(archive, true)); - std::lock_guard lock(m_mutex); - mp_impl->m_readers[id] = reader; - return reader; + } } std::shared_ptr Library::getArchiveById(const std::string& id) { - std::lock_guard lock(m_mutex); try { - return mp_impl->m_archives.at(id); - } catch (std::out_of_range& e) {} - - auto book = getBookById(id); - if (!book.isPathValid()) + return mp_impl->mp_archiveCache->getOrPut(id, + [&](){ + auto book = getBookById(id); + if (!book.isPathValid()) { + throw std::invalid_argument(""); + } + return std::make_shared(book.getPath()); + }); + } catch (std::invalid_argument&) { return nullptr; + } +} - auto sptr = make_shared(book.getPath()); - mp_impl->m_archives[id] = sptr; - return sptr; +std::shared_ptr Library::getSearcherByIds(const BookIdSet& ids) +{ + assert(!ids.empty()); + try { + return mp_impl->mp_searcherCache->getOrPut(ids, + [&](){ + std::vector archives; + for(auto& id:ids) { + auto archive = getArchiveById(id); + if(!archive) { + throw std::invalid_argument(""); + } + archives.push_back(*archive); + } + return std::make_shared(zim::Searcher(archives)); + }); + } catch (std::invalid_argument&) { + return nullptr; + } } unsigned int Library::getBookCount(const bool localBooks, diff --git a/src/search_renderer.cpp b/src/search_renderer.cpp index 8f9685563..8878ead80 100644 --- a/src/search_renderer.cpp +++ b/src/search_renderer.cpp @@ -71,9 +71,9 @@ void SearchRenderer::setSearchPattern(const std::string& pattern) searchPattern = pattern; } -void SearchRenderer::setSearchContent(const std::string& content) +void SearchRenderer::setSearchBookQuery(const std::string& bookQuery) { - searchContent = content; + searchBookQuery = bookQuery; } void SearchRenderer::setProtocolPrefix(const std::string& prefix) @@ -90,13 +90,13 @@ kainjow::mustache::data buildQueryData ( const std::string& searchProtocolPrefix, const std::string& pattern, - const std::string& searchContent + const std::string& bookQuery ) { kainjow::mustache::data query; query.set("pattern", kiwix::encodeDiples(pattern)); std::ostringstream ss; ss << searchProtocolPrefix << "?pattern=" << urlEncode(pattern, true); - ss << "&content=" << urlEncode(searchContent, true); + ss << "&" << bookQuery; query.set("unpaginatedQuery", ss.str()); return query; } @@ -197,7 +197,7 @@ std::string SearchRenderer::getHtml() kainjow::mustache::data query = buildQueryData( searchProtocolPrefix, searchPattern, - searchContent + searchBookQuery ); std::string template_str = RESOURCE::templates::search_result_html; diff --git a/src/server.cpp b/src/server.cpp index 52355afe7..bd478c526 100644 --- a/src/server.cpp +++ b/src/server.cpp @@ -45,6 +45,7 @@ bool Server::start() { m_port, m_root, m_nbThreads, + m_multizimSearchLimit, m_verbose, m_withTaskbar, m_withLibraryButton, diff --git a/src/server/internalServer.cpp b/src/server/internalServer.cpp index 0290d79a7..772f6a4da 100644 --- a/src/server/internalServer.cpp +++ b/src/server/internalServer.cpp @@ -95,35 +95,197 @@ inline std::string normalizeRootUrl(std::string rootUrl) return rootUrl.empty() ? rootUrl : "/" + rootUrl; } -// Returns the value of env var `name` if found, otherwise returns defaultVal -unsigned int getCacheLength(const char* name, unsigned int defaultVal) { - try { - const char* envString = std::getenv(name); - if (envString == nullptr) { - throw std::runtime_error("Environment variable not set"); - } - return extractFromString(envString); - } catch (...) {} +Filter get_search_filter(const RequestContext& request, const std::string& prefix="") +{ + auto filter = kiwix::Filter().valid(true).local(true); + try { + filter.query(request.get_argument(prefix+"q")); + } catch (const std::out_of_range&) {} + try { + filter.maxSize(request.get_argument(prefix+"maxsize")); + } catch (...) {} + try { + filter.name(request.get_argument(prefix+"name")); + } catch (const std::out_of_range&) {} + try { + filter.category(request.get_argument(prefix+"category")); + } catch (const std::out_of_range&) {} + try { + filter.lang(request.get_argument(prefix+"lang")); + } catch (const std::out_of_range&) {} + try { + filter.acceptTags(kiwix::split(request.get_argument(prefix+"tag"), ";")); + } catch (...) {} + try { + filter.rejectTags(kiwix::split(request.get_argument(prefix+"notag"), ";")); + } catch (...) {} + return filter; +} - return defaultVal; +template +std::vector subrange(const std::vector& v, size_t s, size_t n) +{ + const size_t e = std::min(v.size(), s+n); + return std::vector(v.begin()+std::min(v.size(), s), v.begin()+e); } + +std::string renderUrl(const std::string& root, const std::string& urlTemplate) +{ + MustacheData data; + data.set("root", root); + auto url = kainjow::mustache::mustache(urlTemplate).render(data); + if ( url.back() == '\n' ) + url.pop_back(); + return url; +} + +std::string makeFulltextSearchSuggestion(const std::string& lang, const std::string& queryString) +{ + return i18n::expandParameterizedString(lang, "suggest-full-text-search", + { + {"SEARCH_TERMS", queryString} + } + ); +} + +ParameterizedMessage noSuchBookErrorMsg(const std::string& bookName) +{ + return ParameterizedMessage("no-such-book", { {"BOOK_NAME", bookName} }); +} + +ParameterizedMessage invalidRawAccessMsg(const std::string& dt) +{ + return ParameterizedMessage("invalid-raw-data-type", { {"DATATYPE", dt} }); +} + +ParameterizedMessage noValueForArgMsg(const std::string& argument) +{ + return ParameterizedMessage("no-value-for-arg", { {"ARGUMENT", argument} }); +} + +ParameterizedMessage rawEntryNotFoundMsg(const std::string& dt, const std::string& entry) +{ + return ParameterizedMessage("raw-entry-not-found", + { + {"DATATYPE", dt}, + {"ENTRY", entry}, + } + ); +} + +ParameterizedMessage tooManyBooksMsg(size_t nbBooks, size_t limit) +{ + return ParameterizedMessage("too-many-books", + { + {"NB_BOOKS", beautifyInteger(nbBooks)}, + {"LIMIT", beautifyInteger(limit)}, + } + ); +} + +ParameterizedMessage nonParameterizedMessage(const std::string& msgId) +{ + const ParameterizedMessage::Parameters noParams; + return ParameterizedMessage(msgId, noParams); +} + +struct Error : public std::runtime_error { + explicit Error(const ParameterizedMessage& message) + : std::runtime_error("Error while handling request"), + _message(message) + {} + + const ParameterizedMessage& message() const + { + return _message; + } + + const ParameterizedMessage _message; +}; + +void checkBookNumber(const Library::BookIdSet& bookIds, size_t limit) { + if (bookIds.empty()) { + throw Error(nonParameterizedMessage("no-book-found")); + } + if (limit > 0 && bookIds.size() > limit) { + throw Error(tooManyBooksMsg(bookIds.size(), limit)); + } +} + } // unnamed namespace -SearchInfo::SearchInfo(const std::string& pattern) - : pattern(pattern), - geoQuery() -{} +std::pair InternalServer::selectBooks(const RequestContext& request) const +{ + // Try old API + try { + auto bookName = request.get_argument("content"); + try { + const auto bookIds = Library::BookIdSet{mp_nameMapper->getIdForName(bookName)}; + const auto queryString = request.get_query([&](const std::string& key){return key == "content";}, true); + return {queryString, bookIds}; + } catch (const std::out_of_range&) { + throw Error(noSuchBookErrorMsg(bookName)); + } + } catch(const std::out_of_range&) { + // We've catch the out_of_range of get_argument + // continue + } -SearchInfo::SearchInfo(const std::string& pattern, GeoQuery geoQuery) - : pattern(pattern), - geoQuery(geoQuery) -{} + // Does user directly gives us ids ? + try { + auto id_vec = request.get_arguments("books.id"); + if (id_vec.empty()) { + throw Error(noValueForArgMsg("books.id")); + } + for(const auto& bookId: id_vec) { + try { + // This is a silly way to check that bookId exists + mp_nameMapper->getNameForId(bookId); + } catch (const std::out_of_range&) { + throw Error(noSuchBookErrorMsg(bookId)); + } + } + const auto bookIds = Library::BookIdSet(id_vec.begin(), id_vec.end()); + const auto queryString = request.get_query([&](const std::string& key){return key == "books.id";}, true); + return {queryString, bookIds}; + } catch(const std::out_of_range&) {} + + // Use the names + try { + auto name_vec = request.get_arguments("books.name"); + if (name_vec.empty()) { + throw Error(noValueForArgMsg("books.name")); + } + Library::BookIdSet bookIds; + for(const auto& bookName: name_vec) { + try { + bookIds.insert(mp_nameMapper->getIdForName(bookName)); + } catch(const std::out_of_range&) { + throw Error(noSuchBookErrorMsg(bookName)); + } + } + const auto queryString = request.get_query([&](const std::string& key){return key == "books.name";}, true); + return {queryString, bookIds}; + } catch(const std::out_of_range&) {} + + // Check for filtering + Filter filter = get_search_filter(request, "books.filter."); + auto id_vec = mp_library->filter(filter); + if (id_vec.empty()) { + throw Error(nonParameterizedMessage("no-book-found")); + } + const auto bookIds = Library::BookIdSet(id_vec.begin(), id_vec.end()); + const auto queryString = request.get_query([&](const std::string& key){return startsWith(key, "books.filter.");}, true); + return {queryString, bookIds}; +} -SearchInfo::SearchInfo(const RequestContext& request) - : pattern(request.get_optional_param("pattern", "")), - geoQuery(), - bookName(request.get_optional_param("content", "")) +SearchInfo InternalServer::getSearchInfo(const RequestContext& request) const { + auto bookIds = selectBooks(request); + checkBookNumber(bookIds.second, m_multizimSearchLimit); + auto pattern = request.get_optional_param("pattern", ""); + GeoQuery geoQuery; + /* Retrive geo search */ try { auto latitude = request.get_argument("latitude"); @@ -134,10 +296,19 @@ SearchInfo::SearchInfo(const RequestContext& request) catch(const std::invalid_argument&) {} if (!geoQuery && pattern.empty()) { - throw std::invalid_argument("No query provided."); + throw Error(nonParameterizedMessage("no-query")); } + + return SearchInfo(pattern, geoQuery, bookIds.second, bookIds.first); } +SearchInfo::SearchInfo(const std::string& pattern, GeoQuery geoQuery, const Library::BookIdSet& bookIds, const std::string& bookFilterQuery) + : pattern(pattern), + geoQuery(geoQuery), + bookIds(bookIds), + bookFilterQuery(bookFilterQuery) +{} + zim::Query SearchInfo::getZimQuery(bool verbose) const { zim::Query query; if (verbose) { @@ -175,6 +346,7 @@ InternalServer::InternalServer(Library* library, int port, std::string root, int nbThreads, + unsigned int multizimSearchLimit, bool verbose, bool withTaskbar, bool withLibraryButton, @@ -185,6 +357,7 @@ InternalServer::InternalServer(Library* library, m_port(port), m_root(normalizeRootUrl(root)), m_nbThreads(nbThreads), + m_multizimSearchLimit(multizimSearchLimit), m_verbose(verbose), m_withTaskbar(withTaskbar), m_withLibraryButton(withLibraryButton), @@ -194,9 +367,8 @@ InternalServer::InternalServer(Library* library, mp_daemon(nullptr), mp_library(library), mp_nameMapper(nameMapper ? nameMapper : &defaultNameMapper), - searcherCache(getCacheLength("SEARCHER_CACHE_SIZE", std::max((unsigned int) (mp_library->getBookCount(true, true)*0.1), 1U))), - searchCache(getCacheLength("SEARCH_CACHE_SIZE", DEFAULT_CACHE_SIZE)), - suggestionSearcherCache(getCacheLength("SUGGESTION_SEARCHER_CACHE_SIZE", std::max((unsigned int) (mp_library->getBookCount(true, true)*0.1), 1U))) + searchCache(getEnvVar("KIWIX_SEARCH_CACHE_SIZE", DEFAULT_CACHE_SIZE)), + suggestionSearcherCache(getEnvVar("KIWIX_SUGGESTION_SEARCHER_CACHE_SIZE", std::max((unsigned int) (mp_library->getBookCount(true, true)*0.1), 1U))) {} bool InternalServer::start() { @@ -439,56 +611,6 @@ SuggestionsList_t getSuggestions(SuggestionSearcherCache& cache, const zim::Arch return suggestions; } -namespace -{ - -std::string renderUrl(const std::string& root, const std::string& urlTemplate) -{ - MustacheData data; - data.set("root", root); - auto url = kainjow::mustache::mustache(urlTemplate).render(data); - if ( url.back() == '\n' ) - url.pop_back(); - return url; -} - -std::string makeFulltextSearchSuggestion(const std::string& lang, const std::string& queryString) -{ - return i18n::expandParameterizedString(lang, "suggest-full-text-search", - { - {"SEARCH_TERMS", queryString} - } - ); -} - -ParameterizedMessage noSuchBookErrorMsg(const std::string& bookName) -{ - return ParameterizedMessage("no-such-book", { {"BOOK_NAME", bookName} }); -} - -ParameterizedMessage invalidRawAccessMsg(const std::string& dt) -{ - return ParameterizedMessage("invalid-raw-data-type", { {"DATATYPE", dt} }); -} - -ParameterizedMessage rawEntryNotFoundMsg(const std::string& dt, const std::string& entry) -{ - return ParameterizedMessage("raw-entry-not-found", - { - {"DATATYPE", dt}, - {"ENTRY", entry}, - } - ); -} - -ParameterizedMessage nonParameterizedMessage(const std::string& msgId) -{ - const ParameterizedMessage::Parameters noParams; - return ParameterizedMessage(msgId, noParams); -} - -} // unnamed namespace - std::unique_ptr InternalServer::handle_suggest(const RequestContext& request) { if (m_verbose.load()) { @@ -591,41 +713,18 @@ std::unique_ptr InternalServer::handle_search(const RequestContext& re } try { - auto searchInfo = SearchInfo(request); - - std::string bookId; - std::shared_ptr archive; - if (!searchInfo.bookName.empty()) { - try { - bookId = mp_nameMapper->getIdForName(searchInfo.bookName); - archive = mp_library->getArchiveById(bookId); - } catch (const std::out_of_range&) { - throw std::invalid_argument("The requested book doesn't exist."); - } - } - + auto searchInfo = getSearchInfo(request); + auto bookIds = searchInfo.getBookIds(); /* Make the search */ // Try to get a search from the searchInfo, else build it + auto searcher = mp_library->getSearcherByIds(bookIds); + auto lock(searcher->getLock()); + std::shared_ptr search; try { search = searchCache.getOrPut(searchInfo, [=](){ - std::shared_ptr searcher; - if (archive) { - searcher = searcherCache.getOrPut(bookId, [=](){ return std::make_shared(*archive);}); - } else { - for (auto& bookId: mp_library->filter(kiwix::Filter().local(true).valid(true))) { - auto currentArchive = mp_library->getArchiveById(bookId); - if (currentArchive) { - if (! searcher) { - searcher = std::make_shared(*currentArchive); - } else { - searcher->addArchive(*currentArchive); - } - } - } - } return make_shared(searcher->search(searchInfo.getZimQuery(m_verbose.load()))); } ); @@ -638,11 +737,14 @@ std::unique_ptr InternalServer::handle_search(const RequestContext& re "404-page-heading", cssUrl); response += nonParameterizedMessage("no-search-results"); - response += TaskbarInfo(searchInfo.bookName, archive.get()); + if(bookIds.size() == 1) { + auto bookId = *bookIds.begin(); + auto bookName = mp_nameMapper->getNameForId(bookId); + response += TaskbarInfo(bookName, mp_library->getArchiveById(bookId).get()); + } return response; } - auto start = 0; try { start = request.get_argument("start"); @@ -663,17 +765,21 @@ std::unique_ptr InternalServer::handle_search(const RequestContext& re SearchRenderer renderer(search->getResults(start, pageLength), mp_nameMapper, mp_library, start, search->getEstimatedMatches()); renderer.setSearchPattern(searchInfo.pattern); - renderer.setSearchContent(searchInfo.bookName); + renderer.setSearchBookQuery(searchInfo.bookFilterQuery); renderer.setProtocolPrefix(m_root + "/"); renderer.setSearchProtocolPrefix(m_root + "/search"); renderer.setPageLength(pageLength); auto response = ContentResponse::build(*this, renderer.getHtml(), "text/html; charset=utf-8"); - response->set_taskbar(searchInfo.bookName, archive.get()); + if(bookIds.size() == 1) { + auto bookId = *bookIds.begin(); + auto bookName = mp_nameMapper->getNameForId(bookId); + response->set_taskbar(bookName, mp_library->getArchiveById(bookId).get()); + } return std::move(response); - } catch (const std::invalid_argument& e) { + } catch (const Error& e) { return HTTP400HtmlResponse(*this, request) + invalidUrlMsg - + std::string(e.what()); + + e.message(); } } @@ -776,45 +882,6 @@ std::unique_ptr InternalServer::handle_catalog(const RequestContext& r return std::move(response); } -namespace -{ - -Filter get_search_filter(const RequestContext& request) -{ - auto filter = kiwix::Filter().valid(true).local(true); - try { - filter.query(request.get_argument("q")); - } catch (const std::out_of_range&) {} - try { - filter.maxSize(request.get_argument("maxsize")); - } catch (...) {} - try { - filter.name(request.get_argument("name")); - } catch (const std::out_of_range&) {} - try { - filter.category(request.get_argument("category")); - } catch (const std::out_of_range&) {} - try { - filter.lang(request.get_argument("lang")); - } catch (const std::out_of_range&) {} - try { - filter.acceptTags(kiwix::split(request.get_argument("tag"), ";")); - } catch (...) {} - try { - filter.rejectTags(kiwix::split(request.get_argument("notag"), ";")); - } catch (...) {} - return filter; -} - -template -std::vector subrange(const std::vector& v, size_t s, size_t n) -{ - const size_t e = std::min(v.size(), s+n); - return std::vector(v.begin()+std::min(v.size(), s), v.begin()+e); -} - -} // unnamed namespace - std::vector InternalServer::search_catalog(const RequestContext& request, kiwix::OPDSDumper& opdsDumper) diff --git a/src/server/internalServer.h b/src/server/internalServer.h index 69e8166c4..a0cffa95b 100644 --- a/src/server/internalServer.h +++ b/src/server/internalServer.h @@ -68,27 +68,26 @@ struct GeoQuery { class SearchInfo { public: - SearchInfo(const std::string& pattern); - SearchInfo(const std::string& pattern, GeoQuery geoQuery); - SearchInfo(const RequestContext& request); + SearchInfo(const std::string& pattern, GeoQuery geoQuery, const Library::BookIdSet& bookIds, const std::string& bookFilterString); zim::Query getZimQuery(bool verbose) const; + const Library::BookIdSet& getBookIds() const { return bookIds; } friend bool operator<(const SearchInfo& l, const SearchInfo& r) { - return std::tie(l.bookName, l.pattern, l.geoQuery) - < std::tie(r.bookName, r.pattern, r.geoQuery); // keep the same order + return std::tie(l.bookIds, l.pattern, l.geoQuery) + < std::tie(r.bookIds, r.pattern, r.geoQuery); // keep the same order } public: //data std::string pattern; GeoQuery geoQuery; - std::string bookName; + Library::BookIdSet bookIds; + std::string bookFilterQuery; }; typedef kainjow::mustache::data MustacheData; -typedef ConcurrentCache> SearcherCache; typedef ConcurrentCache> SearchCache; typedef ConcurrentCache> SuggestionSearcherCache; @@ -103,6 +102,7 @@ class InternalServer { int port, std::string root, int nbThreads, + unsigned int multizimSearchLimit, bool verbose, bool withTaskbar, bool withLibraryButton, @@ -150,12 +150,15 @@ class InternalServer { bool etag_not_needed(const RequestContext& r) const; ETag get_matching_if_none_match_etag(const RequestContext& request) const; + std::pair selectBooks(const RequestContext& r) const; + SearchInfo getSearchInfo(const RequestContext& r) const; private: // data std::string m_addr; int m_port; std::string m_root; int m_nbThreads; + unsigned int m_multizimSearchLimit; std::atomic_bool m_verbose; bool m_withTaskbar; bool m_withLibraryButton; @@ -167,7 +170,6 @@ class InternalServer { Library* mp_library; NameMapper* mp_nameMapper; - SearcherCache searcherCache; SearchCache searchCache; SuggestionSearcherCache suggestionSearcherCache; diff --git a/src/server/request_context.cpp b/src/server/request_context.cpp index 6b435e5ad..53a5cde21 100644 --- a/src/server/request_context.cpp +++ b/src/server/request_context.cpp @@ -106,7 +106,7 @@ MHD_Result RequestContext::fill_argument(void *__this, enum MHD_ValueKind kind, const char *key, const char* value) { RequestContext *_this = static_cast(__this); - _this->arguments[key] = value == nullptr ? "" : value; + _this->arguments[key].push_back(value == nullptr ? "" : value); return MHD_YES; } @@ -121,8 +121,14 @@ void RequestContext::print_debug_info() const { printf(" - %s : '%s'\n", it->first.c_str(), it->second.c_str()); } printf("arguments :\n"); - for (auto it=arguments.begin(); it!=arguments.end(); it++) { - printf(" - %s : '%s'\n", it->first.c_str(), it->second.c_str()); + for (auto& pair:arguments) { + printf(" - %s :", pair.first.c_str()); + bool first = true; + for (auto& v: pair.second) { + printf("%s %s", first?"":",", v.c_str()); + first = false; + } + printf("\n"); } printf("Parsed : \n"); printf("full_url: %s\n", full_url.c_str()); @@ -176,23 +182,13 @@ ByteRange RequestContext::get_range() const { template<> std::string RequestContext::get_argument(const std::string& name) const { - return arguments.at(name); + return arguments.at(name)[0]; } std::string RequestContext::get_header(const std::string& name) const { return headers.at(lcAll(name)); } -std::string RequestContext::get_query() const { - std::string q; - const char* sep = ""; - for ( const auto& a : arguments ) { - q += sep + a.first + '=' + a.second; - sep = "&"; - } - return q; -} - std::string RequestContext::get_user_language() const { try { diff --git a/src/server/request_context.h b/src/server/request_context.h index 2c4c8902e..f0b79b58e 100644 --- a/src/server/request_context.h +++ b/src/server/request_context.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include "byte_range.h" @@ -69,7 +70,11 @@ class RequestContext { std::string get_header(const std::string& name) const; template T get_argument(const std::string& name) const { - return extractFromString(arguments.at(name)); + return extractFromString(get_argument(name)); + } + + std::vector get_arguments(const std::string& name) const { + return arguments.at(name); } template @@ -86,7 +91,27 @@ class RequestContext { std::string get_url() const; std::string get_url_part(int part) const; std::string get_full_url() const; - std::string get_query() const; + + std::string get_query(bool mustEncode = false) const { + return get_query([](const std::string& key) {return true;}, mustEncode); + } + + template + std::string get_query(F filter, bool mustEncode) const { + std::string q; + const char* sep = ""; + auto encode = [=](const std::string& value) { return mustEncode?urlEncode(value, true):value; }; + for ( const auto& a : arguments ) { + if (!filter(a.first)) { + continue; + } + for (const auto& v: a.second) { + q += sep + encode(a.first) + '=' + encode(v); + sep = "&"; + } + } + return q; + } ByteRange get_range() const; @@ -105,7 +130,7 @@ class RequestContext { ByteRange byteRange_; std::map headers; - std::map arguments; + std::map> arguments; private: // functions static MHD_Result fill_header(void *, enum MHD_ValueKind, const char*, const char*); diff --git a/src/tools/concurrent_cache.h b/src/tools/concurrent_cache.h index 1b5175b2e..5195430c5 100644 --- a/src/tools/concurrent_cache.h +++ b/src/tools/concurrent_cache.h @@ -1,3 +1,4 @@ + /* * Copyright (C) 2021 Matthieu Gautier * Copyright (C) 2020 Veloman Yunkan @@ -84,11 +85,123 @@ class ConcurrentCache return impl_.drop(key); } -private: // data + size_t setMaxSize(size_t new_size) { + std::unique_lock l(lock_); + return impl_.setMaxSize(new_size); + } + +protected: // data Impl impl_; std::mutex lock_; }; + +/** + WeakStore represent a thread safe store (map) of weak ptr. + It allows to store weak_ptr from shared_ptr and retrieve shared_ptr from + potential non expired weak_ptr. + It is not limited in size. +*/ +template +class WeakStore { + private: // types + typedef std::weak_ptr WeakValue; + + public: + explicit WeakStore() = default; + + std::shared_ptr get(const Key& key) + { + std::lock_guard l(m_lock); + auto it = m_weakMap.find(key); + if (it != m_weakMap.end()) { + auto shared = it->second.lock(); + if (shared) { + return shared; + } else { + m_weakMap.erase(it); + } + } + throw std::runtime_error("No weak ptr"); + } + + void add(const Key& key, std::shared_ptr shared) + { + std::lock_guard l(m_lock); + m_weakMap[key] = WeakValue(shared); + } + + + private: //data + std::map m_weakMap; + std::mutex m_lock; +}; + +template +class ConcurrentCache> +{ +private: // types + typedef std::shared_ptr Value; + typedef std::shared_future ValuePlaceholder; + typedef lru_cache Impl; + +public: // types + explicit ConcurrentCache(size_t maxEntries) + : impl_(maxEntries) + {} + + // Gets the entry corresponding to the given key. If the entry is not in the + // cache, it is obtained by calling f() (without any arguments) and the + // result is put into the cache. + // + // The cache as a whole is locked only for the duration of accessing + // the respective slot. If, in the case of the a cache miss, the generation + // of the missing element takes a long time, only attempts to access that + // element will block - the rest of the cache remains open to concurrent + // access. + template + Value getOrPut(const Key& key, F f) + { + std::promise valuePromise; + std::unique_lock l(lock_); + const auto x = impl_.getOrPut(key, valuePromise.get_future().share()); + l.unlock(); + if ( x.miss() ) { + // Try to get back the shared_ptr from the weak_ptr first. + try { + valuePromise.set_value(m_weakStore.get(key)); + } catch(const std::runtime_error& e) { + try { + const auto value = f(); + valuePromise.set_value(value); + m_weakStore.add(key, value); + } catch (std::exception& e) { + drop(key); + throw; + } + } + } + + return x.value().get(); + } + + bool drop(const Key& key) + { + std::unique_lock l(lock_); + return impl_.drop(key); + } + + size_t setMaxSize(size_t new_size) { + std::unique_lock l(lock_); + return impl_.setMaxSize(new_size); + } + +protected: // data + std::mutex lock_; + Impl impl_; + WeakStore m_weakStore; +}; + } // namespace kiwix #endif // ZIM_CONCURRENT_CACHE_H diff --git a/src/tools/lrucache.h b/src/tools/lrucache.h index bd90c3128..dd740a644 100644 --- a/src/tools/lrucache.h +++ b/src/tools/lrucache.h @@ -40,6 +40,7 @@ #include #include +#include #include #include #include @@ -138,12 +139,26 @@ class lru_cache { return _cache_items_map.size(); } + size_t setMaxSize(size_t new_size) { + size_t previous = _max_size; + _max_size = new_size; + return previous; + } + + std::set keys() const { + std::set keys; + for(auto& item:_cache_items_map) { + keys.insert(item.first); + } + return keys; + } + private: // functions void putMissing(const key_t& key, const value_t& value) { assert(_cache_items_map.find(key) == _cache_items_map.end()); _cache_items_list.push_front(key_value_pair_t(key, value)); _cache_items_map[key] = _cache_items_list.begin(); - if (_cache_items_map.size() > _max_size) { + while (_cache_items_map.size() > _max_size) { _cache_items_map.erase(_cache_items_list.back().first); _cache_items_list.pop_back(); } diff --git a/src/tools/otherTools.h b/src/tools/otherTools.h index b1297798e..c0920d7bf 100644 --- a/src/tools/otherTools.h +++ b/src/tools/otherTools.h @@ -23,9 +23,12 @@ #include #include #include +#include #include #include +#include "stringTools.h" + namespace pugi { class xml_node; } @@ -53,6 +56,20 @@ namespace kiwix kainjow::mustache::data onlyAsNonEmptyMustacheValue(const std::string& s); std::string render_template(const std::string& template_str, kainjow::mustache::data data); + + template + T getEnvVar(const char* name, const T& defaultValue) + { + try { + const char* envString = std::getenv(name); + if (envString == nullptr) { + throw std::runtime_error("Environment variable not set"); + } + return extractFromString(envString); + } catch (...) {} + + return defaultValue; + } } #endif diff --git a/static/i18n/en.json b/static/i18n/en.json index 183f8b9f4..89c83c915 100644 --- a/static/i18n/en.json +++ b/static/i18n/en.json @@ -4,12 +4,16 @@ ] }, "name":"English", - "suggest-full-text-search": "containing '{{{SEARCH_TERMS}}}'..." - , "no-such-book": "No such book: {{BOOK_NAME}}" + "suggest-full-text-search" : "containing '{{{SEARCH_TERMS}}}'..." + , "no-such-book" : "No such book: {{BOOK_NAME}}" + , "too-many-books" : "Too many books requested ({{NB_BOOKS}}) where limit is {{LIMIT}}" + , "no-book-found" : "No book matches selection criteria" , "url-not-found" : "The requested URL \"{{url}}\" was not found on this server." , "suggest-search" : "Make a full text search for {{PATTERN}}" , "random-article-failure" : "Oops! Failed to pick a random article :(" , "invalid-raw-data-type" : "{{DATATYPE}} is not a valid request for raw content." + , "no-value-for-arg": "No value provided for argument {{ARGUMENT}}" + , "no-query" : "No query provided." , "raw-entry-not-found" : "Cannot find {{DATATYPE}} entry {{ENTRY}}" , "400-page-title" : "Invalid request" , "400-page-heading" : "Invalid request" diff --git a/static/i18n/qqq.json b/static/i18n/qqq.json index edebb31dc..895cd120f 100644 --- a/static/i18n/qqq.json +++ b/static/i18n/qqq.json @@ -2,16 +2,21 @@ "@metadata": { "authors": [ "Veloman Yunkan", - "Verdy p" + "Verdy p", + "Matthieu Gautier" ] }, "name": "{{Doc-important|Don't write \"English\" in your language!}}\n\n'''Write the name of ''your'' language in its native script.'''\n\nCurrent language to which the string is being translated to.\n\nFor example, write \"français\" when translating to French, or \"Deutsch\" when translating to German.\n\n'''Important:''' Do not use your language’s word for “English”. Use the word that your language uses to refer to itself. If you translate this message to mean “English” in your language, your change will be reverted.", "suggest-full-text-search": "Text appearing in the suggestion list that, when selected, runs a full text search instead of the title search", "no-such-book": "Error text when the requested book is not found in the library", + "too-many-books":"Error text when user request more books than the limit set by the administrator", "url-not-found": "Error text about wrong URL for an HTTP 404 error", + "no-book-found": "Error text when no book matches the selection criteria", "suggest-search": "Suggest a search when the URL points to a non existing article", "random-article-failure": "Failure of the random article selection procedure", "invalid-raw-data-type": "Invalid DATATYPE was used with the /raw endpoint (/raw//DATATYPE/...); allowed values are 'meta' and 'content'", + "no-value-for-arg" : "Error text when no value has been provided for ARGUMENT in the request's query string", + "no-query" : "Error text when no query has been provided for fulltext search", "raw-entry-not-found": "Entry requested via the /raw endpoint was not found", "400-page-title": "Title of the 400 error page", "400-page-heading": "Heading of the 400 error page", diff --git a/test/lrucache.cpp b/test/lrucache.cpp new file mode 100644 index 000000000..ef3aa8789 --- /dev/null +++ b/test/lrucache.cpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2014, lamerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of lamerman nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#include "../src/tools/lrucache.h" +#include "../src/tools/concurrent_cache.h" +#include "gtest/gtest.h" + +const unsigned int NUM_OF_TEST2_RECORDS = 100; +const unsigned int TEST2_CACHE_CAPACITY = 50; + +TEST(CacheTest, SimplePut) { + kiwix::lru_cache cache_lru(1); + cache_lru.put(7, 777); + EXPECT_TRUE(cache_lru.exists(7)); + EXPECT_EQ(777, cache_lru.get(7)); + EXPECT_EQ(1U, cache_lru.size()); +} + +TEST(CacheTest, OverwritingPut) { + kiwix::lru_cache cache_lru(1); + cache_lru.put(7, 777); + cache_lru.put(7, 222); + EXPECT_TRUE(cache_lru.exists(7)); + EXPECT_EQ(222, cache_lru.get(7)); + EXPECT_EQ(1U, cache_lru.size()); +} + +TEST(CacheTest, MissingValue) { + kiwix::lru_cache cache_lru(1); + EXPECT_TRUE(cache_lru.get(7).miss()); + EXPECT_FALSE(cache_lru.get(7).hit()); + EXPECT_THROW(cache_lru.get(7).value(), std::range_error); +} + +TEST(CacheTest, DropValue) { + kiwix::lru_cache cache_lru(3); + cache_lru.put(7, 777); + cache_lru.put(8, 888); + cache_lru.put(9, 999); + EXPECT_EQ(3U, cache_lru.size()); + EXPECT_TRUE(cache_lru.exists(7)); + EXPECT_EQ(777, cache_lru.get(7)); + + EXPECT_TRUE(cache_lru.drop(7)); + + EXPECT_EQ(2U, cache_lru.size()); + EXPECT_FALSE(cache_lru.exists(7)); + EXPECT_THROW(cache_lru.get(7).value(), std::range_error); + + EXPECT_FALSE(cache_lru.drop(7)); +} + +TEST(CacheTest1, KeepsAllValuesWithinCapacity) { + kiwix::lru_cache cache_lru(TEST2_CACHE_CAPACITY); + + for (uint i = 0; i < NUM_OF_TEST2_RECORDS; ++i) { + cache_lru.put(i, i); + } + + for (uint i = 0; i < NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY; ++i) { + EXPECT_FALSE(cache_lru.exists(i)); + } + + for (uint i = NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY; i < NUM_OF_TEST2_RECORDS; ++i) { + EXPECT_TRUE(cache_lru.exists(i)); + EXPECT_EQ((int)i, cache_lru.get(i)); + } + + size_t size = cache_lru.size(); + EXPECT_EQ(TEST2_CACHE_CAPACITY, size); +} + +TEST(ConcurrentCacheTest, handleException) { + kiwix::ConcurrentCache cache(1); + auto val = cache.getOrPut(7, []() { return 777; }); + EXPECT_EQ(val, 777); + EXPECT_THROW(cache.getOrPut(8, []() { throw std::runtime_error("oups"); return 0; }), std::runtime_error); + val = cache.getOrPut(8, []() { return 888; }); + EXPECT_EQ(val, 888); +} + +TEST(ConcurrentCacheTest, weakPtr) { + kiwix::ConcurrentCache> cache(1); + auto refValue = cache.getOrPut(7, []() { return std::make_shared(777); }); + EXPECT_EQ(*refValue, 777); + EXPECT_EQ(refValue.use_count(), 2); + + // This will drop shared(777) from the cache + cache.getOrPut(8, []() { return std::make_shared(888); }); + EXPECT_EQ(refValue.use_count(), 1); + + // We must get the shared value from the weakPtr we have + EXPECT_NO_THROW(cache.getOrPut(7, []() { throw std::runtime_error("oups"); return nullptr; })); + EXPECT_EQ(refValue.use_count(), 2); + + // Drop all ref + cache.getOrPut(8, []() { return std::make_shared(888); }); + refValue.reset(); + + // Be sure we call the construction function + EXPECT_THROW(cache.getOrPut(7, []() { throw std::runtime_error("oups"); return nullptr; }), std::runtime_error); +} diff --git a/test/meson.build b/test/meson.build index 368144c57..4ce7d286b 100644 --- a/test/meson.build +++ b/test/meson.build @@ -10,7 +10,8 @@ tests = [ 'manager', 'name_mapper', 'opds_catalog', - 'server_helper' + 'server_helper', + 'lrucache' ] if build_machine.system() != 'windows' diff --git a/test/server.cpp b/test/server.cpp index 4054a49ba..94db3c944 100644 --- a/test/server.cpp +++ b/test/server.cpp @@ -133,6 +133,7 @@ void ZimFileServer::run(int serverPort, std::string indexTemplateString) server->setNbThreads(2); server->setVerbose(false); server->setTaskbar(withTaskbar, withTaskbar); + server->setMultiZimSearchLimit(3); if (!indexTemplateString.empty()) { server->setIndexTemplateString(indexTemplateString); } @@ -156,6 +157,7 @@ class ServerTest : public ::testing::Test const int PORT = 8001; const ZimFileServer::FilePathCollection ZIMFILES { "./test/zimfile.zim", + "./test/example.zim", "./test/poor.zim", "./test/corner_cases.zim" }; @@ -394,6 +396,10 @@ const char* urls400[] = { "/ROOT/search?content=zimfile", "/ROOT/search?content=non-existing-book&pattern=asdfqwerty", "/ROOT/search?content=non-existing-book&pattern=asd

- No query provided. + Too many books requested (4) where limit is 3

)" }, { /* url */ "/ROOT/search?content=zimfile", @@ -919,7 +925,7 @@ TEST_F(ServerTest, 400WithBodyTesting) The requested URL "/ROOT/search?content=non-existing-book&pattern=asdfqwerty" is not a valid request.

- The requested book doesn't exist. + No such book: non-existing-book

)" }, { /* url */ "/ROOT/search?content=non-existing-book&pattern=a\"