Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Better handling of multi search #729

Merged
merged 29 commits into from
Jun 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
7c688a4
Move `getCacheLength` to a generic helper function `getEnvVar`
mgautierfr Mar 21, 2022
28fb76b
Remove m_readers in `Library::impl`
mgautierfr Apr 26, 2022
582e3ec
Use a concurrent cache to store Archive cache.
mgautierfr Apr 26, 2022
740581c
Link the cache size to the book count.
mgautierfr Apr 26, 2022
f5af063
Move the searcher cache into the Library
mgautierfr Apr 27, 2022
fd0edbb
Use a set of id as key for a the searcher Cache.
mgautierfr Mar 22, 2022
8546236
Use the newly introduced searcherCache for multizim searcher.
mgautierfr Mar 22, 2022
98c54b2
Handle multiple arguments in RequestContext.
mgautierfr Mar 22, 2022
22996e4
Allow user to select multiple books when doing search.
mgautierfr Apr 20, 2022
76ebfd7
Move get_search_filter and subrange.
mgautierfr Mar 30, 2022
4438106
Add a prefix in get_search_filter
mgautierfr Apr 20, 2022
76d5faf
Introduce `selectBooks`
mgautierfr Mar 30, 2022
39d0a56
Use selectBooks in handle_search
mgautierfr Apr 20, 2022
077ceac
Make the search_rendered handle multizim search.
mgautierfr Mar 22, 2022
c721320
Move i18n helper functions
mgautierfr Apr 21, 2022
f0065fd
Introduce Error exception to do i18n
mgautierfr Apr 21, 2022
cf30233
Prefix env variable name with `KIWIX_`
mgautierfr Apr 27, 2022
b74910b
Limit the number of zim in multizim fulltext search.
mgautierfr May 23, 2022
0081b4d
Make the limit of zim files per search configurable.
mgautierfr May 23, 2022
2b38d2c
Copy the lrucache test from libzim.
mgautierfr May 24, 2022
e5ea210
Add a template specialization for ConcurrentCache storing shared_ptr
mgautierfr May 24, 2022
1514661
Protect search from multi threading race condition.
mgautierfr May 24, 2022
3b3d7ad
Preparing to enhance the search results testsuite
veloman-yunkan May 30, 2022
f45962c
First test case for multizim search
veloman-yunkan May 30, 2022
e2ab7fd
Add some more testing.
mgautierfr May 31, 2022
b483a8e
Make the request_context be able to generate a querystring for a subset.
mgautierfr May 30, 2022
b857293
Build the bookSelection query string when we parse the query.
mgautierfr May 30, 2022
3bca433
Correctly url encode querystring
mgautierfr May 31, 2022
a7651d0
Check early that provided bookIds are valid
mgautierfr May 30, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion include/library.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <memory>
#include <mutex>
#include <zim/archive.h>
#include <zim/search.h>

#include "book.h"
#include "bookmark.h"
Expand Down Expand Up @@ -140,6 +141,22 @@ class Filter {
bool accept(const Book& book) const;
};


class ZimSearcher : public zim::Searcher
{
public:
explicit ZimSearcher(zim::Searcher&& searcher)
: zim::Searcher(searcher)
{}

std::unique_lock<std::mutex> getLock() {
return std::unique_lock<std::mutex>(m_mutex);
}
virtual ~ZimSearcher() = default;
private:
std::mutex m_mutex;
};

/**
* A Library store several books.
*/
Expand All @@ -152,6 +169,7 @@ class Library
typedef uint64_t Revision;
typedef std::vector<std::string> BookIdCollection;
typedef std::map<std::string, int> AttributeCounts;
typedef std::set<std::string> BookIdSet;

public:
Library();
Expand Down Expand Up @@ -207,6 +225,10 @@ class Library

DEPRECATED std::shared_ptr<Reader> getReaderById(const std::string& id);
std::shared_ptr<zim::Archive> getArchiveById(const std::string& id);
std::shared_ptr<ZimSearcher> getSearcherById(const std::string& id) {
return getSearcherByIds(BookIdSet{id});
}
std::shared_ptr<ZimSearcher> getSearcherByIds(const BookIdSet& ids);

/**
* Remove a book from the library.
Expand Down Expand Up @@ -338,7 +360,7 @@ class Library
std::vector<std::string> getBookPropValueSet(BookStrPropMemFn p) const;
BookIdCollection filterViaBookDB(const Filter& filter) const;
void updateBookDB(const Book& book);
void dropReader(const std::string& bookId);
void dropCache(const std::string& bookId);
mgautierfr marked this conversation as resolved.
Show resolved Hide resolved

private: //data
std::unique_ptr<Impl> mp_impl;
Expand Down
6 changes: 3 additions & 3 deletions include/search_renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,9 @@ class SearchRenderer
void setSearchPattern(const std::string& pattern);

/**
* Set the search content id.
* Set the querystring used to select books
*/
void setSearchContent(const std::string& name);
void setSearchBookQuery(const std::string& bookQuery);

/**
* Set protocol prefix.
Expand Down Expand Up @@ -112,7 +112,7 @@ class SearchRenderer
zim::SearchResultSet m_srs;
NameMapper* mp_nameMapper;
Library* mp_library;
std::string searchContent;
std::string searchBookQuery;
std::string searchPattern;
std::string protocolPrefix;
std::string searchProtocolPrefix;
Expand Down
4 changes: 3 additions & 1 deletion include/server.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ namespace kiwix
void setAddress(const std::string& addr) { m_addr = addr; }
void setPort(int port) { m_port = port; }
void setNbThreads(int threads) { m_nbThreads = threads; }
void setMultiZimSearchLimit(unsigned int limit) { m_multizimSearchLimit = limit; }
void setIpConnectionLimit(int limit) { m_ipConnectionLimit = limit; }
void setVerbose(bool verbose) { m_verbose = verbose; }
void setIndexTemplateString(const std::string& indexTemplateString) { m_indexTemplateString = indexTemplateString; }
Expand All @@ -63,7 +64,7 @@ namespace kiwix
{ m_blockExternalLinks = blockExternalLinks; }
int getPort();
std::string getAddress();

protected:
Library* mp_library;
NameMapper* mp_nameMapper;
Expand All @@ -72,6 +73,7 @@ namespace kiwix
std::string m_indexTemplateString = "";
int m_port = 80;
int m_nbThreads = 1;
unsigned int m_multizimSearchLimit = 0;
bool m_verbose = false;
bool m_withTaskbar = true;
bool m_withLibraryButton = true;
Expand Down
116 changes: 84 additions & 32 deletions src/library.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,13 @@
#include "tools/regexTools.h"
#include "tools/pathTools.h"
#include "tools/stringTools.h"
#include "tools/otherTools.h"
mgautierfr marked this conversation as resolved.
Show resolved Hide resolved
#include "tools/concurrent_cache.h"

#include <pugixml.hpp>
#include <algorithm>
#include <set>
#include <cmath>
#include <unicode/locid.h>
#include <xapian.h>

Expand All @@ -56,22 +59,42 @@ bool booksReferToTheSameArchive(const Book& book1, const Book& book2)
&& book1.getPath() == book2.getPath();
}

template<typename Key, typename Value>
class MultiKeyCache: public ConcurrentCache<std::set<Key>, Value>
mgautierfr marked this conversation as resolved.
Show resolved Hide resolved
{
public:
explicit MultiKeyCache(size_t maxEntries)
: ConcurrentCache<std::set<Key>, Value>(maxEntries)
{}

bool drop(const Key& key)
{
std::unique_lock<std::mutex> l(this->lock_);
bool removed = false;
for(auto& cache_key: this->impl_.keys()) {
if(cache_key.find(key)!=cache_key.end()) {
removed |= this->impl_.drop(cache_key);
}
}
return removed;
}
};

} // unnamed namespace

struct Library::Impl
{
struct Entry : Book
{
Library::Revision lastUpdatedRevision = 0;

// May also keep the Archive and Reader pointers here and get
// rid of the m_readers and m_archives data members in Library
};

Library::Revision m_revision;
std::map<std::string, Entry> m_books;
std::map<std::string, std::shared_ptr<Reader>> m_readers;
std::map<std::string, std::shared_ptr<zim::Archive>> m_archives;
using ArchiveCache = ConcurrentCache<std::string, std::shared_ptr<zim::Archive>>;
std::unique_ptr<ArchiveCache> mp_archiveCache;
using SearcherCache = MultiKeyCache<std::string, std::shared_ptr<ZimSearcher>>;
std::unique_ptr<SearcherCache> mp_searcherCache;
std::vector<kiwix::Bookmark> m_bookmarks;
Xapian::WritableDatabase m_bookDB;

Expand All @@ -85,7 +108,9 @@ struct Library::Impl
};

Library::Impl::Impl()
: m_bookDB("", Xapian::DB_BACKEND_INMEMORY)
: mp_archiveCache(new ArchiveCache(std::max(getEnvVar<int>("KIWIX_ARCHIVE_CACHE_SIZE", 1), 1))),
mp_searcherCache(new SearcherCache(std::max(getEnvVar<int>("KIWIX_SEARCHER_CACHE_SIZE", 1), 1))),
m_bookDB("", Xapian::DB_BACKEND_INMEMORY)
{
}

Expand Down Expand Up @@ -139,7 +164,7 @@ bool Library::addBook(const Book& book)
try {
auto& oldbook = mp_impl->m_books.at(book.getId());
if ( ! booksReferToTheSameArchive(oldbook, book) ) {
dropReader(book.getId());
dropCache(book.getId());
}
oldbook.update(book); // XXX: This may have no effect if oldbook is readonly
// XXX: Then m_bookDB will become out-of-sync with
Expand All @@ -150,6 +175,13 @@ bool Library::addBook(const Book& book)
auto& newEntry = mp_impl->m_books[book.getId()];
static_cast<Book&>(newEntry) = book;
newEntry.lastUpdatedRevision = mp_impl->m_revision;
size_t new_cache_size = std::ceil(mp_impl->getBookCount(true, true)*0.1);
if (getEnvVar<int>("KIWIX_ARCHIVE_CACHE_SIZE", -1) <= 0) {
mp_impl->mp_archiveCache->setMaxSize(new_cache_size);
}
if (getEnvVar<int>("KIWIX_SEARCHER_CACHE_SIZE", -1) <= 0) {
mp_impl->mp_searcherCache->setMaxSize(new_cache_size);
}
return true;
}
}
Expand All @@ -173,17 +205,23 @@ bool Library::removeBookmark(const std::string& zimId, const std::string& url)
}


void Library::dropReader(const std::string& id)
void Library::dropCache(const std::string& id)
{
mp_impl->m_readers.erase(id);
mp_impl->m_archives.erase(id);
mp_impl->mp_archiveCache->drop(id);
mp_impl->mp_searcherCache->drop(id);
}

bool Library::removeBookById(const std::string& id)
{
std::lock_guard<std::mutex> lock(m_mutex);
mp_impl->m_bookDB.delete_document("Q" + id);
dropReader(id);
dropCache(id);
// We do not change the cache size here
// Most of the time, the book is remove in case of library refresh, it is
// often associated with addBook calls (which will properly set the cache size)
// Having a too big cache is not a problem here (or it would have been before)
// (And setMaxSize doesn't actually reduce the cache size, extra cached items
// will be removed in put or getOrPut).
return mp_impl->m_books.erase(id) == 1;
}

Expand Down Expand Up @@ -242,35 +280,49 @@ const Book& Library::getBookByPath(const std::string& path) const

std::shared_ptr<Reader> Library::getReaderById(const std::string& id)
{
try {
std::lock_guard<std::mutex> lock(m_mutex);
return mp_impl->m_readers.at(id);
} catch (std::out_of_range& e) {}

const auto archive = getArchiveById(id);
if ( !archive )
auto archive = getArchiveById(id);
if(archive) {
return std::make_shared<Reader>(archive);
} else {
return nullptr;

const shared_ptr<Reader> reader(new Reader(archive, true));
std::lock_guard<std::mutex> lock(m_mutex);
mp_impl->m_readers[id] = reader;
return reader;
}
}

std::shared_ptr<zim::Archive> Library::getArchiveById(const std::string& id)
{
std::lock_guard<std::mutex> lock(m_mutex);
try {
return mp_impl->m_archives.at(id);
} catch (std::out_of_range& e) {}

auto book = getBookById(id);
if (!book.isPathValid())
return mp_impl->mp_archiveCache->getOrPut(id,
[&](){
auto book = getBookById(id);
if (!book.isPathValid()) {
throw std::invalid_argument("");
}
return std::make_shared<zim::Archive>(book.getPath());
});
} catch (std::invalid_argument&) {
return nullptr;
}
}

auto sptr = make_shared<zim::Archive>(book.getPath());
mp_impl->m_archives[id] = sptr;
return sptr;
std::shared_ptr<ZimSearcher> Library::getSearcherByIds(const BookIdSet& ids)
{
assert(!ids.empty());
try {
return mp_impl->mp_searcherCache->getOrPut(ids,
[&](){
std::vector<zim::Archive> archives;
for(auto& id:ids) {
auto archive = getArchiveById(id);
if(!archive) {
throw std::invalid_argument("");
}
archives.push_back(*archive);
}
return std::make_shared<ZimSearcher>(zim::Searcher(archives));
});
} catch (std::invalid_argument&) {
return nullptr;
}
}

unsigned int Library::getBookCount(const bool localBooks,
Expand Down
10 changes: 5 additions & 5 deletions src/search_renderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,9 +71,9 @@ void SearchRenderer::setSearchPattern(const std::string& pattern)
searchPattern = pattern;
}

void SearchRenderer::setSearchContent(const std::string& content)
void SearchRenderer::setSearchBookQuery(const std::string& bookQuery)
{
searchContent = content;
searchBookQuery = bookQuery;
}

void SearchRenderer::setProtocolPrefix(const std::string& prefix)
Expand All @@ -90,13 +90,13 @@ kainjow::mustache::data buildQueryData
(
const std::string& searchProtocolPrefix,
const std::string& pattern,
const std::string& searchContent
const std::string& bookQuery
) {
kainjow::mustache::data query;
query.set("pattern", kiwix::encodeDiples(pattern));
std::ostringstream ss;
ss << searchProtocolPrefix << "?pattern=" << urlEncode(pattern, true);
ss << "&content=" << urlEncode(searchContent, true);
ss << "&" << bookQuery;
query.set("unpaginatedQuery", ss.str());
return query;
}
Expand Down Expand Up @@ -197,7 +197,7 @@ std::string SearchRenderer::getHtml()
kainjow::mustache::data query = buildQueryData(
searchProtocolPrefix,
searchPattern,
searchContent
searchBookQuery
);

std::string template_str = RESOURCE::templates::search_result_html;
Expand Down
1 change: 1 addition & 0 deletions src/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ bool Server::start() {
m_port,
m_root,
m_nbThreads,
m_multizimSearchLimit,
m_verbose,
m_withTaskbar,
m_withLibraryButton,
Expand Down
Loading