Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refine search result by customizing analyzer #4456

Merged
merged 3 commits into from Aug 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 1 addition & 1 deletion api/build.gradle
Expand Up @@ -42,7 +42,7 @@ dependencies {
api "org.apache.lucene:lucene-queryparser"
api "org.apache.lucene:lucene-highlighter"
api "org.apache.lucene:lucene-backward-codecs"
api 'cn.shenyanchao.ik-analyzer:ik-analyzer'
api 'org.apache.lucene:lucene-analysis-common'

api "org.apache.commons:commons-lang3"
api "io.seruco.encoding:base62"
Expand Down
@@ -1,18 +1,21 @@
package run.halo.app.search.post;

import static org.apache.commons.lang3.StringUtils.stripToEmpty;
import static org.apache.lucene.document.Field.Store.NO;
import static org.apache.lucene.document.Field.Store.YES;
import static org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE_OR_APPEND;

import java.io.IOException;
import java.text.NumberFormat;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StoredField;
Expand All @@ -22,24 +25,21 @@
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
import org.apache.lucene.queryparser.flexible.standard.config.PointsConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.jsoup.Jsoup;
import org.jsoup.safety.Safelist;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.stereotype.Service;
import org.springframework.util.StopWatch;
import org.wltea.analyzer.lucene.IKAnalyzer;
import reactor.core.Exceptions;
import run.halo.app.infra.properties.HaloProperties;
import run.halo.app.search.SearchParam;
Expand All @@ -49,15 +49,18 @@
@Slf4j
public class LucenePostSearchService implements PostSearchService, DisposableBean {

public static final int MAX_FRAGMENT_SIZE = 100;

private final Analyzer analyzer;

private final Directory postIndexDir;

public LucenePostSearchService(HaloProperties haloProperties)
throws IOException {
analyzer = new IKAnalyzer(true);
analyzer = CustomAnalyzer.builder()
.withTokenizer(StandardTokenizerFactory.class)
.addCharFilter(HTMLStripCharFilterFactory.NAME)
.addTokenFilter(LowerCaseFilterFactory.NAME)
.build();

var postIdxPath = haloProperties.getWorkDir().resolve("indices/posts");
postIndexDir = FSDirectory.open(postIdxPath);
}
Expand All @@ -72,14 +75,35 @@ public SearchResult<PostHit> search(SearchParam param) throws Exception {
var query = buildQuery(keyword);
var topDocs = searcher.search(query, param.getLimit(), Sort.RELEVANCE);
watch.stop();
var highlighter = new Highlighter(
new SimpleHTMLFormatter(param.getHighlightPreTag(), param.getHighlightPostTag()),
new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(MAX_FRAGMENT_SIZE));

var formatter =
new SimpleHTMLFormatter(param.getHighlightPreTag(), param.getHighlightPostTag());
var scorer = new QueryScorer(query);
var highlighter = new Highlighter(formatter, new DefaultEncoder(), scorer);
var hits = new ArrayList<PostHit>(topDocs.scoreDocs.length);
for (var scoreDoc : topDocs.scoreDocs) {
hits.add(convert(searcher.storedFields().document(scoreDoc.doc), highlighter));
var doc = searcher.storedFields().document(scoreDoc.doc);

var title = doc.get("title");
var titleFragment = highlighter.getBestFragment(analyzer, "title", title);
if (titleFragment != null) {
title = titleFragment;
}

var content = doc.get("content");
var contentFragment = highlighter.getBestFragment(analyzer, "content", content);
if (contentFragment != null) {
content = contentFragment;
}

var post = new PostHit();
post.setName(doc.get("name"));
post.setTitle(title);
post.setContent(content);
var publishTimestamp = doc.getField("publishTimestamp").numericValue().longValue();
post.setPublishTimestamp(Instant.ofEpochSecond(publishTimestamp));
post.setPermalink(doc.get("permalink"));
hits.add(post);
}

var result = new SearchResult<PostHit>();
Expand Down Expand Up @@ -141,72 +165,29 @@ public void destroy() throws Exception {
}


private Query buildQuery(String keyword) throws ParseException {
private Query buildQuery(String keyword) throws QueryNodeException {
if (log.isDebugEnabled()) {
log.debug("Trying to search for keyword: {}", keyword);
}
return new QueryParser("searchable", analyzer).parse(keyword);
var parser = new StandardQueryParser(analyzer);
parser.setPointsConfigMap(Map.of(
"publishTimestamp", new PointsConfig(NumberFormat.getNumberInstance(), Long.class)
));
return parser.parse(keyword, "content");
}

private Document convert(PostDoc post) {
var doc = new Document();
doc.add(new StringField("name", post.name(), YES));
doc.add(new StoredField("title", post.title()));

var cleanExcerpt = Jsoup.clean(stripToEmpty(post.excerpt()), Safelist.none());
var cleanContent = Jsoup.clean(stripToEmpty(post.content()), Safelist.none());

var contentBuilder = new StringBuilder(cleanExcerpt);
if (!contentBuilder.isEmpty()) {
contentBuilder.append(' ');
}
contentBuilder.append(cleanContent);

var content = contentBuilder.toString();
doc.add(new TextField("title", post.title(), YES));
doc.add(new TextField("excerpt", post.excerpt(), YES));
doc.add(new TextField("content", post.content(), YES));

doc.add(new StoredField("content", content));
doc.add(new TextField("searchable", post.title() + " " + content, NO));

long publishTimestamp = post.publishTimestamp().toEpochMilli();
var publishTimestamp = post.publishTimestamp().getEpochSecond();
doc.add(new LongPoint("publishTimestamp", publishTimestamp));
doc.add(new StoredField("publishTimestamp", publishTimestamp));
doc.add(new StoredField("permalink", post.permalink()));
return doc;
}

private PostHit convert(Document doc, Highlighter highlighter)
throws IOException, InvalidTokenOffsetsException {
var post = new PostHit();
post.setName(doc.get("name"));

var title = getHighlightedText(doc, "title", highlighter, MAX_FRAGMENT_SIZE);
post.setTitle(title);

var content = getHighlightedText(doc, "content", highlighter, MAX_FRAGMENT_SIZE);
post.setContent(content);

var publishTimestamp = doc.getField("publishTimestamp").numericValue().longValue();
post.setPublishTimestamp(Instant.ofEpochMilli(publishTimestamp));
post.setPermalink(doc.get("permalink"));
return post;
}

private String getHighlightedText(Document doc, String field, Highlighter highlighter,
int maxLength)
throws InvalidTokenOffsetsException, IOException {
try {
var highlightedText = highlighter.getBestFragment(analyzer, field, doc.get(field));
if (highlightedText != null) {
return highlightedText;
}
} catch (IllegalArgumentException iae) {
// TODO we have to ignore the error currently due to no solution about the error.
if (!"boost must be a positive float, got -1.0".equals(iae.getMessage())) {
throw iae;
}
}
// handle if there is not highlighted text
var fieldValue = doc.get(field);
return StringUtils.substring(fieldValue, 0, maxLength);
}
}
2 changes: 1 addition & 1 deletion platform/application/build.gradle
Expand Up @@ -39,7 +39,7 @@ dependencies {
api "org.apache.lucene:lucene-queryparser:$lucene"
api "org.apache.lucene:lucene-highlighter:$lucene"
api "org.apache.lucene:lucene-backward-codecs:$lucene"
api 'cn.shenyanchao.ik-analyzer:ik-analyzer:9.0.0'
api "org.apache.lucene:lucene-analysis-common:$lucene"

api "org.apache.commons:commons-lang3:$commonsLang3"
api "io.seruco.encoding:base62:$base62"
Expand Down