Skip to content

Commit

Permalink
Refine search result by customizing analyzer (#4456)
Browse files Browse the repository at this point in the history
#### What type of PR is this?

/kind improvement
/area core
/milestone 2.9.x

#### What this PR does / why we need it:

- Removes dependency `cn.shenyanchao.ik-analyzer:ik-analyzer:9.0.0` due to no significant effect for searching result.
- Customize our own analyzer with StandardTokenizer, HTMLStripCharFilter and LowerCaseFilterFactory.

Please be aware of that the default field to search has become to `content` instead of `title` + `excerpt` + `content`. If someone wants to search title only, use `title: halo` as query string. For more details, please refer to <https://lucene.apache.org/core/9_5_0/queryparser/org/apache/lucene/queryparser/flexible/standard/StandardQueryParser.html>.

#### Which issue(s) this PR fixes:

Fixes #4455

#### Special notes for your reviewer:

#### Does this PR introduce a user-facing change?

```release-note
优化本地搜索引擎
```
  • Loading branch information
JohnNiang committed Aug 25, 2023
1 parent 138ffde commit e40b5d2
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 73 deletions.
2 changes: 1 addition & 1 deletion api/build.gradle
Expand Up @@ -42,7 +42,7 @@ dependencies {
api "org.apache.lucene:lucene-queryparser"
api "org.apache.lucene:lucene-highlighter"
api "org.apache.lucene:lucene-backward-codecs"
api 'cn.shenyanchao.ik-analyzer:ik-analyzer'
api 'org.apache.lucene:lucene-analysis-common'

api "org.apache.commons:commons-lang3"
api "io.seruco.encoding:base62"
Expand Down
@@ -1,18 +1,21 @@
package run.halo.app.search.post;

import static org.apache.commons.lang3.StringUtils.stripToEmpty;
import static org.apache.lucene.document.Field.Store.NO;
import static org.apache.lucene.document.Field.Store.YES;
import static org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE_OR_APPEND;

import java.io.IOException;
import java.text.NumberFormat;
import java.time.Instant;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory;
import org.apache.lucene.analysis.core.LowerCaseFilterFactory;
import org.apache.lucene.analysis.custom.CustomAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizerFactory;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.LongPoint;
import org.apache.lucene.document.StoredField;
Expand All @@ -22,24 +25,21 @@
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
import org.apache.lucene.queryparser.flexible.standard.config.PointsConfig;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.highlight.DefaultEncoder;
import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.InvalidTokenOffsetsException;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleFragmenter;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.jsoup.Jsoup;
import org.jsoup.safety.Safelist;
import org.springframework.beans.factory.DisposableBean;
import org.springframework.stereotype.Service;
import org.springframework.util.StopWatch;
import org.wltea.analyzer.lucene.IKAnalyzer;
import reactor.core.Exceptions;
import run.halo.app.infra.properties.HaloProperties;
import run.halo.app.search.SearchParam;
Expand All @@ -49,15 +49,18 @@
@Slf4j
public class LucenePostSearchService implements PostSearchService, DisposableBean {

public static final int MAX_FRAGMENT_SIZE = 100;

private final Analyzer analyzer;

private final Directory postIndexDir;

public LucenePostSearchService(HaloProperties haloProperties)
throws IOException {
analyzer = new IKAnalyzer(true);
analyzer = CustomAnalyzer.builder()
.withTokenizer(StandardTokenizerFactory.class)
.addCharFilter(HTMLStripCharFilterFactory.NAME)
.addTokenFilter(LowerCaseFilterFactory.NAME)
.build();

var postIdxPath = haloProperties.getWorkDir().resolve("indices/posts");
postIndexDir = FSDirectory.open(postIdxPath);
}
Expand All @@ -72,14 +75,35 @@ public SearchResult<PostHit> search(SearchParam param) throws Exception {
var query = buildQuery(keyword);
var topDocs = searcher.search(query, param.getLimit(), Sort.RELEVANCE);
watch.stop();
var highlighter = new Highlighter(
new SimpleHTMLFormatter(param.getHighlightPreTag(), param.getHighlightPostTag()),
new QueryScorer(query));
highlighter.setTextFragmenter(new SimpleFragmenter(MAX_FRAGMENT_SIZE));

var formatter =
new SimpleHTMLFormatter(param.getHighlightPreTag(), param.getHighlightPostTag());
var scorer = new QueryScorer(query);
var highlighter = new Highlighter(formatter, new DefaultEncoder(), scorer);
var hits = new ArrayList<PostHit>(topDocs.scoreDocs.length);
for (var scoreDoc : topDocs.scoreDocs) {
hits.add(convert(searcher.storedFields().document(scoreDoc.doc), highlighter));
var doc = searcher.storedFields().document(scoreDoc.doc);

var title = doc.get("title");
var titleFragment = highlighter.getBestFragment(analyzer, "title", title);
if (titleFragment != null) {
title = titleFragment;
}

var content = doc.get("content");
var contentFragment = highlighter.getBestFragment(analyzer, "content", content);
if (contentFragment != null) {
content = contentFragment;
}

var post = new PostHit();
post.setName(doc.get("name"));
post.setTitle(title);
post.setContent(content);
var publishTimestamp = doc.getField("publishTimestamp").numericValue().longValue();
post.setPublishTimestamp(Instant.ofEpochSecond(publishTimestamp));
post.setPermalink(doc.get("permalink"));
hits.add(post);
}

var result = new SearchResult<PostHit>();
Expand Down Expand Up @@ -141,72 +165,29 @@ public void destroy() throws Exception {
}


private Query buildQuery(String keyword) throws ParseException {
private Query buildQuery(String keyword) throws QueryNodeException {
if (log.isDebugEnabled()) {
log.debug("Trying to search for keyword: {}", keyword);
}
return new QueryParser("searchable", analyzer).parse(keyword);
var parser = new StandardQueryParser(analyzer);
parser.setPointsConfigMap(Map.of(
"publishTimestamp", new PointsConfig(NumberFormat.getNumberInstance(), Long.class)
));
return parser.parse(keyword, "content");
}

private Document convert(PostDoc post) {
var doc = new Document();
doc.add(new StringField("name", post.name(), YES));
doc.add(new StoredField("title", post.title()));

var cleanExcerpt = Jsoup.clean(stripToEmpty(post.excerpt()), Safelist.none());
var cleanContent = Jsoup.clean(stripToEmpty(post.content()), Safelist.none());

var contentBuilder = new StringBuilder(cleanExcerpt);
if (!contentBuilder.isEmpty()) {
contentBuilder.append(' ');
}
contentBuilder.append(cleanContent);

var content = contentBuilder.toString();
doc.add(new TextField("title", post.title(), YES));
doc.add(new TextField("excerpt", post.excerpt(), YES));
doc.add(new TextField("content", post.content(), YES));

doc.add(new StoredField("content", content));
doc.add(new TextField("searchable", post.title() + " " + content, NO));

long publishTimestamp = post.publishTimestamp().toEpochMilli();
var publishTimestamp = post.publishTimestamp().getEpochSecond();
doc.add(new LongPoint("publishTimestamp", publishTimestamp));
doc.add(new StoredField("publishTimestamp", publishTimestamp));
doc.add(new StoredField("permalink", post.permalink()));
return doc;
}

private PostHit convert(Document doc, Highlighter highlighter)
throws IOException, InvalidTokenOffsetsException {
var post = new PostHit();
post.setName(doc.get("name"));

var title = getHighlightedText(doc, "title", highlighter, MAX_FRAGMENT_SIZE);
post.setTitle(title);

var content = getHighlightedText(doc, "content", highlighter, MAX_FRAGMENT_SIZE);
post.setContent(content);

var publishTimestamp = doc.getField("publishTimestamp").numericValue().longValue();
post.setPublishTimestamp(Instant.ofEpochMilli(publishTimestamp));
post.setPermalink(doc.get("permalink"));
return post;
}

private String getHighlightedText(Document doc, String field, Highlighter highlighter,
int maxLength)
throws InvalidTokenOffsetsException, IOException {
try {
var highlightedText = highlighter.getBestFragment(analyzer, field, doc.get(field));
if (highlightedText != null) {
return highlightedText;
}
} catch (IllegalArgumentException iae) {
// TODO we have to ignore the error currently due to no solution about the error.
if (!"boost must be a positive float, got -1.0".equals(iae.getMessage())) {
throw iae;
}
}
// handle if there is not highlighted text
var fieldValue = doc.get(field);
return StringUtils.substring(fieldValue, 0, maxLength);
}
}
2 changes: 1 addition & 1 deletion platform/application/build.gradle
Expand Up @@ -39,7 +39,7 @@ dependencies {
api "org.apache.lucene:lucene-queryparser:$lucene"
api "org.apache.lucene:lucene-highlighter:$lucene"
api "org.apache.lucene:lucene-backward-codecs:$lucene"
api 'cn.shenyanchao.ik-analyzer:ik-analyzer:9.0.0'
api "org.apache.lucene:lucene-analysis-common:$lucene"

api "org.apache.commons:commons-lang3:$commonsLang3"
api "io.seruco.encoding:base62:$base62"
Expand Down

0 comments on commit e40b5d2

Please sign in to comment.