diff --git a/api/build.gradle b/api/build.gradle index 7d9f4fc50e..4e8a5c6cd0 100644 --- a/api/build.gradle +++ b/api/build.gradle @@ -42,7 +42,7 @@ dependencies { api "org.apache.lucene:lucene-queryparser" api "org.apache.lucene:lucene-highlighter" api "org.apache.lucene:lucene-backward-codecs" - api 'cn.shenyanchao.ik-analyzer:ik-analyzer' + api 'org.apache.lucene:lucene-analysis-common' api "org.apache.commons:commons-lang3" api "io.seruco.encoding:base62" diff --git a/application/src/main/java/run/halo/app/search/post/LucenePostSearchService.java b/application/src/main/java/run/halo/app/search/post/LucenePostSearchService.java index f95bed73af..9c31c6ec37 100644 --- a/application/src/main/java/run/halo/app/search/post/LucenePostSearchService.java +++ b/application/src/main/java/run/halo/app/search/post/LucenePostSearchService.java @@ -1,18 +1,21 @@ package run.halo.app.search.post; -import static org.apache.commons.lang3.StringUtils.stripToEmpty; -import static org.apache.lucene.document.Field.Store.NO; import static org.apache.lucene.document.Field.Store.YES; import static org.apache.lucene.index.IndexWriterConfig.OpenMode.CREATE_OR_APPEND; import java.io.IOException; +import java.text.NumberFormat; import java.time.Instant; import java.util.ArrayList; import java.util.List; +import java.util.Map; import java.util.Set; import lombok.extern.slf4j.Slf4j; -import org.apache.commons.lang3.StringUtils; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.charfilter.HTMLStripCharFilterFactory; +import org.apache.lucene.analysis.core.LowerCaseFilterFactory; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.standard.StandardTokenizerFactory; import org.apache.lucene.document.Document; import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.StoredField; @@ -22,24 +25,21 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; -import org.apache.lucene.queryparser.classic.ParseException; -import org.apache.lucene.queryparser.classic.QueryParser; +import org.apache.lucene.queryparser.flexible.core.QueryNodeException; +import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; +import org.apache.lucene.queryparser.flexible.standard.config.PointsConfig; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.Sort; +import org.apache.lucene.search.highlight.DefaultEncoder; import org.apache.lucene.search.highlight.Highlighter; -import org.apache.lucene.search.highlight.InvalidTokenOffsetsException; import org.apache.lucene.search.highlight.QueryScorer; -import org.apache.lucene.search.highlight.SimpleFragmenter; import org.apache.lucene.search.highlight.SimpleHTMLFormatter; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.jsoup.Jsoup; -import org.jsoup.safety.Safelist; import org.springframework.beans.factory.DisposableBean; import org.springframework.stereotype.Service; import org.springframework.util.StopWatch; -import org.wltea.analyzer.lucene.IKAnalyzer; import reactor.core.Exceptions; import run.halo.app.infra.properties.HaloProperties; import run.halo.app.search.SearchParam; @@ -49,15 +49,18 @@ @Slf4j public class LucenePostSearchService implements PostSearchService, DisposableBean { - public static final int MAX_FRAGMENT_SIZE = 100; - private final Analyzer analyzer; private final Directory postIndexDir; public LucenePostSearchService(HaloProperties haloProperties) throws IOException { - analyzer = new IKAnalyzer(true); + analyzer = CustomAnalyzer.builder() + .withTokenizer(StandardTokenizerFactory.class) + .addCharFilter(HTMLStripCharFilterFactory.NAME) + .addTokenFilter(LowerCaseFilterFactory.NAME) + .build(); + var postIdxPath = haloProperties.getWorkDir().resolve("indices/posts"); postIndexDir = FSDirectory.open(postIdxPath); } @@ -72,14 +75,35 @@ public SearchResult search(SearchParam param) throws Exception { var query = buildQuery(keyword); var topDocs = searcher.search(query, param.getLimit(), Sort.RELEVANCE); watch.stop(); - var highlighter = new Highlighter( - new SimpleHTMLFormatter(param.getHighlightPreTag(), param.getHighlightPostTag()), - new QueryScorer(query)); - highlighter.setTextFragmenter(new SimpleFragmenter(MAX_FRAGMENT_SIZE)); + var formatter = + new SimpleHTMLFormatter(param.getHighlightPreTag(), param.getHighlightPostTag()); + var scorer = new QueryScorer(query); + var highlighter = new Highlighter(formatter, new DefaultEncoder(), scorer); var hits = new ArrayList(topDocs.scoreDocs.length); for (var scoreDoc : topDocs.scoreDocs) { - hits.add(convert(searcher.storedFields().document(scoreDoc.doc), highlighter)); + var doc = searcher.storedFields().document(scoreDoc.doc); + + var title = doc.get("title"); + var titleFragment = highlighter.getBestFragment(analyzer, "title", title); + if (titleFragment != null) { + title = titleFragment; + } + + var content = doc.get("content"); + var contentFragment = highlighter.getBestFragment(analyzer, "content", content); + if (contentFragment != null) { + content = contentFragment; + } + + var post = new PostHit(); + post.setName(doc.get("name")); + post.setTitle(title); + post.setContent(content); + var publishTimestamp = doc.getField("publishTimestamp").numericValue().longValue(); + post.setPublishTimestamp(Instant.ofEpochSecond(publishTimestamp)); + post.setPermalink(doc.get("permalink")); + hits.add(post); } var result = new SearchResult(); @@ -141,72 +165,29 @@ public void destroy() throws Exception { } - private Query buildQuery(String keyword) throws ParseException { + private Query buildQuery(String keyword) throws QueryNodeException { if (log.isDebugEnabled()) { log.debug("Trying to search for keyword: {}", keyword); } - return new QueryParser("searchable", analyzer).parse(keyword); + var parser = new StandardQueryParser(analyzer); + parser.setPointsConfigMap(Map.of( + "publishTimestamp", new PointsConfig(NumberFormat.getNumberInstance(), Long.class) + )); + return parser.parse(keyword, "content"); } private Document convert(PostDoc post) { var doc = new Document(); doc.add(new StringField("name", post.name(), YES)); - doc.add(new StoredField("title", post.title())); - - var cleanExcerpt = Jsoup.clean(stripToEmpty(post.excerpt()), Safelist.none()); - var cleanContent = Jsoup.clean(stripToEmpty(post.content()), Safelist.none()); - - var contentBuilder = new StringBuilder(cleanExcerpt); - if (!contentBuilder.isEmpty()) { - contentBuilder.append(' '); - } - contentBuilder.append(cleanContent); - - var content = contentBuilder.toString(); + doc.add(new TextField("title", post.title(), YES)); + doc.add(new TextField("excerpt", post.excerpt(), YES)); + doc.add(new TextField("content", post.content(), YES)); - doc.add(new StoredField("content", content)); - doc.add(new TextField("searchable", post.title() + " " + content, NO)); - - long publishTimestamp = post.publishTimestamp().toEpochMilli(); + var publishTimestamp = post.publishTimestamp().getEpochSecond(); doc.add(new LongPoint("publishTimestamp", publishTimestamp)); doc.add(new StoredField("publishTimestamp", publishTimestamp)); doc.add(new StoredField("permalink", post.permalink())); return doc; } - private PostHit convert(Document doc, Highlighter highlighter) - throws IOException, InvalidTokenOffsetsException { - var post = new PostHit(); - post.setName(doc.get("name")); - - var title = getHighlightedText(doc, "title", highlighter, MAX_FRAGMENT_SIZE); - post.setTitle(title); - - var content = getHighlightedText(doc, "content", highlighter, MAX_FRAGMENT_SIZE); - post.setContent(content); - - var publishTimestamp = doc.getField("publishTimestamp").numericValue().longValue(); - post.setPublishTimestamp(Instant.ofEpochMilli(publishTimestamp)); - post.setPermalink(doc.get("permalink")); - return post; - } - - private String getHighlightedText(Document doc, String field, Highlighter highlighter, - int maxLength) - throws InvalidTokenOffsetsException, IOException { - try { - var highlightedText = highlighter.getBestFragment(analyzer, field, doc.get(field)); - if (highlightedText != null) { - return highlightedText; - } - } catch (IllegalArgumentException iae) { - // TODO we have to ignore the error currently due to no solution about the error. - if (!"boost must be a positive float, got -1.0".equals(iae.getMessage())) { - throw iae; - } - } - // handle if there is not highlighted text - var fieldValue = doc.get(field); - return StringUtils.substring(fieldValue, 0, maxLength); - } } diff --git a/platform/application/build.gradle b/platform/application/build.gradle index b4f752da89..fa1b40d6df 100644 --- a/platform/application/build.gradle +++ b/platform/application/build.gradle @@ -39,7 +39,7 @@ dependencies { api "org.apache.lucene:lucene-queryparser:$lucene" api "org.apache.lucene:lucene-highlighter:$lucene" api "org.apache.lucene:lucene-backward-codecs:$lucene" - api 'cn.shenyanchao.ik-analyzer:ik-analyzer:9.0.0' + api "org.apache.lucene:lucene-analysis-common:$lucene" api "org.apache.commons:commons-lang3:$commonsLang3" api "io.seruco.encoding:base62:$base62"