From f375fba542f91d428c6cb48a68a6d37d5716ca99 Mon Sep 17 00:00:00 2001 From: Emmanuel Keller Date: Mon, 29 Feb 2016 23:58:41 +0100 Subject: [PATCH] Prepare v1.5.14 release --- CHANGELOG.txt | 16 ++++ CHANGES.txt | 38 +++------ NOTICE.txt | 2 +- README.md | 6 +- pom.xml | 32 ++++---- shell/start.bat | 4 +- shell/start.sh | 4 +- src/deb/init.d/opensearchserver | 2 +- .../java/com/jaeksoft/searchlib/Server.java | 15 ++++ .../jaeksoft/searchlib/parser/PptParser.java | 77 ++++++++++--------- 10 files changed, 105 insertions(+), 91 deletions(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index 9a466b645..7bd85ff7a 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -8,6 +8,22 @@ https://github.com/jaeksoft/opensearchserver/issues (GH) http://sourceforge.net/p/opensearchserve/feature-request/ (SF) http://sourceforge.net/p/opensearchserve/bug-report/ (SF) +OpenSearchServer 1.5.14 + +New features: +- GH-1706: Autocompletion thread should be interruptible +- GH-1700: Tomcat version update 7.0.68 +- GH-1689: Disabling link detection +- GH-1676: Support of PhantomJS in the HTMLParser +- GH-1671: Add a customizable char tokenizer +- GH-1669: An XML/JSON API for field terms extractions + +Bug fixes: +- GH-1729: org.apache.cxf.interceptor.Fault, Tomcat 7.0.53, when starting crawl +- GH-1697: Ignored Number of suggestion for auto-completion +- GH-1697: Scheduler URL database error +- GH-1659: Crawler deadlocks after running while serving requests + OpenSearchServer 1.5.13 New features: diff --git a/CHANGES.txt b/CHANGES.txt index bb5374841..4c7f06366 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,27 +1,11 @@ -release date=12:00 05.05.2015,version=1.5.13,urgency=low,by=Emmanuel Keller ,distribution=unknown - * GH-1653: REST Crawler - HTTP Header authentication method - * GH-1652: REST Crawler - an integrated sequence for paginated APIs support - * GH-1648: File event scheduler task - * GH-1635: For SMB crawling and AD login SID should also be extracted - * GH-1624: Add the field host in the field mapping of the file crawler - * GH-1617: The REST crawler should support local files indexation - * GH-1611: Support of multiple indexes - * GH-1601: Use compressed bit set for less memory consumption - * GH-1598: Facets can be limited in number - * GH-1597: Facets can be sorted by term or count - * GH-1595: Parallel sorting to better use multi-core system - * GH-1592: The pattern inclusion API (v2) failed to inject the URLs in the URL database - * GH-1582: Programmatically retrieving hosts from Web index - * GH-1562: Fault tolerancy in the database crawler - * GH-1559: Handle short date format in sitemap (yyyy-mm-dd) - * GH-1541: Ability to disable obeying robots.txt using the API - * GH-1537: Collapsing size to 0 merges the returned field - * GH-1524: Add fetch size parameter in the database crawler - * GH-1509: Hunspell native implementation is slow - * GH-1502: A scheduler task able to launch a crawl script - * GH-1494: Error HttpHostConnectException should not abort the crawl session - * GH-1480: Multithreaded OCR - * GH-1476: New index template for credentials storing - * GH-1475: Add an encryption filter - * GH-1471: Authentication based on external index - * GH-1470: Renderer: Add range to date facet +release date=12:00 04.04.2016,version=1.5.14,urgency=low,by=Emmanuel Keller ,distribution=unknown + * GH-1729: org.apache.cxf.interceptor.Fault, Tomcat 7.0.53, when starting crawl + * GH-1706: Autocompletion thread should be interruptible + * GH-1700: Tomcat version update 7.0.68 + * GH-1697: Ignored Number of suggestion for auto-completion + * GH-1696: Scheduler URL database error + * GH-1689: Disabling link detection + * GH-1676: Support of PhantomJS in the HTMLParser + * GH-1671: Add a customizable char tokenizer + * GH-1669: An XML/JSON API for field terms extractions + * GH-1659: Crawler deadlocks after running while serving requests \ No newline at end of file diff --git a/NOTICE.txt b/NOTICE.txt index 73e42233d..86c5f2447 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -1,5 +1,5 @@ OpenSearchServer -Copyright 2008-2014 Emmanuel Keller / Jaeksoft +Copyright 2008-2016 Emmanuel Keller / Jaeksoft http://www.open-search-server.com OpenSearchServer is free software: you can redistribute it and/or diff --git a/README.md b/README.md index 734c32ed5..931baaae2 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ OpenSearchServer ================ http://www.opensearchserver.com -Copyright Emmanuel Keller / Jaeksoft (2008-2015) +Copyright Emmanuel Keller / Jaeksoft (2008-2016) This software is licensed under the GPL v3. OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Linux/Unix/BSD/Windows. @@ -90,8 +90,6 @@ Features ### General - REST API (XML and JSON) -- SOAP Web Service - Monitoring module - Index replication -- Scheduler for management of periodic tasks -- WordPress plugin and Drupal module +- Scheduler for management of periodic tasks \ No newline at end of file diff --git a/pom.xml b/pom.xml index 3cab5ac96..039cb7a80 100644 --- a/pom.xml +++ b/pom.xml @@ -117,12 +117,12 @@ it.unimi.dsi fastutil - 7.0.9 + 7.0.10 com.google.code.gson gson - 2.5 + 2.6.2 com.ibm.icu @@ -147,7 +147,7 @@ args4j args4j - 2.32 + 2.33 org.quartz-scheduler @@ -217,7 +217,7 @@ org.apache.pdfbox pdfbox-ant - 1.8.10 + 1.8.11 org.icepdf @@ -247,7 +247,7 @@ org.apache.hadoop hadoop-client - 2.7.1 + 2.7.2 commons-codec @@ -333,7 +333,7 @@ org.slf4j slf4j-log4j12 - 1.7.13 + 1.7.18 org.zkoss.zk @@ -446,7 +446,7 @@ org.antlr antlr4-runtime - 4.5.1-1 + 4.5.2-1 net.sf.opencsv @@ -456,12 +456,12 @@ org.codehaus.groovy groovy-all - 2.4.5 + 2.4.6 org.mongodb mongo-java-driver - 3.2.0 + 3.2.2 net.sf.jmimemagic @@ -476,7 +476,7 @@ org.roaringbitmap RoaringBitmap - 0.5.15 + 0.6.3 mysql @@ -491,7 +491,7 @@ org.postgresql postgresql - 9.4.1207 + 9.4.1208.jre7 org.hsqldb @@ -554,13 +554,13 @@ UTF-8 2.7.18 3.6.2 - 4.5.1 + 4.5.2 4.4.4 - 3.12 - 2.48.2 + 3.13 + 2.52.0 6.5.4 - 2.6.4 - 7.0.67 + 2.6.5 + 7.0.68 OpenSearchServer is a powerful, enterprise-class, search engine program. Using the web user interface, the crawlers (web, file, database, ...) and the REST/RESTFul API you will be able to integrate quickly and easily advanced full-text search capabilities in your application. OpenSearchServer runs on Windows and Linux/Unix/BSD. diff --git a/shell/start.bat b/shell/start.bat index af07568c0..0def86f6a 100644 --- a/shell/start.bat +++ b/shell/start.bat @@ -6,7 +6,7 @@ rem Move to the directory containing this script cd %cd% set LANG=en_US.UTF-8 -set JAVA_OPTS=%JAVA_OPTS% -Dfile.encoding=UTF-8 +set JAVA_OPTS=%JAVA_OPTS% -Dfile.encoding=UTF-8 -Djava.protocol.handler.pkgs=jcifs rem The directory containing the indexes set OPENSEARCHSERVER_DATA=%cd%\data @@ -18,4 +18,4 @@ rem Any JAVA option. Often used to allocate more memory. Uncomment this line to rem set JAVA_OPTS=%JAVA_OPTS% -Xms1G -Xmx1G rem Starting the server -java %JAVA_OPTS% -jar opensearchserver.jar -extractDirectory server -httpPort %SERVER_PORT% -uriEncoding UTF-8 -Doss.externalparser.classpath=%cd%/lib/ext/* \ No newline at end of file +java %JAVA_OPTS% -jar opensearchserver.jar -extractDirectory server -httpPort %SERVER_PORT% -uriEncoding UTF-8 \ No newline at end of file diff --git a/shell/start.sh b/shell/start.sh index 8b4abb27f..f23b62be8 100644 --- a/shell/start.sh +++ b/shell/start.sh @@ -6,7 +6,7 @@ cd `dirname "$0"` # LANG=en_US.UTF-8 export LANG -JAVA_OPTS="$JAVA_OPTS -Dfile.encoding=UTF-8" +JAVA_OPTS="$JAVA_OPTS -Dfile.encoding=UTF-8 -Djava.protocol.handler.pkgs=jcifs" # The directory containing the indexes (must be exported) OPENSEARCHSERVER_DATA=data @@ -22,8 +22,6 @@ SERVER_PORT=9090 eval java $JAVA_OPTS -jar opensearchserver.jar \ -extractDirectory server \ -httpPort ${SERVER_PORT} \ - -Djava.protocol.handler.pkgs=jcifs \ - -Doss.externalparser.classpath=lib/ext/* \ -uriEncoding UTF-8 \ >> "logs/oss.log" 2>&1 "&" diff --git a/src/deb/init.d/opensearchserver b/src/deb/init.d/opensearchserver index 91600a78b..c3b824347 100644 --- a/src/deb/init.d/opensearchserver +++ b/src/deb/init.d/opensearchserver @@ -24,7 +24,7 @@ OPENSEARCHSERVER_DIR=/var/lib/opensearchserver OPENSEARCHSERVER_SHARE=/usr/share/opensearchserver OPENSEARCHSERVER_JAR=$OPENSEARCHSERVER_SHARE/opensearchserver.jar SERVER_DIR=$OPENSEARCHSERVER_DIR/server -SERVER_OPTS="$JAVA_OPTS -Dfile.encoding=UTF-8 -jar $OPENSEARCHSERVER_JAR -extractDirectory $SERVER_DIR -httpPort $SERVER_PORT -uriEncoding UTF-8" +SERVER_OPTS="$JAVA_OPTS -Djava.protocol.handler.pkgs=jcifs -Dfile.encoding=UTF-8 -jar $OPENSEARCHSERVER_JAR -extractDirectory $SERVER_DIR -httpPort $SERVER_PORT -uriEncoding UTF-8" SERVER_LOG=/var/log/opensearchserver/server.out SERVER_PID="/var/run/opensearchserver.pid" export SERVER_USER=opensearchserver diff --git a/src/main/java/com/jaeksoft/searchlib/Server.java b/src/main/java/com/jaeksoft/searchlib/Server.java index e73002aa1..26699e92b 100644 --- a/src/main/java/com/jaeksoft/searchlib/Server.java +++ b/src/main/java/com/jaeksoft/searchlib/Server.java @@ -16,6 +16,8 @@ import com.github.jankroken.commandline.annotations.Option; import com.github.jankroken.commandline.annotations.ShortSwitch; import com.github.jankroken.commandline.annotations.SingleArgument; +import com.github.jankroken.commandline.annotations.Toggle; +import com.jaeksoft.searchlib.util.FileUtils; import com.jaeksoft.searchlib.util.ThreadUtils; import com.jaeksoft.searchlib.web.StartStopListener; @@ -25,9 +27,13 @@ public class Server { private Server(Arguments arguments) { File baseDir = new File(arguments.extractDirectory == null ? "server" : arguments.extractDirectory); + if (baseDir.exists()) + if (arguments.resetExtract && baseDir.isDirectory()) + FileUtils.deleteDirectoryQuietly(baseDir); if (!baseDir.exists()) baseDir.mkdir(); tomcat = new Tomcat(); + tomcat.noDefaultWebXmlPath(); tomcat.setPort(arguments.httpPort == null ? 9090 : arguments.httpPort); tomcat.setBaseDir(baseDir.getAbsolutePath()); tomcat.getHost().setAppBase(baseDir.getAbsolutePath()); @@ -41,6 +47,7 @@ public static class Arguments { private String extractDirectory = null; private Integer httpPort = null; private String uriEncoding = null; + private boolean resetExtract = false; @Option @LongSwitch("extractDirectory") @@ -66,6 +73,14 @@ public void setUriEncoding(String uriEncoding) { this.uriEncoding = uriEncoding; } + @Option + @LongSwitch("resetExtract") + @ShortSwitch("r") + @Toggle(true) + public void setResetExtract(boolean resetExtract) { + this.resetExtract = resetExtract; + } + } private void start(boolean await) throws IOException, URISyntaxException { diff --git a/src/main/java/com/jaeksoft/searchlib/parser/PptParser.java b/src/main/java/com/jaeksoft/searchlib/parser/PptParser.java index bb6183350..d8018b4f5 100644 --- a/src/main/java/com/jaeksoft/searchlib/parser/PptParser.java +++ b/src/main/java/com/jaeksoft/searchlib/parser/PptParser.java @@ -25,11 +25,13 @@ package com.jaeksoft.searchlib.parser; import java.io.IOException; +import java.util.List; -import org.apache.poi.hslf.model.Slide; -import org.apache.poi.hslf.model.TextRun; import org.apache.poi.hslf.record.TextHeaderAtom; -import org.apache.poi.hslf.usermodel.SlideShow; +import org.apache.poi.hslf.usermodel.HSLFSlide; +import org.apache.poi.hslf.usermodel.HSLFSlideShow; +import org.apache.poi.hslf.usermodel.HSLFTextParagraph; +import org.apache.poi.hslf.usermodel.HSLFTextRun; import com.jaeksoft.searchlib.SearchLibException; import com.jaeksoft.searchlib.analysis.ClassPropertyEnum; @@ -43,9 +45,8 @@ public class PptParser extends Parser { public static final String[] DEFAULT_EXTENSIONS = { "ppt" }; - private static ParserFieldEnum[] fl = { ParserFieldEnum.parser_name, - ParserFieldEnum.title, ParserFieldEnum.note, ParserFieldEnum.body, - ParserFieldEnum.other }; + private static ParserFieldEnum[] fl = { ParserFieldEnum.parser_name, ParserFieldEnum.title, ParserFieldEnum.note, + ParserFieldEnum.body, ParserFieldEnum.other }; public PptParser() { super(fl); @@ -58,43 +59,45 @@ public void initProperties() throws SearchLibException { } @Override - protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) - throws IOException { + protected void parseContent(StreamLimiter streamLimiter, LanguageEnum lang) throws IOException { - SlideShow ppt = new SlideShow(streamLimiter.getNewInputStream()); - Slide[] slides = ppt.getSlides(); + HSLFSlideShow ppt = new HSLFSlideShow(streamLimiter.getNewInputStream()); + List slides = ppt.getSlides(); ParserResultItem result = getNewParserResultItem(); - for (Slide slide : slides) { - TextRun[] textRuns = slide.getTextRuns(); - for (TextRun textRun : textRuns) { - ParserFieldEnum field; - switch (textRun.getRunType()) { - case TextHeaderAtom.TITLE_TYPE: - case TextHeaderAtom.CENTER_TITLE_TYPE: - field = ParserFieldEnum.title; - break; - case TextHeaderAtom.NOTES_TYPE: - field = ParserFieldEnum.note; - break; - case TextHeaderAtom.BODY_TYPE: - case TextHeaderAtom.CENTRE_BODY_TYPE: - case TextHeaderAtom.HALF_BODY_TYPE: - case TextHeaderAtom.QUARTER_BODY_TYPE: - field = ParserFieldEnum.body; - break; - case TextHeaderAtom.OTHER_TYPE: - default: - field = ParserFieldEnum.other; - break; + for (HSLFSlide slide : slides) { + List> textLevel0 = slide.getTextParagraphs(); + for (List textLevel1 : textLevel0) { + for (HSLFTextParagraph textPara : textLevel1) { + ParserFieldEnum field; + switch (textPara.getRunType()) { + case TextHeaderAtom.TITLE_TYPE: + case TextHeaderAtom.CENTER_TITLE_TYPE: + field = ParserFieldEnum.title; + break; + case TextHeaderAtom.NOTES_TYPE: + field = ParserFieldEnum.note; + break; + case TextHeaderAtom.BODY_TYPE: + case TextHeaderAtom.CENTRE_BODY_TYPE: + case TextHeaderAtom.HALF_BODY_TYPE: + case TextHeaderAtom.QUARTER_BODY_TYPE: + field = ParserFieldEnum.body; + break; + case TextHeaderAtom.OTHER_TYPE: + default: + field = ParserFieldEnum.other; + break; + } + StringBuilder sb = new StringBuilder(); + for (HSLFTextRun textRun : textPara.getTextRuns()) { + sb.append(textRun.getRawText()); + sb.append(' '); + } + result.addField(field, StringUtils.replaceConsecutiveSpaces(sb.toString(), " ")); } - String[] frags = textRun.getText().split("\\n"); - for (String frag : frags) - result.addField(field, - StringUtils.replaceConsecutiveSpaces(frag, " ")); } } result.langDetection(10000, ParserFieldEnum.body); - } }