Skip to content

Commit

Permalink
Fix oracle#2560 : recognize Huge Text in gzip or bzip2
Browse files Browse the repository at this point in the history
  • Loading branch information
idodeclare committed Oct 6, 2020
1 parent 8e6e445 commit 03c0528
Show file tree
Hide file tree
Showing 3 changed files with 128 additions and 59 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,16 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;
import java.util.logging.Level;
import java.util.logging.Logger;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.tools.bzip2.CBZip2InputStream;
import org.opengrok.indexer.analysis.AbstractAnalyzer;
import org.opengrok.indexer.analysis.AnalyzerFactory;
import org.opengrok.indexer.analysis.AnalyzerGuru;
import org.opengrok.indexer.analysis.FileAnalyzer;
import org.opengrok.indexer.analysis.StreamSource;
import org.opengrok.indexer.logger.LoggerFactory;
import org.opengrok.indexer.search.QueryBuilder;

/**
Expand All @@ -43,17 +45,9 @@
* Created on September 22, 2005
* @author Chandan
*/
public class BZip2Analyzer extends FileAnalyzer {

private Genre g;
public class BZip2Analyzer extends CompressedAnalyzer {

@Override
public Genre getGenre() {
if (g != null) {
return g;
}
return super.getGenre();
}
private static final Logger LOGGER = LoggerFactory.getLogger(BZip2Analyzer.class);

protected BZip2Analyzer(AnalyzerFactory factory) {
super(factory);
Expand All @@ -71,11 +65,11 @@ public String getCtagsLang() {
* Gets a version number to be used to tag processed documents so that
* re-analysis can be re-done later if a stored version number is different
* from the current implementation.
* @return 20180111_00
* @return 20200417_00
*/
@Override
protected int getSpecializedVersionNo() {
return 20180111_00; // Edit comment above too!
return 20200417_00; // Edit comment above too!
}

@Override
Expand All @@ -92,20 +86,12 @@ public void analyze(Document doc, StreamSource src, Writer xrefOut)
try (InputStream in = bzSrc.getStream()) {
fa = AnalyzerGuru.getAnalyzer(in, newname);
}
if (!(fa instanceof BZip2Analyzer)) {
if (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE) {
this.g = Genre.XREFABLE;
} else {
this.g = Genre.DATA;
}
fa.analyze(doc, bzSrc, xrefOut);
if (doc.get(QueryBuilder.T) != null) {
doc.removeField(QueryBuilder.T);
if (g == Genre.XREFABLE) {
doc.add(new Field(QueryBuilder.T, g.typeName(),
AnalyzerGuru.string_ft_stored_nanalyzed_norms));
}
}
if (fa == null) {
this.g = Genre.DATA;
LOGGER.log(Level.WARNING, "Did not analyze {0} detected as data.", newname);
//TODO we could probably wrap tar analyzer here, need to do research on reader coming from gzis ...
} else if (!(fa instanceof BZip2Analyzer)) {
analyzeUncompressed(doc, xrefOut, fa, bzSrc);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* See LICENSE.txt included in this distribution for the specific
* language governing permissions and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at LICENSE.txt.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/

/*
* Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved.
* Portions Copyright (c) 2017-2020, Chris Fraire <cfraire@me.com>.
*/

package org.opengrok.indexer.analysis.archive;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.opengrok.indexer.analysis.AbstractAnalyzer;
import org.opengrok.indexer.analysis.AnalyzerFactory;
import org.opengrok.indexer.analysis.AnalyzerGuru;
import org.opengrok.indexer.analysis.FileAnalyzer;
import org.opengrok.indexer.analysis.StreamSource;
import org.opengrok.indexer.analysis.data.HugeTextAnalyzerFactory;
import org.opengrok.indexer.configuration.RuntimeEnvironment;
import org.opengrok.indexer.search.QueryBuilder;

import java.io.IOException;
import java.io.InputStream;
import java.io.Writer;

/**
* Represents a base for compressed formats (e.g. gzip or bzip2) but not for
* archive formats that have compression (e.g. Zip or Jar).
* @author Chandan
*/
public abstract class CompressedAnalyzer extends FileAnalyzer {

protected Genre g;

@Override
public Genre getGenre() {
if (g != null) {
return g;
}
return super.getGenre();
}

protected CompressedAnalyzer(AnalyzerFactory factory) {
super(factory);
}

protected void analyzeUncompressed(
Document doc, Writer xrefOut, AbstractAnalyzer fa, StreamSource compressedSrc)
throws IOException, InterruptedException {

if (fa.getGenre() == Genre.PLAIN) {
if (meetsHugeTextThreshold(compressedSrc)) {
fa = HugeTextAnalyzerFactory.DEFAULT_INSTANCE.getAnalyzer();
g = Genre.DATA;
} else {
g = Genre.XREFABLE;
}
} else if (fa.getGenre() == Genre.XREFABLE) {
g = Genre.XREFABLE;
} else {
g = Genre.DATA;
}

fa.analyze(doc, compressedSrc, xrefOut);
if (doc.get(QueryBuilder.T) != null) {
doc.removeField(QueryBuilder.T);
}
doc.add(new Field(QueryBuilder.T, g.typeName(),
AnalyzerGuru.string_ft_stored_nanalyzed_norms));
}

private boolean meetsHugeTextThreshold(StreamSource compressedSrc) throws IOException {
RuntimeEnvironment env = RuntimeEnvironment.getInstance();
int hugeTextThresholdBytes = env.getHugeTextThresholdBytes();
if (Integer.MAX_VALUE == hugeTextThresholdBytes) {
// Don't bother decompressing to count if the limit is MAX_VALUE.
return false;
}

byte[] buf = new byte[8 * 1024];
int bytesRead = 0;
int n;
try (InputStream in = compressedSrc.getStream()) {
while ((n = in.read(buf, 0, buf.length)) != -1) {
bytesRead += n;
if (bytesRead >= hugeTextThresholdBytes) {
return true;
}
}
}
return false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,9 @@
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.opengrok.indexer.analysis.AbstractAnalyzer;
import org.opengrok.indexer.analysis.AnalyzerFactory;
import org.opengrok.indexer.analysis.AnalyzerGuru;
import org.opengrok.indexer.analysis.FileAnalyzer;
import org.opengrok.indexer.analysis.StreamSource;
import org.opengrok.indexer.logger.LoggerFactory;
import org.opengrok.indexer.search.QueryBuilder;
Expand All @@ -47,20 +45,10 @@
* Created on September 22, 2005
* @author Chandan
*/
public class GZIPAnalyzer extends FileAnalyzer {
public class GZIPAnalyzer extends CompressedAnalyzer {

private static final Logger LOGGER = LoggerFactory.getLogger(GZIPAnalyzer.class);

private Genre g;

@Override
public Genre getGenre() {
if (g != null) {
return g;
}
return super.getGenre();
}

protected GZIPAnalyzer(AnalyzerFactory factory) {
super(factory);
}
Expand All @@ -77,11 +65,11 @@ public String getCtagsLang() {
* Gets a version number to be used to tag processed documents so that
* re-analysis can be re-done later if a stored version number is different
* from the current implementation.
* @return 20180111_00
* @return 20200417_00
*/
@Override
protected int getSpecializedVersionNo() {
return 20180111_00; // Edit comment above too!
return 20200417_00; // Edit comment above too!
}

@Override
Expand All @@ -93,30 +81,16 @@ public void analyze(Document doc, StreamSource src, Writer xrefOut)
String path = doc.get(QueryBuilder.PATH);
if (path != null && path.toLowerCase(Locale.ROOT).endsWith(".gz")) {
String newname = path.substring(0, path.length() - 3);
//System.err.println("GZIPPED OF = " + newname);
try (InputStream gzis = gzSrc.getStream()) {
fa = AnalyzerGuru.getAnalyzer(gzis, newname);
}
if (fa == null) {
this.g = Genre.DATA;
LOGGER.log(Level.WARNING, "Did not analyze {0}, detected as data.", newname);
LOGGER.log(Level.WARNING, "Did not analyze {0} detected as data.", newname);
//TODO we could probably wrap tar analyzer here, need to do research on reader coming from gzis ...
} else { // cant recurse!
//simple file gziped case captured here
if (fa.getGenre() == Genre.PLAIN || fa.getGenre() == Genre.XREFABLE) {
this.g = Genre.XREFABLE;
} else {
this.g = Genre.DATA;
}
fa.analyze(doc, gzSrc, xrefOut);
if (doc.get(QueryBuilder.T) != null) {
doc.removeField(QueryBuilder.T);
if (g == Genre.XREFABLE) {
doc.add(new Field(QueryBuilder.T, g.typeName(),
AnalyzerGuru.string_ft_stored_nanalyzed_norms));
}
}

analyzeUncompressed(doc, xrefOut, fa, gzSrc);
}
}
}
Expand Down

0 comments on commit 03c0528

Please sign in to comment.