Skip to content

Commit

Permalink
HSEARCH-705 - [Lucene 3.1 only] use Lucene 3.1 new IndexWriter config…
Browse files Browse the repository at this point in the history
…uration API
  • Loading branch information
Sanne committed Apr 4, 2011
1 parent 6440c0f commit f2c2946
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 77 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -293,10 +293,6 @@ transaction.commit();</programlisting>
<literal>hibernate.search.[default|&lt;indexname&gt;].indexwriter.batch.max_buffered_docs</literal>
</listitem>

<listitem>
<literal>hibernate.search.[default|&lt;indexname&gt;].indexwriter.batch.max_field_length</literal>
</listitem>

<listitem>
<literal>hibernate.search.[default|&lt;indexname&gt;].indexwriter.batch.max_merge_docs</literal>
</listitem>
Expand All @@ -317,6 +313,9 @@ transaction.commit();</programlisting>
<literal>hibernate.search.batchbackend.concurrent_writers</literal>
</listitem>
</itemizedlist>

<para>Previous versions also had a <literal>max_field_length</literal> but this was removed from Lucene,
it's possible to obtain a similar effect by using a <classname>LimitTokenCountAnalyzer</classname>.</para>

<para>All <literal>.indexwriter</literal> parameters are Lucene specific
and Hibernate Search is just passing these parameters through - see <xref
Expand Down
22 changes: 0 additions & 22 deletions hibernate-search/src/main/docbook/en-US/modules/configuration.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1309,28 +1309,6 @@ hibernate.search.default.indexwriter.batch.max_merge_docs 100</programlisting>
<entry>Disabled (flushes by RAM usage)</entry>
</row>

<row>
<entry><literal>hibernate.search.[default|&lt;indexname&gt;].indexwriter.[transaction|batch].max_field_length</literal></entry>

<entry><para>The maximum number of terms that will be indexed for
a single field. This limits the amount of memory required for
indexing so that very large data will not crash the indexing
process by running out of memory. This setting refers to the
number of running terms, not to the number of different
terms.</para> <para>This silently truncates large documents,
excluding from the index all terms that occur further in the
document. If you know your source documents are large, be sure to
set this value high enough to accommodate the expected size. If
you set it to Integer.MAX_VALUE, then the only limit is your
memory, but you should anticipate an OutOfMemoryError. </para>
<para>If setting this value in <literal>batch</literal>
differently than in <literal>transaction</literal> you may get
different data (and results) in your index depending on the
indexing mode.</para></entry>

<entry>10000</entry>
</row>

<row>
<entry><literal>hibernate.search.[default|&lt;indexname&gt;].indexwriter.[transaction|batch].max_merge_docs</literal></entry>

Expand Down
4 changes: 0 additions & 4 deletions hibernate-search/src/main/docbook/en-US/modules/optimize.xml
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,6 @@ searchFactory.optimize();</programlisting>
<literal>hibernate.search.[default|&lt;indexname&gt;].indexwriter.[batch|transaction].max_buffered_docs</literal>
</listitem>

<listitem>
<literal>hibernate.search.[default|&lt;indexname&gt;].indexwriter.[batch|transaction].max_field_length</literal>
</listitem>

<listitem>
<literal>hibernate.search.[default|&lt;indexname&gt;].indexwriter.[batch|transaction].max_merge_docs</literal>
</listitem>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
import java.util.Map;
import java.util.Properties;

import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.slf4j.Logger;

import org.hibernate.search.SearchException;
Expand Down Expand Up @@ -152,12 +153,12 @@ public ParameterSet(Properties prop, String paramName) {
* Applies the parameters represented by this to a writer.
* Undefined parameters are not set, leaving the lucene default.
*
* @param writer the IndexWriter whereto the parameters will be applied.
* @param writerConfig the IndexWriter configuration whereto the parameters will be applied.
*/
public void applyToWriter(IndexWriter writer) {
public void applyToWriter(IndexWriterConfig writerConfig) {
for ( Map.Entry<IndexWriterSetting, Integer> entry : parameters.entrySet() ) {
try {
entry.getKey().applySetting( writer, entry.getValue() );
entry.getKey().applySetting( writerConfig, entry.getValue() );
}
catch ( IllegalArgumentException e ) {
//TODO if DirectoryProvider had getDirectoryName() exceptions could tell better
Expand All @@ -168,6 +169,27 @@ public void applyToWriter(IndexWriter writer) {
}
}
}

/**
* Creates a new LogByteSizeMergePolicy as configured by this property set.
* @return a new LogByteSizeMergePolicy instance.
*/
public LogByteSizeMergePolicy getNewMergePolicy() {
LogByteSizeMergePolicy logByteSizeMergePolicy = new LogByteSizeMergePolicy();
for ( Map.Entry<IndexWriterSetting, Integer> entry : parameters.entrySet() ) {
try {
entry.getKey().applySetting( logByteSizeMergePolicy, entry.getValue() );
}
catch ( IllegalArgumentException e ) {
//TODO if DirectoryProvider had getDirectoryName() exceptions could tell better
throw new SearchException(
"Illegal IndexWriter setting "
+ entry.getKey().getKey() + " " + e.getMessage(), e
);
}
}
return logByteSizeMergePolicy;
}

public Integer getCurrentValueFor(IndexWriterSetting ws) {
return parameters.get( ws );
Expand Down Expand Up @@ -224,12 +246,12 @@ public String toString() {
}
}

public void applyToWriter(IndexWriter writer, boolean batch) {
public void applyToWriter(IndexWriterConfig writerConfig, boolean batch) {
if ( batch ) {
getBatchIndexParameters().applyToWriter( writer );
getBatchIndexParameters().applyToWriter( writerConfig );
}
else {
getTransactionIndexParameters().applyToWriter( writer );
getTransactionIndexParameters().applyToWriter( writerConfig );
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,16 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.SimpleAnalyzer;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogByteSizeMergePolicy;
import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.util.Version;
import org.slf4j.Logger;

import org.hibernate.search.spi.WorkerBuildContext;
import org.hibernate.search.SearchFactory;
import org.hibernate.search.backend.LuceneIndexingParameters.ParameterSet;
import org.hibernate.search.backend.impl.lucene.overrides.ConcurrentMergeScheduler;
import org.hibernate.search.engine.DocumentBuilderIndexedEntity;
import org.hibernate.search.engine.SearchFactoryImplementor;
Expand All @@ -61,9 +65,12 @@
public class Workspace {

private static final Logger log = LoggerFactory.make();
private static final Analyzer SIMPLE_ANALYZER = new SimpleAnalyzer();
private static final IndexWriter.MaxFieldLength maxFieldLength =
new IndexWriter.MaxFieldLength( IndexWriter.DEFAULT_MAX_FIELD_LENGTH );

/**
* This Analyzer is never used in practice: during Add operation it's overriden.
* So we don't care for the Version, using whatever Lucene thinks is safer.
*/
private static final Analyzer SIMPLE_ANALYZER = new SimpleAnalyzer( Version.LUCENE_31 );

// invariant state:

Expand All @@ -73,8 +80,10 @@ public class Workspace {
private final ReentrantLock lock;
private final Set<Class<?>> entitiesInDirectory;
private final LuceneIndexingParameters indexingParams;
private final Similarity similarity;
private final ErrorHandler errorHandler;

private final IndexWriterConfig writerConfig = new IndexWriterConfig( Version.LUCENE_31, SIMPLE_ANALYZER );
private final IndexWriterConfig batchWriterConfig = new IndexWriterConfig( Version.LUCENE_31, SIMPLE_ANALYZER );

// variable state:

Expand All @@ -93,10 +102,15 @@ public Workspace(WorkerBuildContext context, DirectoryProvider<?> provider, Erro
this.directoryProvider = provider;
this.optimizerStrategy = context.getOptimizerStrategy( directoryProvider );
this.entitiesInDirectory = context.getClassesInDirectoryProvider( provider );
this.indexingParams = context.getIndexingParameters( directoryProvider );
this.lock = context.getDirectoryProviderLock( provider );
this.similarity = context.getSimilarity( directoryProvider );
this.indexingParams = context.getIndexingParameters( directoryProvider );
this.errorHandler = errorHandler;
LuceneIndexingParameters indexingParams = context.getIndexingParameters( directoryProvider );
indexingParams.applyToWriter( writerConfig, false );
indexingParams.applyToWriter( batchWriterConfig, true );
Similarity similarity = context.getSimilarity( directoryProvider );
writerConfig.setSimilarity( similarity );
batchWriterConfig.setSimilarity( similarity );
}

public <T> DocumentBuilderIndexedEntity<T> getDocumentBuilder(Class<T> entity) {
Expand Down Expand Up @@ -156,12 +170,16 @@ public synchronized IndexWriter getIndexWriter(boolean batchmode, ErrorContextBu
if ( writer != null )
return writer;
try {
writer = new IndexWriter( directoryProvider.getDirectory(), SIMPLE_ANALYZER, false, maxFieldLength ); // has been created at init time
indexingParams.applyToWriter( writer, batchmode );
writer.setSimilarity( similarity );
MergeScheduler mergeScheduler = new ConcurrentMergeScheduler( this.errorHandler );
writer.setMergeScheduler( mergeScheduler );
log.trace( "IndexWriter opened" );
if ( batchmode ) {
ParameterSet indexingParameters = indexingParams.getBatchIndexParameters();
writer = createNewIndexWriter( directoryProvider, this.batchWriterConfig, indexingParameters );
log.trace( "IndexWriter opened using batch configuration" );
}
else {
ParameterSet indexingParameters = indexingParams.getTransactionIndexParameters();
writer = createNewIndexWriter( directoryProvider, this.writerConfig, indexingParameters );
log.trace( "IndexWriter opened using default configuration" );
}
}
catch ( IOException ioe ) {
writer = null;
Expand All @@ -170,6 +188,21 @@ public synchronized IndexWriter getIndexWriter(boolean batchmode, ErrorContextBu
return writer;
}

/**
* Create as new IndexWriter using the passed in IndexWriterConfig as a template, but still applies some late changes:
* we need to override the MergeScheduler to handle background errors, and a new instance needs to be created for each
* new IndexWriter.
* Also each new IndexWriter needs a new MergePolicy.
*/
private IndexWriter createNewIndexWriter(DirectoryProvider<?> directoryProvider, IndexWriterConfig writerConfig, ParameterSet indexingParameters) throws IOException {
LogByteSizeMergePolicy newMergePolicy = indexingParameters.getNewMergePolicy(); //TODO make it possible to configure a different policy?
writerConfig.setMergePolicy( newMergePolicy );
MergeScheduler mergeScheduler = new ConcurrentMergeScheduler( this.errorHandler );
writerConfig.setMergeScheduler( mergeScheduler );
IndexWriter writer = new IndexWriter( directoryProvider.getDirectory(), writerConfig );
return writer;
}

/**
* @see #getIndexWriter(boolean, ErrorContextBuilder)
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,12 @@

import java.io.Serializable;

import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogByteSizeMergePolicy;

import org.hibernate.search.SearchException;
import org.hibernate.search.util.LoggerFactory;
import org.slf4j.Logger;

/**
* Represents possible options to be applied to an
Expand All @@ -38,74 +41,80 @@
public enum IndexWriterSetting implements Serializable {

/**
* @see org.apache.lucene.index.IndexWriter#setMaxBufferedDeleteTerms(int)
* @see org.apache.lucene.index.IndexWriterConfig#setMaxBufferedDeleteTerms(int)
*/
MAX_BUFFERED_DELETE_TERMS( "max_buffered_delete_terms" ) {
public void applySetting(IndexWriter writer, int value) {
writer.setMaxBufferedDeleteTerms( value );
public void applySetting(IndexWriterConfig writerConfig, int value) {
writerConfig.setMaxBufferedDeleteTerms( value );
}
},
/**
* @see org.apache.lucene.index.IndexWriter#setMaxBufferedDocs(int)
* @see org.apache.lucene.index.IndexWriterConfig#setMaxBufferedDocs(int)
*/
MAX_BUFFERED_DOCS( "max_buffered_docs" ) {
public void applySetting(IndexWriter writer, int value) {
writer.setMaxBufferedDocs( value );
public void applySetting(IndexWriterConfig writerConfig, int value) {
writerConfig.setMaxBufferedDocs( value );
}
},
/**
* @see org.apache.lucene.index.IndexWriter#setMaxFieldLength(int)
* No longer applied - use a LimitTokenCountAnalyzer.
* @see org.apache.lucene.analysis.LimitTokenCountAnalyzer
* @deprecated
*/
MAX_FIELD_LENGTH( "max_field_length" ) {
public void applySetting(IndexWriter writer, int value) {
writer.setMaxFieldLength( value );
public void applySetting(IndexWriterConfig writerConfig, int value) {
log.warn( "Configuration option 'max_field_length' is no longer applied. Use LimitTokenCountAnalyzer instead" );
}
},
/**
* @see org.apache.lucene.index.IndexWriter#setMaxMergeDocs(int)
* @see org.apache.lucene.index.LogByteSizeMergePolicy#setMaxMergeDocs(int)
*/
MAX_MERGE_DOCS( "max_merge_docs" ) {
public void applySetting(IndexWriter writer, int value) {
writer.setMaxMergeDocs( value );
public void applySetting(LogByteSizeMergePolicy logByteSizeMergePolicy, int value) {
logByteSizeMergePolicy.setMaxMergeDocs( value );
}
},
/**
* @see org.apache.lucene.index.IndexWriter#setMergeFactor(int)
* @see org.apache.lucene.index.LogByteSizeMergePolicy#setMergeFactor(int)
*/
MERGE_FACTOR( "merge_factor" ) {
public void applySetting(IndexWriter writer, int value) {
writer.setMergeFactor( value );
public void applySetting(LogByteSizeMergePolicy logByteSizeMergePolicy, int value) {
logByteSizeMergePolicy.setMergeFactor( value );
}
},
/**
* @see org.apache.lucene.index.IndexWriter#setRAMBufferSizeMB(double)
* @see org.apache.lucene.index.IndexWriterConfig#setRAMBufferSizeMB(double)
*/
RAM_BUFFER_SIZE( "ram_buffer_size" ) {
public void applySetting(IndexWriter writer, int value) {
writer.setRAMBufferSizeMB( value );
public void applySetting(IndexWriterConfig writerConfig, int value) {
writerConfig.setRAMBufferSizeMB( value );
}
},
/**
* @see org.apache.lucene.index.IndexWriter#setTermIndexInterval(int)
* @see org.apache.lucene.index.IndexWriterConfig#setTermIndexInterval(int)
*/
TERM_INDEX_INTERVAL( "term_index_interval" ) {
public void applySetting(IndexWriter writer, int value) {
writer.setTermIndexInterval( value );
public void applySetting(IndexWriterConfig writerConfig, int value) {
writerConfig.setTermIndexInterval( value );
}
},
/**
* @see org.apache.lucene.index.IndexWriter#setUseCompoundFile(boolean)
* @see org.apache.lucene.index.LogByteSizeMergePolicy#setUseCompoundFile(boolean)
*/
USE_COMPOUND_FILE( "use_compound_file" ) {
public void applySetting(IndexWriter writer, int value) {
writer.setUseCompoundFile( intToBoolean( value ) );
}

@Override
public Integer parseVal(String value) {
return USE_COMPOUND_FILE.parseBoolean( value );
}

@Override
public void applySetting(LogByteSizeMergePolicy logByteSizeMergePolicy, int value) {
boolean useCompoundFile = intToBoolean( value );
logByteSizeMergePolicy.setUseCompoundFile( useCompoundFile );
}
};

private static final Logger log = LoggerFactory.make();

private static final Integer TRUE = 1;
private static final Integer FALSE = 0;
Expand All @@ -119,7 +128,15 @@ public Integer parseVal(String value) {
/**
* @throws IllegalArgumentException when user selects an invalid value; should be wrapped.
*/
public abstract void applySetting(IndexWriter writer, int value);
public void applySetting(IndexWriterConfig writerConfig, int value) {
// nothing to do unless overriden
}
/**
* @throws IllegalArgumentException when user selects an invalid value; should be wrapped.
*/
public void applySetting(LogByteSizeMergePolicy logByteSizeMergePolicy, int value) {
// nothing to do unless overriden
}

/**
* @return The key used in configuration files to select an option.
Expand Down

0 comments on commit f2c2946

Please sign in to comment.