Skip to content

Commit

Permalink
Add the ability to set an analyzer on keyword fields.
Browse files Browse the repository at this point in the history
This adds a new `normalizer` property to `keyword` fields that pre-processes the
field value prior to indexing, but without altering the `_source`. Note that
only the normalization components that work on a per-character basis are
applied, so for instance stemming filters will be ignored while lowercasing or
ascii folding will be applied.

Closes elastic#18064
  • Loading branch information
jpountz committed Dec 27, 2016
1 parent e7444f7 commit 39cffee
Show file tree
Hide file tree
Showing 20 changed files with 772 additions and 68 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -145,11 +145,10 @@ public NamedAnalyzer get(Object key) {

@Override
public Set<Entry<String, NamedAnalyzer>> entrySet() {
// just to ensure we can iterate over this single analzyer
return Collections.singletonMap(fakeDefault.name(), fakeDefault).entrySet();
return Collections.emptySet();
}
};
try (IndexAnalyzers fakeIndexAnalzyers = new IndexAnalyzers(indexSettings, fakeDefault, fakeDefault, fakeDefault, analyzerMap)) {
try (IndexAnalyzers fakeIndexAnalzyers = new IndexAnalyzers(indexSettings, fakeDefault, fakeDefault, fakeDefault, analyzerMap, analyzerMap)) {
MapperService mapperService = new MapperService(indexSettings, fakeIndexAnalzyers, xContentRegistry, similarityService,
mapperRegistry, () -> null);
mapperService.merge(indexMetaData, MapperService.MergeReason.MAPPING_RECOVERY, false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,17 +67,20 @@ public final class AnalysisRegistry implements Closeable {
private final Map<String, AnalysisProvider<TokenFilterFactory>> tokenFilters;
private final Map<String, AnalysisProvider<TokenizerFactory>> tokenizers;
private final Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers;
private final Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers;

public AnalysisRegistry(Environment environment,
Map<String, AnalysisProvider<CharFilterFactory>> charFilters,
Map<String, AnalysisProvider<TokenFilterFactory>> tokenFilters,
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers,
Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers) {
Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers,
Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers) {
this.environment = environment;
this.charFilters = unmodifiableMap(charFilters);
this.tokenFilters = unmodifiableMap(tokenFilters);
this.tokenizers = unmodifiableMap(tokenizers);
this.analyzers = unmodifiableMap(analyzers);
this.normalizers = unmodifiableMap(normalizers);
}

/**
Expand Down Expand Up @@ -151,7 +154,8 @@ public IndexAnalyzers build(IndexSettings indexSettings) throws IOException {
final Map<String, TokenizerFactory> tokenizerFactories = buildTokenizerFactories(indexSettings);
final Map<String, TokenFilterFactory> tokenFilterFactories = buildTokenFilterFactories(indexSettings);
final Map<String, AnalyzerProvider<?>> analyzierFactories = buildAnalyzerFactories(indexSettings);
return build(indexSettings, analyzierFactories, tokenizerFactories, charFilterFactories, tokenFilterFactories);
final Map<String, AnalyzerProvider<?>> normalizerFactories = buildNormalizerFactories(indexSettings);
return build(indexSettings, analyzierFactories, normalizerFactories, tokenizerFactories, charFilterFactories, tokenFilterFactories);
}

public Map<String, TokenFilterFactory> buildTokenFilterFactories(IndexSettings indexSettings) throws IOException {
Expand All @@ -164,22 +168,28 @@ public Map<String, TokenFilterFactory> buildTokenFilterFactories(IndexSettings i
*/
tokenFilters.put("synonym", requriesAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
tokenFilters.put("synonym_graph", requriesAnalysisSettings((is, env, name, settings) -> new SynonymGraphFilterFactory(is, env, this, name, settings)));
return buildMapping(false, "tokenfilter", indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.tokenFilterFactories);
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.tokenFilterFactories);
}

public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {
final Map<String, Settings> tokenizersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_TOKENIZER);
return buildMapping(false, "tokenizer", indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.tokenizerFactories);
return buildMapping(Component.TOKENIZER, indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.tokenizerFactories);
}

public Map<String, CharFilterFactory> buildCharFilterFactories(IndexSettings indexSettings) throws IOException {
final Map<String, Settings> charFiltersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_CHAR_FILTER);
return buildMapping(false, "charfilter", indexSettings, charFiltersSettings, charFilters, prebuiltAnalysis.charFilterFactories);
return buildMapping(Component.CHAR_FILTER, indexSettings, charFiltersSettings, charFilters, prebuiltAnalysis.charFilterFactories);
}

public Map<String, AnalyzerProvider<?>> buildAnalyzerFactories(IndexSettings indexSettings) throws IOException {
final Map<String, Settings> analyzersSettings = indexSettings.getSettings().getGroups("index.analysis.analyzer");
return buildMapping(true, "analyzer", indexSettings, analyzersSettings, analyzers, prebuiltAnalysis.analyzerProviderFactories);
return buildMapping(Component.ANALYZER, indexSettings, analyzersSettings, analyzers, prebuiltAnalysis.analyzerProviderFactories);
}

public Map<String, AnalyzerProvider<?>> buildNormalizerFactories(IndexSettings indexSettings) throws IOException {
final Map<String, Settings> noralizersSettings = indexSettings.getSettings().getGroups("index.analysis.normalizer");
// TODO: Have pre-built normalizers
return buildMapping(Component.NORMALIZER, indexSettings, noralizersSettings, normalizers, Collections.emptyMap());
}

/**
Expand All @@ -194,7 +204,7 @@ public AnalysisProvider<TokenizerFactory> getTokenizerProvider(String tokenizer,
final Map<String, Settings> tokenizerSettings = indexSettings.getSettings().getGroups("index.analysis.tokenizer");
if (tokenizerSettings.containsKey(tokenizer)) {
Settings currentSettings = tokenizerSettings.get(tokenizer);
return getAnalysisProvider("tokenizer", tokenizers, tokenizer, currentSettings.get("type"));
return getAnalysisProvider(Component.TOKENIZER, tokenizers, tokenizer, currentSettings.get("type"));
} else {
return getTokenizerProvider(tokenizer);
}
Expand Down Expand Up @@ -223,7 +233,7 @@ public AnalysisProvider<TokenFilterFactory> getTokenFilterProvider(String tokenF
} else if ("synonym_graph".equals(typeName)) {
return requriesAnalysisSettings((is, env, name, settings) -> new SynonymGraphFilterFactory(is, env, this, name, settings));
} else {
return getAnalysisProvider("tokenfilter", tokenFilters, tokenFilter, typeName);
return getAnalysisProvider(Component.FILTER, tokenFilters, tokenFilter, typeName);
}
} else {
return getTokenFilterProvider(tokenFilter);
Expand All @@ -242,7 +252,7 @@ public AnalysisProvider<CharFilterFactory> getCharFilterProvider(String charFilt
final Map<String, Settings> tokenFilterSettings = indexSettings.getSettings().getGroups("index.analysis.char_filter");
if (tokenFilterSettings.containsKey(charFilter)) {
Settings currentSettings = tokenFilterSettings.get(charFilter);
return getAnalysisProvider("charfilter", charFilters, charFilter, currentSettings.get("type"));
return getAnalysisProvider(Component.CHAR_FILTER, charFilters, charFilter, currentSettings.get("type"));
} else {
return getCharFilterProvider(charFilter);
}
Expand All @@ -261,7 +271,40 @@ public boolean requiresAnalysisSettings() {
};
}

private <T> Map<String, T> buildMapping(boolean analyzer, String toBuild, IndexSettings settings, Map<String, Settings> settingsMap,
enum Component {
ANALYZER {
@Override
public String toString() {
return "analyzer";
}
},
NORMALIZER {
@Override
public String toString() {
return "normalizer";
}
},
CHAR_FILTER {
@Override
public String toString() {
return "char_filter";
}
},
TOKENIZER {
@Override
public String toString() {
return "tokenizer";
}
},
FILTER {
@Override
public String toString() {
return "filter";
}
};
}

private <T> Map<String, T> buildMapping(Component component, IndexSettings settings, Map<String, Settings> settingsMap,
Map<String, AnalysisModule.AnalysisProvider<T>> providerMap, Map<String, AnalysisModule.AnalysisProvider<T>> defaultInstance)
throws IOException {
Settings defaultSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, settings.getIndexVersionCreated()).build();
Expand All @@ -270,29 +313,34 @@ private <T> Map<String, T> buildMapping(boolean analyzer, String toBuild, IndexS
String name = entry.getKey();
Settings currentSettings = entry.getValue();
String typeName = currentSettings.get("type");
if (analyzer) {
T factory;
if (component == Component.ANALYZER) {
T factory = null;
if (typeName == null) {
if (currentSettings.get("tokenizer") != null) {
factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings);
} else {
throw new IllegalArgumentException(toBuild + " [" + name + "] must specify either an analyzer type, or a tokenizer");
throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer");
}
} else if (typeName.equals("custom")) {
factory = (T) new CustomAnalyzerProvider(settings, name, currentSettings);
} else {
AnalysisModule.AnalysisProvider<T> type = providerMap.get(typeName);
if (type == null) {
throw new IllegalArgumentException("Unknown " + toBuild + " type [" + typeName + "] for [" + name + "]");
}
factory = type.get(settings, environment, name, currentSettings);
}
factories.put(name, factory);
} else {
AnalysisProvider<T> type = getAnalysisProvider(toBuild, providerMap, name, typeName);
final T factory = type.get(settings, environment, name, currentSettings);
factories.put(name, factory);
if (factory != null) {
factories.put(name, factory);
continue;
}
} else if (component == Component.NORMALIZER) {
if (typeName == null || typeName.equals("custom")) {
T factory = (T) new CustomNormalizerProvider(settings, name, currentSettings);
factories.put(name, factory);
continue;
}
}
AnalysisProvider<T> type = getAnalysisProvider(component, providerMap, name, typeName);
if (type == null) {
throw new IllegalArgumentException("Unknown " + component + " type [" + typeName + "] for [" + name + "]");
}
final T factory = type.get(settings, environment, name, currentSettings);
factories.put(name, factory);

}
// go over the char filters in the bindings and register the ones that are not configured
Expand Down Expand Up @@ -330,13 +378,13 @@ private <T> Map<String, T> buildMapping(boolean analyzer, String toBuild, IndexS
return factories;
}

private <T> AnalysisProvider<T> getAnalysisProvider(String toBuild, Map<String, AnalysisProvider<T>> providerMap, String name, String typeName) {
private <T> AnalysisProvider<T> getAnalysisProvider(Component component, Map<String, AnalysisProvider<T>> providerMap, String name, String typeName) {
if (typeName == null) {
throw new IllegalArgumentException(toBuild + " [" + name + "] must specify either an analyzer type, or a tokenizer");
throw new IllegalArgumentException(component + " [" + name + "] must specify either an analyzer type, or a tokenizer");
}
AnalysisProvider<T> type = providerMap.get(typeName);
if (type == null) {
throw new IllegalArgumentException("Unknown " + toBuild + " type [" + typeName + "] for [" + name + "]");
throw new IllegalArgumentException("Unknown " + component + " type [" + typeName + "] for [" + name + "]");
}
return type;
}
Expand Down Expand Up @@ -426,6 +474,7 @@ public void close() throws IOException {

public IndexAnalyzers build(IndexSettings indexSettings,
Map<String, AnalyzerProvider<?>> analyzerProviders,
Map<String, AnalyzerProvider<?>> normalizerProviders,
Map<String, TokenizerFactory> tokenizerFactoryFactories,
Map<String, CharFilterFactory> charFilterFactoryFactories,
Map<String, TokenFilterFactory> tokenFilterFactoryFactories) {
Expand All @@ -436,10 +485,15 @@ public IndexAnalyzers build(IndexSettings indexSettings,
DeprecationLogger deprecationLogger = new DeprecationLogger(logger);
Map<String, NamedAnalyzer> analyzerAliases = new HashMap<>();
Map<String, NamedAnalyzer> analyzers = new HashMap<>();
Map<String, NamedAnalyzer> normalizers = new HashMap<>();
for (Map.Entry<String, AnalyzerProvider<?>> entry : analyzerProviders.entrySet()) {
processAnalyzerFactory(deprecationLogger, indexSettings, entry.getKey(), entry.getValue(), analyzerAliases, analyzers,
tokenFilterFactoryFactories, charFilterFactoryFactories, tokenizerFactoryFactories);
}
for (Map.Entry<String, AnalyzerProvider<?>> entry : normalizerProviders.entrySet()) {
processNormalizerFactory(deprecationLogger, indexSettings, entry.getKey(), entry.getValue(), normalizers,
tokenFilterFactoryFactories, charFilterFactoryFactories);
}
for (Map.Entry<String, NamedAnalyzer> entry : analyzerAliases.entrySet()) {
String key = entry.getKey();
if (analyzers.containsKey(key) &&
Expand Down Expand Up @@ -485,7 +539,7 @@ public IndexAnalyzers build(IndexSettings indexSettings,
}
}
return new IndexAnalyzers(indexSettings, defaultIndexAnalyzer, defaultSearchAnalyzer, defaultSearchQuoteAnalyzer,
unmodifiableMap(analyzers));
unmodifiableMap(analyzers), unmodifiableMap(normalizers));
}

private void processAnalyzerFactory(DeprecationLogger deprecationLogger,
Expand Down Expand Up @@ -551,4 +605,25 @@ private void processAnalyzerFactory(DeprecationLogger deprecationLogger,
}
}
}

private void processNormalizerFactory(DeprecationLogger deprecationLogger,
IndexSettings indexSettings,
String name,
AnalyzerProvider<?> normalizerFactory,
Map<String, NamedAnalyzer> normalizers,
Map<String, TokenFilterFactory> tokenFilters,
Map<String, CharFilterFactory> charFilters) {
if (normalizerFactory instanceof CustomNormalizerProvider) {
((CustomNormalizerProvider) normalizerFactory).build(charFilters, tokenFilters);
}
Analyzer normalizerF = normalizerFactory.get();
if (normalizerF == null) {
throw new IllegalArgumentException("normalizer [" + normalizerFactory.name() + "] created null normalizer");
}
NamedAnalyzer normalizer = new NamedAnalyzer(name, normalizerFactory.scope(), normalizerF);
if (normalizers.containsKey(name)) {
throw new IllegalStateException("already registered analyzer with name: " + name);
}
normalizers.put(name, normalizer);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -94,4 +94,27 @@ protected Reader initReader(String fieldName, Reader reader) {
}
return reader;
}

@Override
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
for (CharFilterFactory charFilter : charFilters) {
if (charFilter instanceof MultiTermAwareComponent) {
charFilter = (CharFilterFactory) ((MultiTermAwareComponent) charFilter).getMultiTermComponent();
reader = charFilter.create(reader);
}
}
return reader;
}

@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
TokenStream result = in;
for (TokenFilterFactory filter : tokenFilters) {
if (filter instanceof MultiTermAwareComponent) {
filter = (TokenFilterFactory) ((MultiTermAwareComponent) filter).getMultiTermComponent();
result = filter.create(result);
}
}
return result;
}
}

0 comments on commit 39cffee

Please sign in to comment.