From 32e23b9100b769cf9f45fb7459648422dd473ccb Mon Sep 17 00:00:00 2001 From: Adrien Grand Date: Thu, 7 May 2015 16:46:40 +0200 Subject: [PATCH] Aggs: Make it possible to configure missing values. Most aggregations (terms, histogram, stats, percentiles, geohash-grid) now support a new `missing` option which defines the value to consider when a field does not have a value. This can be handy if you eg. want a terms aggregation to handle the same way documents that have "N/A" or no value for a `tag` field. This works in a very similar way to the `missing` option on the `sort` element. One known issue is that this option sometimes cannot make the right decision in the unmapped case: it needs to replace all values with the `missing` value but might not know what kind of values source should be produced (numerics, strings, geo points?). For this reason, we might want to add an `unmapped_type` option in the future like we did for sorting. Related to #5324 --- .../bucket/datehistogram-aggregation.asciidoc | 23 ++ .../bucket/histogram-aggregation.asciidoc | 23 ++ .../bucket/terms-aggregation.asciidoc | 22 ++ .../metrics/avg-aggregation.asciidoc | 24 +- .../metrics/cardinality-aggregation.asciidoc | 21 ++ .../extendedstats-aggregation.asciidoc | 24 +- .../metrics/max-aggregation.asciidoc | 21 ++ .../metrics/min-aggregation.asciidoc | 22 ++ .../metrics/percentile-aggregation.asciidoc | 22 ++ .../percentile-rank-aggregation.asciidoc | 22 ++ .../metrics/stats-aggregation.asciidoc | 24 +- .../metrics/sum-aggregation.asciidoc | 22 ++ .../elasticsearch/common/geo/GeoUtils.java | 21 +- .../ValuesSourceAggregationBuilder.java | 12 + .../GlobalOrdinalsStringTermsAggregator.java | 6 +- .../bucket/terms/TermsAggregatorFactory.java | 4 +- ...ValuesSourceMetricsAggregationBuilder.java | 13 + .../support/AggregationContext.java | 82 ++++- .../aggregations/support/MissingValues.java | 337 ++++++++++++++++++ .../aggregations/support/ValuesSource.java | 109 ++++-- .../ValuesSourceAggregatorFactory.java | 4 +- .../support/ValuesSourceConfig.java | 1 + .../support/ValuesSourceParser.java | 8 + .../aggregations/MissingValueTests.java | 195 ++++++++++ .../support/MissingValuesTests.java | 297 +++++++++++++++ 25 files changed, 1311 insertions(+), 48 deletions(-) create mode 100644 src/main/java/org/elasticsearch/search/aggregations/support/MissingValues.java create mode 100644 src/test/java/org/elasticsearch/search/aggregations/MissingValueTests.java create mode 100644 src/test/java/org/elasticsearch/search/aggregations/support/MissingValuesTests.java diff --git a/docs/reference/aggregations/bucket/datehistogram-aggregation.asciidoc b/docs/reference/aggregations/bucket/datehistogram-aggregation.asciidoc index 256ef62d7669d..dfff6691af1b7 100644 --- a/docs/reference/aggregations/bucket/datehistogram-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/datehistogram-aggregation.asciidoc @@ -123,3 +123,26 @@ settings and filter the returned buckets based on a `min_doc_count` setting (by bucket that matches documents and the last one are returned). This histogram also supports the `extended_bounds` setting, which enables extending the bounds of the histogram beyond the data itself (to read more on why you'd want to do that please refer to the explanation <>). + +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "publish_date" : { + "datehistogram" : { + "field" : "publish_date", + "interval": "year", + "missing": "2000-01-01" <1> + } + } + } +} +-------------------------------------------------- + +<1> Documents without a value in the `publish_date` field will fall into the same bucket as documents that have the value `2000-01-01`. diff --git a/docs/reference/aggregations/bucket/histogram-aggregation.asciidoc b/docs/reference/aggregations/bucket/histogram-aggregation.asciidoc index 129732c041067..99d7950e7de8b 100644 --- a/docs/reference/aggregations/bucket/histogram-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/histogram-aggregation.asciidoc @@ -317,3 +317,26 @@ Response: } } -------------------------------------------------- + +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "quantity" : { + "histogram" : { + "field" : "quantity", + "interval": 10, + "missing": 0 <1> + } + } + } +} +-------------------------------------------------- + +<1> Documents without a value in the `quantity` field will fall into the same bucket as documents that have the value `0`. diff --git a/docs/reference/aggregations/bucket/terms-aggregation.asciidoc b/docs/reference/aggregations/bucket/terms-aggregation.asciidoc index 58a6ca2449cda..cf401126c8a53 100644 --- a/docs/reference/aggregations/bucket/terms-aggregation.asciidoc +++ b/docs/reference/aggregations/bucket/terms-aggregation.asciidoc @@ -655,3 +655,25 @@ in inner aggregations. <1> experimental[] the possible values are `map`, `global_ordinals`, `global_ordinals_hash` and `global_ordinals_low_cardinality` Please note that Elasticsearch will ignore this execution hint if it is not applicable and that there is no backward compatibility guarantee on these hints. + +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "tags" : { + "terms" : { + "field" : "tags", + "missing": "N/A" <1> + } + } + } +} +-------------------------------------------------- + +<1> Documents without a value in the `tags` field will fall into the same bucket as documents that have the value `N/A`. diff --git a/docs/reference/aggregations/metrics/avg-aggregation.asciidoc b/docs/reference/aggregations/metrics/avg-aggregation.asciidoc index 3f029984ba83c..8e0d2b4b5e7a6 100644 --- a/docs/reference/aggregations/metrics/avg-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/avg-aggregation.asciidoc @@ -72,4 +72,26 @@ It turned out that the exam was way above the level of the students and a grade } } } --------------------------------------------------- \ No newline at end of file +-------------------------------------------------- + +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "grade_avg" : { + "avg" : { + "field" : "grade", + "missing": 10 <1> + } + } + } +} +-------------------------------------------------- + +<1> Documents without a value in the `grade` field will fall into the same bucket as documents that have the value `10`. diff --git a/docs/reference/aggregations/metrics/cardinality-aggregation.asciidoc b/docs/reference/aggregations/metrics/cardinality-aggregation.asciidoc index 07943a06c2d5b..8e34e16f7a8b5 100644 --- a/docs/reference/aggregations/metrics/cardinality-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/cardinality-aggregation.asciidoc @@ -155,3 +155,24 @@ however since hashes need to be computed on the fly. TIP: The `script` parameter expects an inline script. Use `script_id` for indexed scripts and `script_file` for scripts in the `config/scripts/` directory. +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "tag_cardinality" : { + "cardinality" : { + "field" : "tag", + "missing": "N/A" <1> + } + } + } +} +-------------------------------------------------- + +<1> Documents without a value in the `tag` field will fall into the same bucket as documents that have the value `N/A`. diff --git a/docs/reference/aggregations/metrics/extendedstats-aggregation.asciidoc b/docs/reference/aggregations/metrics/extendedstats-aggregation.asciidoc index 07d25fac65dd9..0f65b7670cf56 100644 --- a/docs/reference/aggregations/metrics/extendedstats-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/extendedstats-aggregation.asciidoc @@ -116,4 +116,26 @@ It turned out that the exam was way above the level of the students and a grade } } } --------------------------------------------------- \ No newline at end of file +-------------------------------------------------- + +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "grades_stats" : { + "extended_stats" : { + "field" : "grade", + "missing": 0 <1> + } + } + } +} +-------------------------------------------------- + +<1> Documents without a value in the `grade` field will fall into the same bucket as documents that have the value `0`. diff --git a/docs/reference/aggregations/metrics/max-aggregation.asciidoc b/docs/reference/aggregations/metrics/max-aggregation.asciidoc index facefc1201dd2..856adc4b03d80 100644 --- a/docs/reference/aggregations/metrics/max-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/max-aggregation.asciidoc @@ -67,3 +67,24 @@ Let's say that the prices of the documents in our index are in USD, but we would } -------------------------------------------------- +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "grade_max" : { + "max" : { + "field" : "grade", + "missing": 10 <1> + } + } + } +} +-------------------------------------------------- + +<1> Documents without a value in the `grade` field will fall into the same bucket as documents that have the value `10`. diff --git a/docs/reference/aggregations/metrics/min-aggregation.asciidoc b/docs/reference/aggregations/metrics/min-aggregation.asciidoc index 1383cc0832278..c7424d5570b9c 100644 --- a/docs/reference/aggregations/metrics/min-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/min-aggregation.asciidoc @@ -66,3 +66,25 @@ Let's say that the prices of the documents in our index are in USD, but we would } } -------------------------------------------------- + +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "grade_min" : { + "min" : { + "field" : "grade", + "missing": 10 <1> + } + } + } +} +-------------------------------------------------- + +<1> Documents without a value in the `grade` field will fall into the same bucket as documents that have the value `10`. diff --git a/docs/reference/aggregations/metrics/percentile-aggregation.asciidoc b/docs/reference/aggregations/metrics/percentile-aggregation.asciidoc index 6bd10110077a6..d5262beb6ef9f 100644 --- a/docs/reference/aggregations/metrics/percentile-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/percentile-aggregation.asciidoc @@ -190,3 +190,25 @@ A "node" uses roughly 32 bytes of memory, so under worst-case scenarios (large a of data which arrives sorted and in-order) the default settings will produce a TDigest roughly 64KB in size. In practice data tends to be more random and the TDigest will use less memory. + +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "grade_percentiles" : { + "percentiles" : { + "field" : "grade", + "missing": 10 <1> + } + } + } +} +-------------------------------------------------- + +<1> Documents without a value in the `grade` field will fall into the same bucket as documents that have the value `10`. diff --git a/docs/reference/aggregations/metrics/percentile-rank-aggregation.asciidoc b/docs/reference/aggregations/metrics/percentile-rank-aggregation.asciidoc index d327fc66303fe..a494a0a5d00ca 100644 --- a/docs/reference/aggregations/metrics/percentile-rank-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/percentile-rank-aggregation.asciidoc @@ -86,3 +86,25 @@ script to generate values which percentile ranks are calculated on <2> Scripting supports parameterized input just like any other script TIP: The `script` parameter expects an inline script. Use `script_id` for indexed scripts and `script_file` for scripts in the `config/scripts/` directory. + +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "grade_ranks" : { + "percentile_ranks" : { + "field" : "grade", + "missing": 10 <1> + } + } + } +} +-------------------------------------------------- + +<1> Documents without a value in the `grade` field will fall into the same bucket as documents that have the value `10`. diff --git a/docs/reference/aggregations/metrics/stats-aggregation.asciidoc b/docs/reference/aggregations/metrics/stats-aggregation.asciidoc index 7fbdecd60113e..429be4b8c4dc8 100644 --- a/docs/reference/aggregations/metrics/stats-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/stats-aggregation.asciidoc @@ -78,4 +78,26 @@ It turned out that the exam was way above the level of the students and a grade } } } --------------------------------------------------- \ No newline at end of file +-------------------------------------------------- + +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "grades_stats" : { + "stats" : { + "field" : "grade", + "missing": 0 <1> + } + } + } +} +-------------------------------------------------- + +<1> Documents without a value in the `grade` field will fall into the same bucket as documents that have the value `0`. diff --git a/docs/reference/aggregations/metrics/sum-aggregation.asciidoc b/docs/reference/aggregations/metrics/sum-aggregation.asciidoc index 8857ff306ee37..2d16129d15f0a 100644 --- a/docs/reference/aggregations/metrics/sum-aggregation.asciidoc +++ b/docs/reference/aggregations/metrics/sum-aggregation.asciidoc @@ -77,3 +77,25 @@ Computing the sum of squares over all stock tick changes: } } -------------------------------------------------- + +==== Missing value + +The `missing` parameter defines how documents that are missing a value should be treated. +By default they will be ignored but it is also possible to treat them as if they +had a value. + +[source,js] +-------------------------------------------------- +{ + "aggs" : { + "total_time" : { + "sum" : { + "field" : "took", + "missing": 100 <1> + } + } + } +} +-------------------------------------------------- + +<1> Documents without a value in the `took` field will fall into the same bucket as documents that have the value `100`. diff --git a/src/main/java/org/elasticsearch/common/geo/GeoUtils.java b/src/main/java/org/elasticsearch/common/geo/GeoUtils.java index c4bc51d7bfb82..77ff66db688c9 100644 --- a/src/main/java/org/elasticsearch/common/geo/GeoUtils.java +++ b/src/main/java/org/elasticsearch/common/geo/GeoUtils.java @@ -409,19 +409,24 @@ public static GeoPoint parseGeoPoint(XContentParser parser, GeoPoint point) thro return point.reset(lat, lon); } else if(parser.currentToken() == Token.VALUE_STRING) { String data = parser.text(); - int comma = data.indexOf(','); - if(comma > 0) { - lat = Double.parseDouble(data.substring(0, comma).trim()); - lon = Double.parseDouble(data.substring(comma + 1).trim()); - return point.reset(lat, lon); - } else { - return point.resetFromGeoHash(data); - } + return parseGeoPoint(data, point); } else { throw new ElasticsearchParseException("geo_point expected"); } } + /** parse a {@link GeoPoint} from a String */ + public static GeoPoint parseGeoPoint(String data, GeoPoint point) { + int comma = data.indexOf(','); + if(comma > 0) { + double lat = Double.parseDouble(data.substring(0, comma).trim()); + double lon = Double.parseDouble(data.substring(comma + 1).trim()); + return point.reset(lat, lon); + } else { + return point.resetFromGeoHash(data); + } + } + private GeoUtils() { } } diff --git a/src/main/java/org/elasticsearch/search/aggregations/ValuesSourceAggregationBuilder.java b/src/main/java/org/elasticsearch/search/aggregations/ValuesSourceAggregationBuilder.java index c61164386f867..8d34d68ead3d1 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/ValuesSourceAggregationBuilder.java +++ b/src/main/java/org/elasticsearch/search/aggregations/ValuesSourceAggregationBuilder.java @@ -34,6 +34,7 @@ public abstract class ValuesSourceAggregationBuilder params; + private Object missing; /** * Constructs a new builder. @@ -117,6 +118,14 @@ public B params(Map params) { return (B) this; } + /** + * Configure the value to use when documents miss a value. + */ + public B missing(Object missingValue) { + this.missing = missingValue; + return (B) this; + } + @Override protected final XContentBuilder internalXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(); @@ -132,6 +141,9 @@ protected final XContentBuilder internalXContent(XContentBuilder builder, Params if (this.params != null) { builder.field("params").map(this.params); } + if (missing != null) { + builder.field("missing", missing); + } doInternalXContent(builder, params); return builder.endObject(); diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java index 67766c79c19bd..2d2cebde163f6 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/GlobalOrdinalsStringTermsAggregator.java @@ -58,7 +58,7 @@ */ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggregator { - protected final ValuesSource.Bytes.WithOrdinals.FieldData valuesSource; + protected final ValuesSource.Bytes.WithOrdinals valuesSource; protected final IncludeExclude.OrdinalsFilter includeExclude; // TODO: cache the acceptedglobalValues per aggregation definition. @@ -71,7 +71,7 @@ public class GlobalOrdinalsStringTermsAggregator extends AbstractStringTermsAggr protected RandomAccessOrds globalOrds; - public GlobalOrdinalsStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, + public GlobalOrdinalsStringTermsAggregator(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals valuesSource, Terms.Order order, BucketCountThresholds bucketCountThresholds, IncludeExclude.OrdinalsFilter includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, List reducers, Map metaData) throws IOException { super(name, factories, aggregationContext, parent, order, bucketCountThresholds, collectionMode, showTermDocCountError, reducers, @@ -334,7 +334,7 @@ public static class LowCardinality extends GlobalOrdinalsStringTermsAggregator { private RandomAccessOrds segmentOrds; - public LowCardinality(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals.FieldData valuesSource, + public LowCardinality(String name, AggregatorFactories factories, ValuesSource.Bytes.WithOrdinals valuesSource, Terms.Order order, BucketCountThresholds bucketCountThresholds, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode collectionMode, boolean showTermDocCountError, List reducers, Map metaData) throws IOException { super(name, factories, valuesSource, order, bucketCountThresholds, null, aggregationContext, parent, collectionMode, showTermDocCountError, reducers, metaData); assert factories == null || factories.count() == 0; diff --git a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java index 4056409517ed1..84196067a86df 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java +++ b/src/main/java/org/elasticsearch/search/aggregations/bucket/terms/TermsAggregatorFactory.java @@ -69,7 +69,7 @@ Aggregator create(String name, AggregatorFactories factories, ValuesSource value Terms.Order order, TermsAggregator.BucketCountThresholds bucketCountThresholds, IncludeExclude includeExclude, AggregationContext aggregationContext, Aggregator parent, SubAggCollectionMode subAggCollectMode, boolean showTermDocCountError, List reducers, Map metaData) throws IOException { final IncludeExclude.OrdinalsFilter filter = includeExclude == null ? null : includeExclude.convertToOrdinalsFilter(); - return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, reducers, metaData); + return new GlobalOrdinalsStringTermsAggregator(name, factories, (ValuesSource.Bytes.WithOrdinals) valuesSource, order, bucketCountThresholds, filter, aggregationContext, parent, subAggCollectMode, showTermDocCountError, reducers, metaData); } @Override @@ -103,7 +103,7 @@ Aggregator create(String name, AggregatorFactories factories, ValuesSource value if (includeExclude != null || factories.count() > 0) { return GLOBAL_ORDINALS.create(name, factories, valuesSource, order, bucketCountThresholds, includeExclude, aggregationContext, parent, subAggCollectMode, showTermDocCountError, reducers, metaData); } - return new GlobalOrdinalsStringTermsAggregator.LowCardinality(name, factories, (ValuesSource.Bytes.WithOrdinals.FieldData) valuesSource, order, bucketCountThresholds, aggregationContext, parent, subAggCollectMode, showTermDocCountError, reducers, metaData); + return new GlobalOrdinalsStringTermsAggregator.LowCardinality(name, factories, (ValuesSource.Bytes.WithOrdinals) valuesSource, order, bucketCountThresholds, aggregationContext, parent, subAggCollectMode, showTermDocCountError, reducers, metaData); } @Override diff --git a/src/main/java/org/elasticsearch/search/aggregations/metrics/ValuesSourceMetricsAggregationBuilder.java b/src/main/java/org/elasticsearch/search/aggregations/metrics/ValuesSourceMetricsAggregationBuilder.java index dd97b3ccfb6db..cf2d644b70f35 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/metrics/ValuesSourceMetricsAggregationBuilder.java +++ b/src/main/java/org/elasticsearch/search/aggregations/metrics/ValuesSourceMetricsAggregationBuilder.java @@ -36,6 +36,7 @@ public abstract class ValuesSourceMetricsAggregationBuilder params; + private Object missing; protected ValuesSourceMetricsAggregationBuilder(String name, String type) { super(name, type); @@ -84,6 +85,14 @@ public B param(String name, Object value) { return (B) this; } + /** + * Configure the value to use when documents miss a value. + */ + public B missing(Object missingValue) { + this.missing = missingValue; + return (B) this; + } + @Override protected void internalXContent(XContentBuilder builder, Params params) throws IOException { if (field != null) { @@ -105,5 +114,9 @@ protected void internalXContent(XContentBuilder builder, Params params) throws I if (this.params != null && !this.params.isEmpty()) { builder.field("params").map(this.params); } + + if (missing != null) { + builder.field("missing", missing); + } } } diff --git a/src/main/java/org/elasticsearch/search/aggregations/support/AggregationContext.java b/src/main/java/org/elasticsearch/search/aggregations/support/AggregationContext.java index 4165927bc0b70..4591e8392e2cd 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/support/AggregationContext.java +++ b/src/main/java/org/elasticsearch/search/aggregations/support/AggregationContext.java @@ -18,17 +18,24 @@ */ package org.elasticsearch.search.aggregations.support; +import org.apache.lucene.util.BytesRef; import org.elasticsearch.cache.recycler.PageCacheRecycler; +import org.elasticsearch.common.Nullable; +import org.elasticsearch.common.geo.GeoPoint; +import org.elasticsearch.common.geo.GeoUtils; import org.elasticsearch.common.util.BigArrays; import org.elasticsearch.index.fielddata.IndexFieldData; import org.elasticsearch.index.fielddata.IndexGeoPointFieldData; import org.elasticsearch.index.fielddata.IndexNumericFieldData; import org.elasticsearch.index.fielddata.IndexOrdinalsFieldData; import org.elasticsearch.index.fielddata.plain.ParentChildIndexFieldData; +import org.elasticsearch.index.mapper.core.DateFieldMapper; +import org.elasticsearch.search.SearchParseException; import org.elasticsearch.search.aggregations.AggregationExecutionException; import org.elasticsearch.search.internal.SearchContext; import java.io.IOException; +import java.util.Date; /** * @@ -53,11 +60,78 @@ public BigArrays bigArrays() { return searchContext.bigArrays(); } - /** Get a value source given its configuration and the depth of the aggregator in the aggregation tree. */ - public VS valuesSource(ValuesSourceConfig config) throws IOException { + /** Get a value source given its configuration. A return value of null indicates that + * no value source could be built. */ + @Nullable + public VS valuesSource(ValuesSourceConfig config, SearchContext context) throws IOException { assert config.valid() : "value source config is invalid - must have either a field context or a script or marked as unmapped"; - assert !config.unmapped : "value source should not be created for unmapped fields"; + final VS vs; + if (config.unmapped) { + if (config.missing == null) { + // otherwise we will have values because of the missing value + vs = null; + } else if (ValuesSource.Numeric.class.isAssignableFrom(config.valueSourceType)) { + vs = (VS) ValuesSource.Numeric.EMPTY; + } else if (ValuesSource.GeoPoint.class.isAssignableFrom(config.valueSourceType)) { + vs = (VS) ValuesSource.GeoPoint.EMPTY; + } else if (ValuesSource.class.isAssignableFrom(config.valueSourceType) + || ValuesSource.Bytes.class.isAssignableFrom(config.valueSourceType) + || ValuesSource.Bytes.WithOrdinals.class.isAssignableFrom(config.valueSourceType)) { + vs = (VS) ValuesSource.Bytes.EMPTY; + } else { + throw new SearchParseException(searchContext, "Can't deal with unmapped ValuesSource type " + config.valueSourceType, null); + } + } else { + vs = originalValuesSource(config); + } + + if (config.missing == null) { + return vs; + } + + if (vs instanceof ValuesSource.Bytes) { + final BytesRef missing = new BytesRef(config.missing.toString()); + if (vs instanceof ValuesSource.Bytes.WithOrdinals) { + return (VS) MissingValues.replaceMissing((ValuesSource.Bytes.WithOrdinals) vs, missing); + } else { + return (VS) MissingValues.replaceMissing((ValuesSource.Bytes) vs, missing); + } + } else if (vs instanceof ValuesSource.Numeric) { + Number missing = null; + if (config.missing instanceof Number) { + missing = (Number) config.missing; + } else { + if (config.fieldContext != null && config.fieldContext.mapper() instanceof DateFieldMapper) { + final DateFieldMapper mapper = (DateFieldMapper) config.fieldContext.mapper(); + try { + missing = mapper.dateTimeFormatter().parser().parseDateTime(config.missing.toString()).getMillis(); + } catch (IllegalArgumentException e) { + throw new SearchParseException(context, "Expected a date value in [missing] but got [" + config.missing + "]", null, e); + } + } else { + try { + missing = Double.parseDouble(config.missing.toString()); + } catch (NumberFormatException e) { + throw new SearchParseException(context, "Expected a numeric value in [missing] but got [" + config.missing + "]", null, e); + } + } + } + return (VS) MissingValues.replaceMissing((ValuesSource.Numeric) vs, missing); + } else if (vs instanceof ValuesSource.GeoPoint) { + // TODO: also support the structured formats of geo points + final GeoPoint missing = GeoUtils.parseGeoPoint(config.missing.toString(), new GeoPoint()); + return (VS) MissingValues.replaceMissing((ValuesSource.GeoPoint) vs, missing); + } else { + // Should not happen + throw new SearchParseException(searchContext, "Can't apply missing values on a " + vs.getClass(), null); + } + } + + /** + * Return the original values source, before we apply `missing`. + */ + private VS originalValuesSource(ValuesSourceConfig config) throws IOException { if (config.fieldContext == null) { if (ValuesSource.Numeric.class.isAssignableFrom(config.valueSourceType)) { return (VS) numericScript(config); @@ -111,7 +185,7 @@ private ValuesSource.Bytes bytesScript(ValuesSourceConfig config) throws IOEx } private ValuesSource.GeoPoint geoPointField(ValuesSourceConfig config) throws IOException { - return new ValuesSource.GeoPoint((IndexGeoPointFieldData) config.fieldContext.indexFieldData()); + return new ValuesSource.GeoPoint.Fielddata((IndexGeoPointFieldData) config.fieldContext.indexFieldData()); } } diff --git a/src/main/java/org/elasticsearch/search/aggregations/support/MissingValues.java b/src/main/java/org/elasticsearch/search/aggregations/support/MissingValues.java new file mode 100644 index 0000000000000..28a4bd2567ce4 --- /dev/null +++ b/src/main/java/org/elasticsearch/search/aggregations/support/MissingValues.java @@ -0,0 +1,337 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.support; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.RandomAccessOrds; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.util.BytesRef; +import org.elasticsearch.common.geo.GeoPoint; +import org.elasticsearch.index.fielddata.AbstractRandomAccessOrds; +import org.elasticsearch.index.fielddata.MultiGeoPointValues; +import org.elasticsearch.index.fielddata.SortedBinaryDocValues; +import org.elasticsearch.index.fielddata.SortedNumericDoubleValues; + +import java.io.IOException; + +/** + * Utility class that allows to return views of {@link ValuesSource}s that + * replace the missing value with a configured value. + */ +public enum MissingValues { + ; + + // TODO: we could specialize the single value case + + public static ValuesSource.Bytes replaceMissing(final ValuesSource.Bytes valuesSource, final BytesRef missing) { + return new ValuesSource.Bytes() { + @Override + public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { + SortedBinaryDocValues values = valuesSource.bytesValues(context); + return replaceMissing(values, missing); + } + }; + } + + static SortedBinaryDocValues replaceMissing(final SortedBinaryDocValues values, final BytesRef missing) { + return new SortedBinaryDocValues() { + + private int count; + + @Override + public BytesRef valueAt(int index) { + if (count > 0) { + return values.valueAt(index); + } else if (index == 0) { + return missing; + } else { + throw new IndexOutOfBoundsException(); + } + } + + @Override + public void setDocument(int docId) { + values.setDocument(docId); + count = values.count(); + } + + @Override + public int count() { + return count == 0 ? 1 : count; + } + }; + } + + public static ValuesSource.Numeric replaceMissing(final ValuesSource.Numeric valuesSource, final Number missing) { + final boolean missingIsFloat = missing.longValue() != (long) missing.doubleValue(); + final boolean isFloatingPoint = valuesSource.isFloatingPoint() || missingIsFloat; + return new ValuesSource.Numeric() { + + @Override + public boolean isFloatingPoint() { + return isFloatingPoint; + } + + @Override + public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { + return replaceMissing(valuesSource.bytesValues(context), new BytesRef(missing.toString())); + } + + @Override + public SortedNumericDocValues longValues(LeafReaderContext context) throws IOException { + final SortedNumericDocValues values = valuesSource.longValues(context); + return replaceMissing(values, missing.longValue()); + } + + @Override + public SortedNumericDoubleValues doubleValues(LeafReaderContext context) throws IOException { + final SortedNumericDoubleValues values = valuesSource.doubleValues(context); + return replaceMissing(values, missing.doubleValue()); + } + }; + } + + static SortedNumericDocValues replaceMissing(final SortedNumericDocValues values, final long missing) { + return new SortedNumericDocValues() { + + private int count; + + @Override + public void setDocument(int doc) { + values.setDocument(doc); + count = values.count(); + } + + @Override + public long valueAt(int index) { + if (count > 0) { + return values.valueAt(index); + } else if (index == 0) { + return missing; + } else { + throw new IndexOutOfBoundsException(); + } + } + + @Override + public int count() { + return count == 0 ? 1 : count; + } + + }; + } + + static SortedNumericDoubleValues replaceMissing(final SortedNumericDoubleValues values, final double missing) { + return new SortedNumericDoubleValues() { + + private int count; + + @Override + public void setDocument(int doc) { + values.setDocument(doc); + count = values.count(); + } + + @Override + public double valueAt(int index) { + if (count > 0) { + return values.valueAt(index); + } else if (index == 0) { + return missing; + } else { + throw new IndexOutOfBoundsException(); + } + } + + @Override + public int count() { + return count == 0 ? 1 : count; + } + + }; + } + + public static ValuesSource.Bytes replaceMissing(final ValuesSource.Bytes.WithOrdinals valuesSource, final BytesRef missing) { + return new ValuesSource.Bytes.WithOrdinals() { + @Override + public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { + SortedBinaryDocValues values = valuesSource.bytesValues(context); + return replaceMissing(values, missing); + } + + @Override + public RandomAccessOrds ordinalsValues(LeafReaderContext context) { + RandomAccessOrds values = valuesSource.ordinalsValues(context); + return replaceMissing(values, missing); + } + + @Override + public RandomAccessOrds globalOrdinalsValues(LeafReaderContext context) { + RandomAccessOrds values = valuesSource.globalOrdinalsValues(context); + return replaceMissing(values, missing); + } + }; + } + + static RandomAccessOrds replaceMissing(final RandomAccessOrds values, final BytesRef missing) { + final long missingOrd = values.lookupTerm(missing); + if (missingOrd >= 0) { + // The value already exists + return replaceMissingOrd(values, missingOrd); + } else { + final long insertedOrd = -1 - missingOrd; + return insertOrd(values, insertedOrd, missing); + } + } + + static RandomAccessOrds replaceMissingOrd(final RandomAccessOrds values, final long missingOrd) { + return new AbstractRandomAccessOrds() { + + private int cardinality = 0; + + @Override + public void doSetDocument(int docID) { + values.setDocument(docID); + cardinality = values.cardinality(); + } + + @Override + public BytesRef lookupOrd(long ord) { + return values.lookupOrd(ord); + } + + @Override + public long getValueCount() { + return values.getValueCount(); + } + + @Override + public long ordAt(int index) { + if (cardinality > 0) { + return values.ordAt(index); + } else if (index == 0) { + return missingOrd; + } else { + throw new IndexOutOfBoundsException(); + } + } + + @Override + public int cardinality() { + return cardinality == 0 ? 1 : cardinality; + } + }; + } + + static RandomAccessOrds insertOrd(final RandomAccessOrds values, final long insertedOrd, final BytesRef missingValue) { + return new AbstractRandomAccessOrds() { + + private int cardinality = 0; + + @Override + public void doSetDocument(int docID) { + values.setDocument(docID); + cardinality = values.cardinality(); + } + + @Override + public BytesRef lookupOrd(long ord) { + if (ord < insertedOrd) { + return values.lookupOrd(ord); + } else if (ord > insertedOrd) { + return values.lookupOrd(ord - 1); + } else { + return missingValue; + } + } + + @Override + public long getValueCount() { + return 1 + values.getValueCount(); + } + + @Override + public long ordAt(int index) { + if (cardinality > 0) { + final long ord = values.ordAt(index); + if (ord < insertedOrd) { + return ord; + } else { + return ord + 1; + } + } else if (index == 0) { + return insertedOrd; + } else { + throw new IndexOutOfBoundsException(); + } + } + + @Override + public int cardinality() { + return cardinality == 0 ? 1 : cardinality; + } + }; + } + + public static ValuesSource.GeoPoint replaceMissing(final ValuesSource.GeoPoint valuesSource, final GeoPoint missing) { + return new ValuesSource.GeoPoint() { + + @Override + public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { + return replaceMissing(valuesSource.bytesValues(context), new BytesRef(missing.toString())); + } + + @Override + public MultiGeoPointValues geoPointValues(LeafReaderContext context) { + final MultiGeoPointValues values = valuesSource.geoPointValues(context); + return replaceMissing(values, missing); + } + }; + } + + static MultiGeoPointValues replaceMissing(final MultiGeoPointValues values, final GeoPoint missing) { + return new MultiGeoPointValues() { + + private int count; + + @Override + public GeoPoint valueAt(int index) { + if (count > 0) { + return values.valueAt(index); + } else if (index == 0) { + return missing; + } else { + throw new IndexOutOfBoundsException(); + } + } + + @Override + public void setDocument(int docId) { + values.setDocument(docId); + count = values.count(); + } + + @Override + public int count() { + return count == 0 ? 1 : count; + } + }; + } +} diff --git a/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSource.java b/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSource.java index 90509f8193e1f..577b3b7a8010a 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSource.java +++ b/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSource.java @@ -68,6 +68,25 @@ public boolean needsScores() { public static abstract class Bytes extends ValuesSource { + public static final WithOrdinals EMPTY = new WithOrdinals() { + + @Override + public RandomAccessOrds ordinalsValues(LeafReaderContext context) { + return DocValues.emptySortedSet(); + } + + @Override + public RandomAccessOrds globalOrdinalsValues(LeafReaderContext context) { + return DocValues.emptySortedSet(); + } + + @Override + public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { + return org.elasticsearch.index.fielddata.FieldData.emptySortedBinary(context.reader().maxDoc()); + } + + }; + @Override public Bits docsWithValue(LeafReaderContext context) throws IOException { final SortedBinaryDocValues bytes = bytesValues(context); @@ -94,7 +113,16 @@ public Bits docsWithValue(LeafReaderContext context) { public abstract RandomAccessOrds globalOrdinalsValues(LeafReaderContext context); - public abstract long globalMaxOrd(IndexSearcher indexSearcher); + public long globalMaxOrd(IndexSearcher indexSearcher) { + IndexReader indexReader = indexSearcher.getIndexReader(); + if (indexReader.leaves().isEmpty()) { + return 0; + } else { + LeafReaderContext atomicReaderContext = indexReader.leaves().get(0); + RandomAccessOrds values = globalOrdinalsValues(atomicReaderContext); + return values.getValueCount(); + } + } public static class FieldData extends WithOrdinals { @@ -122,20 +150,6 @@ public RandomAccessOrds globalOrdinalsValues(LeafReaderContext context) { final AtomicOrdinalsFieldData atomicFieldData = global.load(context); return atomicFieldData.getOrdinalsValues(); } - - @Override - public long globalMaxOrd(IndexSearcher indexSearcher) { - IndexReader indexReader = indexSearcher.getIndexReader(); - if (indexReader.leaves().isEmpty()) { - return 0; - } else { - LeafReaderContext atomicReaderContext = indexReader.leaves().get(0); - IndexOrdinalsFieldData globalFieldData = indexFieldData.loadGlobal(indexReader); - AtomicOrdinalsFieldData afd = globalFieldData.load(atomicReaderContext); - RandomAccessOrds values = afd.getOrdinalsValues(); - return values.getValueCount(); - } - } } } @@ -212,6 +226,30 @@ public boolean needsScores() { public static abstract class Numeric extends ValuesSource { + public static final Numeric EMPTY = new Numeric() { + + @Override + public boolean isFloatingPoint() { + return false; + } + + @Override + public SortedNumericDocValues longValues(LeafReaderContext context) { + return DocValues.emptySortedNumeric(context.reader().maxDoc()); + } + + @Override + public SortedNumericDoubleValues doubleValues(LeafReaderContext context) throws IOException { + return org.elasticsearch.index.fielddata.FieldData.emptySortedNumericDoubles(context.reader().maxDoc()); + } + + @Override + public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { + return org.elasticsearch.index.fielddata.FieldData.emptySortedBinary(context.reader().maxDoc()); + } + + }; + /** Whether the underlying data is floating-point or not. */ public abstract boolean isFloatingPoint(); @@ -452,13 +490,21 @@ public void setScorer(Scorer scorer) { } } - public static class GeoPoint extends ValuesSource { + public static abstract class GeoPoint extends ValuesSource { - protected final IndexGeoPointFieldData indexFieldData; + public static final GeoPoint EMPTY = new GeoPoint() { - public GeoPoint(IndexGeoPointFieldData indexFieldData) { - this.indexFieldData = indexFieldData; - } + @Override + public MultiGeoPointValues geoPointValues(LeafReaderContext context) { + return org.elasticsearch.index.fielddata.FieldData.emptyMultiGeoPoints(context.reader().maxDoc()); + } + + @Override + public SortedBinaryDocValues bytesValues(LeafReaderContext context) throws IOException { + return org.elasticsearch.index.fielddata.FieldData.emptySortedBinary(context.reader().maxDoc()); + } + + }; @Override public Bits docsWithValue(LeafReaderContext context) { @@ -470,13 +516,24 @@ public Bits docsWithValue(LeafReaderContext context) { } } - @Override - public SortedBinaryDocValues bytesValues(LeafReaderContext context) { - return indexFieldData.load(context).getBytesValues(); - } + public abstract MultiGeoPointValues geoPointValues(LeafReaderContext context); - public org.elasticsearch.index.fielddata.MultiGeoPointValues geoPointValues(LeafReaderContext context) { - return indexFieldData.load(context).getGeoPointValues(); + public static class Fielddata extends GeoPoint { + + protected final IndexGeoPointFieldData indexFieldData; + + public Fielddata(IndexGeoPointFieldData indexFieldData) { + this.indexFieldData = indexFieldData; + } + + @Override + public SortedBinaryDocValues bytesValues(LeafReaderContext context) { + return indexFieldData.load(context).getBytesValues(); + } + + public org.elasticsearch.index.fielddata.MultiGeoPointValues geoPointValues(LeafReaderContext context) { + return indexFieldData.load(context).getGeoPointValues(); + } } } diff --git a/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSourceAggregatorFactory.java b/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSourceAggregatorFactory.java index dbefc2e261242..d083ae306cc35 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSourceAggregatorFactory.java +++ b/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSourceAggregatorFactory.java @@ -57,10 +57,10 @@ protected ValuesSourceAggregatorFactory(String name, String type, ValuesSourceCo @Override public Aggregator createInternal(AggregationContext context, Aggregator parent, boolean collectsFromSingleBucket, List reducers, Map metaData) throws IOException { - if (config.unmapped()) { + VS vs = context.valuesSource(config, context.searchContext()); + if (vs == null) { return createUnmapped(context, parent, reducers, metaData); } - VS vs = context.valuesSource(config); return doCreateInternal(vs, context, parent, collectsFromSingleBucket, reducers, metaData); } diff --git a/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSourceConfig.java b/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSourceConfig.java index 3ad7559b431ff..d855a909c59e5 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSourceConfig.java +++ b/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSourceConfig.java @@ -35,6 +35,7 @@ public class ValuesSourceConfig { boolean unmapped = false; String formatPattern; ValueFormat format; + Object missing; public ValuesSourceConfig(Class valueSourceType) { this.valueSourceType = valueSourceType; diff --git a/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSourceParser.java b/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSourceParser.java index 88c3f64b0898b..f7313885cbbaa 100644 --- a/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSourceParser.java +++ b/src/main/java/org/elasticsearch/search/aggregations/support/ValuesSourceParser.java @@ -68,6 +68,7 @@ private static class Input { Map params = null; ValueType valueType = null; String format = null; + Object missing = null; } private final String aggName; @@ -90,6 +91,10 @@ private ValuesSourceParser(String aggName, InternalAggregation.Type aggType, Sea } public boolean token(String currentFieldName, XContentParser.Token token, XContentParser parser) throws IOException { + if ("missing".equals(currentFieldName) && token.isValue()) { + input.missing = parser.objectText(); + return true; + } if (token == XContentParser.Token.VALUE_STRING) { if ("field".equals(currentFieldName)) { input.field = parser.text(); @@ -146,6 +151,7 @@ public ValuesSourceConfig config() { valuesSourceType = ValuesSource.Bytes.class; } ValuesSourceConfig config = new ValuesSourceConfig(valuesSourceType); + config.missing = input.missing; config.format = resolveFormat(input.format, valueType); config.script = createScript(); config.scriptValueType = valueType; @@ -156,6 +162,7 @@ public ValuesSourceConfig config() { if (mapper == null) { Class valuesSourceType = valueType != null ? (Class) valueType.getValuesSourceType() : this.valuesSourceType; ValuesSourceConfig config = new ValuesSourceConfig<>(valuesSourceType); + config.missing = input.missing; config.format = resolveFormat(input.format, valueType); config.unmapped = true; if (valueType != null) { @@ -181,6 +188,7 @@ public ValuesSourceConfig config() { } config.fieldContext = new FieldContext(input.field, indexFieldData, mapper); + config.missing = input.missing; config.script = createScript(); config.format = resolveFormat(input.format, mapper); return config; diff --git a/src/test/java/org/elasticsearch/search/aggregations/MissingValueTests.java b/src/test/java/org/elasticsearch/search/aggregations/MissingValueTests.java new file mode 100644 index 0000000000000..157dc528f80c7 --- /dev/null +++ b/src/test/java/org/elasticsearch/search/aggregations/MissingValueTests.java @@ -0,0 +1,195 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations; + +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.common.geo.GeoPoint; +import org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval; +import org.elasticsearch.search.aggregations.bucket.histogram.Histogram; +import org.elasticsearch.search.aggregations.bucket.terms.Terms; +import org.elasticsearch.search.aggregations.metrics.cardinality.Cardinality; +import org.elasticsearch.search.aggregations.metrics.geobounds.GeoBounds; +import org.elasticsearch.search.aggregations.metrics.percentiles.Percentiles; +import org.elasticsearch.search.aggregations.metrics.stats.Stats; +import org.elasticsearch.test.ElasticsearchIntegrationTest; + +import static org.elasticsearch.search.aggregations.AggregationBuilders.cardinality; +import static org.elasticsearch.search.aggregations.AggregationBuilders.dateHistogram; +import static org.elasticsearch.search.aggregations.AggregationBuilders.geoBounds; +import static org.elasticsearch.search.aggregations.AggregationBuilders.histogram; +import static org.elasticsearch.search.aggregations.AggregationBuilders.percentiles; +import static org.elasticsearch.search.aggregations.AggregationBuilders.stats; +import static org.elasticsearch.search.aggregations.AggregationBuilders.terms; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse; + +@ElasticsearchIntegrationTest.SuiteScopeTest +public class MissingValueTests extends ElasticsearchIntegrationTest { + + @Override + protected int maximumNumberOfShards() { + return 2; + } + + @Override + protected void setupSuiteScopeCluster() throws Exception { + assertAcked(prepareCreate("idx").addMapping("type", "date", "type=date", "location", "type=geo_point").get()); + indexRandom(true, + client().prepareIndex("idx", "type", "1").setSource(), + client().prepareIndex("idx", "type", "2").setSource("str", "foo", "long", 3L, "double", 5.5, "date", "2015-05-07", "location", "1,2")); + } + + public void testUnmappedTerms() { + SearchResponse response = client().prepareSearch("idx").addAggregation(terms("my_terms").field("non_existing_field").missing("bar")).get(); + assertSearchResponse(response); + Terms terms = response.getAggregations().get("my_terms"); + assertEquals(1, terms.getBuckets().size()); + assertEquals(2, terms.getBucketByKey("bar").getDocCount()); + } + + public void testStringTerms() { + SearchResponse response = client().prepareSearch("idx").addAggregation(terms("my_terms").field("str").missing("bar")).get(); + assertSearchResponse(response); + Terms terms = response.getAggregations().get("my_terms"); + assertEquals(2, terms.getBuckets().size()); + assertEquals(1, terms.getBucketByKey("foo").getDocCount()); + assertEquals(1, terms.getBucketByKey("bar").getDocCount()); + + response = client().prepareSearch("idx").addAggregation(terms("my_terms").field("str").missing("foo")).get(); + assertSearchResponse(response); + terms = response.getAggregations().get("my_terms"); + assertEquals(1, terms.getBuckets().size()); + assertEquals(2, terms.getBucketByKey("foo").getDocCount()); + } + + public void testLongTerms() { + SearchResponse response = client().prepareSearch("idx").addAggregation(terms("my_terms").field("long").missing(4)).get(); + assertSearchResponse(response); + Terms terms = response.getAggregations().get("my_terms"); + assertEquals(2, terms.getBuckets().size()); + assertEquals(1, terms.getBucketByKey("3").getDocCount()); + assertEquals(1, terms.getBucketByKey("4").getDocCount()); + + response = client().prepareSearch("idx").addAggregation(terms("my_terms").field("long").missing(3)).get(); + assertSearchResponse(response); + terms = response.getAggregations().get("my_terms"); + assertEquals(1, terms.getBuckets().size()); + assertEquals(2, terms.getBucketByKey("3").getDocCount()); + } + + public void testDoubleTerms() { + SearchResponse response = client().prepareSearch("idx").addAggregation(terms("my_terms").field("double").missing(4.5)).get(); + assertSearchResponse(response); + Terms terms = response.getAggregations().get("my_terms"); + assertEquals(2, terms.getBuckets().size()); + assertEquals(1, terms.getBucketByKey("4.5").getDocCount()); + assertEquals(1, terms.getBucketByKey("5.5").getDocCount()); + + response = client().prepareSearch("idx").addAggregation(terms("my_terms").field("double").missing(5.5)).get(); + assertSearchResponse(response); + terms = response.getAggregations().get("my_terms"); + assertEquals(1, terms.getBuckets().size()); + assertEquals(2, terms.getBucketByKey("5.5").getDocCount()); + } + + public void testUnmappedHistogram() { + SearchResponse response = client().prepareSearch("idx").addAggregation(histogram("my_histogram").field("non-existing_field").interval(5).missing(12)).get(); + assertSearchResponse(response); + Histogram histogram = response.getAggregations().get("my_histogram"); + assertEquals(1, histogram.getBuckets().size()); + assertEquals(10L, histogram.getBuckets().get(0).getKey()); + assertEquals(2, histogram.getBuckets().get(0).getDocCount()); + } + + public void testHistogram() { + SearchResponse response = client().prepareSearch("idx").addAggregation(histogram("my_histogram").field("long").interval(5).missing(7)).get(); + assertSearchResponse(response); + Histogram histogram = response.getAggregations().get("my_histogram"); + assertEquals(2, histogram.getBuckets().size()); + assertEquals(0L, histogram.getBuckets().get(0).getKey()); + assertEquals(1, histogram.getBuckets().get(0).getDocCount()); + assertEquals(5L, histogram.getBuckets().get(1).getKey()); + assertEquals(1, histogram.getBuckets().get(1).getDocCount()); + + response = client().prepareSearch("idx").addAggregation(histogram("my_histogram").field("long").interval(5).missing(3)).get(); + assertSearchResponse(response); + histogram = response.getAggregations().get("my_histogram"); + assertEquals(1, histogram.getBuckets().size()); + assertEquals(0L, histogram.getBuckets().get(0).getKey()); + assertEquals(2, histogram.getBuckets().get(0).getDocCount()); + } + + public void testDateHistogram() { + SearchResponse response = client().prepareSearch("idx").addAggregation(dateHistogram("my_histogram").field("date").interval(DateHistogramInterval.YEAR).missing("2014-05-07")).get(); + assertSearchResponse(response); + Histogram histogram = response.getAggregations().get("my_histogram"); + assertEquals(2, histogram.getBuckets().size()); + assertEquals("2014-01-01T00:00:00.000Z", histogram.getBuckets().get(0).getKeyAsString()); + assertEquals(1, histogram.getBuckets().get(0).getDocCount()); + assertEquals("2015-01-01T00:00:00.000Z", histogram.getBuckets().get(1).getKeyAsString()); + assertEquals(1, histogram.getBuckets().get(1).getDocCount()); + + response = client().prepareSearch("idx").addAggregation(dateHistogram("my_histogram").field("date").interval(DateHistogramInterval.YEAR).missing("2015-05-07")).get(); + assertSearchResponse(response); + histogram = response.getAggregations().get("my_histogram"); + assertEquals(1, histogram.getBuckets().size()); + assertEquals("2015-01-01T00:00:00.000Z", histogram.getBuckets().get(0).getKeyAsString()); + assertEquals(2, histogram.getBuckets().get(0).getDocCount()); + } + + public void testCardinality() { + SearchResponse response = client().prepareSearch("idx").addAggregation(cardinality("card").field("long").missing(2)).get(); + assertSearchResponse(response); + Cardinality cardinality = response.getAggregations().get("card"); + assertEquals(2, cardinality.getValue()); + } + + public void testPercentiles() { + SearchResponse response = client().prepareSearch("idx").addAggregation(percentiles("percentiles").field("long").missing(1000)).get(); + assertSearchResponse(response); + Percentiles percentiles = response.getAggregations().get("percentiles"); + assertEquals(1000, percentiles.percentile(100), 0); + } + + public void testStats() { + SearchResponse response = client().prepareSearch("idx").addAggregation(stats("stats").field("long").missing(5)).get(); + assertSearchResponse(response); + Stats stats = response.getAggregations().get("stats"); + assertEquals(2, stats.getCount()); + assertEquals(4, stats.getAvg(), 0); + } + + public void testUnmappedGeoBounds() { + SearchResponse response = client().prepareSearch("idx").addAggregation(geoBounds("bounds").field("non_existing_field").missing("2,1")).get(); + assertSearchResponse(response); + GeoBounds bounds = response.getAggregations().get("bounds"); + assertEquals(new GeoPoint(2,1), bounds.bottomRight()); + assertEquals(new GeoPoint(2,1), bounds.topLeft()); + } + + public void testGeoBounds() { + SearchResponse response = client().prepareSearch("idx").addAggregation(geoBounds("bounds").field("location").missing("2,1")).get(); + assertSearchResponse(response); + GeoBounds bounds = response.getAggregations().get("bounds"); + assertEquals(new GeoPoint(1,2), bounds.bottomRight()); + assertEquals(new GeoPoint(2,1), bounds.topLeft()); + } + +} diff --git a/src/test/java/org/elasticsearch/search/aggregations/support/MissingValuesTests.java b/src/test/java/org/elasticsearch/search/aggregations/support/MissingValuesTests.java new file mode 100644 index 0000000000000..cd72d7f069b7f --- /dev/null +++ b/src/test/java/org/elasticsearch/search/aggregations/support/MissingValuesTests.java @@ -0,0 +1,297 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.elasticsearch.search.aggregations.support; + +import com.carrotsearch.randomizedtesting.generators.RandomPicks; +import com.carrotsearch.randomizedtesting.generators.RandomStrings; + +import org.apache.lucene.index.RandomAccessOrds; +import org.apache.lucene.index.SortedNumericDocValues; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.TestUtil; +import org.elasticsearch.common.geo.GeoPoint; +import org.elasticsearch.index.fielddata.AbstractRandomAccessOrds; +import org.elasticsearch.index.fielddata.MultiGeoPointValues; +import org.elasticsearch.index.fielddata.SortedBinaryDocValues; +import org.elasticsearch.index.fielddata.SortedNumericDoubleValues; +import org.elasticsearch.test.ElasticsearchTestCase; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +public class MissingValuesTests extends ElasticsearchTestCase { + + public void testMissingBytes() { + final int numDocs = TestUtil.nextInt(random(), 1, 100); + final BytesRef[][] values = new BytesRef[numDocs][]; + for (int i = 0; i < numDocs; ++i) { + values[i] = new BytesRef[random().nextInt(4)]; + for (int j = 0; j < values[i].length; ++j) { + values[i][j] = new BytesRef(RandomStrings.randomAsciiOfLength(random(), 2)); + } + Arrays.sort(values[i]); + } + SortedBinaryDocValues asBinaryValues = new SortedBinaryDocValues() { + + int i = -1; + + @Override + public BytesRef valueAt(int index) { + return values[i][index]; + } + + @Override + public void setDocument(int docId) { + i = docId; + } + + @Override + public int count() { + return values[i].length; + } + }; + final BytesRef missing = new BytesRef(RandomStrings.randomAsciiOfLength(random(), 2)); + SortedBinaryDocValues withMissingReplaced = MissingValues.replaceMissing(asBinaryValues, missing); + for (int i = 0; i < numDocs; ++i) { + withMissingReplaced.setDocument(i); + if (values[i].length > 0) { + assertEquals(values[i].length, withMissingReplaced.count()); + for (int j = 0; j < values[i].length; ++j) { + assertEquals(values[i][j], withMissingReplaced.valueAt(j)); + } + } else { + assertEquals(1, withMissingReplaced.count()); + assertEquals(missing, withMissingReplaced.valueAt(0)); + } + } + } + + public void testMissingOrds() { + final int numDocs = TestUtil.nextInt(random(), 1, 100); + final int numOrds = TestUtil.nextInt(random(), 1, 10); + + final Set valueSet = new HashSet<>(); + while (valueSet.size() < numOrds) { + valueSet.add(new BytesRef(RandomStrings.randomAsciiOfLength(random(), 5))); + } + final BytesRef[] values = valueSet.toArray(new BytesRef[numOrds]); + Arrays.sort(values); + + final int[][] ords = new int[numDocs][]; + for (int i = 0; i < numDocs; ++i) { + ords[i] = new int[random().nextInt(numOrds)]; + for (int j = 0; j < ords[i].length; ++j) { + ords[i][j] = j; + } + for (int j = ords[i].length - 1; j >= 0; --j) { + final int maxOrd = j == ords[i].length - 1 ? numOrds : ords[i][j+1]; + ords[i][j] = TestUtil.nextInt(random(), ords[i][j], maxOrd - 1); + } + } + RandomAccessOrds asRandomAccessOrds = new AbstractRandomAccessOrds() { + + int i = -1; + + @Override + public void doSetDocument(int docID) { + i = docID; + } + + @Override + public BytesRef lookupOrd(long ord) { + return values[(int) ord]; + } + + @Override + public long getValueCount() { + return values.length; + } + + @Override + public long ordAt(int index) { + return ords[i][index]; + } + + @Override + public int cardinality() { + return ords[i].length; + } + }; + + final BytesRef existingMissing = RandomPicks.randomFrom(random(), values); + final BytesRef missingMissing = new BytesRef(RandomStrings.randomAsciiOfLength(random(), 5)); + + for (BytesRef missing : Arrays.asList(existingMissing, missingMissing)) { + RandomAccessOrds withMissingReplaced = MissingValues.replaceMissing(asRandomAccessOrds, missing); + if (valueSet.contains(missing)) { + assertEquals(values.length, withMissingReplaced.getValueCount()); + } else { + assertEquals(values.length + 1, withMissingReplaced.getValueCount()); + } + for (int i = 0; i < numDocs; ++i) { + withMissingReplaced.setDocument(i); + if (ords[i].length > 0) { + assertEquals(ords[i].length, withMissingReplaced.cardinality()); + for (int j = 0; j < ords[i].length; ++j) { + assertEquals(values[ords[i][j]], withMissingReplaced.lookupOrd(withMissingReplaced.ordAt(j))); + } + } else { + assertEquals(1, withMissingReplaced.cardinality()); + assertEquals(missing, withMissingReplaced.lookupOrd(withMissingReplaced.ordAt(0))); + } + } + } + } + + public void testMissingLongs() { + final int numDocs = TestUtil.nextInt(random(), 1, 100); + final int[][] values = new int[numDocs][]; + for (int i = 0; i < numDocs; ++i) { + values[i] = new int[random().nextInt(4)]; + for (int j = 0; j < values[i].length; ++j) { + values[i][j] = randomInt(); + } + Arrays.sort(values[i]); + } + SortedNumericDocValues asNumericValues = new SortedNumericDocValues() { + + int i = -1; + + @Override + public long valueAt(int index) { + return values[i][index]; + } + + @Override + public void setDocument(int docId) { + i = docId; + } + + @Override + public int count() { + return values[i].length; + } + }; + final long missing = randomInt(); + SortedNumericDocValues withMissingReplaced = MissingValues.replaceMissing(asNumericValues, missing); + for (int i = 0; i < numDocs; ++i) { + withMissingReplaced.setDocument(i); + if (values[i].length > 0) { + assertEquals(values[i].length, withMissingReplaced.count()); + for (int j = 0; j < values[i].length; ++j) { + assertEquals(values[i][j], withMissingReplaced.valueAt(j)); + } + } else { + assertEquals(1, withMissingReplaced.count()); + assertEquals(missing, withMissingReplaced.valueAt(0)); + } + } + } + + public void testMissingDoubles() { + final int numDocs = TestUtil.nextInt(random(), 1, 100); + final double[][] values = new double[numDocs][]; + for (int i = 0; i < numDocs; ++i) { + values[i] = new double[random().nextInt(4)]; + for (int j = 0; j < values[i].length; ++j) { + values[i][j] = randomDouble(); + } + Arrays.sort(values[i]); + } + SortedNumericDoubleValues asNumericValues = new SortedNumericDoubleValues() { + + int i = -1; + + @Override + public double valueAt(int index) { + return values[i][index]; + } + + @Override + public void setDocument(int docId) { + i = docId; + } + + @Override + public int count() { + return values[i].length; + } + }; + final long missing = randomInt(); + SortedNumericDoubleValues withMissingReplaced = MissingValues.replaceMissing(asNumericValues, missing); + for (int i = 0; i < numDocs; ++i) { + withMissingReplaced.setDocument(i); + if (values[i].length > 0) { + assertEquals(values[i].length, withMissingReplaced.count()); + for (int j = 0; j < values[i].length; ++j) { + assertEquals(values[i][j], withMissingReplaced.valueAt(j), 0); + } + } else { + assertEquals(1, withMissingReplaced.count()); + assertEquals(missing, withMissingReplaced.valueAt(0), 0); + } + } + } + + public void testMissingGeoPoints() { + final int numDocs = TestUtil.nextInt(random(), 1, 100); + final GeoPoint[][] values = new GeoPoint[numDocs][]; + for (int i = 0; i < numDocs; ++i) { + values[i] = new GeoPoint[random().nextInt(4)]; + for (int j = 0; j < values[i].length; ++j) { + values[i][j] = new GeoPoint(randomDouble() * 90, randomDouble() * 180); + } + } + MultiGeoPointValues asGeoValues = new MultiGeoPointValues() { + + int i = -1; + + @Override + public GeoPoint valueAt(int index) { + return values[i][index]; + } + + @Override + public void setDocument(int docId) { + i = docId; + } + + @Override + public int count() { + return values[i].length; + } + }; + final GeoPoint missing = new GeoPoint(randomDouble() * 90, randomDouble() * 180); + MultiGeoPointValues withMissingReplaced = MissingValues.replaceMissing(asGeoValues, missing); + for (int i = 0; i < numDocs; ++i) { + withMissingReplaced.setDocument(i); + if (values[i].length > 0) { + assertEquals(values[i].length, withMissingReplaced.count()); + for (int j = 0; j < values[i].length; ++j) { + assertEquals(values[i][j], withMissingReplaced.valueAt(j)); + } + } else { + assertEquals(1, withMissingReplaced.count()); + assertEquals(missing, withMissingReplaced.valueAt(0)); + } + } + } +}