Skip to content

Commit

Permalink
METRON-1366: Add an entropy stellar function (cstella via mmiklavc) c…
Browse files Browse the repository at this point in the history
…loses apache#872
  • Loading branch information
cstella authored and iraghumitra committed Feb 17, 2018
1 parent d158281 commit cdf3542
Show file tree
Hide file tree
Showing 5 changed files with 159 additions and 0 deletions.
7 changes: 7 additions & 0 deletions metron-analytics/metron-statistics/README.md
Expand Up @@ -217,6 +217,13 @@ functions can be used from everywhere where Stellar is used.
* stats - The Stellar statistics object
* Returns: The variance of the values in the window or NaN if the statistics object is null.

### Information Theory Functions

#### `IT_ENTROPY`
* Description: Computes the base-2 entropy of a multiset
* Input:
* input - a multiset (a map of objects to counts).
* Returns: The [base-2 entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)#Definition) of the count . The unit of this is bits.

### Statistical Outlier Detection

Expand Down
@@ -0,0 +1,53 @@
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.metron.statistics.informationtheory;

import org.apache.metron.stellar.dsl.BaseStellarFunction;
import org.apache.metron.stellar.dsl.Stellar;

import java.util.List;
import java.util.Map;

public class InformationTheoryFunctions {
@Stellar( namespace="IT"
, name="ENTROPY"
, description = "Computes the base-2 entropy of a multiset"
, params = { "input - a multiset (a map of objects to counts)" }
, returns = "The [base-2 entropy](https://en.wikipedia.org/wiki/Entropy_(information_theory)#Definition) of the count . The unit of this is bits."
)
public static class Entropy extends BaseStellarFunction {

@Override
public Object apply(List<Object> args) {
if(args.isEmpty()) {
throw new IllegalArgumentException("IT_ENTROPY expects exactly one argument.");
}
Object inputObj = args.get(0);
if(inputObj == null) {
return null;
}
if(!(inputObj instanceof Map)) {
throw new IllegalArgumentException("IT_ENTROPY expects exactly one argument and expects it to be a map of counts (e.g. Map<?, Integer>)");
}
Map<?, Integer> countMap = (Map<?, Integer>) inputObj;
return InformationTheoryUtil.INSTANCE.bitEntropy(countMap);
}
}
}
@@ -0,0 +1,52 @@
/*
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
package org.apache.metron.statistics.informationtheory;

import java.util.Map;

public enum InformationTheoryUtil {
INSTANCE;
private static final double LOG2 = Math.log(2);

public double entropy(Map<?, Integer> counts, double logOfBase) {
double ret = 0.0;
int n = 0;
if(counts == null || counts.isEmpty()) {
return ret;
}
for(Integer f : counts.values()) {
n+=f;
}

for(Integer f : counts.values()) {
double p = f.doubleValue()/n;
ret -= p * Math.log(p) / logOfBase;
}
return ret;
}

public double entropy(Map<?, Integer> counts, int base) {
return entropy(counts, Math.log(base));
}

public double bitEntropy(Map<?, Integer> counts) {
return entropy(counts, LOG2);
}
}
@@ -0,0 +1,46 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.metron.statistics.informationtheory;

import com.google.common.collect.ImmutableMap;
import org.junit.Assert;
import org.junit.Test;

import java.util.HashMap;

import static org.apache.metron.stellar.common.utils.StellarProcessorUtils.run;

public class EntropyTest {
@Test
public void entropyTest() throws Exception {
//test empty collection
Assert.assertEquals(0.0, (Double) run("IT_ENTROPY({})", new HashMap<>()), 0.0);

/*
Now consider the string aaaaaaaaaabbbbbccccc or 10 a's followed by 5 b's and 5 c's.
The probabilities of each character is as follows:
p(a) = 1/2
p(b) = 1/4
p(c) = 1/4
so the shannon entropy should be
-p(a)*log_2(p(a)) - p(b)*log_2(p(b)) - p(c)*log_2(p(c)) =
-0.5*-1 - 0.25*-2 - 0.25*-2 = 1.5
*/
Assert.assertEquals(1.5, (Double) run("IT_ENTROPY({ 'a' : 10, 'b' : 5, 'c' : 5} )", new HashMap<>()), 0.0);
}
}
1 change: 1 addition & 0 deletions metron-stellar/stellar-common/README.md
Expand Up @@ -189,6 +189,7 @@ Where:
| [ `HLLP_INIT`](../../metron-analytics/metron-statistics#hllp_init) |
| [ `HLLP_MERGE`](../../metron-analytics/metron-statistics#hllp_merge) |
| [ `IN_SUBNET`](#in_subnet) |
| [ `IT_ENTROPY`](../../metron-analytics/metron-statistics#it_entropy) |
| [ `IS_DATE`](#is_date) |
| [ `IS_ENCODING`](#is_encoding) |
| [ `IS_DOMAIN`](#is_domain) |
Expand Down

0 comments on commit cdf3542

Please sign in to comment.