In [1]:
from requests import get, post, put, delete

In [7]:
# Loading some accounts data to perform aggregations on it

url = 'http://localhost:9200/accounts/personal/_bulk?pretty&refresh'
data = open('accounts.json', 'rb').read()
headers = {
    "Content-Type":"application/x-ndjson",
}
resp = post(url,data=data,headers=headers)
resp

<Response [200]>

In [9]:
# data columns we have in the above index
# {
#   "account_number" : 1,
#   "balance" : 39225,
#   "firstname" : "Amber",
#   "lastname" : "Duke",
#   "age" : 32,
#   "gender" : "M",
#   "address" : "880 Holmes Lane",
#   "employer" : "Pyrami",
#   "email" : "amberduke@pyrami.com",
#   "city" : "Brogan",
#   "state" : "IL"
# }

There are total 4 types of aggregations
 - Metric
 - Bucketing
 - Matrix
 - Pipeline

### Metric Aggregations

In [10]:
url = 'http://localhost:9200/accounts/_search?pretty'
query = {
    'size': 0,
    'aggs': {
        'total_balance': {
            'sum': {
                'field': 'balance'
            }
        }
    }
}

resp = post(url, json=query)
resp.json()

{u'_shards': {u'failed': 0, u'skipped': 0, u'successful': 1, u'total': 1},
 u'aggregations': {u'total_balance': {u'value': 25714837.0}},
 u'hits': {u'hits': [],
  u'max_score': None,
  u'total': {u'relation': u'eq', u'value': 1000}},
 u'timed_out': False,
 u'took': 22}

In [11]:
# Stats, Getting all the basic metrics
url = 'http://localhost:9200/accounts/_search?pretty'
query = {
    'size': 0,
    'aggs': {
        'my_stats': {
            'stats': {
                'field': 'balance'
            }
        }
    }
}

resp = post(url, json=query)
resp.json()

{u'_shards': {u'failed': 0, u'skipped': 0, u'successful': 1, u'total': 1},
 u'aggregations': {u'my_stats': {u'avg': 25714.837,
   u'count': 1000,
   u'max': 49989.0,
   u'min': 1011.0,
   u'sum': 25714837.0}},
 u'hits': {u'hits': [],
  u'max_score': None,
  u'total': {u'relation': u'eq', u'value': 1000}},
 u'timed_out': False,
 u'took': 33}

### Cardinality
The number of unique values in the field

In [13]:
# Also, we need to enable fielddata or use keyword
query = {
    'size': 0,
    'aggs': {
        'city_count': {
            'cardinality': {
                'field': 'city.keyword'
            }
        }
    }
}


resp = post(url, json=query)
resp.json()

{u'_shards': {u'failed': 0, u'skipped': 0, u'successful': 1, u'total': 1},
 u'aggregations': {u'city_count': {u'value': 999}},
 u'hits': {u'hits': [],
  u'max_score': None,
  u'total': {u'relation': u'eq', u'value': 1000}},
 u'timed_out': False,
 u'took': 73}

#### Turning on fielddata on text field
<img src="turning-on-fielddata.png" />

In [None]:
url

### Bucketing Aggs

In [14]:
# Also, we need to enable fielddata or use keyword
query = {
    'size': 0,
    'aggs': {
        'gender_bucket': {
            'terms': {
                'field': 'gender.keyword'
            }
        }
    }
}


resp = post(url, json=query)
resp.json()

{u'_shards': {u'failed': 0, u'skipped': 0, u'successful': 1, u'total': 1},
 u'aggregations': {u'gender_bucket': {u'buckets': [{u'doc_count': 507,
     u'key': u'M'},
    {u'doc_count': 493, u'key': u'F'}],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 0}},
 u'hits': {u'hits': [],
  u'max_score': None,
  u'total': {u'relation': u'eq', u'value': 1000}},
 u'timed_out': False,
 u'took': 57}

In [17]:
# Multilevel Nested Aggregations
query = {
    'size': 0,
    'aggs': {
        'balance_bucket': {
            'range': {
                'field': 'balance',
                'ranges': [
                    {'to': 1000},
                    {'from': 1001, 'to': 5000},
                    {'from': 5001}
                ]
            }
        }
    }
}


resp = post(url, json=query)
resp.json()

{u'_shards': {u'failed': 0, u'skipped': 0, u'successful': 1, u'total': 1},
 u'aggregations': {u'balance_bucket': {u'buckets': [{u'doc_count': 0,
     u'key': u'*-1000.0',
     u'to': 1000.0},
    {u'doc_count': 79,
     u'from': 1001.0,
     u'key': u'1001.0-5000.0',
     u'to': 5000.0},
    {u'doc_count': 921, u'from': 5001.0, u'key': u'5001.0-*'}]}},
 u'hits': {u'hits': [],
  u'max_score': None,
  u'total': {u'relation': u'eq', u'value': 1000}},
 u'timed_out': False,
 u'took': 10}

### Bucketing, Metric Aggs

In [20]:
# Multilevel nested metric aggs with buckets
query = {
    'size': 0,
    'aggs': {
        'state_bucket': {
            'terms': {
                'field': 'state.keyword'
            },
            'aggs': {
            'gender_bucket': {
                'terms': {
                    'field': 'gender.keyword'
                },
                'aggs': {
                    'min_balance': {
                        'min': {
                            'field': 'balance'
                        }
                    },
                     'max_balance': {
                        'max': {
                            'field': 'balance'
                        }
                    }
                }
            }
        }
        }
    }
        
}


resp = post(url, json=query)
resp.json()

### Filter & Filters

In [22]:
# With Filter
query = {
    'size': 0,
    'aggs': {
        'gender': {
            'filter': {
                'match': {'gender': 'M'}
            },
            'aggs': {
                'avg_balance': {
                    'avg': {
                        'field': 'balance'
                    }
                }
            }
        }
    }
}

resp = post(url, json=query)
resp.json()

In [25]:
query = {
    'size': 0,
    'aggs': {
        'state_bucket': {
            'filters': {
                'other_bucket_key': 'Other States',
                'filters': {
                    'IL': {'match': {'state': 'IL'}},
                    'TN': {'match': {'state': 'TN'}}
                }
            }
        }
        
    }
}

resp = post(url, json=query)
resp.json()