# The definitive guide
## 01 - Tutorial

`sudo docker run --name ungol-es_guide -p 9200:9200 -p 9300:9300 -e 'discovery.type=single-node' docker.elastic.co/elasticsearch/elasticsearch:6.2.4`

* https://www.elastic.co/guide/en/elasticsearch/reference/6.0/removal-of-types.html

In [1]:
import json
import pathlib

SRC = pathlib.Path('src/')

In [6]:
from pprint import pprint as pp
import elasticsearch as es
e = es.Elasticsearch([{ 'host': 'localhost', 'port': 9200 }])

pp(e.info())

{'cluster_name': 'docker-cluster',
 'cluster_uuid': 'O37AcGfuSMe1_i68NGyEPg',
 'name': 'DbF5o3U',
 'tagline': 'You Know, for Search',
 'version': {'build_date': '2018-04-12T20:37:28.497551Z',
             'build_hash': 'ccec39f',
             'build_snapshot': False,
             'lucene_version': '7.2.1',
             'minimum_index_compatibility_version': '5.0.0',
             'minimum_wire_compatibility_version': '5.6.0',
             'number': '6.2.4'}}


In [9]:
', '.join([fn for fn in dir(e) if not fn.startswith('_')])

'bulk, cat, clear_scroll, cluster, count, create, delete, delete_by_query, delete_script, exists, exists_source, explain, field_caps, get, get_script, get_source, get_template, index, indices, info, ingest, mget, msearch, msearch_template, mtermvectors, nodes, ping, put_script, put_template, reindex, reindex_rethrottle, remote, render_search_template, scroll, search, search_shards, search_template, snapshot, tasks, termvectors, transport, update, update_by_query, xpack'

In [15]:
pp(e.count())

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'count': 1674}


In [21]:
with SRC.joinpath('employees.json').open('r') as f:
    employees = json.loads(f.read())
    
pp(len(employees))

3


## Populate the index

In [31]:
# a type must be provided, unfortunately
# don't know how this will be handled in ES7, where
# document types are removed for good
#
# https://github.com/elastic/elasticsearch-py/issues/646

idx_args = {
    'index': 'employee',
    'doc_type': 'bullshit'
}

# e.index(..., op_type='create')
try:
    res = e.create(id=1, body=employees[0], **idx_args)
    
except es.ConflictError:
    pp('employee already exists')
    res = e.get(id=1, **idx_args)
    
pp(res)

PUT http://localhost:9200/employee/bullshit/1/_create [status:409 request:0.004s]


'employee already exists'
{'_id': '1',
 '_index': 'employee',
 '_source': {'about': 'I love to go rock climbing',
             'age': 25,
             'first_name': 'John',
             'interests': ['sports', 'music'],
             'last_name': 'Smith'},
 '_type': 'bullshit',
 '_version': 1,
 'found': True}


In [32]:
# index the remaining employees
for i, employee in enumerate(employees[1:], start=2):
    if not e.exists(id=i, **idx_args):
        ret = e.create(id=i, body=employee, **idx_args)
        pp(ret)
    else:
        ret = e.get(id=i, **idx_args)
        pp(ret)

{'_id': '2',
 '_index': 'employee',
 '_primary_term': 1,
 '_seq_no': 0,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_type': 'bullshit',
 '_version': 1,
 'result': 'created'}
{'_id': '3',
 '_index': 'employee',
 '_primary_term': 1,
 '_seq_no': 0,
 '_shards': {'failed': 0, 'successful': 1, 'total': 2},
 '_type': 'bullshit',
 '_version': 1,
 'result': 'created'}


## execute a simple search

In [33]:
# by default, the top 10 results are returned
e.search(**idx_args)

{'took': 21,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 3,
  'max_score': 1.0,
  'hits': [{'_index': 'employee',
    '_type': 'bullshit',
    '_id': '2',
    '_score': 1.0,
    '_source': {'first_name': 'Jane',
     'last_name': 'Smith',
     'age': 32,
     'about': 'I like to collect rock albums',
     'interests': ['music']}},
   {'_index': 'employee',
    '_type': 'bullshit',
    '_id': '1',
    '_score': 1.0,
    '_source': {'first_name': 'John',
     'last_name': 'Smith',
     'age': 25,
     'about': 'I love to go rock climbing',
     'interests': ['sports', 'music']}},
   {'_index': 'employee',
    '_type': 'bullshit',
    '_id': '3',
    '_score': 1.0,
    '_source': {'first_name': 'Douglas',
     'last_name': 'Fir',
     'age': 35,
     'about': 'I like to build cabinets',
     'interests': ['forestry']}}]}}

In [34]:
# search by using the lucene query string syntax
e.search(q='last_name:Smith', **idx_args)

{'took': 22,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 2,
  'max_score': 0.2876821,
  'hits': [{'_index': 'employee',
    '_type': 'bullshit',
    '_id': '2',
    '_score': 0.2876821,
    '_source': {'first_name': 'Jane',
     'last_name': 'Smith',
     'age': 32,
     'about': 'I like to collect rock albums',
     'interests': ['music']}},
   {'_index': 'employee',
    '_type': 'bullshit',
    '_id': '1',
    '_score': 0.2876821,
    '_source': {'first_name': 'John',
     'last_name': 'Smith',
     'age': 25,
     'about': 'I love to go rock climbing',
     'interests': ['sports', 'music']}}]}}

In [35]:
# search by using the query DSL
e.search(body={'query': {'match': {'last_name': 'Smith'}}}, **idx_args)

{'took': 6,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 2,
  'max_score': 0.2876821,
  'hits': [{'_index': 'employee',
    '_type': 'bullshit',
    '_id': '2',
    '_score': 0.2876821,
    '_source': {'first_name': 'Jane',
     'last_name': 'Smith',
     'age': 32,
     'about': 'I like to collect rock albums',
     'interests': ['music']}},
   {'_index': 'employee',
    '_type': 'bullshit',
    '_id': '1',
    '_score': 0.2876821,
    '_source': {'first_name': 'John',
     'last_name': 'Smith',
     'age': 25,
     'about': 'I love to go rock climbing',
     'interests': ['sports', 'music']}}]}}

## More complicated searches

In [36]:
query = {
    'query': {
        'bool': {
            'must': {
                'match': {
                    'last_name': 'smith'}},
            'filter': {
                'range': {
                    'age': { 'gt': 30 }}}}}}

e.search(body=query, **idx_args)

{'took': 16,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 1,
  'max_score': 0.2876821,
  'hits': [{'_index': 'employee',
    '_type': 'bullshit',
    '_id': '2',
    '_score': 0.2876821,
    '_source': {'first_name': 'Jane',
     'last_name': 'Smith',
     'age': 32,
     'about': 'I like to collect rock albums',
     'interests': ['music']}}]}}

## Full text searches

In [37]:
e.search(body={'query': {'match': {'about': 'rock climbing'}}})

{'took': 15,
 'timed_out': False,
 '_shards': {'total': 6, 'successful': 6, 'skipped': 0, 'failed': 0},
 'hits': {'total': 2,
  'max_score': 0.5753642,
  'hits': [{'_index': 'employee',
    '_type': 'bullshit',
    '_id': '1',
    '_score': 0.5753642,
    '_source': {'first_name': 'John',
     'last_name': 'Smith',
     'age': 25,
     'about': 'I love to go rock climbing',
     'interests': ['sports', 'music']}},
   {'_index': 'employee',
    '_type': 'bullshit',
    '_id': '2',
    '_score': 0.2876821,
    '_source': {'first_name': 'Jane',
     'last_name': 'Smith',
     'age': 32,
     'about': 'I like to collect rock albums',
     'interests': ['music']}}]}}

In [38]:
e.search(body={'query': {'match_phrase': {'about': 'rock climbing'}}})

{'took': 35,
 'timed_out': False,
 '_shards': {'total': 6, 'successful': 6, 'skipped': 0, 'failed': 0},
 'hits': {'total': 1,
  'max_score': 0.5753642,
  'hits': [{'_index': 'employee',
    '_type': 'bullshit',
    '_id': '1',
    '_score': 0.5753642,
    '_source': {'first_name': 'John',
     'last_name': 'Smith',
     'age': 25,
     'about': 'I love to go rock climbing',
     'interests': ['sports', 'music']}}]}}

## Aggregations

In [46]:
agg_terms = {'terms': {'field': 'interests.keyword'}}
query = {'aggs': {'all_interests': agg_terms}}

pp('query')
pp(query)

res = e.search(body=query, **idx_args)

pp('response')
for bucket in res['aggregations']['all_interests']['buckets']:
    pp(bucket)

'query'
{'aggs': {'all_interests': {'terms': {'field': 'interests.keyword'}}}}
'response'
{'doc_count': 2, 'key': 'music'}
{'doc_count': 1, 'key': 'forestry'}
{'doc_count': 1, 'key': 'sports'}


In [48]:
# average age of employees sharing an interest
agg_age = {'aggs': { 'avg_age': {'avg': {'field': 'age'}}}}
query = {'aggs': {'all_interests': {**agg_terms, **agg_age}}}

pp('query')
pp(query)

res = e.search(body=query, **idx_args)

pp('response')
for bucket in res['aggregations']['all_interests']['buckets']:
    pp(bucket)

'query'
{'aggs': {'all_interests': {'aggs': {'avg_age': {'avg': {'field': 'age'}}},
                            'terms': {'field': 'interests.keyword'}}}}
'response'
{'avg_age': {'value': 28.5}, 'doc_count': 2, 'key': 'music'}
{'avg_age': {'value': 35.0}, 'doc_count': 1, 'key': 'forestry'}
{'avg_age': {'value': 25.0}, 'doc_count': 1, 'key': 'sports'}
