### Overview

This Notebook uses vanilla [elasticsearch.py](https://github.com/elastic/elasticsearch-py) and [elasticsearch-dsl.py](https://github.com/elastic/elasticsearch-dsl-py) to interact with Siren, just to prove that the default Elasticsearch endpoints are unchanged.

You can query, aggregate, inspect indices, and write data to Siren just as if it were a normal Elasticsearch cluster.

#### Notebook setup

In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import warnings

# Stop the flood of insecure request warnings
warnings.filterwarnings("ignore")

<IPython.core.display.Javascript object>

In [3]:
from tqdm.auto import tqdm

<IPython.core.display.Javascript object>

### Connect

In [4]:
import elasticsearch as es
import elasticsearch_dsl as dsl

client = es.Elasticsearch(
    "https://siren:9220",
    http_auth=("sirenadmin", "password"),
    verify_certs=False,
)
client

<Elasticsearch([{'host': 'siren', 'port': 9220, 'use_ssl': True}])>

<IPython.core.display.Javascript object>

In [5]:
client.ping()

True

<IPython.core.display.Javascript object>

In [6]:
client.info()

{'name': 'siren-node',
 'cluster_name': 'siren-distribution',
 'cluster_uuid': 'io6gD5WORPaz1_IqddTtog',
 'version': {'number': '7.10.2',
  'build_flavor': 'oss',
  'build_type': 'tar',
  'build_hash': '747e1cc71def077253878a59143c1f785afa92b9',
  'build_date': '2021-01-13T00:42:12.435326Z',
  'build_snapshot': False,
  'lucene_version': '8.7.0',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

<IPython.core.display.Javascript object>

In [7]:
client.indices.get_mapping().keys()

dict_keys(['investment_revised', 'company', '.map__shape__world-countries_us_us-states_vermont_rivers_streams-river-geojson', 'searchguard', 'company_revised', '.sirenaccess', '.map__shape__world-countries_us_us-states_us-states-geojson', 'investment', 'investor', 'article_revised', '.map__shape__world-countries_ireland_counties_ireland-counties-geojson', '.map__shape__world-countries_us_us-states_texas_city-polygons_texascitypolygons-geojson', '.map__point__world-countries_us_us-states_california_steelhead-abundance_steelhead-abundance--point-features-geojson', '.map__shape__world-countries_worldcountries-geojson', 'investor_revised', '.siren', '.map__point__world-countries_us_us-states_california_chinook-abundance_chinook-abundance--point-features-geojson', '.map__shape__world-countries_ireland_rail-network_irish-rail-network-geojson', 'article', '.map__shape__world-lakes_ne-10m-lakes-geojson'])

<IPython.core.display.Javascript object>

In [8]:
indices = sorted([c for c in client.indices.get_mapping() if not c.startswith(".")])
indices

['article',
 'article_revised',
 'company',
 'company_revised',
 'investment',
 'investment_revised',
 'investor',
 'investor_revised',
 'searchguard']

<IPython.core.display.Javascript object>

### Query

In [9]:
for idx in tqdm(indices):
    print(idx, client.count(index=idx))

  0%|          | 0/9 [00:00<?, ?it/s]

article {'count': 646903, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}
article_revised {'count': 0, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}
company {'count': 160106, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}
company_revised {'count': 0, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}
investment {'count': 41623, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}
investment_revised {'count': 0, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}
investor {'count': 14959, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}
investor_revised {'count': 0, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}
searchguard {'count': 0, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}


<IPython.core.display.Javascript object>

In [10]:
client.search(index="article", size=1)

{'took': 4,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': 1.0,
  'hits': [{'_index': 'article',
    '_type': 'Article',
    '_id': 'cSOr2XcBu9D2HElYwL6P',
    '_score': 1.0,
    '_source': {'snippet': '         Some AT&T Wireless customers have voted an emphatic no on a promotion for    American Idol    that popped up on their phones this week.         AT&T, a sponsor of the show, said it sent text messages to a    significant number    of its 75 million customers    ',
     'image': '',
     'pmonth': 1,
     'siren': {'nlp': {'taxonomy_annotated': {'snippet': '         Some AT&T Wireless customers have voted an emphatic no on a promotion for    American Idol    that popped up on their phones this week.         AT&T, a sponsor of the show, said it sent text messages to a    significant number    of its 75 million customers    ',
        'title': "A Text Arrives.   Oh, It

<IPython.core.display.Javascript object>

In [11]:
s = dsl.Search()
s.aggs.bucket(name="sources", agg_type="terms", field="source")
body = s.to_dict()
body

{'aggs': {'sources': {'terms': {'field': 'source'}}}}

<IPython.core.display.Javascript object>

In [12]:
client.search(index="article", body=body, size=0)

{'took': 127,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': None,
  'hits': []},
 'aggregations': {'sources': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 44668,
   'buckets': [{'key': 'ZDNet', 'doc_count': 227993},
    {'key': 'Gigaom', 'doc_count': 82080},
    {'key': 'Mashable', 'doc_count': 55248},
    {'key': 'TechCrunch', 'doc_count': 51075},
    {'key': 'Thenextweb', 'doc_count': 50272},
    {'key': 'VentureBeat', 'doc_count': 41186},
    {'key': 'ReadWrite', 'doc_count': 27603},
    {'key': 'The Verge', 'doc_count': 27297},
    {'key': 'GamesBeat', 'doc_count': 21901},
    {'key': 'Ars Technica', 'doc_count': 17580}]}}}

<IPython.core.display.Javascript object>

In [13]:
# Show lucene query with limit
client.search(index="article", size=5, q="snippet:siren")

{'took': 28,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 11, 'relation': 'eq'},
  'max_score': 12.269118,
  'hits': [{'_index': 'article',
    '_type': 'Article',
    '_id': 'aCSs2XcBu9D2HElYiVza',
    '_score': 12.269118,
    '_source': {'snippet': 'So far, I have resisted the siren call of Twitter. But I love Twitter search -- especially when a tweet-happy Microsoft group is posting up a storm.',
     'image': '',
     'pmonth': 9,
     'siren': {'nlp': {'taxonomy_annotated': {'snippet': 'So far, I have resisted the siren call of Twitter. But I love Twitter search -- especially when a tweet-happy Microsoft group is posting up a storm.',
        'title': "The 10 best tweets (so far) from Microsoft's MVP Summit"},
       'instances': {'snippet': {'entity/organization': [{'nerType': 'Organization',
           'probability': 0.7769690556821683,
           'start': 110,
           'match': 'Microsoft',
           

<IPython.core.display.Javascript object>

In [14]:
# show lucene query with aggs in body
client.search(index="article", size=0, q="snippet:siren", body=body)

{'took': 19,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 11, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'sources': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 0,
   'buckets': [{'key': 'ZDNet', 'doc_count': 8},
    {'key': 'TechCrunch', 'doc_count': 2},
    {'key': 'Forbes', 'doc_count': 1}]}}}

<IPython.core.display.Javascript object>