### Overview

This Notebook reviews the Elasticsearch `/<index>/_search` API and introduces the Siren-specific `/siren/<index>/_search`.  All examples here will use "hand-crafted" requests to the endpoints rather than convenience libraries like `elasticsearch.py` and `elasticsearch-dsl.py` because those cannot work with the Siren endpoints.

Elasticsearch DSL docs - 
Siren DSL docs - https://docs.siren.io/siren-federate-user-guide/24/siren-federate/search-apis.html#_search_api

#### Notebook setup

In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import httpx

client = httpx.Client(
    base_url="https://siren:9220",
    verify=False,
    auth=("sirenadmin", "password"),
    headers={"Content-Type": "application/json"},
)
client

<httpx.Client at 0x7fea78376190>

<IPython.core.display.Javascript object>

In [3]:
resp = client.get("/")
resp

<Response [200 OK]>

<IPython.core.display.Javascript object>

In [4]:
resp.json()

{'name': 'siren-node',
 'cluster_name': 'siren-distribution',
 'cluster_uuid': 'io6gD5WORPaz1_IqddTtog',
 'version': {'number': '7.10.2',
  'build_flavor': 'oss',
  'build_type': 'tar',
  'build_hash': '747e1cc71def077253878a59143c1f785afa92b9',
  'build_date': '2021-01-13T00:42:12.435326Z',
  'build_snapshot': False,
  'lucene_version': '8.7.0',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

<IPython.core.display.Javascript object>

### List indices

In [5]:
endpoint = "/_cat/indices"
params = {"format": "json"}
resp = client.get(endpoint, params=params)
resp

<Response [200 OK]>

<IPython.core.display.Javascript object>

In [6]:
resp.json()

[{'health': 'yellow',
  'status': 'open',
  'index': 'investor_revised',
  'uuid': '_is0d7tiTPy8CvEZUTxcnQ',
  'pri': '1',
  'rep': '1',
  'docs.count': '0',
  'docs.deleted': '0',
  'store.size': '230b',
  'pri.store.size': '230b'},
 {'health': 'yellow',
  'status': 'open',
  'index': '.map__shape__world-countries_us_us-states_vermont_rivers_streams-river-geojson',
  'uuid': '1QGCLOLESVKeaghovLSWTQ',
  'pri': '1',
  'rep': '1',
  'docs.count': '223',
  'docs.deleted': '0',
  'store.size': '1.7mb',
  'pri.store.size': '1.7mb'},
 {'health': 'yellow',
  'status': 'open',
  'index': '.map__shape__world-countries_us_us-states_us-states-geojson',
  'uuid': 'ZRYs4cp6TU69V1zP8UWTsA',
  'pri': '1',
  'rep': '1',
  'docs.count': '57',
  'docs.deleted': '0',
  'store.size': '296.4kb',
  'pri.store.size': '296.4kb'},
 {'health': 'yellow',
  'status': 'open',
  'index': 'investment',
  'uuid': 'Yuv5kDRnRl-85DrozzSjqw',
  'pri': '1',
  'rep': '1',
  'docs.count': '41623',
  'docs.deleted': '0',
  '

<IPython.core.display.Javascript object>

In [7]:
[row["index"] for row in resp.json() if not row["index"].startswith(".")]

['investor_revised',
 'investment',
 'article_revised',
 'article',
 'investment_revised',
 'searchguard',
 'investor',
 'company',
 'company_revised']

<IPython.core.display.Javascript object>

### Vanilla ES

Querying the `company` index

#### Pull one result

In [8]:
endpoint = "/company/_search"
body = {"size": 1}

client.post(endpoint, json=body).json()

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': 1.0,
  'hits': [{'_index': 'company',
    '_type': 'Company',
    '_id': 'BSy32XcBu9D2HElYRKk4',
    '_score': 1.0,
    '_ignored': ['deadpooled_date'],
    '_source': {'webpage': '',
     'phone_number': '4352004532',
     'location': '40.5649781, -111.8389726',
     'email_address': 'social@inspirationengine.com',
     'city': 'Sandy',
     'description': 'Simply *the* way to listen to customers.',
     'homepage_url': 'http://www.inspirationengine.com',
     'founded_month': 8,
     'revenuecurrency': '',
     'hasstatus': '',
     'one_competitor': None,
     'statecode': 'UT',
     'deadpooled_date': '',
     'url': 'http://www.crunchbase.com/company/inspiration-engine',
     'freebaseid': '',
     'number_of_employees': 1,
     'revenue': '',
     'permalink': 'inspiration-engine',
     'founded_year': 2010,


<IPython.core.display.Javascript object>

#### Bucket aggregation

In [9]:
endpoint = "/company/_search"

# with no query filter, and not pulling any results back
body = {"size": 0, "aggregations": {"by_country": {"terms": {"field": "countrycode"}}}}

resp = client.post(endpoint, json=body)
resp.json()

{'took': 8,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': None,
  'hits': []},
 'aggregations': {'by_country': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 10626,
   'buckets': [{'key': 'USA', 'doc_count': 46192},
    {'key': 'GBR', 'doc_count': 6200},
    {'key': 'IND', 'doc_count': 3131},
    {'key': 'CAN', 'doc_count': 3062},
    {'key': 'DEU', 'doc_count': 1607},
    {'key': 'FRA', 'doc_count': 1401},
    {'key': 'AUS', 'doc_count': 1142},
    {'key': 'ESP', 'doc_count': 918},
    {'key': 'ISR', 'doc_count': 852},
    {'key': 'NLD', 'doc_count': 745}]}}}

<IPython.core.display.Javascript object>

In [10]:
endpoint = "/company/_search"

# with a query filter
body = {
    "size": 0,
    "query": {"query_string": {"query": "countrycode: USA"}},
    "aggregations": {"by_city": {"terms": {"field": "city"}}},
}


resp = client.post(endpoint, json=body)
resp.json()

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': None,
  'hits': []},
 'aggregations': {'by_city': {'doc_count_error_upper_bound': 0,
   'sum_other_doc_count': 32263,
   'buckets': [{'key': 'New York', 'doc_count': 3472},
    {'key': 'San Francisco', 'doc_count': 3374},
    {'key': 'Seattle', 'doc_count': 877},
    {'key': 'Los Angeles', 'doc_count': 866},
    {'key': 'Chicago', 'doc_count': 853},
    {'key': 'San Diego', 'doc_count': 732},
    {'key': 'Austin', 'doc_count': 731},
    {'key': 'Palo Alto', 'doc_count': 722},
    {'key': 'San Jose', 'doc_count': 588},
    {'key': 'Mountain View', 'doc_count': 560}]}}}

<IPython.core.display.Javascript object>

### Siren

The `/siren/<index>/_search` endpoint behaves exactly like Vanilla ES but it accepts more DSL options

In [11]:
# showing same behavior as Vanilla ES
endpoint = "/siren/company/_search"

body = {"size": 1}

client.post(endpoint, json=body).json()

{'pit_id': 'ad6f1580-f448-49c5-a909-d4c4959bc5a5',
 'took': 2,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 10000, 'relation': 'gte'},
  'max_score': 1.0,
  'hits': [{'_index': 'company',
    '_type': 'Company',
    '_id': 'BSy32XcBu9D2HElYRKk4',
    '_score': 1.0,
    '_ignored': ['deadpooled_date'],
    '_source': {'webpage': '',
     'phone_number': '4352004532',
     'location': '40.5649781, -111.8389726',
     'email_address': 'social@inspirationengine.com',
     'city': 'Sandy',
     'description': 'Simply *the* way to listen to customers.',
     'homepage_url': 'http://www.inspirationengine.com',
     'founded_month': 8,
     'revenuecurrency': '',
     'hasstatus': '',
     'one_competitor': None,
     'statecode': 'UT',
     'deadpooled_date': '',
     'url': 'http://www.crunchbase.com/company/inspiration-engine',
     'freebaseid': '',
     'number_of_employees': 1,
     'revenue': '',
     'permalink'

<IPython.core.display.Javascript object>

#### Joined query

In [12]:
endpoint = "/siren/company/_search"

# Only companies that have been reported on by NYT
body = {
    "size": 1,
    "query": {
        "join": {
            "indices": ["article"],
            "on": ["id", "companies"],
            "request": {
                "query": {"query_string": {"query": 'article.source:"New York Times"'}},
            },
        },
    },
}

client.post(endpoint, json=body).json()

{'pit_id': 'a8b4dbd8-349d-4487-a0b7-fa58cd4cc392',
 'took': 3,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 921, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'company',
    '_type': 'Company',
    '_id': '6Sy32XcBu9D2HElYQ6bx',
    '_score': 1.0,
    '_ignored': ['deadpooled_date'],
    '_source': {'webpage': '',
     'phone_number': '',
     'location': None,
     'email_address': 'info@favorites.bz',
     'city': None,
     'description': 'Visual Web Directory',
     'homepage_url': 'http://www.favorites.bz',
     'founded_month': 10,
     'revenuecurrency': '',
     'hasstatus': '',
     'one_competitor': 'company/allmyfaves',
     'statecode': None,
     'deadpooled_date': '',
     'url': 'http://www.crunchbase.com/company/favorites',
     'freebaseid': '',
     'number_of_employees': 2,
     'revenue': '',
     'permalink': 'favorites',
     'founded_year': 2012,
     'id': 'company/favorite

<IPython.core.display.Javascript object>

#### Sanity check the join

If we wanted to "manually" do what Siren is offering in the previous join query, we could grab all companies reported on by NYT using a bucket aggregation in the `article` index, then query for everything in the `company` index with a terms query.  

In [13]:
endpoint = "/siren/article/_search"

body = {
    "size": 0,
    "query": {"query_string": {"query": 'article.source:"New York Times"'}},
    "aggregations": {"companies": {"terms": {"field": "companies", "size": 50000}}},
}

resp = client.post(endpoint, json=body)
js = resp.json()
buckets = js["aggregations"]["companies"]["buckets"]
companies = [row["key"] for row in buckets]
len(companies)

921

<IPython.core.display.Javascript object>

In [14]:
companies[:5]

['company/google',
 'company/apple',
 'company/microsoft',
 'company/facebook',
 'company/yahoo']

<IPython.core.display.Javascript object>

In [15]:
endpoint = "/siren/company/_search"

body = {"size": 1, "query": {"terms": {"id": companies}}}
resp = client.post(endpoint, json=body)
resp.json()

{'pit_id': '063aa619-4bc4-4304-b5a3-c9b972663ff9',
 'took': 22,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 921, 'relation': 'eq'},
  'max_score': 1.0,
  'hits': [{'_index': 'company',
    '_type': 'Company',
    '_id': '6Sy32XcBu9D2HElYQ6bx',
    '_score': 1.0,
    '_ignored': ['deadpooled_date'],
    '_source': {'webpage': '',
     'phone_number': '',
     'location': None,
     'email_address': 'info@favorites.bz',
     'city': None,
     'description': 'Visual Web Directory',
     'homepage_url': 'http://www.favorites.bz',
     'founded_month': 10,
     'revenuecurrency': '',
     'hasstatus': '',
     'one_competitor': 'company/allmyfaves',
     'statecode': None,
     'deadpooled_date': '',
     'url': 'http://www.crunchbase.com/company/favorites',
     'freebaseid': '',
     'number_of_employees': 2,
     'revenue': '',
     'permalink': 'favorites',
     'founded_year': 2012,
     'id': 'company/favorit

<IPython.core.display.Javascript object>

#### Filter on original and joined index

In [16]:
endpoint = "/siren/company/_search"

# Copmanies located in MD that have been reported on by NYT
body = {
    "size": 1,
    "query": {
        "bool": {
            "must": [
                {"query_string": {"query": "statecode:MD"}},
                {
                    "join": {
                        "indices": ["article"],
                        "on": ["id", "companies"],
                        "request": {
                            "query": {
                                "query_string": {
                                    "query": 'article.source:"New York Times"'
                                }
                            },
                        },
                    },
                },
            ]
        }
    },
}

client.post(endpoint, json=body).json()

{'pit_id': 'cdf084e7-7ac0-4422-9d29-3ca168552bf0',
 'took': 6,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3, 'relation': 'eq'},
  'max_score': 5.270296,
  'hits': [{'_index': 'company',
    '_type': 'Company',
    '_id': '2S232XcBu9D2HElYgbAu',
    '_score': 5.270296,
    '_ignored': ['deadpooled_date'],
    '_source': {'webpage': '',
     'phone_number': '',
     'location': '39.2894651, -76.6166397',
     'email_address': '',
     'city': 'Baltimore',
     'description': 'New Times Demand New Journalism',
     'homepage_url': 'http://www.thedaily.com',
     'founded_month': 0,
     'revenuecurrency': '',
     'hasstatus': '',
     'one_competitor': None,
     'statecode': 'MD',
     'deadpooled_date': '',
     'url': 'http://www.crunchbase.com/company/the-daily',
     'freebaseid': '',
     'number_of_employees': None,
     'revenue': '',
     'permalink': 'the-daily',
     'founded_year': 1888,
     'id': '

<IPython.core.display.Javascript object>