# Elasticsearch

In this notebook we will setup an Elasticsearch server, read in Shakespeares works, and analyze them to unerstand term vectors.

You may mix direct API calls, the Python API, or url calls from Python. Whatever gives you access to the data.



### Install the necessary elasticsearch Python packages

In [161]:
!pip install 'elasticsearch<7.14.0'

# docs are here https://elasticsearch-py.readthedocs.io/en/v7.13.4/#



### Import packages

In [162]:
import os
import time
from elasticsearch import Elasticsearch
import numpy as np
import pandas as pd
import json
from pprint import pprint
import requests

## Setup Elasticsearch Instance


In [163]:
%%bash

wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz
wget -q https://artifacts.elastic.co/downloads/elasticsearch/elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512
tar -xzf elasticsearch-oss-7.9.2-linux-x86_64.tar.gz
sudo chown -R daemon:daemon elasticsearch-7.9.2/
shasum -a 512 -c elasticsearch-oss-7.9.2-linux-x86_64.tar.gz.sha512 

elasticsearch-oss-7.9.2-linux-x86_64.tar.gz: OK


Run the instance as a daemon (background) process

In [164]:
%%bash --bg

sudo -H -u daemon elasticsearch-7.9.2/bin/elasticsearch

Starting job # 4 in a separate thread.


In [165]:
# Sleep for few seconds to let the instance start.  - here in case you are running end-to-end
time.sleep(20)

query the base endpoint to retrieve information about the cluster.

In [166]:
%%bash

curl -sX GET "localhost:9200/"

{
  "name" : "7fe582970eb2",
  "cluster_name" : "elasticsearch",
  "cluster_uuid" : "N5oHNfTkTBuA26MPE9RUeA",
  "version" : {
    "number" : "7.9.2",
    "build_flavor" : "oss",
    "build_type" : "tar",
    "build_hash" : "d34da0ea4a966c4e49417f2da2f244e3e97b4e6e",
    "build_date" : "2020-09-23T00:45:33.626720Z",
    "build_snapshot" : false,
    "lucene_version" : "8.6.2",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


### Data

Get the Shakespeare data 

In [167]:
%%bash 

wget 'https://download.elastic.co/demos/kibana/gettingstarted/shakespeare_6.0.json' -q

In [168]:
%%bash

head -5 shakespeare_6.0.json

{"index":{"_index":"shakespeare","_id":0}}
{"type":"act","line_id":1,"play_name":"Henry IV", "speech_number":"","line_number":"","speaker":"","text_entry":"ACT I"}
{"index":{"_index":"shakespeare","_id":1}}
{"type":"scene","line_id":2,"play_name":"Henry IV","speech_number":"","line_number":"","speaker":"","text_entry":"SCENE I. London. The palace."}
{"index":{"_index":"shakespeare","_id":2}}


In [169]:
from elasticsearch import helpers, Elasticsearch
import csv

ES_NODES = "http://localhost:9200"

es = Elasticsearch(hosts = [ES_NODES])
index_name = 'shakespeare'
doctype = 'shakespeare_works'
es.indices.delete(index=index_name, ignore=[400, 404])
es.indices.create(index=index_name, ignore=400, 
      body={
              "mappings": {
                  "properties" : {
                  "speaker": 
                    {"type": "keyword"},
                  "play_name": 
                    {"type": "keyword"},
                  "line_id": 
                    {"type": "integer"},
                  "speech_number": 
                    {"type": "integer"}, 
                  "text_entry":
                    {"term_vector": "with_positions_offsets",
                     "type": "text", 
                     "fielddata": True}
            }
      }}
  )

body={
              "mappings": {
                  "properties" : {
                  "speaker": 
                    {"type": "keyword"},
                  "play_name": 
                    {"type": "keyword"},
                  "line_id": 
                    {"type": "integer"},
                  "speech_number": 
                    {"type": "integer"}, 
                  "text_entry":
                    {"term_vector": "with_positions_offsets",
                     "type": "text", 
                     "fielddata": True}
            }
      }}

Bulk upload the data

In [170]:
! curl -s -q -H 'Content-Type: application/x-ndjson' -XPOST 'localhost:9200/shakespeare/_bulk?pretty' --data-binary @shakespeare_6.0.json 

{
  "took" : 8365,
  "errors" : false,
  "items" : [
    {
      "index" : {
        "_index" : "shakespeare",
        "_type" : "_doc",
        "_id" : "0",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 0,
        "_primary_term" : 1,
        "status" : 201
      }
    },
    {
      "index" : {
        "_index" : "shakespeare",
        "_type" : "_doc",
        "_id" : "1",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 1,
        "_primary_term" : 1,
        "status" : 201
      }
    },
    {
      "index" : {
        "_index" : "shakespeare",
        "_type" : "_doc",
        "_id" : "2",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
   

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[1;30;43mStreaming output truncated to the last 5000 lines.[0m
        },
        "_seq_no" : 78470,
        "_primary_term" : 1,
        "status" : 201
      }
    },
    {
      "index" : {
        "_index" : "shakespeare",
        "_type" : "_doc",
        "_id" : "78471",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 78471,
        "_primary_term" : 1,
        "status" : 201
      }
    },
    {
      "index" : {
        "_index" : "shakespeare",
        "_type" : "_doc",
        "_id" : "78472",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 78472,
        "_primary_term" : 1,
        "status" : 201
      }
    },
    {
      "index" : {
        "_index" : "shakespeare",
        "_type" : "_doc",
        "_id" : "7

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



 : "shakespeare",
        "_type" : "_doc",
        "_id" : "98580",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 98580,
        "_primary_term" : 1,
        "status" : 201
      }
    },
    {
      "index" : {
        "_index" : "shakespeare",
        "_type" : "_doc",
        "_id" : "98581",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 98581,
        "_primary_term" : 1,
        "status" : 201
      }
    },
    {
      "index" : {
        "_index" : "shakespeare",
        "_type" : "_doc",
        "_id" : "98582",
        "_version" : 1,
        "result" : "created",
        "_shards" : {
          "total" : 2,
          "successful" : 1,
          "failed" : 0
        },
        "_seq_no" : 98582,
        "_prima

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [171]:
! curl http://localhost:9200/_cat/indices

yellow open test-index  fDe1Fw7zTcimu1m4AwG2Ww 1 1      4 0   30kb   30kb
yellow open shakespeare zNWXZydlQaaSi6g-VlHrPA 1 1 111396 0 15.9mb 15.9mb


### Extract term vectors

In [172]:
#res = es.index(index="shakespeare",body=body,id=1)
#res = es.get(index="shakespeare",id=1)


In [174]:
def search(es_object,index_name,search):
  res = es_object.search(index=index_name,body=search)
  return res

In [175]:
thing = {'query':{'match':{'text_entry':'trenching'}}}
res = search(es,'shakespeare',json.dumps(thing))
pprint(res)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '9',
                    '_index': 'shakespeare',
                    '_score': 10.831611,
                    '_source': {'line_id': 10,
                                'line_number': '1.1.7',
                                'play_name': 'Henry IV',
                                'speaker': 'KING HENRY IV',
                                'speech_number': 1,
                                'text_entry': 'Nor more shall trenching war '
                                              'channel her fields,',
                                'type': 'line'},
                    '_type': '_doc'}],
          'max_score': 10.831611,
          'total': {'relation': 'eq', 'value': 1}},
 'timed_out': False,
 'took': 2}


In [176]:
query ={"size":200,"query": {"match_all": {}}} #query matches everything
result = es.search(
    index="shakespeare",body=query)  
ids = []
for res in result['hits']['hits']:
    ids.append(res['_id'])
print(ids) #collect all the ids, there are 9 apparently
fields = ['term_vectors']
res = es.mtermvectors(index='shakespeare',ids=ids,term_statistics=True,fields=None)
#pprint(res)

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99', '100', '101', '102', '103', '104', '105', '106', '107', '108', '109', '110', '111', '112', '113', '114', '115', '116', '117', '118', '119', '120', '121', '122', '123', '124', '125', '126', '127', '128', '129', '130', '131', '132', '133', '134', '135', '136', '137', '138', '139', '140', '141', '142', '143', '144', '145', '146', '147', '148', '149', '150', '151', '152', '153', '154', '155', '156', '157', '15

### Find a rare term

In [177]:
fields = {}
keys = {}
for doc in res['docs']:
  fields=doc['term_vectors']
  terms = fields['text_entry']['terms']
  for word, freqs in terms.items():
    keys[word] = freqs['doc_freq']
  
#pprint(keys)
minval = min(keys.values())
inter = [k for k, v in keys.items() if v==minval]
inter

['unbuttoning',
 'strands',
 'trenching',
 'impressed',
 'naild',
 'forwarding',
 'snatched',
 'welshwomen',
 'ridge',
 'archibald',
 'balkd',
 'holmedons',
 'athol',
 'murray',
 'lugged',
 'lincolnshire',
 'rascalliest',
 'straightest',
 'malevolent']

### Search for the term


In [178]:
thing = {'query':{'match':{'text_entry':'rascalliest'}}}
res = search(es,'shakespeare',json.dumps(thing))
pprint(res)

{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': '188',
                    '_index': 'shakespeare',
                    '_score': 12.133916,
                    '_source': {'line_id': 189,
                                'line_number': '1.2.75',
                                'play_name': 'Henry IV',
                                'speaker': 'FALSTAFF',
                                'speech_number': 27,
                                'text_entry': 'the most comparative, '
                                              'rascalliest, sweet young',
                                'type': 'line'},
                    '_type': '_doc'}],
          'max_score': 12.133916,
          'total': {'relation': 'eq', 'value': 1}},
 'timed_out': False,
 'took': 2}
