In [1]:
import json
from elasticsearch import Elasticsearch

In [4]:
# check ES is up
!curl localhost:9200

{
  "name" : "es01",
  "cluster_name" : "es-docker-cluster",
  "cluster_uuid" : "esmw9d6xTkay_4OD0qAXAA",
  "version" : {
    "number" : "7.14.0",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "dd5a0a2acaa2045ff9624f3729fc8a6f40835aa1",
    "build_date" : "2021-07-29T20:49:32.864135063Z",
    "build_snapshot" : false,
    "lucene_version" : "8.9.0",
    "minimum_wire_compatibility_version" : "6.8.0",
    "minimum_index_compatibility_version" : "6.0.0-beta1"
  },
  "tagline" : "You Know, for Search"
}


In [5]:
es = Elasticsearch()
es.info()



{'name': 'es01',
 'cluster_name': 'es-docker-cluster',
 'cluster_uuid': 'esmw9d6xTkay_4OD0qAXAA',
 'version': {'number': '7.14.0',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': 'dd5a0a2acaa2045ff9624f3729fc8a6f40835aa1',
  'build_date': '2021-07-29T20:49:32.864135063Z',
  'build_snapshot': False,
  'lucene_version': '8.9.0',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [8]:
# check data we load in milestone1 are still there
response = es.search(
    index="cdc",
    body= {
        "query": {
            "match_all": {}
        },
    },
)

print("number of documents in index = ", response['hits']['total']['value'])

number of document in index =  401


In [10]:
# try a match query
response = es.search(
    index="cdc",
    body= {
        "query": {
            "match": {
                "text": "World Health Organization"
            }
        },
    },
)

print("number of documents satisfied query = ", response['hits']['total']['value'])

number of documents satisfied query =  166


In [18]:
# try a match query and a term filter
response = es.search(
    index="cdc",
    body= {
        "query": {
            "bool": {
              "must": {
                "match": { "text": "World Health Organization" }
              },
              "must_not": { 
                  # section_title imported as text, need to add '.keyword' for term query
                  "term": { "section_title.keyword" : "External links" }
              }
            }
        },
        "_source": "section_title"
    }
)

print("number of documents satisfied query = ", response['hits']['total']['value'])
# print top 10 results
for doc in response['hits']['hits']:
    print("score =", doc['_score'])
    print("section_title=", doc['_source']['section_title'])

number of documents satisfied query =  158
score = 7.7753215
section_title= Summary
score = 7.602386
section_title= Further reading
score = 7.26242
section_title= Data and graphs
score = 7.253867
section_title= Health agencies
score = 6.9860735
section_title= Zika virus
score = 6.6351795
section_title= History of Human diseases
score = 6.5220504
section_title= Severe acute respiratory syndrome
score = 5.9988565
section_title= Antiviral therapy
score = 5.942991
section_title= Mother-to-child
score = 5.792868
section_title= Vaccination


In [23]:
#now let's boost doc whose section_title == 'Summary'
# try a match query and a term filter
response = es.search(
    index="cdc",
    body= {
        "query": {
            "bool": {
              "must": {
                "match": { "text": "Spanish flu" }
              },
              "must_not": { 
                  # section_title imported as text, need to add '.keyword' for term query
                  "term": { "section_title.keyword" : "External links" }
              },
              "should": {
                  "match": {
                      "section_title": 'Summary'
                  }
              }
            }
        },
        "_source": "section_title"
    }
)

print("number of documents satisfied query = ", response['hits']['total']['value'])
# print top 10 results
for doc in response['hits']['hits']:
    print("score =", doc['_score'])
    print("section_title=", doc['_source']['section_title'])

number of documents satisfied query =  43
score = 11.069967
section_title= Summary
score = 10.229466
section_title= Summary
score = 9.171614
section_title= Influenza
score = 8.41239
section_title= In popular culture
score = 7.022901
section_title= H5N1 (Avian flu)
score = 6.203658
section_title= Guidelines
score = 5.7174244
section_title= Summary
score = 5.680986
section_title= Later
score = 4.6167145
section_title= Typhus
score = 4.179197
section_title= Humans


In [24]:
# now let's add highlight
response = es.search(
    index="cdc",
    body= {
        "query": {
            "bool": {
              "must": {
                "match": { "text": "Spanish flu" }
              },
              "must_not": { 
                  # section_title imported as text, need to add '.keyword' for term query
                  "term": { "section_title.keyword" : "External links" }
              },
              "should": {
                  "match": {
                      "section_title": 'Summary'
                  }
              }
            }
        },
        "_source": ["section_title", "text"],
        "highlight": {
            "fields": {
              "text": {},
              "section_title": {}
            }
        }
    }
)

print("number of documents satisfied query = ", response['hits']['total']['value'])
# print top 10 results
for doc in response['hits']['hits']:
    print("score =", doc['_score'])
    print("highlight = ", doc['highlight'])
    print("section_title=", doc['_source']['section_title'])

number of documents satisfied query =  43
score = 11.069967
highlight =  {'section_title': ['<em>Summary</em>'], 'text': ['If transmission does cause human <em>flu</em>, it is called zoonotic swine <em>flu</em>.', 'People with regular exposure to pigs are at increased risk of swine <em>flu</em> infection.', 'These strains of swine <em>flu</em> rarely pass from human to human.', 'or around 700 million to 1.4 billion people, contracted the illness—more in absolute terms than the <em>Spanish</em>', '<em>flu</em> pandemic.']}
section_title= Summary
score = 10.229466
highlight =  {'section_title': ['<em>Summary</em>'], 'text': ['The term was not used yet but was for later pandemics including the 1918 influenza pandemic (<em>Spanish</em>', '<em>flu</em>).']}
section_title= Summary
score = 9.171614
highlight =  {'text': ['The 1889–1890 <em>flu</em> pandemic, also known as Russian <em>Flu</em> or Asiatic <em>Flu</em>, was first reported in May 1889', 'The "<em>Spanish</em> <em>flu</em>", 1918–