# Import Libaries

In [1]:
try:
    from elasticsearch import Elasticsearch, RequestsHttpConnection
    from elasticsearch.client import IndicesClient
except:
    !pip install 'elasticsearch<7.14.0'
    from elasticsearch import Elasticsearch, RequestsHttpConnection
    from elasticsearch.client import IndicesClient
    
try:
    from requests_aws4auth import AWS4Auth
except:
    !pip install requests_aws4auth
    from requests_aws4auth import AWS4Auth
    
import boto3
import os
import pandas as pd
import numpy as np
import re

## Send HTTPS request to ES

In [2]:
service = 'es'
ss = boto3.Session()
credentials = ss.get_credentials()
region = ss.region_name

#ES domain
host = "vpc-dusstac-dussta-1n8niblaemqb-otcfzpck6s7czlgni6gxcb4rru.us-east-1.es.amazonaws.com"

awsauth = AWS4Auth(credentials.access_key, credentials.secret_key,
                   region, service, session_token=credentials.token)

# set up ES client for future API calls
es = Elasticsearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

### Test Connection to AWS ES Service
Testing the connection to ES using a HTTP request body and parameters. `searchBody` below is the same as the search body in dusstackapiprocessor. The index in DUS is called `textract`.

In [3]:
es

<Elasticsearch([{'host': 'vpc-dusstac-dussta-1n8niblaemqb-otcfzpck6s7czlgni6gxcb4rru.us-east-1.es.amazonaws.com', 'port': 443}])>

In [4]:
!curl -H 'Content-Type: application/json' -X GET https://vpc-dusstac-dussta-1n8niblaemqb-otcfzpck6s7czlgni6gxcb4rru.us-east-1.es.amazonaws.com/textract?pretty

{
  "textract" : {
    "aliases" : { },
    "mappings" : {
      "properties" : {
        "5" : {
          "properties" : {
            " this authorization shall be in force and effect until 12/31/2020 (date event), at which time this authorization expires" : {
              "type" : "text",
              "fields" : {
                "keyword" : {
                  "type" : "keyword",
                  "ignore_above" : 256
                }
              }
            }
          }
        },
        "anatomy" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "bucket" : {
          "type" : "text",
          "fields" : {
            "keyword" : {
              "type" : "keyword",
              "ignore_above" : 256
            }
          }
        },
        "by" : {
          "type" : "text",
          "fields" : {
            "keyword" : {


In [16]:
# search query
# keyword = "What is diabetes\?"
# keyword = "What are the types of diabetes\?"
# keyword = "What are the types of contracts\?"
keyword = "contracts"

# size of the result fragment returned by ES
ES_HIGHLIGHT_FRAGMENT_SIZE = 200

# HTTP request parameters that will be sent to ES API
searchBody = {
    # set up for string based queries
    "query" : {
        "query_string": {
            "query": keyword
        }
    },
    # set up for result fragments that are returned by ES
    "highlight" : {
        "fields" : {
            "content" : { "pre_tags" : [""], "post_tags" : [""] },
        },
        "fragment_size" : ES_HIGHLIGHT_FRAGMENT_SIZE,
        "require_field_match": False
    }
}

# make a query against existing index in ES via call to API
output = es.search(
    index='textract',
    size=1000,
    body=searchBody,
    _source = True,
    filter_path=[
        'hits.hits._id',
        'hits.hits._source',
        'hits.hits.highlight',
        'hits.hits._score',
        'hits.hits.date'
    ],
    request_timeout=5 # this isn't what limits to 5 fragments
)

# Download and save
df_res = pd.DataFrame(columns=['doc_path','doc_name','score', 'highlight'])

# parse results from search; returns 5 fragments from each source
n_results = len(output["hits"]["hits"])
print(f"The Elasticsearch query returned {n_results} results.\n")

print(f"Example output:\n\n")
for i,x in enumerate(output["hits"]["hits"]):
    newrow = {'doc_path':os.path.split(x['_source']['name'])[0],
              'doc_name':os.path.split(x['_source']['name'])[1],
              'score':x['_score'],
              'highlight':''}
    for j,highlight in enumerate(x['highlight']['content']):
        print(f"SOURCE {i}: HIGHLIGHT {j}")
        highlight = highlight.replace("\n"," ")
        print(f"{highlight}\n")
        newrow['highlight'] = highlight
        df_res.loc[df_res.shape[0],:] = newrow
        
# save the file
# flnm = 'query_' + re.sub('\W+','', keyword.replace(" ","_").lower()) + '_1.csv'
# df_res.to_csv(flnm, index=False)

The Elasticsearch query returned 18 results.

Example output:


SOURCE 0: HIGHLIGHT 0
Different types diabetic nephropathy, is kidney disease of nerve damage affect different parts of caused by diabetes. You can help protect your body.

SOURCE 0: HIGHLIGHT 1
& Dementia & Sleep Apnea & Diabetes Diabetes Diabetes Diabetes Depression NIH Diabetes is linked High blood People who have is common among to some types of glucose increases sleep apnea NIH people with

SOURCE 0: HIGHLIGHT 2
Many the chance of - -when you stop chronic, or long- risk factors for developing breathing for term, illness such cancer and for dementia NIHG. short periods as diabetes. diabetes are the Tell your doctor

SOURCE 0: HIGHLIGHT 3
This content is provided as a service of the National Institute of Diabetes and Digestive and Kidney Diseases (NIDDK), part of the National Institutes of Health.

SOURCE 0: HIGHLIGHT 4
Contact Us The National Institute of Diabetes and Digestive and Kidney Diseases Health Information Ce

In [17]:
len(output['hits']['hits'])

18

## Download Test

In [103]:
# set up a query and download the contents
# use output from previous query
# why does highlight only return 5?

df_res = pd.DataFrame(columns=['doc_path','doc_name','score', 'highlight'])

# parse results from search; returns 5 fragments from each source
n_results = len(output["hits"]["hits"])
print(f"The Elasticsearch query returned {n_results} results.\n")

print(f"Example output:\n\n")
for i,x in enumerate(output["hits"]["hits"]):
    newrow = {'doc_path':os.path.split(x['_source']['name'])[0],
              'doc_name':os.path.split(x['_source']['name'])[1],
              'score':x['_score'],
              'highlight':''}
#     df_res.loc[df_res.shape[0],:] = newrow
    for j,highlight in enumerate(x['highlight']['content']):
        highlight = highlight.replace("\n"," ")
        newrow['highlight'] = highlight
        df_res.loc[df_res.shape[0],:] = newrow

The Elasticsearch query returned 17 results.

Example output:




In [104]:
df_res.to_csv('save_query_types_of_diabetes.csv', index=False)

# Examine the Index from stackoverflow

In [4]:
res = es.search(
    index='textract',
    body= {'size':10000,
           'query': {
               'match_all':{}
           }
          }
)

In [12]:
for i,doc in enumerate(res['hits']['hits']):
    print(i, doc['_source']['name'])

0 public/85f5715b-c972-41e7-a554-9174d9799fca/32100355.pdf
1 public/a3744816-6066-4f34-b066-5411a4bf4555/Creatine_Kinase_-_MedlinePlus_Medical_Test.pdf
2 public/23d1a9d2-4fd3-40b0-869c-f5cb8c9a1958/KS United Healthcare Contract.pdf
3 public/5cd8a466-0e0c-474e-b552-2f9d0da90e7b/KS Aetna Contract.pdf
4 public/c0338aad-4f80-4148-aa20-6e05c0a515ca/KS Sunflower Health Plan Contract.pdf
5 public/916355dd-5c9d-4f74-b13b-1af12f06d037/PA Health Choices PH Contract - Network Exhibit.pdf
6 public/2e3895c9-23c9-46d5-952b-acd9d91c693a/pfizer-inc-covid-19-vaccine-contract-page1.pdf
7 public/6dbbce3a-3e64-4dd5-8ff7-c112e0eff636/pfizer-inc-covid-19-vaccine-contract(5).pdf
8 public/b2e9ee40-d7c5-4a1e-a10e-b13c024e207f/2110.10476.pdf
9 public/65b27304-1f38-4227-af57-ead1b1d27266/Preventing_Diabetes_Problems_NIDDK.pdf
10 public/a55db906-83f2-45e2-99de-5d80293f009d/KS Sunflower Health Plan Contract.pdf
11 public/4c9040b9-cf33-4a81-84cd-ec057c328afe/KS Aetna Contract.pdf
12 public/eb605dec-427e-4c3d-b532-6

In [7]:
res['hits']['hits']

[{'_index': 'textract',
  '_type': '_doc',
  '_id': '85f5715b-c972-41e7-a554-9174d9799fca',
  '_score': 1.0,
  '_source': {'documentId': '85f5715b-c972-41e7-a554-9174d9799fca',
   'name': 'public/85f5715b-c972-41e7-a554-9174d9799fca/32100355.pdf',
   'bucket': 'dusstack-dusstackdocumentss3bucketbpxgdwfrjv578tc-1vu4fuke61l7n',
   'content': 'DEPARTMENT OF HEALTH AND HUMAN SERVICES\nOFFICE OF INSPECTOR GENERAL\nof\nTOR\nGE\nWASHINGTON, DC 20201\nUM\nJanuary 8, 2021\nTO:\nDebra Houry, MD, MPH\nDirector, National Center for Injury Prevention and Control\nCenters for Disease Control and Prevention\nChrista Capozzola\nChief Financial Officer\nCenters for Disease Control and Prevention\nFROM:\n/Amy J. Frontz/\nDeputy Inspector General for Audit Services\nSUBJECT:\nIndependent Attestation Review: Centers for Disease Control and Prevention\n2020 Detailed Accounting Report, Performance Summary Report for National\nDrug Control Activities, Budget Formulation Compliance Report, and\nAccompanying R

### Examine Current ES Index
This index was created when the DUS application was initially deployed. Under "properties" are the features in which the documents are indexed by. In this example, there are a mix of numbers, authors, and sentence fragments. The numbers correspond to ICD10 numbers since those were used to when building the index. The other properties are extracted by ES to best optimize searches across the documents in the index.

In [6]:
import json

ind_client = IndicesClient(es)
if ind_client.exists(index="textract"):
    print("The index textract exists...\n")
    index = ind_client.get(index="textract")
    print(json.dumps(index,sort_keys=True, indent=2))
else:
    print("The index textract does not exist...")

The index textract exists...

{
  "textract": {
    "aliases": {},
    "mappings": {
      "properties": {
        "5": {
          "properties": {
            " this authorization shall be in force and effect until 12/31/2020 (date event), at which time this authorization expires": {
              "fields": {
                "keyword": {
                  "ignore_above": 256,
                  "type": "keyword"
                }
              },
              "type": "text"
            }
          }
        },
        "anatomy": {
          "fields": {
            "keyword": {
              "ignore_above": 256,
              "type": "keyword"
            }
          },
          "type": "text"
        },
        "bucket": {
          "fields": {
            "keyword": {
              "ignore_above": 256,
              "type": "keyword"
            }
          },
          "type": "text"
        },
        "by": {
          "fields": {
            "keyword": {
              "ignore_abo

In [21]:
index['textract']['mappings']['properties'].keys()

dict_keys(['5', 'anatomy', 'bucket', 'by', 'christa capozzola', 'circular', 'commercial_item', 'content', 'current pharmacy name', 'current physician name', 'date', 'date of last update', 'description', 'documentId', 'event', 'from', 'heart attack [https', 'link to treatment plan problem', 'location', 'medical_condition', 'medication', 'muscle disorders [https', 'muscular dystrophy [https', 'name', 'on', 'organization', 'other', 'patient name', 'person', 'phone', 'printed name of patient or personal representative and his or her relationship to patient', 'protected_health_information', 'quantity', 'signature of patient or personal representative', 'subject', 'target date', 'test_treatment_procedure', 'time_expression', 'title', 'to', 'treatment notes'])

In [27]:
index['textract']['settings']

{'index': {'number_of_shards': '2',
  'provided_name': 'textract',
  'max_result_window': '10000',
  'creation_date': '1631581097237',
  'analysis': {'filter': {'custom_stopword_filter': {'type': 'stop',
     'updateable': 'true',
     'stopwords_path': 'analyzers/F220113380'},
    'english_stopwords': {'type': 'stop', 'stopwords': '_english_'},
    'remove_extra_spaces': {'pattern': '\\s+',
     'type': 'pattern_replace',
     'replacement': ' '},
    'custom_shingle_filter': {'max_shingle_size': '3',
     'min_shingle_size': '2',
     'output_unigrams_if_no_shingles': 'true',
     'output_unigrams': 'true',
     'type': 'shingle',
     'filler_token': ''}},
   'analyzer': {'custom_analyzer': {'filter': ['asciifolding',
      'lowercase',
      'english_stopwords',
      'custom_stopword_filter',
      'custom_shingle_filter',
      'trim',
      'remove_extra_spaces',
      'unique'],
     'type': 'custom',
     'tokenizer': 'classic'}}},
  'number_of_replicas': '1',
  'uuid': 'ZvrS7

### Store Current Textract as New Index
The following cell will store the current textract index as `originaltextractindex` for safe keeping (if it doesn't already exist). We will use this copy to reindex the updated textract index.

In [None]:
from elasticsearch.helpers import reindex

if not ind_client.exists(index="originaltextractindex"):
    
    print("Index 'originaltextractindex' does not exist, creating...")
    ind_client.create(
        index="originaltextractindex",
        body={
            "settings": {
                "index": {
                    "number_of_shards": 2
                }
            },
            "mappings":{
                "properties":{
                "date":{
                    "type": "date",
                    "format": "M'/'dd'/'yyyy||date||year||year_month||dd MMM yyyy||dd'/'MM'/'yyyy||yyyy'/'MM'/'dd||dd'/'MM'/'YY||year_month_day||MM'/'dd'/'yy||dd MMM||MM'/'yyyy||M-dd-yyyy||MM'/'dd'/'yyyy||M||d'/'MM'/'yyyy||MM'/'dd'/'yy"
                    }
                }
            }
        }
    )
        
    reindex(
        client = es, # original ES client specified in the beginning
        source_index = "textract", # original index with documents we want to copy
        target_index = "originaltextractindex" # new index where documents will be copied to
    ) 

# Build and Demonstrate Filters for Custom Analyzer
The following cells will demonstrate each filter that will be added to a custom text analyzer. The analyzer is applied to the index and will be manually specified for search queries.

For our index, we will use the classic tokenizer and the asciifolding, lowercase, stopwords, and shingle filters. Below, the impact of each filter is demonstrated on the same query string. The result of each filter will be a list of tokens.

In [28]:
query_string = "Quick Brown Foxes is. today the tESt!! but i Will not be blAh!?"

### Lowercase Filter

In [None]:
body = {
    "tokenizer":"classic", # removes punctuation and splits string on whitespace
    "filter":[
        "asciifolding", # converts characters to ASCII
        "lowercase", # transforms characters to lowercase
    ],
    "text" : query_string
}

# store index client class as a variable
ind_client = IndicesClient(es)

# use the analyze module of the index class to ping the ES/analyze API
token_dict = ind_client.analyze(
    body=body, # request body
    index="originaltextractindex" # index name
)

# extract tokens from result dictionary
tokens = []
for token in token_dict["tokens"]:
    tokens.append(token["token"])
tokens

### Custom and Existing Stopword Filter
In this example, a custom and existing stopword filter are applied to the text. With a custom stopword filter, words that are specific to the use case can be added to a custom dictionary for removal from index and query text.

Before moving onto the next step, import a stopwords dictionary into AWS ES. First, add a TXT-file that contains your list of stopwords to an S3 bucket. Each stopword has to be on a separate line within the TXT-file. Next, navigate to the ES console and select "Packages" within the left-hand panel. Select "Import" and point to the location of your stopwords file in the S3 bucket. Below is a screenshot of the import package dashboard.

<img src="assets/stopwords_package.png" /><br><br>


Once your package is loaded into ES, AWS ES will generate an ID for your package. Take note of this ID since it will be used to reference your list of stopwords in future API calls.

<img src="assets/stopwords_package_dashboard.png" />

In [None]:
body = {
    "tokenizer":"classic",# removes punctuation and splits string on whitespace
    "filter":[
        {# ES offered filter: English stopwords
            "type":"stop", # specify the type of the filter as stop for stopword filter
            "stopwords":"_english_" # stopword filter will use English stopwords
        },
        {# custom filter: stop words based upon custom dictionary uploaded onto AWS ES
            "type":"stop",
            "stopwords_path":"analyzers/F220113380", # ID of the custom dictionary in AWS ES
            "updateable":True
        },
    ],
    "text" : query_string
}

# use the analyze module of the index class to ping the ES/analyze API
token_dict = ind_client.analyze(
    body=body, # request body
    index="originaltextractindex" # index name
)

# extract tokens from result dictionary
tokens = []
for token in token_dict["tokens"]:
    tokens.append(token["token"])
tokens

### Shingles Filter
Shingles are synonomous with the popular definition of ngrams. In ElasticSearch, ngrams are used for predictive text purposes and create ngrams at the character level instead of the word level. Shingles in ElasticSearch create ngrams at the word level which is what we want.

In [15]:
body = {
    "tokenizer":"classic",# removes punctuation and splits string on whitespace
    "filter":[
        {# existing filter: shingles which are synonomous with the popular definition of ngram
            "type":"shingle", # specify filter type as a shingle
            "min_shingle_size":2, # min size=2 for bigram
            "max_shingle_size":3, # mac size=3 for trigram
            "output_unigrams": True,
            "output_unigrams_if_no_shingles":True
        },
    ],
    "text" : query_string
}

# use the analyze module of the index class to ping the ES/analyze API
token_dict = ind_client.analyze(
    body=body, # request body
    index="originaltextractindex" # index name
)

# extract tokens from result dictionary
tokens = []
for token in token_dict["tokens"]:
    tokens.append(token["token"])
tokens

['Quick',
 'Quick Brown',
 'Quick Brown Foxes',
 'Brown',
 'Brown Foxes',
 'Brown Foxes is',
 'Foxes',
 'Foxes is',
 'Foxes is today',
 'is',
 'is today',
 'is today the',
 'today',
 'today the',
 'today the tESt',
 'the',
 'the tESt',
 'the tESt but',
 'tESt',
 'tESt but',
 'tESt but i',
 'but',
 'but i',
 'but i Will',
 'i',
 'i Will',
 'i Will not',
 'Will',
 'Will not',
 'Will not be',
 'not',
 'not be',
 'not be blAh',
 'be',
 'be blAh',
 'blAh']

### Combining all Filters
Below, all filters are combined to create a single custom filter. Each individual filter will be executed in the order in which it appears below.

Note: The shingles filter appears after the stopword filter. In ES, stopwords are replaced by an underscore by default and are not explicitly removed. Hence, there are underscores within the bi- and tri-grams below. In the following section, we will replace the underscore with an empty string.

In [16]:
body = {
    "tokenizer":"classic",# removes punctuation and splits string on whitespace
    "filter":[
        "asciifolding", # converts characters to ASCII
        "lowercase", # transforms characters to lowercase
        {# ES offered filter: English stopwords
            "type":"stop", # specify the type of the filter as stop for stopword filter
            "stopwords":"_english_" # stopword filter will use English stopwords
        },
        {# custom filter: stop words based upon custom dictionary uploaded onto AWS ES
            "type":"stop",
            "stopwords_path":"analyzers/F220113380", # ID of the custom dictionary in AWS ES
            "updateable":True
        },
        {# existing filter: shingles which are synonomous with the popular definition of ngram
            "type":"shingle", # specify filter type as a shingle
            "min_shingle_size":2, # min size=2 for bigram
            "max_shingle_size":3, # mac size=3 for trigram
            "output_unigrams": True,
            "output_unigrams_if_no_shingles":True
        },
        "trim", #remove extra spaces from the beginning and end created by shingle filter
        {#ES offered filter: remove extra whitespace in between words caused by shingle filter
            "type":"pattern_replace", # specify filter type as REGEX replacement
            "pattern":"\\s+", # search for consecutive spaces
            "replacement":" " # replace consecutive spaces with a single space
        },
        "unique" # remove duplicates that are introduced by shingles processing and cleaning
    ],
    "text" : query_string
}

# use the analyze module of the index class to ping the ES/analyze API
token_dict = ind_client.analyze(
    body=body, # request body
    index="originaltextractindex" # index name
)

# extract tokens from result dictionary
tokens = []
for token in token_dict["tokens"]:
    tokens.append(token["token"])
tokens

['quick',
 'quick brown',
 'quick brown foxes',
 'brown',
 'brown foxes',
 'brown foxes _',
 'foxes',
 'foxes _',
 'foxes _ today',
 '_ today',
 '_ today _',
 'today',
 'today _',
 'today _ test',
 '_ test',
 '_ test _',
 'test',
 'test _',
 'test _ _',
 '_ _ blah',
 '_ blah',
 'blah']

# Create New Index

### Delete "textract" Index
Since the `textract` namespace is used throughout multiple Lambda functions, it is easier to create a new index called `textract` than it is to change the index name for all of the Lambda functions that require it. The following cells will delete the existing index and then reindex it using `textractedit`.

The following cell confirms that `textract` exists.

In [17]:
ind_client.get_alias("*")

{'originaltextractindex': {'aliases': {}},
 '.kibana_1': {'aliases': {'.kibana': {}}},
 'textract': {'aliases': {}}}

In [18]:
ind_client.delete(index="textract")

{'acknowledged': True}

The next cell will confirm that `textract` is no longer an index in our ES domain.

In [19]:
ind_client.get_alias("*")

{'originaltextractindex': {'aliases': {}},
 '.kibana_1': {'aliases': {'.kibana': {}}}}

### Create Index
Using the index client that we specified earlier, we will create a new index with the `textract` name space.

In [21]:
ind_client.create(
    index="textract",
    body={
        "settings": {
            "index": {
                "number_of_shards": 2, # number of times the index volume is divided into smaller parts
                "max_result_window" : 10000, # max number of results to return, if applicable
                "analysis" : {
                    "analyzer":{
                        # specify the names of components of analyzer here. will define below if custom
                        "custom_analyzer":{
                            "type":"custom",
                            "tokenizer":"classic",# splits on whitespace, removes punctuation
                            "filter":[
                                 # converts characters to ASCII
                                "asciifolding",
                                
                                # makes character lowercase
                                "lowercase",
                                
                                # ES filter: English stopwords; requires some customization (below)
                                "english_stopwords",
                                
                                # custom filter: stop words based upon custom dictionary uploaded onto AWS ES
                                "custom_stopword_filter",
                                
                                # ES filter: shingles which are synonomous with the popular definition of ngram; requires some customization (below)
                                "custom_shingle_filter",
                                
                                # ES filter: remove extra spaces from the beginning and end created by shingle filter
                                "trim",
                                
                                # ES filter: remove extra whitespace in between words caused by shingle filter; requires some customization (below)
                                "remove_extra_spaces",
                                
                                # ES filter: remove duplicates that are introduced by shingles processing and cleaning
                                "unique"
                            ]
                        }
                    },
                    # define custom and ES filters
                    "filter":{
                        "custom_stopword_filter":{
                            "type":"stop",
                            "stopwords_path":"analyzers/F220113380",
                            "updateable":True
                        },
                        "english_stopwords":{
                            "type":"stop",
                            "stopwords":"_english_"
                        },
                        "custom_shingle_filter":{
                            "type":"shingle",
                            "min_shingle_size":2,
                            "max_shingle_size":3,
                            "output_unigrams": True,
                            "output_unigrams_if_no_shingles":True,
                            "filler_token":"" # replace the default underscore placeholder with an empty string
                        },
                        "remove_extra_spaces":{
                            "type":"pattern_replace",
                            "pattern":"\\s+",
                            "replacement":" "
                        }
                    }
                }
            }
        },
        # create data fields that are added to the index after being extracted by ES
        "mappings":{
            "properties":{
                # add a data field to index
                "date":{
                    # specify mapping type as date
                    "type": "date",
                    
                    # specify the format of the mapping type. ES looks for dates that follow these formats
                    "format": "M'/'dd'/'yyyy||date||year||year_month||dd MMM yyyy||dd'/'MM'/'yyyy||yyyy'/'MM'/'dd||dd'/'MM'/'YY||year_month_day||MM'/'dd'/'yy||dd MMM||MM'/'yyyy||M-dd-yyyy||MM'/'dd'/'yyyy||M||d'/'MM'/'yyyy||MM'/'dd'/'yy"
                }
            }
        }
    }
)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'textract'}

The following cell confirms that `textract` is listed as an index again.

In [None]:
ind_client.get_alias("*")

# Add Documents to New Index
Documents can be added to a new index by copying documents from another index. This is achieved via `reindex`.

In [23]:
from elasticsearch.helpers import reindex

reindex(
    client = es, # original ES client specified in the beginning
    source_index = "originaltextractindex", # original index with documents we want to copy
    target_index = "textract" # new index where documents will be copied to
)

(7, 0)

# Verify Changes to Index
When creating the index above, we specified our collection of text processors with the name `custom_analyzer`. In the cells below, we can reference this analyzer by name when using the `textract` index.

In [24]:
body = {
    # evoking our custom analyzer from the index
    "analyzer" : "custom_analyzer",
    "text" : query_string
}


token_dict = ind_client.analyze(
    body=body,
    index="textract"
)

tokens = []
for token in token_dict["tokens"]:
    tokens.append(token["token"])
tokens

['quick',
 'quick brown',
 'quick brown foxes',
 'brown',
 'brown foxes',
 'foxes',
 'foxes today',
 'today',
 'today test',
 'test',
 'blah']

Now, let's test the new index on an old query. The output are highlighted fragments from each source in the form of a list. You can see that the results have improved significantly and there are fewer references to kidney related documents.

In [25]:
keyword = "What is diabetes\?"

ES_HIGHLIGHT_FRAGMENT_SIZE = 200
host = "vpc-dusstac-dussta-h8vo3o6f68eb-knopiij3g5xsdg42i2sxvarih4.us-east-1.es.amazonaws.com" #ES domain

searchBody = {
     "query" : {
        "query_string": {
            "query": keyword,
            "fields":["content^3","name","keyword^2","title^4","summary"],
            "fuzziness":"AUTO",
            "analyzer":"custom_analyzer",
           }
      },
      "highlight" : {
        "fields" : {
            "content" : { "pre_tags" : [""], "post_tags" : [""] },
          },
          "fragment_size" : ES_HIGHLIGHT_FRAGMENT_SIZE,
          "require_field_match": False
     }
}


output = es.search(
    index='textract',
    size=1000,
    body=searchBody,
    _source = True,
    filter_path=['hits.hits._id', 'hits.hits._source','hits.hits.highlight','hits.hits._score'],
    request_timeout=5
)



n_results = len(output["hits"]["hits"])
print(f"The Elasticsearch query returned {n_results} results.")

print(f"Example output:\n\n")

for x in output["hits"]["hits"]:
    print(x["highlight"]["content"],"\n")

The Elasticsearch query returned 1 results.
Example output:


['Preventing Diabetes Problems I NIDDK\n10/23/20,3127 PM\nPreventing Diabetes Problems\nView or Print All Sections\n>\nHeart Disease & Stroke\nLow Blood Glucose\n(Hypoglycemia)\nDiabetes can damage blood vessels', 'Managing your diabetes can\nyour kidneys by managing your diabetes\nhelp prevent nerve damage that affects\nand meeting your blood pressure goals.\nyour feet and limbs, and organs such as\nyour heart.', 'Disease\nDiabetes can cause nerve damage and\nDiabetes can damage your eyes and lead\npoor blood flow, which can lead to\nto low vision and blindness.', '&\nDementia &\nSleep Apnea &\nDiabetes\nDiabetes\nDiabetes\nDiabetes\nDepression NIH\nDiabetes is linked\nHigh blood\nPeople who have\nis common among\nto some types of\nglucose increases\nsleep apnea NIH\npeople with', 'https://www.niddk.nih.gov/health-information/diabetes/overview/preventing-problems\nPage 3 of 4\n\nPreventing Diabetes Problems I NIDDK\n10/23/2

With another search, you can see how specific the results are to the phrases in the query string.

In [26]:
keyword = "dibetes in older patients"

ES_HIGHLIGHT_FRAGMENT_SIZE = 200
host = "vpc-dusstac-dussta-h8vo3o6f68eb-knopiij3g5xsdg42i2sxvarih4.us-east-1.es.amazonaws.com" #ES domain

searchBody = {
     "query" : {
        "query_string": {
            "query": keyword,
            "fields":["content^3","name","keyword^2","title^4","summary"],
            "fuzziness":"AUTO",
            "analyzer":"custom_analyzer",
           }
      },
      "highlight" : {
        "fields" : {
            "content" : { "pre_tags" : [""], "post_tags" : [""] },
          },
          "fragment_size" : ES_HIGHLIGHT_FRAGMENT_SIZE,
          "require_field_match": False
     }
}


output = es.search(
    index='textract',
    size=1000,
    body=searchBody,
    _source = True,
    filter_path=['hits.hits._id', 'hits.hits._source','hits.hits.highlight','hits.hits._score'],
    request_timeout=5
)



n_results = len(output["hits"]["hits"])
print(f"The Elasticsearch query returned {n_results} results.")

print(f"Example output:\n\n")

for x in output["hits"]["hits"]:
    print(x["highlight"]["content"],"\n")

The Elasticsearch query returned 2 results.
Example output:


['The NIDDK translates and disseminates research findings to\nincrease knowledge and understanding about health and disease among patients, health professionals, and the\npublic.'] 

['Available\nfrom:\nittps://www.cedars-sinai.edu/Patients/Health-Conditions/Neuromuscular-Disorders.aspx [https://www.cedars-sinai.edu/Patients/\nealth-Conditions/Neuromuscular-Disorders.aspx]\n2.'] 

