```
Summary by:
- A41316 - Nguyễn Hữu Khoa
- A42718 - Lê Thảo Quyên
```

> **Note:** Many source code is outdated, some parameters, function usage have been changed according to elasticsearch version 7.17.10

## Elasticsearch  Search Queries , Filters & Aggregations

### STEP 4: DATA EXPLORATION



In [1]:
from datetime import datetime
from elasticsearch import Elasticsearch
import wikipedia
import wikipediaapi
import requests

In [54]:
# Elasticsearch client used to communicate with database
client = Elasticsearch('http://localhost:9200')
indexName = "medical" #index name
# client.indices.create(index=indexName) # create index
docType="diseases2"
searchFrom = 0
searchSize= 10

#### Finding Lupus first trial

In [7]:
searchBody={
    "fields":["name"],
    "query":{
        "simple_query_string" : {
            "query": '+fatigue+fever+"joint pain"',
            "fields": ["fulltext","title^5","name^10"]
        }
    }
}
client.search(index=indexName, doc_type=docType, body=searchBody, from_=searchFrom, size=searchSize)

  client.search(index=indexName, doc_type=docType, body=searchBody, from_=searchFrom, size=searchSize)


{'took': 5,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 3, 'relation': 'eq'},
  'max_score': 6.2221518,
  'hits': [{'_index': 'medical',
    '_type': 'diseases',
    '_id': 'Acute gouty arthritis',
    '_score': 6.2221518,
    '_source': {'name': 'Acute gouty arthritis',
     'title': 'Gout',
     'fulltext': 'Gout ( GOWT) is a form of inflammatory arthritis characterized by recurrent attacks of a red, tender, hot and swollen joint, caused by the deposition of needle-like crystals of uric acid known as monosodium urate crystals. Pain typically comes on rapidly, reaching maximal intensity in less than 12 hours. The joint at the base of the big toe is affected (Podagra) in about half of cases. It may also result in tophi, kidney stones, or kidney damage.Gout is due to persistently elevated levels of uric acid (urate) in the blood (hyperuricemia). This occurs from a combination of diet, other health problems, and 

In [35]:
client.search(
    index=indexName,
    doc_type=docType,
    body={
        "query": {
            "simple_query_string" : {
                "query": 'thirst "weight loss"',
                "fields": ["fulltext","title^5","name^10"]
            }
        }
    }, 
    from_=searchFrom, 
    size=searchSize)

  client.search(


{'took': 48,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 35, 'relation': 'eq'},
  'max_score': 6.6612787,
  'hits': [{'_index': 'medical',
    '_type': 'diseases',
    '_id': "Addison's disease",
    '_score': 6.6612787,
    '_source': {'name': "Addison's disease",
     'title': "Addison's disease",
     'fulltext': 'Addison\'s disease, also known as primary adrenal insufficiency, is a rare long-term endocrine disorder characterized by inadequate production of the steroid hormones cortisol and aldosterone by the two outer layers of the cells of the adrenal glands (adrenal cortex), causing adrenal insufficiency. Symptoms generally come on slowly and insidiously and may include abdominal pain and gastrointestinal abnormalities, weakness, and weight loss. Darkening of the skin in certain areas may also occur. Under certain circumstances, an adrenal crisis may occur with low blood pressure, vomiting, lower back pai

### Filters & Aggregations

#### Diabetes KeyWords

In [45]:
# elasticsearch==7.17.10
searchBody={
    "fields":["name"],
    "query":{
        "bool": {
            "filter": {
                'term': {'name':'diabetes'}
            }
        }
    },  
    "aggregations" : {
        "DiseaseKeywords" : {
            "significant_terms" : { "field" : "fulltext", "size" : 30 }
        }
    }
}
mapping = {
    "properties": {
        "fulltext": {
            "type": "text",
            "fielddata": True
        }
    }
}
client.indices.put_mapping(index=indexName, doc_type=docType, body=mapping, include_type_name=True)
client.search(index=indexName,doc_type=docType, body=searchBody, from_ = searchFrom, size=searchSize)

  client.search(index=indexName,doc_type=docType, body=searchBody, from_ = searchFrom, size=searchSize)


{'took': 1,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'DiseaseKeywords': {'doc_count': 0,
   'bg_count': 326,
   'buckets': []}}}

### Step 3 revisited: Data preparation for disease profiling

#### Updating Elasticsearch index settings

In [46]:
settings={
    "analysis": {
            "filter": {
                "my_shingle_filter": {
                    "type":             "shingle",
                    "min_shingle_size": 2, 
                    "max_shingle_size": 2, 
                    "output_unigrams":  False   
                }
            },
            "analyzer": {
                "my_shingle_analyzer": {
                    "type":             "custom",
                    "tokenizer":        "standard",
                    "filter": [
                        "lowercase",
                        "my_shingle_filter" 
                    ]
                }
            }
        }
    }
#before we can change certain settings the index needs to be closed. After changing
#the settings we can reopen the index
client.indices.close(index=indexName)
client.indices.put_settings(index=indexName , body = settings)
client.indices.open(index=indexName)



{'acknowledged': True, 'shards_acknowledged': True}

#### Create more advanced Elasticsearch doctype mapping

In [52]:
client.indices.delete(index=indexName)

{'acknowledged': True}

In [60]:
docType = 'diseases2' #document type we will index

diseaseMapping = {
    'properties': {
        'name': {'type': 'keyword'},
        'title': {'type': 'text'},
        'fulltext': {
            "type": "text",
            "fields": {
                "shingles": {
                    "type":     "text",
                    "analyzer": "my_shingle_analyzer"
                }
            },
            "analyzer": "my_shingle_analyzer"
        }
    },
    'settings': {
        'analysis': {
            'filter': {
                'my_shingle_filter': {
                    'type': 'shingle',
                    'min_shingle_size': 2, 
                    'max_shingle_size': 2, 
                    'output_unigrams': False   
                }
            },
            'analyzer': {
                'my_shingle_analyzer': {
                    'type': 'custom',
                    'tokenizer': 'standard',
                    'filter': [
                        'lowercase',
                        'my_shingle_filter' 
                    ]
                }
            }
        }
    }
}

# client.indices.create(index=indexName)
client.indices.put_mapping(index=indexName, doc_type=docType, body=diseaseMapping, include_type_name=True)


RequestError: RequestError(400, 'mapper_parsing_exception', 'Failed to parse mapping [diseases2]: analyzer [my_shingle_analyzer] has not been configured in mappings')

In [None]:
dl = wikipedia.page("Lists_of_diseases")
diseaseListArray = []
for link in dl.links[15:42]:
    try:
        diseaseListArray.append(wikipedia.page(link))
    except Exception as e: 
        print(str(e))

In [None]:
checkList = [["0","1","2","3","4","5","6","7","8","9"],["A"],["B"],["C"],["D"],["E"],["F"],["G"],["H"],["I"],["J"],["K"],["L"],["M"],["N"],["O"],["P"],["Q"],["R"],["S"],["T"],["U"],["V"],["W"],["X"],["Y"],["Z"]]
for diseaselistNumber, diseaselist in enumerate(diseaseListArray):  #loop through disease lists
    for disease in diseaselist.links: #loop through lists of links for every disease list
        try:
            #first check if it is a disease, then index it
            if disease[0] in checkList[diseaselistNumber] and disease[0:3] !="List":
                currentPage = wikipedia.page(disease) 
                client.index(index=indexName, doc_type=docType,id = disease, document={"name": disease, "title":currentPage.title , "fulltext":currentPage.content})
        except Exception as e: 
            print(str(e))
            pass

## Diabetes KeyWords + significant Bigrams

In [49]:
searchBody={
"fields":["name"],
"query":{
    "bool": {
        "filter": {
            'term': {'name':'diabetes'}
        }
    }
},  
"aggregations" : {
        "DiseaseKeywords" : {
            "significant_terms" : { "field" : "fulltext", "size" : 30 }
        },
        "DiseaseBigrams": {
            "significant_terms" : { "field" : "fulltext.shingles", "size" : 30 }
        }
    }
}
client.search(index=indexName,doc_type=docType, body=searchBody, from_ = searchFrom, size=searchSize)

  client.search(index=indexName,doc_type=docType, body=searchBody, from_ = searchFrom, size=searchSize)


{'took': 56,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'aggregations': {'DiseaseKeywords': {'doc_count': 0,
   'bg_count': 326,
   'buckets': []},
  'DiseaseBigrams': {'buckets': []}}}