# Index settings and analyzers

## Initiate modules

In [1]:
from elasticsearch import Elasticsearch
import wikipedia

## Initiate global variables

In [2]:
client = Elasticsearch() #elasticsearch client used to communicate with the database
indexName = "medical" #the index name
docType = 'diseases2' #document type we will index
searchFrom = 0
searchSize= 3

## Update Settings

In [16]:
settings={
    "analysis": {
            "filter": {
                "my_shingle_filter": {
                    "type":             "shingle",
                    "min_shingle_size": 2, 
                    "max_shingle_size": 2, 
                    "output_unigrams":  False   
                }
            },
            "analyzer": {
                "my_shingle_analyzer": {
                    "type":             "custom",
                    "tokenizer":        "standard",
                    "filter": [
                        "lowercase",
                        "my_shingle_filter" 
                    ]
                }
            }
        }
    }
#before we can change certain settings the index needs to be closed. After changing
#the settings we can reopen the index
client.indices.close(index=indexName)
client.indices.put_settings(index=indexName , body = settings)
client.indices.open(index=indexName)

{u'acknowledged': True}

In [17]:
#The new mapping differs from the old one in that fulltext.shingles will now be created. fulltext.shingles contains our bigrams
diseaseMapping = {
        'properties': {
            'name': {'type': 'string'},
            'title': {'type': 'string'},
            'fulltext': {
                "type": "string",
                "fields": {
                    "shingles": {
                        "type":     "string",
                        "analyzer": "my_shingle_analyzer"
                    }
                }
            }
        }
    }
#client.indices.delete_mapping(index=indexName,doc_type=docType)
client.indices.put_mapping(index=indexName,doc_type=docType,body=diseaseMapping )

{u'acknowledged': True}

In [3]:
dl = wikipedia.page("Lists_of_diseases")
diseaseListArray = []
for link in dl.links[15:42]:
    try:
        diseaseListArray.append(wikipedia.page(link))
    except Exception,e: 
        print str(e)

In [4]:
#the checklist is an array containing an array of allowed "first characters". If a disease does not comply, we skip it
checkList = [["0","1","2","3","4","5","6","7","8","9"],["A"],["B"],["C"],["D"],["E"],["F"],["G"],["H"],["I"],["J"],["K"],["L"],["M"],["N"],["O"],["P"],["Q"],["R"],["S"],["T"],["U"],["V"],["W"],["X"],["Y"],["Z"]]
for diseaselistNumber, diseaselist in enumerate(diseaseListArray):  #loop through disease lists
    for disease in diseaselist.links: #loop through lists of links for every disease list
        try:
            #first check if it is a disease, then index it
            if disease[0] in checkList[diseaselistNumber] and disease[0:3] !="List":
                currentPage = wikipedia.page(disease) 
                client.index(index=indexName, doc_type=docType,id = disease, body={"name": disease, "title":currentPage.title , "fulltext":currentPage.content})
        except Exception,e: 
            #print str(e)
            pass

## Diabetes KeyWords + significant Bigrams

In [15]:
searchBody={
"fields":["name"],
"query":{
    "filtered" : {
        "filter": {
            'term': {'name':'diabetes'}
        }
    }
},  
"aggregations" : {
        "DiseaseKeywords" : {
            "significant_terms" : { "field" : "fulltext", "size" : 30 }
        },
        "DiseaseBigrams": {
            "significant_terms" : { "field" : "fulltext.shingles", "size" : 30 }
        }
    }
}
client.search(index=indexName,doc_type=docType, body=searchBody, from_ = searchFrom, size=searchSize)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'aggregations': {u'DiseaseBigrams': {u'buckets': [{u'bg_count': 22,
     u'doc_count': 10,
     u'key': u'the diabetes',
     u'score': 63.434343434343425},
    {u'bg_count': 18,
     u'doc_count': 9,
     u'key': u'a passer',
     u'score': 62.833333333333314},
    {u'bg_count': 18,
     u'doc_count': 9,
     u'key': u'excessive discharge',
     u'score': 62.833333333333314},
    {u'bg_count': 18,
     u'doc_count': 9,
     u'key': u'passer through',
     u'score': 62.833333333333314},
    {u'bg_count': 20,
     u'doc_count': 9,
     u'key': u'from diabetes',
     u'score': 56.51666666666667},
    {u'bg_count': 16,
     u'doc_count': 8,
     u'key': u'neurogenic diabetes',
     u'score': 55.85185185185184},
    {u'bg_count': 16,
     u'doc_count': 8,
     u'key': u'without taste',
     u'score': 55.85185185185184},
    {u'bg_count': 16,
     u'doc_count': 8,
     u'key': u'polyuria in',
     u'score': 55.85185185185184},
  