## Data retrieval & preparation

## Initiate modules

In [9]:
from elasticsearch import Elasticsearch
import wikipedia
import re

## Create elasticsearch Index

In [11]:
client = Elasticsearch() #elasticsearch client used to communicate with the database
indexName = "medical" #the index name
#client.indices.delete(index=indexName) #delete an index
client.indices.create(index=indexName) #create an index

## Create Document Type

In [5]:
#create a mapping and attribute it to the disease doc type
diseaseMapping = {
        'properties': {
            'name': {'type': 'string'},
            'title': {'type': 'string'},
            'fulltext': {'type': 'string'}
        }
    }
#client.indices.delete_mapping(index=indexName,doc_type='diseases')
client.indices.put_mapping(index=indexName,doc_type='diseases',body=diseaseMapping )

{u'acknowledged': True}

## Wikipedia API

In [6]:
dl = wikipedia.page("Lists_of_diseases")
dl.links

[u'Airborne disease',
 u'Contagious disease',
 u'Cryptogenic disease',
 u'Disease',
 u'Disseminated disease',
 u'Endocrine disease',
 u'Environmental disease',
 u'Eye disease',
 u'Lifestyle disease',
 u'List of abbreviations for diseases and disorders',
 u'List of autism-related topics',
 u'List of basic exercise topics',
 u'List of cancer types',
 u'List of communication disorders',
 u'List of cutaneous conditions',
 u'List of diseases (0\u20139)',
 u'List of diseases (A)',
 u'List of diseases (B)',
 u'List of diseases (C)',
 u'List of diseases (D)',
 u'List of diseases (E)',
 u'List of diseases (F)',
 u'List of diseases (G)',
 u'List of diseases (H)',
 u'List of diseases (I)',
 u'List of diseases (J)',
 u'List of diseases (K)',
 u'List of diseases (L)',
 u'List of diseases (M)',
 u'List of diseases (N)',
 u'List of diseases (O)',
 u'List of diseases (P)',
 u'List of diseases (Q)',
 u'List of diseases (R)',
 u'List of diseases (S)',
 u'List of diseases (T)',
 u'List of diseases (U)',


### hard coded version

In [17]:
diseaseListArray = []
for link in dl.links[15:42]:
    try:
        diseaseListArray.append(wikipedia.page(link))
    except Exception,e: 
        print str(e)

### Regex version

In [19]:
import re
diseaseListArray = []
check = re.compile("List of diseases*")
for link in dl.links:
    if check.match(link):
        try:
            diseaseListArray.append(wikipedia.page(link))
        except Exception as e: 
            print(str(e))


In [20]:
diseaseListArray

[<WikipediaPage 'List of diseases (0–9)'>,
 <WikipediaPage 'List of diseases (A)'>,
 <WikipediaPage 'List of diseases (B)'>,
 <WikipediaPage 'List of diseases (C)'>,
 <WikipediaPage 'List of diseases (D)'>,
 <WikipediaPage 'List of diseases (E)'>,
 <WikipediaPage 'List of diseases (F)'>,
 <WikipediaPage 'List of diseases (G)'>,
 <WikipediaPage 'List of diseases (H)'>,
 <WikipediaPage 'List of diseases (I)'>,
 <WikipediaPage 'List of diseases (J)'>,
 <WikipediaPage 'List of diseases (K)'>,
 <WikipediaPage 'List of diseases (L)'>,
 <WikipediaPage 'List of diseases (M)'>,
 <WikipediaPage 'List of diseases (N)'>,
 <WikipediaPage 'List of diseases (O)'>,
 <WikipediaPage 'List of diseases (P)'>,
 <WikipediaPage 'List of diseases (Q)'>,
 <WikipediaPage 'List of diseases (R)'>,
 <WikipediaPage 'List of diseases (S)'>,
 <WikipediaPage 'List of diseases (T)'>,
 <WikipediaPage 'List of diseases (U)'>,
 <WikipediaPage 'List of diseases (V)'>,
 <WikipediaPage 'List of diseases (W)'>,
 <WikipediaPag

In [8]:
diseaseListArray[0].links

[u'11 beta hydroxylase deficiency',
 u'11 beta hydroxysteroid dehydrogenase type 2 deficiency',
 u'17-beta-hydroxysteroid dehydrogenase deficiency',
 u'17 alpha hydroxylase deficiency',
 u'17 beta hydroxysteroide dehydrogenase deficiency',
 u'17q21.31 microdeletion syndrome',
 u'18-Hydroxylase deficiency',
 u'18p deletion syndrome',
 u'1p36 deletion syndrome',
 u'2,8 dihydroxy-adenine urolithiasis',
 u'2-Hydroxyglutaricaciduria',
 u'2-Methylacetoacetyl CoA thiolase deficiency',
 u'2-hydroxyethyl methacrylate sensitization',
 u'2-hydroxyglutaricaciduria',
 u'21 hydroxylase deficiency',
 u'22q11.2 deletion syndrome',
 u'3-M syndrome',
 u'3-hydroxy 3-methyl glutaryl-coa lyase deficiency',
 u'3-hydroxyacyl-coa dehydrogenase deficiency',
 u'3-methyl crotonyl-coa carboxylase deficiency',
 u'3-methyl glutaconic aciduria',
 u'3C syndrome',
 u'3 alpha methylcrotonyl-Coa carboxylase 1 deficiency',
 u'3 alpha methylcrotonyl-coa carboxylase 2 deficiency',
 u'3 alpha methylglutaconic aciduria, type

##     Indexing The disease pages

In [12]:
#the checklist is an array containing an array of allowed "first characters". If a disease does not comply, we skip it
checkList = [["0","1","2","3","4","5","6","7","8","9"],["A"],["B"],["C"],["D"],["E"],["F"],["G"],["H"],["I"],["J"],["K"],["L"],["M"],["N"],["O"],["P"],["Q"],["R"],["S"],["T"],["U"],["V"],["W"],["X"],["Y"],["Z"]]
docType = 'diseases' #document type we will index
for diseaselistNumber, diseaselist in enumerate(diseaseListArray):  #loop through disease lists
    for disease in diseaselist.links: #loop through lists of links for every disease list
        try:
            #first check if it is a disease, then index it
            if disease[0] in checkList[diseaselistNumber] and disease[0:3] !="List":
                currentPage = wikipedia.page(disease) 
                client.index(index=indexName, doc_type=docType,id = disease, body={"name": disease, "title":currentPage.title , "fulltext":currentPage.content})
        except Exception: 
            #print str(e)
            pass