In [24]:
from elasticsearch import Elasticsearch

In [25]:
es = Elasticsearch()
INDEX_NAME = 'netant-terms'

create_index_body = {
        "settings": {
            # just one shard, no replicas for testing
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "term": {"type": "keyword"},
                "term_suggest": { 
                    "type": "completion",
                    "analyzer" : "keyword",
                    "search_analyzer" : "keyword"
                },
                "term_type": {
                    "type": "text"
                },
                "source": {
                    "type": "text"
                }
            }
        },
    }

In [26]:
es.indices.delete(index=INDEX_NAME)
es.indices.create(index=INDEX_NAME, ignore=400, body=create_index_body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'netant-terms'}

In [27]:
f=open("./all_words_type_freq_gt_15.txt", "r")
all_words =f.read()

In [28]:
lines = all_words.splitlines()
len(lines)

234545

In [29]:
lines[0:10]

['#270400 smith-lemli-opitz syndrome; slos;;slo syndrome;;rsh syndrome;;rutledge lethal multiple congenital anomaly syndrome;;polydactyly, sex reversal, renal hypoplasia, and unilobar lung;;lethal acrodysgenital syndrome\tdisease\thpo',
 'rnf7p1\tgene\tncbi',
 'cd120b\tgene\tncbi',
 'cd120a\tgene\tncbi',
 'cd257\tgene\tncbi',
 'establishment of mitotic spindle localization\tfunction\tGO',
 'mll1 complex\tfunction\tGO',
 'cd256\tgene\tncbi',
 'regulation of production of sirna involved in chromatin silencing by small rna\tfunction\tGO',
 'neurl1\tgene\tmonarch']

In [30]:
def batch_insert_data(lines):
    for line in lines:
        row = line.split('\t')
        yield {
            "_index": INDEX_NAME,
            "term": row[0],
            "term_suggest": row[0],
            "term_type": row[1],
            "source": row[2]
        }

In [31]:
from elasticsearch import helpers
helpers.bulk(es, batch_insert_data(lines))

(234545, [])

In [32]:
query1 = {
    "query": {
        "query_string": {
            "query": "brca OR tp53"
        }
    },
    "size": 100
}


query2 = {
     "suggest": {
        "term-suggest" : {
            "text" : "brca", 
            "completion" : { 
                "field" : "term_suggest",
                "skip_duplicates": True,
                "size": 20,
                "fuzzy" : {
                    "fuzziness" : "AUTO"
                }
            }
        }
    }
}

res = es.search(index=INDEX_NAME, body=query2)
res

{'took': 21,
 'timed_out': False,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0},
 'hits': {'total': {'value': 0, 'relation': 'eq'},
  'max_score': None,
  'hits': []},
 'suggest': {'term-suggest': [{'text': 'brca',
    'offset': 0,
    'length': 4,
    'options': [{'text': 'brca1',
      '_index': 'netant-terms',
      '_type': '_doc',
      '_id': 'ivn6i2sBNIhWNO-7JxFu',
      '_score': 3.0,
      '_source': {'term': 'brca1',
       'term_suggest': 'brca1',
       'term_type': 'gene',
       'source': 'CCLE'}},
     {'text': 'brca1 protein',
      '_index': 'netant-terms',
      '_type': '_doc',
      '_id': 'n_j6i2sBNIhWNO-7HHeJ',
      '_score': 3.0,
      '_source': {'term': 'brca1 protein',
       'term_suggest': 'brca1 protein',
       'term_type': 'drug',
       'source': 'mesh'}},
     {'text': 'brca1-a complex',
      '_index': 'netant-terms',
      '_type': '_doc',
      '_id': '-vr6i2sBNIhWNO-7PStG',
      '_score': 3.0,
      '_source': {'term': 'brca