## This notebook contains the code for creating the search index with ElasticSearch using the entity labels 

#### Required Files: 
relations_labels.csv

In [1]:
import pandas as pd

In [2]:
import re
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.search(emoj, data)

In [3]:
RELATIONS_FILE = "relations_labels.csv"

In [4]:
labels_df = pd.read_csv(RELATIONS_FILE)

print("Starting: ", len(labels_df))


Starting:  964


In [5]:
labels_df.head()

Unnamed: 0,uri,label
0,<http://www.wikidata.org/entity/P4882>,segmental innervation
1,<http://www.wikidata.org/entity/P1339>,number injured
2,<http://www.wikidata.org/entity/P1339>,injury toll
3,<http://www.wikidata.org/entity/P1339>,number of injured
4,<http://www.wikidata.org/entity/P1339>,injured


### Connecting to elasticsearch runtime

In [6]:
import elasticsearch
config = {
    'host': 'localhost'
}
es = elasticsearch.Elasticsearch([config,], timeout=300)

### Creating a new index in elasticsearch and populating it with the predicates' labels

In [7]:
RELATION_INDEX = "wikidata_bio_relation_index"

if es.indices.exists(RELATION_INDEX):
    es.indices.delete(index=RELATION_INDEX)

es.indices.create(index=RELATION_INDEX)

  if es.indices.exists(RELATION_INDEX):


{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'wikidata_bio_relation_index'}

In [8]:
from elasticsearch import helpers
import tqdm.notebook as tq

def doc_generator(df):
    df_iter = df.iterrows()
    for index, document in tq.tqdm(df_iter):
#         print(document.to_dict())
        yield {
                "_index": 'wikidata_bio_relation_index',
                "_type": "_doc",
                "_id" : f"{index+1}",
                "_source": document.to_dict(),
            }
    print("Complete")

elasticsearch.helpers.bulk(es, doc_generator(labels_df))

0it [00:00, ?it/s]

Complete




(964, [])

### Sanity Check: Is the Index working

In [9]:
indexName="wikidata_bio_relation_index"
elasticResults=es.search(index=indexName, body={
        "from": 0,
        "size": 100,
        "query": {
            "match": {
                "label":"associated"
            }
        }
    })

print(elasticResults)

{'took': 0, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 3, 'relation': 'eq'}, 'max_score': 5.972643, 'hits': [{'_index': 'wikidata_bio_relation_index', '_type': '_doc', '_id': '281', '_score': 5.972643, '_source': {'uri': '<http://www.wikidata.org/entity/P3335>', 'label': 'associated hazard'}}, {'_index': 'wikidata_bio_relation_index', '_type': '_doc', '_id': '706', '_score': 5.037065, '_source': {'uri': '<http://www.wikidata.org/entity/P868>', 'label': 'foods traditionally associated'}}, {'_index': 'wikidata_bio_relation_index', '_type': '_doc', '_id': '848', '_score': 5.037065, '_source': {'uri': '<http://www.wikidata.org/entity/P2293>', 'label': 'genetically associated with'}}]}}


  elasticResults=es.search(index=indexName, body={
